Add RL model tests and improve the models

ishii-norimi · ishii-norimi · commit 449e9ec1c7d9 · 2022-01-07T14:58:35.000+09:00
diff --git a/js/platform/rl.js b/js/platform/rl.js
@@ -193,12 +193,11 @@ export default class RLPlatform extends BasePlatform {
 			this._plotter.printStep()
 			this._plotter.plotRewards()
 		}
-		return [stepInfo.state, stepInfo.reward, stepInfo.done]
+		return stepInfo
 	}
 
 	test(state, action, agent) {
-		const stepInfo = this._env.test(state, action, agent);
-		return [stepInfo.state, stepInfo.reward, stepInfo.done]
+		return this._env.test(state, action, agent)
 	}
 
 	sample_action(agent) {
diff --git a/js/view/a2c.js b/js/view/a2c.js
@@ -2,7 +2,7 @@ import A2CAgent from '../../lib/model/a2c.js'
 
 class A2CCBAgent {
 	constructor(env, resolution, layers, optimizer, use_worker, cb) {
-		this._agent = new A2CAgent(env, resolution, 50, layers, optimizer)
+		this._agent = new A2CAgent(env.env, resolution, 50, layers, optimizer)
 		cb && cb()
 	}
 
@@ -61,10 +61,10 @@ var dispA2C = function (elm, env) {
 		const learning_rate = +elm.select('[name=learning_rate]').property('value')
 		const batch = +elm.select('[name=batch]').property('value')
 		agent.get_action(cur_state, action => {
-			const [next_state, reward, done] = env.step(action, agent)
+			const { state, done } = env.step(action, agent)
 			agent.update(done, learning_rate, batch, () => {
 				const end_proc = () => {
-					cur_state = next_state
+					cur_state = state
 					cb && cb(done)
 				}
 				if (render) {
diff --git a/js/view/dqn.js b/js/view/dqn.js
@@ -70,10 +70,10 @@ var dispDQN = function (elm, env) {
 		const learning_rate = +elm.select('[name=learning_rate]').property('value')
 		const batch = +elm.select('[name=batch]').property('value')
 		agent.get_action(cur_state, Math.max(min_greedy_rate, greedy_rate * greedy_rate_update), action => {
-			let [next_state, reward, done] = env.step(action, agent)
-			agent.update(action, cur_state, next_state, reward, done, learning_rate, batch, () => {
+			const { state, reward, done } = env.step(action, agent)
+			agent.update(action, cur_state, state, reward, done, learning_rate, batch, () => {
 				const end_proc = () => {
-					cur_state = next_state
+					cur_state = state
 					if (done || env.epoch % 1000 === 999) {
 						elm.select('[name=greedy_rate]').property('value', greedy_rate * greedy_rate_update)
 					}
diff --git a/js/view/dynamic_programming.js b/js/view/dynamic_programming.js
@@ -54,9 +54,9 @@ var dispDP = function (elm, env) {
 			;(function loop() {
 				if (isMoving) {
 					const action = agent.get_action(cur_state)
-					const [next_state, reward, done] = env.step(action, agent)
+					const { state } = env.step(action, agent)
 					env.render(() => agent.get_score())
-					cur_state = next_state
+					cur_state = state
 					setTimeout(loop, 10)
 				}
 			})()
diff --git a/js/view/genetic_algorithm.js b/js/view/genetic_algorithm.js
@@ -62,11 +62,11 @@ var dispGeneticAlgorithm = function (elm, env) {
 			testButton.attr('value', isTesting ? 'Stop' : 'Test')
 			if (isTesting) {
 				const topAgent = agent.top_agent()
-				let state = env.reset(topAgent)
+				let cur_state = env.reset(topAgent)
 				void (function loop() {
-					const action = topAgent.get_action(state)
-					const [next_state, reward, done] = env.step(action, topAgent)
-					state = next_state
+					const action = topAgent.get_action(cur_state)
+					const { state, done } = env.step(action, topAgent)
+					cur_state = state
 					env.render()
 					if (isTesting && !done) {
 						setTimeout(() => loop(), 0)
diff --git a/js/view/monte_carlo.js b/js/view/monte_carlo.js
@@ -7,27 +7,21 @@ var dispMC = function (elm, env) {
 	let cur_state = env.reset(agent)
 	env.render(() => agent.get_score())
 
-	let action_history = []
-
 	const step = (render = true) => {
 		const greedy_rate = +elm.select('[name=greedy_rate]').property('value')
 		const action = agent.get_action(cur_state, greedy_rate)
-		const [next_state, reward, done] = env.step(action, agent)
-		action_history.push([action, cur_state, reward])
+		const { state, reward, done } = env.step(action, agent)
+		agent.update(action, cur_state, reward, done)
 		if (render) {
 			env.render()
 		}
-		cur_state = next_state
-		if (done) {
-			agent.update(action_history)
-			action_history = []
-		}
+		cur_state = state
 		return done
 	}
 
 	const reset = () => {
 		cur_state = env.reset(agent)
-		action_history = []
+		agent.reset()
 		env.render(() => agent.get_score())
 	}
 
diff --git a/js/view/policy_gradient.js b/js/view/policy_gradient.js
@@ -7,27 +7,21 @@ var dispPolicyGradient = function (elm, env) {
 	let cur_state = env.reset(agent)
 	env.render(() => agent.get_score())
 
-	let action_history = []
-
 	const step = (render = true) => {
 		const learning_rate = +elm.select('[name=learning_rate]').property('value')
 		const action = agent.get_action(cur_state)
-		const [next_state, reward, done] = env.step(action, agent)
-		action_history.push([action, cur_state, reward])
+		const { state, reward, done } = env.step(action, agent)
+		agent.update(action, cur_state, reward, done, learning_rate)
 		if (render) {
 			env.render()
 		}
-		cur_state = next_state
-		if (done) {
-			agent.update(action_history, learning_rate)
-			action_history = []
-		}
+		cur_state = state
 		return done
 	}
 
 	const reset = () => {
 		cur_state = env.reset(agent)
-		action_history = []
+		agent.reset()
 		env.render(() => agent.get_score())
 	}
 
diff --git a/js/view/q_learning.js b/js/view/q_learning.js
@@ -10,16 +10,16 @@ var dispQLearning = function (elm, env) {
 	const step = (render = true) => {
 		const greedy_rate = +elm.select('[name=greedy_rate]').property('value')
 		const action = agent.get_action(cur_state, greedy_rate)
-		const [next_state, reward, done] = env.step(action, agent)
-		agent.update(action, cur_state, next_state, reward)
+		const { state, reward, done } = env.step(action, agent)
+		agent.update(action, cur_state, state, reward)
 		if (render) {
 			if (env.epoch % 10 === 0) {
 				env.render(() => agent.get_score())
 			} else {
 				env.render()
 			}
 		}
-		cur_state = next_state
+		cur_state = state
 		return done
 	}
 
diff --git a/js/view/sarsa.js b/js/view/sarsa.js
@@ -10,16 +10,16 @@ var dispSARSA = function (elm, env) {
 	const step = (render = true) => {
 		const greedy_rate = +elm.select('[name=greedy_rate]').property('value')
 		const action = agent.get_action(cur_state, greedy_rate)
-		const [next_state, reward, done] = env.step(action, agent)
-		agent.update(action, cur_state, next_state, reward)
+		const { state, reward, done } = env.step(action, agent)
+		agent.update(action, cur_state, state, reward)
 		if (render) {
 			if (env.epoch % 10 === 0) {
 				env.render(() => agent.get_score())
 			} else {
 				env.render()
 			}
 		}
-		cur_state = next_state
+		cur_state = state
 		if (done) {
 			agent.reset()
 		}
diff --git a/lib/model/a2c.js b/lib/model/a2c.js
@@ -162,7 +162,7 @@ export default class A2CAgent {
 	constructor(env, resolution, procs, layers, optimizer) {
 		this._net = new ActorCriticNet(env, resolution, layers, optimizer)
 		this._procs = procs
-		this._env = env.env
+		this._env = env
 		this._advanced_step = 5
 		this._gamma = 0.99
 
diff --git a/lib/model/dynamic_programming.js b/lib/model/dynamic_programming.js
@@ -48,8 +48,8 @@ class DPTable extends QTableBase {
 			let vs = []
 			a.fill(0)
 			do {
-				let [y, reward, done] = this._env.test(this._state_value(x), this._action_value(a))
-				y = this._state_index(y)
+				const { state, reward, done } = this._env.test(this._state_value(x), this._action_value(a))
+				const y = this._state_index(state)
 				const [s, e] = this._to_position(this._state_sizes, y)
 				const v = reward + this._gamma * lastV[s]
 				const [_, ps] = this._q(x, a)
@@ -75,8 +75,8 @@ class DPTable extends QTableBase {
 			a.fill(0)
 			const x_state = this._state_value(x)
 			do {
-				let [y, reward, done] = this._env.test(x_state, this._action_value(a))
-				y = this._state_index(y)
+				const { state, reward, done } = this._env.test(x_state, this._action_value(a))
+				const y = this._state_index(state)
 				const [s, e] = this._to_position(this._state_sizes, y)
 				const v = reward + this._gamma * lastV[s]
 				const [_, ps] = this._q(x, a)
diff --git a/lib/model/genetic_algorithm.js b/lib/model/genetic_algorithm.js
@@ -113,12 +113,12 @@ class GeneticAlgorithmAgent {
 	}
 
 	run(env) {
-		let state = env.reset(this)
+		let cur_state = env.reset(this)
 		let c = 0
 		while (c++ < this._max_epoch) {
-			const action = this.get_action(state)
-			const [next_state, reward, done] = env.step(action, this)
-			state = next_state
+			const action = this.get_action(cur_state)
+			const { state, reward, done } = env.step(action, this)
+			cur_state = state
 			this._total_reward += reward
 			if (done) break
 		}
diff --git a/lib/model/monte_carlo.js b/lib/model/monte_carlo.js
@@ -32,6 +32,15 @@ export default class MCAgent {
 	constructor(env, resolution = 20) {
 		this._env = env
 		this._table = new MCTable(env, resolution)
+
+		this._history = []
+	}
+
+	/**
+	 * Reset agent.
+	 */
+	reset() {
+		this._history = []
 	}
 
 	/**
@@ -58,9 +67,16 @@ export default class MCAgent {
 
 	/**
 	 * Update model.
-	 * @param {*[]} actions
+	 * @param {*[]} action
+	 * @param {*[]} state
+	 * @param {number} reward
+	 * @param {boolean} done
 	 */
-	update(actions) {
-		this._table.update(actions)
+	update(action, state, reward, done) {
+		this._history.push([action, state, reward])
+		if (done) {
+			this._table.update(this._history)
+			this._history = []
+		}
 	}
 }
diff --git a/lib/model/policy_gradient.js b/lib/model/policy_gradient.js
@@ -93,6 +93,15 @@ export default class PGAgent {
 	 */
 	constructor(env, resolution = 20) {
 		this._table = new SoftmaxPolicyGradient(env, resolution)
+
+		this._history = []
+	}
+
+	/**
+	 * Reset agent.
+	 */
+	reset() {
+		this._history = []
 	}
 
 	/**
@@ -114,10 +123,17 @@ export default class PGAgent {
 
 	/**
 	 * Update model.
-	 * @param {*[]} actions
+	 * @param {*[]} action
+	 * @param {*[]} state
+	 * @param {number} reward
+	 * @param {boolean} done
 	 * @param {number} learning_rate
 	 */
-	update(actions, learning_rate) {
-		this._table.update(actions, learning_rate)
+	update(action, state, reward, done, learning_rate) {
+		this._history.push([action, state, reward])
+		if (done) {
+			this._table.update(this._history, learning_rate)
+			this._history = []
+		}
 	}
 }
diff --git a/tests/lib/model/a2c.test.js b/tests/lib/model/a2c.test.js
@@ -0,0 +1,32 @@
+import { jest } from '@jest/globals'
+jest.retryTimes(3)
+
+import A2CAgent from '../../../lib/model/a2c.js'
+import CartPoleRLEnvironment from '../../../lib/rl/cartpole.js'
+
+test('default', () => {
+	const env = new CartPoleRLEnvironment()
+	const agent = new A2CAgent(env, 20, 10, [{ type: 'full', out_size: 5, activation: 'tanh' }], 'adam')
+	for (let i = 0; i < 10000; i++) {
+		agent.update(true, 0.01, 10)
+	}
+
+	let totalReward = 0
+	let curState = env.reset()
+	while (true) {
+		const action = agent.get_action(curState)
+		const { state, reward, done } = env.step(action)
+		totalReward += reward
+		curState = state
+		if (done) {
+			break
+		}
+	}
+	expect(totalReward).toBeGreaterThan(150)
+	const score = agent.get_score()
+	expect(score).toHaveLength(20)
+	expect(score[0]).toHaveLength(20)
+	expect(score[0][0]).toHaveLength(20)
+	expect(score[0][0][0]).toHaveLength(20)
+	expect(score[0][0][0][0]).toHaveLength(2)
+})
diff --git a/tests/lib/model/dqn.test.js b/tests/lib/model/dqn.test.js
@@ -0,0 +1,34 @@
+import { jest } from '@jest/globals'
+jest.retryTimes(3)
+
+import DQNAgent from '../../../lib/model/dqn.js'
+import CartPoleRLEnvironment from '../../../lib/rl/cartpole.js'
+
+test('default', () => {
+	const env = new CartPoleRLEnvironment()
+	const agent = new DQNAgent(env, 20, [{ type: 'full', out_size: 10, activation: 'tanh' }], 'adam')
+
+	const totalRewards = []
+	const n = 200
+	for (let i = 0; i < n; i++) {
+		let curState = env.reset()
+		totalRewards[i] = 0
+		while (true) {
+			const action = agent.get_action(curState, 1 - (i / n) ** 2)
+			const { state, reward, done } = env.step(action)
+			agent.update(action, curState, state, reward, done, 0.001, 10)
+			totalRewards[i] += reward
+			curState = state
+			if (done) {
+				break
+			}
+		}
+	}
+	expect(totalRewards.slice(-5).reduce((s, v) => s + v, 0) / 5).toBeGreaterThan(150)
+	const score = agent.get_score()
+	expect(score).toHaveLength(20)
+	expect(score[0]).toHaveLength(20)
+	expect(score[0][0]).toHaveLength(20)
+	expect(score[0][0][0]).toHaveLength(20)
+	expect(score[0][0][0][0]).toHaveLength(2)
+})
diff --git a/tests/lib/model/dynamic_programming.test.js b/tests/lib/model/dynamic_programming.test.js
diff --git a/tests/lib/model/genetic_algorithm.test.js b/tests/lib/model/genetic_algorithm.test.js
diff --git a/tests/lib/model/monte_carlo.test.js b/tests/lib/model/monte_carlo.test.js
diff --git a/tests/lib/model/policy_gradient.test.js b/tests/lib/model/policy_gradient.test.js
diff --git a/tests/lib/model/q_learning.test.js b/tests/lib/model/q_learning.test.js
diff --git a/tests/lib/model/sarsa.test.js b/tests/lib/model/sarsa.test.js

Original file line number	Diff line number	Diff line change
`@@ -193,12 +193,11 @@ export default class RLPlatform extends BasePlatform {`
`193`	`193`	`this._plotter.printStep()`
`194`	`194`	`this._plotter.plotRewards()`
`195`	`195`	`}`
`196`		`- return [stepInfo.state, stepInfo.reward, stepInfo.done]`
	`196`	`+ return stepInfo`
`197`	`197`	`}`
`198`	`198`
`199`	`199`	`test(state, action, agent) {`
`200`		`- const stepInfo = this._env.test(state, action, agent);`
`201`		`- return [stepInfo.state, stepInfo.reward, stepInfo.done]`
	`200`	`+ return this._env.test(state, action, agent)`
`202`	`201`	`}`
`203`	`202`
`204`	`203`	`sample_action(agent) {`
Original file line number	Diff line number	Diff line change
`@@ -10,16 +10,16 @@ var dispQLearning = function (elm, env) {`
`10`	`10`	`const step = (render = true) => {`
`11`	`11`	`const greedy_rate = +elm.select('[name=greedy_rate]').property('value')`
`12`	`12`	`const action = agent.get_action(cur_state, greedy_rate)`
`13`		`- const [next_state, reward, done] = env.step(action, agent)`
`14`		`- agent.update(action, cur_state, next_state, reward)`
	`13`	`+ const { state, reward, done } = env.step(action, agent)`
	`14`	`+ agent.update(action, cur_state, state, reward)`
`15`	`15`	`if (render) {`
`16`	`16`	`if (env.epoch % 10 === 0) {`
`17`	`17`	`env.render(() => agent.get_score())`
`18`	`18`	`} else {`
`19`	`19`	`env.render()`
`20`	`20`	`}`
`21`	`21`	`}`
`22`		`- cur_state = next_state`
	`22`	`+ cur_state = state`
`23`	`23`	`return done`
`24`	`24`	`}`
`25`	`25`