Jun / Sep 25 2019
Chapter10 Mountain Car
using ReinforcementLearning, ReinforcementLearningEnvironments, RLIntro.MountainCar
Note that the MountainCar
in the RLIntro
is a slightly different different from the one in ReinforcementLearningEnvironments
.
using Plots
using StatsBase
using SparseArrays
env = MountainCar.MountainCarEnv()
MountainCarEnv(-0.534187, 0.0, MultiContinuousSpace{(2,),1}([-1.2, -0.07], [0.5, 0.07]), DiscreteSpace{Int64}(1, 3, 3))
obs_space = observation_space(env)
MultiContinuousSpace{(2,),1}([-1.2, -0.07], [0.5, 0.07])
ns, na = length(observation_space(env)), length(action_space(env))
(2, 3)
ntilings = 8 ntiles = 8 tiling = Tiling(Tuple(range(l, step=(h-l)/ntiles, length=ntiles+2) for (l, h) in zip(obs_space.low, obs_space.high))) offset = (obs_space.high .- obs_space.low) ./ (ntiles * ntilings) tilings = [tiling - offset .* (i-1) for i in 1:ntilings]
TilingPreprocessor(tilings)(obs_space.low), TilingPreprocessor(tilings)(obs_space.high)
([1, 1, 1, 1, 1, 1, 1, 1], [81, 81, 81, 81, 81, 81, 81, 81])
(POSITION_MIN, VELOCITY_MIN), (POSITION_MAX, VELOCITY_MAX) = obs_space.low, obs_space.high
([-1.2, -0.07], [0.5, 0.07])
By using TilingPreprocessor
, we transform a state of two scalars (like [-0.55, 0.0]
) to a vector of Int (like [39, 40, 40, 40, 40, 40, 40, 40]
). To use the LinearQApproximator
, we need to encdoe this new state together with an action into a feature vector:
const STATE_INDICES = LinearIndices( ( ntilings, (ntiles+1)^2, # the maximum index among the elements of the preprocessed state na, ) )
encode_state_action(state, action) = sparsevec([STATE_INDICES[i, s, action] for (i, s) in enumerate(state)], ones(length(state)), length(STATE_INDICES))
encode_state_action (generic function with 1 method)
function create_env_agent(α=2e-4, n=0) env = WrappedEnv( env=MountainCar.MountainCarEnv(), preprocessor=TilingPreprocessor(tilings) ) agent = Agent( π=QBasedPolicy( learner=TDLearner( approximator=LinearQApproximator( weights=zeros(length(STATE_INDICES)), feature_func=encode_state_action, actions=collect(1:na) ), optimizer=Descent(α), n=n ), selector=EpsilonGreedySelector(0.) ), buffer=episode_RTSA_buffer(;state_eltype=Vector{Int}) ) env, agent end
create_env_agent (generic function with 3 methods)
function show_approximation(n) env, agent = create_env_agent() run(agent, env, StopAfterEpisode(n)) [agent.π.learner.approximator(env.preprocessor([p, v])) |> maximum for p in range(POSITION_MIN, stop=POSITION_MAX, length=40), v in range(VELOCITY_MIN, stop=VELOCITY_MAX, length=40)] end
show_approximation (generic function with 1 method)
heatmap(show_approximation(1))
heatmap(show_approximation(12))
heatmap(show_approximation(104))
heatmap(show_approximation(1000))
heatmap(show_approximation(9000))
p = plot(legend=:topright) n_runs = 10 # about 2 seconds per run, quite slow here, need revisit for α in [0.1/8, 0.2/8, 0.5/8] avg_steps_per_episode = zeros(500) for _ in 1:n_runs env, agent = create_env_agent(α) hook = StepsPerEpisode() run(agent, env, StopAfterEpisode(500; is_show_progress=false);hook=hook) avg_steps_per_episode .+= hook.steps end plot!(p, avg_steps_per_episode ./ n_runs) end p
function run_once(α, n) env, agent = create_env_agent(α, n) hook = StepsPerEpisode() run(agent, env, StopAfterEpisode(50), hook=hook) mean(hook.steps) end
run_once (generic function with 1 method)
p = plot(legend=:topright) for (A, n) in [(0.4:0.1:1.7, 1), (0.3:0.1:1.6, 2), (0.2:0.1:1.4, 4), (0.2:0.1:0.9, 8), (0.2:0.1:0.7, 16)] plot!(p, A, [mean(run_once(α/8, n) for _ in 1:5) for α in A], label="n = $n") end p