Jun / Sep 25 2019

Chapter10 Mountain Car

using ReinforcementLearning, ReinforcementLearningEnvironments, RLIntro.MountainCar

Note that the MountainCar in the RLIntro is a slightly different different from the one in ReinforcementLearningEnvironments.

using SparseArrays
env = MountainCar.MountainCarEnv()
MountainCarEnv(-0.534187, 0.0, MultiContinuousSpace{(2,),1}([-1.2, -0.07], [0.5, 0.07]), DiscreteSpace{Int64}(1, 3, 3))
obs_space = observation_space(env)
MultiContinuousSpace{(2,),1}([-1.2, -0.07], [0.5, 0.07])
ns, na = length(observation_space(env)), length(action_space(env))
(2, 3)
ntilings = 8
ntiles = 8
tiling = Tiling(Tuple(range(l, step=(h-l)/ntiles, length=ntiles+2) for (l, h) in zip(obs_space.low, obs_space.high)))
offset = (obs_space.high .- obs_space.low) ./ (ntiles * ntilings)
tilings = [tiling - offset .* (i-1) for i in 1:ntilings]
TilingPreprocessor(tilings)(obs_space.low), TilingPreprocessor(tilings)(obs_space.high)
([1, 1, 1, 1, 1, 1, 1, 1], [81, 81, 81, 81, 81, 81, 81, 81])
(POSITION_MIN, VELOCITY_MIN), (POSITION_MAX, VELOCITY_MAX) = obs_space.low, obs_space.high
([-1.2, -0.07], [0.5, 0.07])

By using TilingPreprocessor, we transform a state of two scalars (like [-0.55, 0.0]) to a vector of Int (like [39, 40, 40, 40, 40, 40, 40, 40]). To use the LinearQApproximator, we need to encdoe this new state together with an action into a feature vector:

const STATE_INDICES = LinearIndices(
    (
        ntilings,
        (ntiles+1)^2, # the maximum index among the elements of the preprocessed state
        na,
    )
)
encode_state_action(state, action) = sparsevec([STATE_INDICES[i, s, action] for (i, s) in enumerate(state)], ones(length(state)), length(STATE_INDICES))
encode_state_action (generic function with 1 method)
function create_env_agent(α=2e-4, n=0)
    env = WrappedEnv(
        env=MountainCar.MountainCarEnv(),
        preprocessor=TilingPreprocessor(tilings) 
    )

    agent = Agent(
        π=QBasedPolicy(
            learner=TDLearner(
                approximator=LinearQApproximator(
                    weights=zeros(length(STATE_INDICES)),
                    feature_func=encode_state_action,
                    actions=collect(1:na)
                    ),
                optimizer=Descent(α),
                n=n
                ),
            selector=EpsilonGreedySelector(0.)
            ),
        buffer=episode_RTSA_buffer(;state_eltype=Vector{Int})
    )

    env, agent
end
create_env_agent (generic function with 3 methods)
function show_approximation(n)
    env, agent = create_env_agent()
    run(agent, env, StopAfterEpisode(n))
    [agent.π.learner.approximator(env.preprocessor([p, v])) |> maximum
        for p in range(POSITION_MIN, stop=POSITION_MAX, length=40),
            v in range(VELOCITY_MIN, stop=VELOCITY_MAX, length=40)]
end
show_approximation (generic function with 1 method)
heatmap(show_approximation(1))
heatmap(show_approximation(12))
heatmap(show_approximation(104))
heatmap(show_approximation(1000))
heatmap(show_approximation(9000))
p = plot(legend=:topright)
n_runs = 10  # about 2 seconds per run, quite slow here, need revisit
for α in [0.1/8, 0.2/8, 0.5/8]
    avg_steps_per_episode = zeros(500)
    for _ in 1:n_runs
        env, agent = create_env_agent(α)
        hook = StepsPerEpisode()
        run(agent, env, StopAfterEpisode(500; is_show_progress=false);hook=hook)
        avg_steps_per_episode .+= hook.steps
    end
    plot!(p, avg_steps_per_episode ./ n_runs)
end
p
function run_once(α, n)
    env, agent = create_env_agent(α, n)
    hook = StepsPerEpisode()
    run(agent, env, StopAfterEpisode(50), hook=hook)
    mean(hook.steps)
end
run_once (generic function with 1 method)
p = plot(legend=:topright)
for (A, n) in [(0.4:0.1:1.7, 1), (0.3:0.1:1.6, 2), (0.2:0.1:1.4, 4), (0.2:0.1:0.9, 8), (0.2:0.1:0.7, 16)]
    plot!(p, A, [mean(run_once(α/8, n) for _ in 1:5) for α in A], label="n = $n")
end
p