Jun / Sep 25 2019

Chapter02 Ten Armed Testbed

using StatsBase
using ReinforcementLearning, ReinforcementLearningEnvironments, RLIntro.MultiArmBandits
using Plots, StatsPlots

default(size=(800, 600))
env = MultiArmBanditsEnv()
MultiArmBanditsEnv([2.22248, -1.28341, -0.466235, -0.484057, -0.420341, 1.28472, 0.204828, 0.389035, -1.42047, -0.878386], 0.0, 1, false, 0.0, false, DiscreteSpace{Int64}(1, 1, 1), DiscreteSpace{Int64}(1, 10, 10))

Let's visualize the possible rewards for each action first:

violin([randn(100) .+ x for x in env.truevalues], leg=false)

Here we define a customized hook first to collect the info that our action is the best one or not.

struct CollectBestActions <: AbstractHook
    isbest::Vector{Bool}
    CollectBestActions() = new(Vector{Bool}())
end

(h::CollectBestActions)(::PostActStage, agent, env, action_obs) = push!(h.isbest, env.isbest)

Now we create a testbed to explore the impact of different ϵ.

function bandit_testbed(;selector=EpsilonGreedySelector(0.1), truereward=0.0, init=0., opt=InvDecay(1.0))
    env = MultiArmBanditsEnv(truereward=truereward)
    agent = Agent(
        QBasedPolicy(
            TDLearner(
                approximator = TabularQApproximator(
                    n_state=length(observation_space(env)),
                    n_action=length(action_space(env)),
                    init=init
                ),
                optimizer = opt
            ),
            selector
        ),
        episode_RTSA_buffer()
    )
    best_action_stats, reward_stats = CollectBestActions(), RewardsPerEpisode()
    run(agent, env, StopAfterStep(1000);hook=ComposedHook(best_action_stats, reward_stats))
    collect(Iterators.flatten(reward_stats.rewards)),best_action_stats.isbest
end
bandit_testbed (generic function with 1 method)
p = plot(layout=(2, 1))
for ϵ in [0.1, 0.01, 0.0]
    stats = [bandit_testbed(;selector=EpsilonGreedySelector(ϵ)) for _ in 1:2000]
    plot!(p, mean(x[1] for x in stats), subplot=1, legend=:bottomright, label="epsilon=$ϵ")
    plot!(p, mean(x[2] for x in stats), subplot=2, legend=:bottomright, label="epsilon=$ϵ")
end

p
p = plot(legend=:bottomright)
plot!(p, mean(bandit_testbed(;selector=EpsilonGreedySelector(0.), init=5., opt=Descent(0.1))[2] for _ in 1:2000), label="Q_1=5, epsilon=0.")
plot!(p, mean(bandit_testbed(;selector=EpsilonGreedySelector(0.1), init=0., opt=Descent(0.1))[2] for _ in 1:2000), label="Q_1=0, epsilon=0.1")
p = plot(legend=:bottomright)
plot!(p, mean(bandit_testbed(;selector=UCBSelector(10), opt=Descent(0.1))[1] for _ in 1:5000), label="UpperConfidenceBound, c=2")
plot!(p, mean(bandit_testbed(;selector=EpsilonGreedySelector(0.1), opt=Descent(0.1))[1] for _ in 1:5000), label="epsilon-greedy, epsilon=0.1")

p
function gb_bandit_testbed(;baseline=0., selector=WeightedSelector(true), truereward=0.0, init=0., opt=InvDecay(1.0))
    env = MultiArmBanditsEnv(truereward=truereward)
    agent = Agent(
        QBasedPolicy(
            GradientBanditLearner(
                approximator=TabularQApproximator(
                    n_state=length(observation_space(env)),
                    n_action=length(action_space(env)),
                    init=init
                ),
                optimizer=opt,
                baseline=baseline
            ),
            selector
        ),
        episode_RTSA_buffer()
    )
    best_action_stats, reward_stats = CollectBestActions(), RewardsPerEpisode()
    run(agent, env, StopAfterStep(1000);hook=ComposedHook(best_action_stats, reward_stats))
    collect(Iterators.flatten(reward_stats.rewards)),best_action_stats.isbest
end
gb_bandit_testbed (generic function with 1 method)

As you can see, the only difference of gb_bandit_testbed compared to bandit_testbed is that it uses GradientBanditLearner instread of TDLearner.

truereward = 4.0

p = plot(legend=:bottomright)

plot!(p, mean(gb_bandit_testbed(;opt=Descent(0.1), baseline=SampleAvg(), truereward=truereward)[2] for _ in 1:2000), label="alpha = 0.1, with baseline")
plot!(p, mean(gb_bandit_testbed(;opt=Descent(0.4), baseline=SampleAvg(), truereward=truereward)[2] for _ in 1:2000), label="alpha = 0.4, with baseline")
plot!(p, mean(gb_bandit_testbed(;opt=Descent(0.1), truereward=truereward)[2] for _ in 1:2000), label="alpha = 0.1, without baseline")
plot!(p, mean(gb_bandit_testbed(;opt=Descent(0.4), truereward=truereward)[2] for _ in 1:2000), label="alpha = 0.4, without baseline")

p
p = plot(legend=:topleft)

plot!(p, -7:-2, [mean(mean(bandit_testbed(;selector=EpsilonGreedySelector(2.0^i))[1] for _ in 1:2000)) for i in -7:-2], label="epsilon greedy")
plot!(p, -5:1, [mean(mean(gb_bandit_testbed(;selector=WeightedSelector(true), opt=Descent(2.0^i))[1] for _ in 1:2000)) for i in -5:1], label="gradient")
plot!(p, -4:2, [mean(mean(bandit_testbed(;selector=UCBSelector(10; c=2.0^i))[1] for _ in 1:2000)) for i in -4:2], label="UCB")
plot!(p, -2:2, [mean(mean(bandit_testbed(;selector=EpsilonGreedySelector(0.), init=(2.0^i))[1] for _ in 1:2000)) for i in -2:2], label="greedy with initialization")

p