【深入浅出强化学习-编程实战】1 多臂赌博机

多臂赌博机：
假设玩家共有N次摇动摇臂的机会，每次怎么选择可以使得最后得到的金币最多？

$\epsilon$ -greedy
玻尔兹曼策略
UCB策略

python"># 多臂赌博机
import numpy as np
import matplotlib.pyplot as plt

class KB_Game:
    def __init__(self,*args,**kwargs):
        # 属性
        # q 每个臂的平均回报，假设臂的数目是3，初始值都为0.0
        self.q = np.array([0.0,0.0,0.0])
        # action_counts 摇动每个臂的次数，初始值为0
        self.action_counts = np.array([0,0,0])
        # current_cumulative_rewards 当前累积回报总和，初始值为0.0
        self.current_cumulate_rewards=0.0
        # action 动作空间
        self.actions = [1,2,3]
        # counts 玩家玩游戏的次数
        self.counts = 0
        # counts_history 玩家玩游戏的次数记录
        self.counts_history = []
        # cumulative_rewards_history 累积回报的记录
        self.cumulative_rewards_history = []
        # a 玩家当前动作，初始值可以设为任意一个动作
        self.a = 1
        # reward 当前回报 初始为0
        self.reward = 0

    # 模拟多臂赌博机如何给出回报
    # 输入为动作
    # 输出为回报
    def step(self,a):
        r = 0
        if a == 1:
            r = np.random.normal(1,1)
        if a == 2:
            r = np.random.normal(2,1)
        if a == 3:
            r = np.random.normal(1.5,1)
        return r

    # 三种选择动作的策略方法
    # 输入为策略policy， 有3个policy: e_greedy,ucb,boltzmann
    # 参数字典**kwargs用于传递相应策略所对的超参数,e_greedy——epsilon，ucb——c_ratio,boltzmann——temerature
    def choose_action(self,policy,**kwargs):
        action = 0

        if policy == 'e_greedy':
            if np.random.random()<kwargs['epsilon']:
                action = np.random.randint(1,4)#1，2，3任意选
            else:
                action = np.argmax(self.q)+1

        # UCB中，N(a)在分母，因此第一次是依次摇动每个臂，程序判断每个动作的次数，如果有为0的则选择该动作
        if policy == 'ucb':
            c_ratio = kwargs['c_ratio']
            if 0 in self.action_counts:
                action = np.where(self.action_counts==0)[0][0]+1
            else:
                value = self.q +c_ratio*np.sqrt(np.log(self.counts)/self.action_counts)
                action = np.argmax(value)+1

        if policy == 'boltzmann':
            tau = kwargs['temperature']
            p = np.exp(self.q/tau)/(np.sum(np.exp(self.q/tau)))
            action = np.random.choice([1,2,3],p=p.ravel())# 用p的规则在[1,2,3]中抽样

        return action

    # 交互学习训练
    # 输入为play_total 要训练的总次数;policy 训练的策略；**kwargs 超参数字典
    # 智能体通过要学习的策略选择动作，再将动作传给step()，从多臂赌博机中获得回报r,智能体根据立即回报更新每个动作的平均回报q,计算当前的累积回报并保存
    def train(self,play_total,policy,**kwargs):
        reward_1 = []
        reward_2 = []
        reward_3 = []
        for i in range(play_total):
            action = 0
            if policy == 'e_greedy':
                action = self.choose_action(policy,epsilon=kwargs['epsilon'])
            if policy == 'ucb':
                action = self.choose_action(policy,c_ratio=kwargs['c_ratio'])
            if policy == 'boltzmann':
                action = self.choose_action(policy,temperature=kwargs['temperature'])
            self.a = action
            print(self.a)
            # 与环境交互一次
            self.r = self.step(self.a)
            self.counts += 1
            # 更新值函数
            self.q[self.a-1] = (self.q[self.a-1]*self.action_counts[self.a-1]+self.r)/(self.action_counts[self.a-1] +1)
            self.action_counts += 1
            reward_1.append([self.q[0]])
            reward_2.append([self.q[1]])
            reward_3.append([self.q[2]])
            self.current_cumulate_rewards += self.r
            self.cumulative_rewards_history.append(self.current_cumulate_rewards)
            self.counts_history.append(i)

    # 每次训练新policy时，需要将成员变量进行重置
    def reset(self):
        self.q = np.array([0.0,0.0,0.0])
        self.action_counts = np.array([0,0,0])
        self.current_cumulate_rewards = 0.0
        self.counts = 0
        self.counts_history = []
        self.cumulative_rewards_history = []
        self.a = 1
        self.reward = 0

    # 画图 更直观比较不同策略的性能
    # 参数为colors 曲线的颜色,policy
    def plot(self,colors,policy,style):
        plt.figure(1)
        plt.plot(self.counts_history,self.cumulative_rewards_history,colors,label=policy)
        plt.legend()# 加上图例
        plt.xlabel('n',fontsize=18)
        plt.ylabel('total rewards',fontsize=18)

# KB_Game类完成了
# 写主程序
if __name__ == '__main__':
    # step1:设置随机种子，以免我们每次结果都一样
    np.random.seed(0)
    # step2:将类KB_Game进行实例化
    k_gamble = KB_Game()
    # step3: 设置总的训练次数total,设置每个策略的超参数,调用类的训练方法进行学习
    total = 200
    k_gamble.train(play_total=total,policy='e_greedy',epsilon=0.05)
    # step4: 学习完后调用画图方法
    k_gamble.plot(colors='b',policy='e_greedy',style='--')
    # step5: 进行初始化，训练另一个策略
    k_gamble.reset()
    k_gamble.train(play_total=total, policy='ucb', c_ratio=0.5)
    k_gamble.plot(colors='r', policy='ucb', style='-.')
    k_gamble.reset()
    k_gamble.train(play_total=total, policy='boltzmann', temperature=1)
    k_gamble.plot(colors='g', policy='boltzmann', style='-')
    k_gamble.reset()
    # step6: 画图，显示3种策略的学习和训练过程
    plt.show()