import numpy as np
import matplotlib.pyplot as plt
def Greedy_method(Q, epsilon, numA):
qt = Q[0];
if np.random.rand() <= epsilon:
arm = np.random.randint(0, numA);
else:
arm = np.argmax(qt);
return arm
def Softmax_method(Q, epsilon):
temperature = epsilon;
qt = Q[0];
softe = np.exp(qt/temperature);
soft_max = softe / np.sum(softe);
z = np.random.rand();
cum_prob = 0.0;
for i in xrange(len(soft_max)):
prob = soft_max[i];
cum_prob += prob;
if cum_prob > z:
arm = i
return arm
return len(soft_max) - 1
def initial_velection(optimis, numA, te):
qN = np.zeros((1, numA));
if optimis == 0:
qT = np.zeros((1, numA));
else:
if te == 0:
qT = np.ones((1, numA)) * 5.0;
else:
qT = np.zeros((1, numA));
return qT, qN
def n_armed_testbed(nBandit=2000, nArm=10, nPlay=1000, sigma=1.0, func_selection=0, optimistic=0):
if optimistic == 0 and func_selection == 0:
eps = [0.01, 0.1, 1];
elif optimistic == 0 and func_selection == 1:
eps = [0, 0.01, 0.1];
else:
eps = [0.0, 0.1];
qTmean = np.random.multivariate_normal(np.zeros((nArm)), np.eye(nArm), (nBandit));
[row, column] = np.shape(qTmean);
qT0 = np.zeros((row,column));
average_reward = np.zeros((len(eps), nPlay));
perOptAction = np.zeros((len(eps), nPlay));
for ei in xrange(len(eps)):
teps = eps[ei];
Rewards = np.zeros((nBandit, nPlay));
optAction = np.zeros((nBandit, nPlay));
for bi in xrange(nBandit):
qT, qN = initial_velection(optimistic, nArm, ei);
for p in xrange(nPlay):
if func_selection == 1 :
arm = Greedy_method(qT, teps, nArm);
else:
arm = Softmax_method(qT, teps);
best_arm = np.argmax(qTmean[bi, :]);
if arm == best_arm:
optAction[bi, p] = 1.0;
reward = qTmean[bi, arm] + sigma * np.random.normal(0,1);
Rewards[bi, p] = reward;
qT[0, arm] = qT[0, arm] + 0.1*(reward - qT[0, arm]);
avg = np.mean(Rewards, 0);
average_reward[ei, :] = avg.T;
PercentOptAction = np.mean(optAction, 0);
perOptAction[ei, :] = PercentOptAction.T;
plot_label = "epsilon=";
list_label = [];
for i in eps:
list_label.append(plot_label + str(i));
x = np.linspace(0, nPlay, nPlay);
for i in xrange(len(eps)):
plt.plot(x, average_reward[i]);
plt.xlabel('Plays');
plt.ylabel('Average Rewards');
plt.legend(list_label);
plt.show()
for i in xrange(len(eps)):
plt.plot(x, perOptAction[i]);
plt.xlabel('Plays')
plt.ylabel('Optimal Actions(%)')
plt.legend(list_label);
plt.show()
def main():
n_armed_testbed();
if __name__ == "__main__":
main()