Move to Right [ns3-gym-multiagent, reinforcement learning (Q-Learning) example]

[Description]

Based on ns3-gym-multiagent and my previous move to right example, I wrote this lab. An agent is located at position 0 (state=0) that is trying to move to position 5 (state=5). The agent can move to left (action=0) or move to right (action=1). When the agent reaches the position 5 (state=5), the agent can get the reward 1.0. Otherwise, the agent gets nothing (reward=0.0). I will show how to use q-learning to make the agent move to right as fast as possible. In this lab, there are two agents that are trying to move to position 5. Only when two agents both reach position 5, the work is done.

Note. If you want to run this example, you need to install ns3-gym-multiagent first. Then create a directory named move2right-multiagent under scratch. Put the following codes in this directory.

[mygym.h]

/* -*- Mode: C++; c-file-style: "gnu"; indent-tabs-mode:nil; -*- */

* This program is free software; you can redistribute it and/or modify

* it under the terms of the GNU General Public License version 2 as

* published by the Free Software Foundation;

* This program is distributed in the hope that it will be useful,

* but WITHOUT ANY WARRANTY; without even the implied warranty of

* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

* GNU General Public License for more details.

* You should have received a copy of the GNU General Public License

* along with this program; if not, write to the Free Software

* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

* Author: ZhangminWang

#ifndef MY_GYM_ENTITY_H

#define MY_GYM_ENTITY_H

#include "ns3/opengym-module.h"

#include "ns3/nstime.h"

namespace ns3 {

/**

* \note This is only a test, not use algorithm

class MyGymEnv : public OpenGymMultiEnv

{

public:

MyGymEnv ();

MyGymEnv (Time stepTime);

virtual ~MyGymEnv ();

static TypeId GetTypeId (void);

virtual void DoDispose ();

Ptr<OpenGymSpace> GetActionSpace(uint32_t id);

Ptr<OpenGymSpace> GetObservationSpace(uint32_t id);

bool GetDone(uint32_t id);

Ptr<OpenGymDataContainer> GetObservation(uint32_t id);

float GetReward(uint32_t id);

std::string GetInfo(uint32_t id);

bool ExecuteActions(uint32_t id, Ptr<OpenGymDataContainer> action);

private:

void ScheduleNextStateRead();

Time m_interval;

};

}

#endif // MY_GYM_ENTITY_H

[mygym.cc]

/* -*- Mode: C++; c-file-style: "gnu"; indent-tabs-mode:nil; -*- */

* This program is free software; you can redistribute it and/or modify

* it under the terms of the GNU General Public License version 2 as

* published by the Free Software Foundation;

* This program is distributed in the hope that it will be useful,

* but WITHOUT ANY WARRANTY; without even the implied warranty of

* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

* GNU General Public License for more details.

* You should have received a copy of the GNU General Public License

* along with this program; if not, write to the Free Software

* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

* Author: ZhangminWang

#include "mygym.h"

#include "ns3/object.h"

#include "ns3/core-module.h"

#include "ns3/wifi-module.h"

#include "ns3/node-list.h"

#include "ns3/log.h"

#include <sstream>

#include <iostream>

const int n=2;

int mypos[n+1];

int myact[n+1];

namespace ns3 {

NS_LOG_COMPONENT_DEFINE ("MyGymEnv");

NS_OBJECT_ENSURE_REGISTERED (MyGymEnv);

MyGymEnv::MyGymEnv ()

{

NS_LOG_FUNCTION (this);

m_interval = Seconds (0.1);

Simulator::Schedule (Seconds (0.0), &MyGymEnv::ScheduleNextStateRead, this);

}

MyGymEnv::MyGymEnv (Time stepTime)

{

NS_LOG_FUNCTION (this);

m_interval = stepTime;

Simulator::Schedule (Seconds (0.0), &MyGymEnv::ScheduleNextStateRead, this);

}

void

MyGymEnv::ScheduleNextStateRead ()

{

NS_LOG_FUNCTION (this);

Simulator::Schedule (m_interval, &MyGymEnv::ScheduleNextStateRead, this);

// Notify();

Step ();

}

MyGymEnv::~MyGymEnv ()

{

NS_LOG_FUNCTION (this);

}

TypeId

MyGymEnv::GetTypeId (void)

{

static TypeId tid = TypeId ("MyGymEnv")

.SetParent<OpenGymEnv> ()

.SetGroupName ("OpenGym")

.AddConstructor<MyGymEnv> ();

return tid;

}

void

MyGymEnv::DoDispose ()

{

NS_LOG_FUNCTION (this);

}

Define observation space

Ptr<OpenGymSpace>

MyGymEnv::GetObservationSpace (uint32_t id)

{

// uint32_t nodeNum = 5;

uint32_t low = 0;

uint32_t high = 5;

std::vector<uint32_t> shape = {1};

std::string dtype = TypeNameGet<uint32_t> ();

Ptr<OpenGymBoxSpace> box = CreateObject<OpenGymBoxSpace> (low, high, shape, dtype);

NS_LOG_UNCOND ("ID " << id << " MyGetObservationSpace: " << box);

return box;

}

Define action space

Ptr<OpenGymSpace>

MyGymEnv::GetActionSpace (uint32_t id)

{

Ptr<OpenGymDiscreteSpace> discrete = CreateObject<OpenGymDiscreteSpace> (2);

NS_LOG_UNCOND ("ID " << id << " MyGetActionSpace: " << discrete);

return discrete;

}

Define game over condition

bool

MyGymEnv::GetDone (uint32_t id)

{

bool isGameOver = false;

static int stepCounter[n+1] = {0};

stepCounter[id] += 1;

if (mypos[id]==5) {

isGameOver = true;

}

NS_LOG_UNCOND ("ID:"<< id << " MyGetGameOver: " << isGameOver << " stepCounter:"<< stepCounter[id]);

return isGameOver;

}

Collect observations

Ptr<OpenGymDataContainer>

MyGymEnv::GetObservation (uint32_t id)

{

std::vector<uint32_t> shape = {1,};

Ptr<OpenGymBoxContainer<uint32_t>> box = CreateObject<OpenGymBoxContainer<uint32_t>> (shape);

box->AddValue (mypos[id]);

Ptr<OpenGymTupleContainer> data = CreateObject<OpenGymTupleContainer> ();

data->Add (box);

// Print data from tuple

Ptr<OpenGymBoxContainer<uint32_t>> mbox =

DynamicCast<OpenGymBoxContainer<uint32_t>> (data->Get (0));

NS_LOG_UNCOND ("Now:" << Simulator::Now ().GetSeconds ());

NS_LOG_UNCOND ("ID " << id << " MyGetObservation: " << data);

//NS_LOG_UNCOND ("---" << mbox);

return data;

}

Define reward function

float

MyGymEnv::GetReward (uint32_t id)

{

float reward = 0.0;

if(mypos[id]==5)

reward=1.0;

NS_LOG_UNCOND ("ID " << id << " MyGetReward: " << reward);

return reward;

}

Define extra info. Optional

std::string

MyGymEnv::GetInfo (uint32_t id)

{

std::string myInfo = "testInfo";

NS_LOG_UNCOND ("ID " << id << " MyGetExtraInfo: " << myInfo);

return myInfo;

}

Execute received actions

bool

MyGymEnv::ExecuteActions (uint32_t id, Ptr<OpenGymDataContainer> action)

{

Ptr<OpenGymDiscreteContainer> discrete = DynamicCast<OpenGymDiscreteContainer>(action);

myact[id]=discrete->GetValue();

if(myact[id]==1){

if(mypos[id]==5)

mypos[id]=5;

else

mypos[id]+=1;

} else {

if (mypos[id]==0)

mypos[id]=0;

else

mypos[id]-=1;

}

NS_LOG_UNCOND ("ID " << id << " MyExecuteActions: " << myact);

return true;

}

} // namespace ns3

[sim.cc]

/* -*- Mode: C++; c-file-style: "gnu"; indent-tabs-mode:nil; -*- */

* This program is free software; you can redistribute it and/or modify

* it under the terms of the GNU General Public License version 2 as

* published by the Free Software Foundation;

* This program is distributed in the hope that it will be useful,

* but WITHOUT ANY WARRANTY; without even the implied warranty of

* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

* GNU General Public License for more details.

* You should have received a copy of the GNU General Public License

* along with this program; if not, write to the Free Software

* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

* Author: Piotr Gawlowicz <gawlowicz.p@gmail.com>

#include "ns3/core-module.h"

#include "ns3/opengym-module.h"

#include "mygym.h"

using namespace ns3;

NS_LOG_COMPONENT_DEFINE ("OpenGym");

int

main (int argc, char *argv[])

{

// Parameters of the scenario

uint32_t simSeed = 1;

double simulationTime = 10; //seconds

double envStepTime = 0.1; //seconds, ns3gym env step time interval

uint32_t testArg = 0;

uint32_t openGymPort;

CommandLine cmd;

// required parameters for OpenGym interface

cmd.AddValue ("openGymPort", "Port number for OpenGym env. Default: 5555", openGymPort);

cmd.AddValue ("simSeed", "Seed for random generator. Default: 1", simSeed);

// optional parameters

cmd.AddValue ("simTime", "Simulation time in seconds. Default: 1s", simulationTime);

cmd.AddValue ("stepTime", "Gym Env step time in seconds. Default: 0.1s", envStepTime);

cmd.AddValue ("testArg", "Extra simulation argument. Default: 0", testArg);

cmd.Parse (argc, argv);

NS_LOG_UNCOND ("Ns3Env parameters:");

NS_LOG_UNCOND ("--simulationTime: " << simulationTime);

NS_LOG_UNCOND ("--envStepTime: " << envStepTime);

NS_LOG_UNCOND ("--seed: " << simSeed);

NS_LOG_UNCOND ("--testArg: " << testArg);

RngSeedManager::SetSeed (1);

RngSeedManager::SetRun (simSeed);

// OpenGym MultiEnv

Ptr<MyGymEnv> myGymEnv = CreateObject<MyGymEnv> (Seconds (envStepTime));

for (uint32_t id = 1; id <= 2; id++)

{

myGymEnv->AddAgentId (id);

}

NS_LOG_UNCOND ("Simulation start");

Simulator::Stop (Seconds (simulationTime));

Simulator::Run ();

NS_LOG_UNCOND ("Simulation stop");

myGymEnv->NotifySimulationEnd ();

Simulator::Destroy ();

}

[test.py]

#!/usr/bin/env python3

# -*- coding: utf-8 -*-

import argparse

from ns3gym import ns3_multiagent_env as ns3env

import numpy as np

import pandas as pd

import time

N_STATES = 6 # the length of the 1 dimensional world

ACTIONS = ['left', 'right'] # available actions

EPSILON = 0.9 # greedy police

ALPHA = 0.1 # learning rate

GAMMA = 0.9 # discount factor

MAX_EPISODES = 10 # maximum episodes

FRESH_TIME = 0.1 # fresh time for one move

# NOTE: This is only a test, not use algorithm

parser = argparse.ArgumentParser(description='Start simulation script on/off')

parser.add_argument('--start',

type=int,

default=1,

help='Start ns-3 simulation script 0/1, Default: 1')

parser.add_argument('--iterations',

type=int,

default=MAX_EPISODES,

help='Number of iterations')

args = parser.parse_args()

startSim = True

iterationNum = int(args.iterations)

port = 5555

simTime = 10 # seconds

stepTime = 0.1 # seconds

seed = 1

simArgs = {"--simTime": simTime,

"--stepTime": stepTime,

"--testArg": 123}

debug = False

env = ns3env.MultiEnv(port=port, stepTime=stepTime, startSim=startSim, simSeed=seed, simArgs=simArgs, debug=debug)

# simpler:

#env = ns3env.Ns3Env()

#env.reset()

ob_spaces = env.observation_space

ac_spaces = env.action_space

print("Observation space: ", ob_spaces, " size: ", len(ob_spaces))

print("Action space: ", ac_spaces, " size: ", len(ac_spaces))

stepIdx = 0

currIt = 0

agent_num = 2

def build_q_table(n_states, actions):

table = pd.DataFrame(

np.zeros((n_states, len(actions))), # q_table initial values

columns=actions, # actions's name

)

# print(table) # show table

return table

def choose_action(state, q_table):

# This is how to choose an action

state_actions = q_table.iloc[state, :]

#print("q_table=", q_table)

#print("state_actions=", state_actions)

if (np.random.uniform() > EPSILON) or ((state_actions == 0).all()): # act non-greedy or state-action have no value

action_name = np.random.choice(ACTIONS)

#print("action_name1=", action_name)

else: # act greedy

action_name = state_actions.idxmax() # replace argmax to idxmax as argmax means a different function in newer version of pandas

#print("action_name2=", action_name)

return action_name

q_table=[]

for ag in range(agent_num):

q_table.append(build_q_table(N_STATES, ACTIONS))

print("q_table[", ag, "]=", q_table[ag])

try:

while True:

print("Start iteration: ", currIt)

obs = env.reset()

print("obs=", obs)

#print(" type(obs[0][0][0])=", type(obs[0][0][0]), " obs[0][0][0]=", obs[0][0][0])

stepIdx=0

while True:

stepIdx += 1

print("Step: ", stepIdx)

actions = []

for ag in range(agent_num):

A = choose_action(obs[ag][0][0], q_table[ag])

if A == 'right':

act=1

else:

act=0

actions.append(act)

print("---actions: ", actions)

next_obs, reward, done, _ = env.step(actions)

print("---next_obs, reward, done:", next_obs[-2:], reward[-2:], done[-2:])

q_predict=[]

for ag in range(agent_num):

if(actions[ag]==0):

q_predict.append(q_table[ag].loc[obs[ag][0][0], "left"])

else:

q_predict.append(q_table[ag].loc[obs[ag][0][0], "right"])

#print("next_obs[-2:][ag][0][0]=", next_obs[-2:][ag][0][0])

#print("reward[-2:][ag]=", reward[-2:][ag])

#print("q_table[ag].iloc[next_obs[-2:][ag][0][0], :].max()=", q_table[ag].iloc[next_obs[-2:][ag][0][0], :].max())

if next_obs[-2:][ag][0][0] != 5:

q_target = reward[-2:][ag] + GAMMA * q_table[ag].iloc[next_obs[-2:][ag][0][0], :].max()

else:

q_target = reward[-2:][ag] # next state is terminal

#print("q_target=", q_target, "q_predict[ag]=", q_predict[ag])

if(actions[ag]==0):

q_table[ag].loc[obs[ag][0][0], "left"] += ALPHA * (q_target - q_predict[ag])

#print("q_table[ag].loc[obs[ag][0][0], 'left']=", q_table[ag].loc[obs[ag][0][0], "left"])

else:

q_table[ag].loc[obs[ag][0][0], "right"] += ALPHA * (q_target - q_predict[ag])

#print("q_table[ag].loc[obs[ag][0][0], 'right']=", q_table[ag].loc[obs[ag][0][0], "right"])

obs = next_obs[-2:] # move to next state

#print("111 obs=", obs, " obs[0][0][0]=", obs[0][0][0])

ok=1

for ag in range(agent_num):

if(done[-2:][ag]==False):

ok=0

break

if(ok==1):

print("total steps:", stepIdx)

for ag in range(agent_num):

print("q_table[", ag, "]=", q_table[ag])

break

time.sleep(5)

currIt += 1

if currIt == iterationNum:

break

except KeyboardInterrupt:

print("Ctrl-C -> Exit")

finally:

env.close()

print("Done")

[Execution]

Open one terminal.

Open another terminal

…….

For the first run, these two agent needs 12 steps to finish the job.

…………………

After few runs, two agents just need 5 steps to finish the job.

Back to NS3 Learning Guide

Last Modified: 2022/11/15 done

[Author]

Dr. Chih-Heng Ke

Department of Computer Science and Information Engineering, National Quemoy University, Kinmen, Taiwan

Email: smallko@gmail.com