The SARSA Algorithm

model free algorithm
similar to Q learning algorithm, but samples reward based on policy and adds policy related Q value of new state
SARSA algorithms are called on-policy, because the experience used for learning is acquired following the current policy

SARSA Example Implementation

Please see my Svelte TD Learning Repository for the complete code and the interactive Gridworld Examples for more information.

const SarsaQTableUpdate = (state, a, r, stateNext, aNext) => {
  let g;
  if (mazeComp.isTerminal(stateNext)) {
    g = r;
  } else {
    g = r + gamma * mazeComp.getQValue(stateNext, aNext);
  }
  let q = (1.0 - alpha) * mazeComp.getQValue(state, a) + alpha * g;
  mazeComp.setQValue(state, a, q);
};

const runSarsaEpisodeStep = (state, a) => {
  let stateNext;
  let aNext, r;

  if (mazeComp.isTerminal(state)) {
    runEpisode();  // run next episode (calls runSarsaEpisode)
  } else {
    stepTimer = setTimeout(() => {
      [stateNext, r] = mazeComp.step(state, a);
      aNext = mazeComp.getEpsilonGreedyAction(stateNext, epsilon);
      SarsaQTableUpdate(state, a, r, stateNext, aNext);
      state = [...stateNext];
      a = Number(aNext);
      runSarsaEpisodeStep(state, a);
    }, 0);
  }
};

const runSarsaEpisode = () => {
  let state = mazeComp.getRandomStartState();
  let a = mazeComp.getEpsilonGreedyAction(state, epsilon);
  runSarsaEpisodeStep(state, a);
};