import * as React from 'react'
  /* @jsx mdx */
import { mdx } from '@mdx-js/react';
/* @jsx mdx */

import Sketch from '../../components/sketch';
import sketchSarsa from '../../scripts/sSarsa/sSarsa';
import sketchQ from '../../scripts/sQlearningSarsa/sQlearningSarsa';
import { Link } from 'gatsby';
import Layout from '../../components/blogLayout';
import DateConverter from '../../components/dateConverter';
import { SideNote, MarginNote } from '../../components/sideNote';
import { KatexBlock, KatexInline } from "../../components/Katex";
import Commento from "../../components/commento";
import NewThought from '../../components/newThought';
import sketchPolicy from '../../scripts/sPolicy/sPolicy';
import sketchValue from '../../scripts/sValue/sValue';
export const _frontmatter = {
  "title": "SARSA (vs. Q-learning)",
  "subtitle": "blabla",
  "date": "2022-01-07",
  "slug": "sarsa",
  "author": "JAN MALTE LICHTENBERG"
};

const makeShortcode = name => function MDXDefaultShortcode(props) {
  console.warn("Component " + name + " was not imported, exported, or provided by MDXProvider as global scope");
  return <div {...props} />;
};

const MDXLayout = "wrapper";
export default function MDXContent({
  components,
  ...props
}) {
  return <MDXLayout {...props} components={components} mdxType="MDXLayout">
    <Layout slug={props.pageContext.frontmatter.slug} mdxType="Layout">
      <div>
        <DateConverter frontmatter={props.pageContext.frontmatter} mdxType="DateConverter" />
        <p className="blogtitle">
          {props.pageContext.frontmatter.title}
        </p>
      </div>
      <p className="intro">
        <p>{`This is the fourth post of the blog series `}
          <Link to="/posts/rllbl" mdxType="Link">{`Reinforcement learning: line by line`}</Link>
          {`. Here we take a
look at the tabular Sarsa algorithm and compare it to the Q-learning algorithm
(discussed in the `}
          <Link to="/posts/qlearning" mdxType="Link">{`previous post`}</Link>
          {`). In case you
are completely new to reinforcement learning (RL), see `}
          <Link to="/posts/rllbl#rl" mdxType="Link">{`here`}</Link>
          {` for an informal introduction.`}</p>
      </p>
      <p>{`The interactive sketch below shows an implementation of the tabular Sarsa
algorithm applied to a version of a simple game, called the `}
        <em {...{
          "parentName": "p"
        }}>{`Pancakes
Gridworld`}</em>
        {`.`}
        <SideNote mdxType="SideNote">{`See `}
          <Link to="/posts/mdp" mdxType="Link">{`here`}</Link>
          {` for more information
about the Pancakes Gridworld.`}</SideNote></p>
      <Sketch sketch={sketchSarsa} mdxType="Sketch" />
      <h4>{`SARSA: State, Action, Reward, State, Action`}</h4>
      <p>{`The tabular Sarsa algorithm is conceptually very similar to the Q-learning
algorithm in that, in every time step, the agent uses only information from the
current transition to improve its action-value estimates. The main conceptual difference
between the two algorithms is the update rule (line 6 of the pseudo code), as
discussed `}
        <Link to="#sarsaupdate" mdxType="Link">{`further below`}</Link>
        {` in more detail.`}</p>
      <p>{`One technical consequence of using this other update rule is that the Sarsa algorithm has
a slightly different algorithmic structure. Specifically, the Sarsa algorithm
requires  in each iteration the current state `}
        <span {...{
          "className": "math math-inline",
          "parentName": "p"
        }}><span {...{
            "className": "katex",
            "parentName": "span"
          }}><span {...{
              "className": "katex-mathml",
              "parentName": "span"
            }}><math {...{
                "xmlns": "http://www.w3.org/1998/Math/MathML",
                "parentName": "span"
              }}><semantics {...{
                  "parentName": "math"
                }}><mrow {...{
                    "parentName": "semantics"
                  }}><mi {...{
                      "parentName": "mrow"
                    }}>{`S`}</mi></mrow>
                  <annotation {...{
                    "encoding": "application/x-tex",
                    "parentName": "semantics"
                  }}>{`S`}</annotation></semantics></math></span>
            <span {...{
              "className": "katex-html",
              "aria-hidden": "true",
              "parentName": "span"
            }}><span {...{
                "className": "base",
                "parentName": "span"
              }}><span {...{
                  "className": "strut",
                  "style": {
                    "height": "0.68333em",
                    "verticalAlign": "0em"
                  },
                  "parentName": "span"
                }} />
                <span {...{
                  "className": "mord mathnormal",
                  "style": {
                    "marginRight": "0.05764em"
                  },
                  "parentName": "span"
                }}>{`S`}</span></span></span></span></span>
        {`, the current action `}
        <span {...{
          "className": "math math-inline",
          "parentName": "p"
        }}><span {...{
            "className": "katex",
            "parentName": "span"
          }}><span {...{
              "className": "katex-mathml",
              "parentName": "span"
            }}><math {...{
                "xmlns": "http://www.w3.org/1998/Math/MathML",
                "parentName": "span"
              }}><semantics {...{
                  "parentName": "math"
                }}><mrow {...{
                    "parentName": "semantics"
                  }}><mi {...{
                      "parentName": "mrow"
                    }}>{`A`}</mi></mrow>
                  <annotation {...{
                    "encoding": "application/x-tex",
                    "parentName": "semantics"
                  }}>{`A`}</annotation></semantics></math></span>
            <span {...{
              "className": "katex-html",
              "aria-hidden": "true",
              "parentName": "span"
            }}><span {...{
                "className": "base",
                "parentName": "span"
              }}><span {...{
                  "className": "strut",
                  "style": {
                    "height": "0.68333em",
                    "verticalAlign": "0em"
                  },
                  "parentName": "span"
                }} />
                <span {...{
                  "className": "mord mathnormal",
                  "parentName": "span"
                }}>{`A`}</span></span></span></span></span>
        {`, the reward `}
        <span {...{
          "className": "math math-inline",
          "parentName": "p"
        }}><span {...{
            "className": "katex",
            "parentName": "span"
          }}><span {...{
              "className": "katex-mathml",
              "parentName": "span"
            }}><math {...{
                "xmlns": "http://www.w3.org/1998/Math/MathML",
                "parentName": "span"
              }}><semantics {...{
                  "parentName": "math"
                }}><mrow {...{
                    "parentName": "semantics"
                  }}><mi {...{
                      "parentName": "mrow"
                    }}>{`R`}</mi></mrow>
                  <annotation {...{
                    "encoding": "application/x-tex",
                    "parentName": "semantics"
                  }}>{`R`}</annotation></semantics></math></span>
            <span {...{
              "className": "katex-html",
              "aria-hidden": "true",
              "parentName": "span"
            }}><span {...{
                "className": "base",
                "parentName": "span"
              }}><span {...{
                  "className": "strut",
                  "style": {
                    "height": "0.68333em",
                    "verticalAlign": "0em"
                  },
                  "parentName": "span"
                }} />
                <span {...{
                  "className": "mord mathnormal",
                  "style": {
                    "marginRight": "0.00773em"
                  },
                  "parentName": "span"
                }}>{`R`}</span></span></span></span></span>
        {`, the next
state `}
        <span {...{
          "className": "math math-inline",
          "parentName": "p"
        }}><span {...{
            "className": "katex",
            "parentName": "span"
          }}><span {...{
              "className": "katex-mathml",
              "parentName": "span"
            }}><math {...{
                "xmlns": "http://www.w3.org/1998/Math/MathML",
                "parentName": "span"
              }}><semantics {...{
                  "parentName": "math"
                }}><mrow {...{
                    "parentName": "semantics"
                  }}><msup {...{
                      "parentName": "mrow"
                    }}><mi {...{
                        "parentName": "msup"
                      }}>{`S`}</mi>
                      <mo {...{
                        "mathvariant": "normal",
                        "lspace": "0em",
                        "rspace": "0em",
                        "parentName": "msup"
                      }}>{`′`}</mo></msup></mrow>
                  <annotation {...{
                    "encoding": "application/x-tex",
                    "parentName": "semantics"
                  }}>{`S'`}</annotation></semantics></math></span>
            <span {...{
              "className": "katex-html",
              "aria-hidden": "true",
              "parentName": "span"
            }}><span {...{
                "className": "base",
                "parentName": "span"
              }}><span {...{
                  "className": "strut",
                  "style": {
                    "height": "0.751892em",
                    "verticalAlign": "0em"
                  },
                  "parentName": "span"
                }} />
                <span {...{
                  "className": "mord",
                  "parentName": "span"
                }}><span {...{
                    "className": "mord mathnormal",
                    "style": {
                      "marginRight": "0.05764em"
                    },
                    "parentName": "span"
                  }}>{`S`}</span>
                  <span {...{
                    "className": "msupsub",
                    "parentName": "span"
                  }}><span {...{
                      "className": "vlist-t",
                      "parentName": "span"
                    }}><span {...{
                        "className": "vlist-r",
                        "parentName": "span"
                      }}><span {...{
                          "className": "vlist",
                          "style": {
                            "height": "0.751892em"
                          },
                          "parentName": "span"
                        }}><span {...{
                            "style": {
                              "top": "-3.063em",
                              "marginRight": "0.05em"
                            },
                            "parentName": "span"
                          }}><span {...{
                              "className": "pstrut",
                              "style": {
                                "height": "2.7em"
                              },
                              "parentName": "span"
                            }} />
                            <span {...{
                              "className": "sizing reset-size6 size3 mtight",
                              "parentName": "span"
                            }}><span {...{
                                "className": "mord mtight",
                                "parentName": "span"
                              }}><span {...{
                                  "className": "mord mtight",
                                  "parentName": "span"
                                }}>{`′`}</span></span></span></span></span></span></span></span></span></span></span></span></span>
        {`, `}
        <em {...{
          "parentName": "p"
        }}>{`and the next action`}</em>
        {` `}
        <span {...{
          "className": "math math-inline",
          "parentName": "p"
        }}><span {...{
            "className": "katex",
            "parentName": "span"
          }}><span {...{
              "className": "katex-mathml",
              "parentName": "span"
            }}><math {...{
                "xmlns": "http://www.w3.org/1998/Math/MathML",
                "parentName": "span"
              }}><semantics {...{
                  "parentName": "math"
                }}><mrow {...{
                    "parentName": "semantics"
                  }}><msup {...{
                      "parentName": "mrow"
                    }}><mi {...{
                        "parentName": "msup"
                      }}>{`A`}</mi>
                      <mo {...{
                        "mathvariant": "normal",
                        "lspace": "0em",
                        "rspace": "0em",
                        "parentName": "msup"
                      }}>{`′`}</mo></msup></mrow>
                  <annotation {...{
                    "encoding": "application/x-tex",
                    "parentName": "semantics"
                  }}>{`A'`}</annotation></semantics></math></span>
            <span {...{
              "className": "katex-html",
              "aria-hidden": "true",
              "parentName": "span"
            }}><span {...{
                "className": "base",
                "parentName": "span"
              }}><span {...{
                  "className": "strut",
                  "style": {
                    "height": "0.751892em",
                    "verticalAlign": "0em"
                  },
                  "parentName": "span"
                }} />
                <span {...{
                  "className": "mord",
                  "parentName": "span"
                }}><span {...{
                    "className": "mord mathnormal",
                    "parentName": "span"
                  }}>{`A`}</span>
                  <span {...{
                    "className": "msupsub",
                    "parentName": "span"
                  }}><span {...{
                      "className": "vlist-t",
                      "parentName": "span"
                    }}><span {...{
                        "className": "vlist-r",
                        "parentName": "span"
                      }}><span {...{
                          "className": "vlist",
                          "style": {
                            "height": "0.751892em"
                          },
                          "parentName": "span"
                        }}><span {...{
                            "style": {
                              "top": "-3.063em",
                              "marginRight": "0.05em"
                            },
                            "parentName": "span"
                          }}><span {...{
                              "className": "pstrut",
                              "style": {
                                "height": "2.7em"
                              },
                              "parentName": "span"
                            }} />
                            <span {...{
                              "className": "sizing reset-size6 size3 mtight",
                              "parentName": "span"
                            }}><span {...{
                                "className": "mord mtight",
                                "parentName": "span"
                              }}><span {...{
                                  "className": "mord mtight",
                                  "parentName": "span"
                                }}>{`′`}</span></span></span></span></span></span></span></span></span></span></span></span></span>
        {`. The Q-learning update,
on the other hand, only requires the variables `}
        <span {...{
          "className": "math math-inline",
          "parentName": "p"
        }}><span {...{
            "className": "katex",
            "parentName": "span"
          }}><span {...{
              "className": "katex-mathml",
              "parentName": "span"
            }}><math {...{
                "xmlns": "http://www.w3.org/1998/Math/MathML",
                "parentName": "span"
              }}><semantics {...{
                  "parentName": "math"
                }}><mrow {...{
                    "parentName": "semantics"
                  }}><mi {...{
                      "parentName": "mrow"
                    }}>{`S`}</mi>
                    <mo {...{
                      "separator": "true",
                      "parentName": "mrow"
                    }}>{`,`}</mo>
                    <mi {...{
                      "parentName": "mrow"
                    }}>{`A`}</mi>
                    <mo {...{
                      "separator": "true",
                      "parentName": "mrow"
                    }}>{`,`}</mo>
                    <mi {...{
                      "parentName": "mrow"
                    }}>{`R`}</mi>
                    <mo {...{
                      "separator": "true",
                      "parentName": "mrow"
                    }}>{`,`}</mo>
                    <msup {...{
                      "parentName": "mrow"
                    }}><mi {...{
                        "parentName": "msup"
                      }}>{`S`}</mi>
                      <mo {...{
                        "mathvariant": "normal",
                        "lspace": "0em",
                        "rspace": "0em",
                        "parentName": "msup"
                      }}>{`′`}</mo></msup></mrow>
                  <annotation {...{
                    "encoding": "application/x-tex",
                    "parentName": "semantics"
                  }}>{`S, A, R, S'`}</annotation></semantics></math></span>
            <span {...{
              "className": "katex-html",
              "aria-hidden": "true",
              "parentName": "span"
            }}><span {...{
                "className": "base",
                "parentName": "span"
              }}><span {...{
                  "className": "strut",
                  "style": {
                    "height": "0.946332em",
                    "verticalAlign": "-0.19444em"
                  },
                  "parentName": "span"
                }} />
                <span {...{
                  "className": "mord mathnormal",
                  "style": {
                    "marginRight": "0.05764em"
                  },
                  "parentName": "span"
                }}>{`S`}</span>
                <span {...{
                  "className": "mpunct",
                  "parentName": "span"
                }}>{`,`}</span>
                <span {...{
                  "className": "mspace",
                  "style": {
                    "marginRight": "0.16666666666666666em"
                  },
                  "parentName": "span"
                }} />
                <span {...{
                  "className": "mord mathnormal",
                  "parentName": "span"
                }}>{`A`}</span>
                <span {...{
                  "className": "mpunct",
                  "parentName": "span"
                }}>{`,`}</span>
                <span {...{
                  "className": "mspace",
                  "style": {
                    "marginRight": "0.16666666666666666em"
                  },
                  "parentName": "span"
                }} />
                <span {...{
                  "className": "mord mathnormal",
                  "style": {
                    "marginRight": "0.00773em"
                  },
                  "parentName": "span"
                }}>{`R`}</span>
                <span {...{
                  "className": "mpunct",
                  "parentName": "span"
                }}>{`,`}</span>
                <span {...{
                  "className": "mspace",
                  "style": {
                    "marginRight": "0.16666666666666666em"
                  },
                  "parentName": "span"
                }} />
                <span {...{
                  "className": "mord",
                  "parentName": "span"
                }}><span {...{
                    "className": "mord mathnormal",
                    "style": {
                      "marginRight": "0.05764em"
                    },
                    "parentName": "span"
                  }}>{`S`}</span>
                  <span {...{
                    "className": "msupsub",
                    "parentName": "span"
                  }}><span {...{
                      "className": "vlist-t",
                      "parentName": "span"
                    }}><span {...{
                        "className": "vlist-r",
                        "parentName": "span"
                      }}><span {...{
                          "className": "vlist",
                          "style": {
                            "height": "0.751892em"
                          },
                          "parentName": "span"
                        }}><span {...{
                            "style": {
                              "top": "-3.063em",
                              "marginRight": "0.05em"
                            },
                            "parentName": "span"
                          }}><span {...{
                              "className": "pstrut",
                              "style": {
                                "height": "2.7em"
                              },
                              "parentName": "span"
                            }} />
                            <span {...{
                              "className": "sizing reset-size6 size3 mtight",
                              "parentName": "span"
                            }}><span {...{
                                "className": "mord mtight",
                                "parentName": "span"
                              }}><span {...{
                                  "className": "mord mtight",
                                  "parentName": "span"
                                }}>{`′`}</span></span></span></span></span></span></span></span></span></span></span></span></span>
        {`. This leads to
a different structure of learning updates: `}
        <KatexBlock formula={` 
  \\text{Q-learning: } \\ \\rlap{$\\overbrace{\\phantom{S_0, A_0, R_1, S_1}}^{1^\\text{st}
  \\text{update}}$} S_0, A_0, R_1, 
  \\rlap{$\\underbrace{\\phantom{S_1, A_1, R_1, S_2}}_{2^\\text{nd}
  \\text{update}}$} S_1, A_1, R_2, 
  \\rlap{$\\overbrace{\\phantom{S_2, A_2, R_3, S_3}}^{3^\\text{rd}
  \\text{update}}$} S_2, A_2, R_3, 
  \\rlap{$\\underbrace{\\phantom{S_3, A_3, R_4, S_4}}_{4^\\text{th}
  \\text{update}}$} S_3, A_3, R_4, S_4, A_4, \\dots
  `} mdxType="KatexBlock" />
        {`
`}
        <KatexBlock formula={` 
  \\text{Sarsa: } \\ \\rlap{$\\overbrace{\\phantom{S_0, A_0, R_1, S_1, A_1}}^{1^\\text{st}
  \\text{update}}$} S_0, A_0, R_1, 
  \\rlap{$\\underbrace{\\phantom{S_1, A_1, R_1, S_2, A_2}}_{2^\\text{nd}
  \\text{update}}$} S_1, A_1, R_2, 
  \\rlap{$\\overbrace{\\phantom{S_2, A_2, R_3, S_3, A_3}}^{3^\\text{rd}
  \\text{update}}$} S_2, A_2, R_3, 
  \\rlap{$\\underbrace{\\phantom{S_3, A_3, R_4, S_4, A_4}}_{4^\\text{th}
  \\text{update}}$} S_3, A_3, R_4, S_4, A_4, \\dots
  `} mdxType="KatexBlock" /></p>
      <p>{`For the Q-learning algorithm only state is shared between two successive updates.
For the Sarsa algorithm, both state and action are shared between successive
updates. This difference is the reason for the different `}
        {`“`}
        {`rhythm`}
        {`”`}
        {` in
the algorithm structure, as further explained in the algorithm description. In
particular, for the Sarsa algorithm we always have to carry around two
actions (current action `}
        <span {...{
          "className": "math math-inline",
          "parentName": "p"
        }}><span {...{
            "className": "katex",
            "parentName": "span"
          }}><span {...{
              "className": "katex-mathml",
              "parentName": "span"
            }}><math {...{
                "xmlns": "http://www.w3.org/1998/Math/MathML",
                "parentName": "span"
              }}><semantics {...{
                  "parentName": "math"
                }}><mrow {...{
                    "parentName": "semantics"
                  }}><mi {...{
                      "parentName": "mrow"
                    }}>{`A`}</mi></mrow>
                  <annotation {...{
                    "encoding": "application/x-tex",
                    "parentName": "semantics"
                  }}>{`A`}</annotation></semantics></math></span>
            <span {...{
              "className": "katex-html",
              "aria-hidden": "true",
              "parentName": "span"
            }}><span {...{
                "className": "base",
                "parentName": "span"
              }}><span {...{
                  "className": "strut",
                  "style": {
                    "height": "0.68333em",
                    "verticalAlign": "0em"
                  },
                  "parentName": "span"
                }} />
                <span {...{
                  "className": "mord mathnormal",
                  "parentName": "span"
                }}>{`A`}</span></span></span></span></span>
        {` and next action `}
        <span {...{
          "className": "math math-inline",
          "parentName": "p"
        }}><span {...{
            "className": "katex",
            "parentName": "span"
          }}><span {...{
              "className": "katex-mathml",
              "parentName": "span"
            }}><math {...{
                "xmlns": "http://www.w3.org/1998/Math/MathML",
                "parentName": "span"
              }}><semantics {...{
                  "parentName": "math"
                }}><mrow {...{
                    "parentName": "semantics"
                  }}><msup {...{
                      "parentName": "mrow"
                    }}><mi {...{
                        "parentName": "msup"
                      }}>{`A`}</mi>
                      <mo {...{
                        "mathvariant": "normal",
                        "lspace": "0em",
                        "rspace": "0em",
                        "parentName": "msup"
                      }}>{`′`}</mo></msup></mrow>
                  <annotation {...{
                    "encoding": "application/x-tex",
                    "parentName": "semantics"
                  }}>{`A'`}</annotation></semantics></math></span>
            <span {...{
              "className": "katex-html",
              "aria-hidden": "true",
              "parentName": "span"
            }}><span {...{
                "className": "base",
                "parentName": "span"
              }}><span {...{
                  "className": "strut",
                  "style": {
                    "height": "0.751892em",
                    "verticalAlign": "0em"
                  },
                  "parentName": "span"
                }} />
                <span {...{
                  "className": "mord",
                  "parentName": "span"
                }}><span {...{
                    "className": "mord mathnormal",
                    "parentName": "span"
                  }}>{`A`}</span>
                  <span {...{
                    "className": "msupsub",
                    "parentName": "span"
                  }}><span {...{
                      "className": "vlist-t",
                      "parentName": "span"
                    }}><span {...{
                        "className": "vlist-r",
                        "parentName": "span"
                      }}><span {...{
                          "className": "vlist",
                          "style": {
                            "height": "0.751892em"
                          },
                          "parentName": "span"
                        }}><span {...{
                            "style": {
                              "top": "-3.063em",
                              "marginRight": "0.05em"
                            },
                            "parentName": "span"
                          }}><span {...{
                              "className": "pstrut",
                              "style": {
                                "height": "2.7em"
                              },
                              "parentName": "span"
                            }} />
                            <span {...{
                              "className": "sizing reset-size6 size3 mtight",
                              "parentName": "span"
                            }}><span {...{
                                "className": "mord mtight",
                                "parentName": "span"
                              }}><span {...{
                                  "className": "mord mtight",
                                  "parentName": "span"
                                }}>{`′`}</span></span></span></span></span></span></span></span></span></span></span></span></span>
        {`), whereas the Q-learning
algorithm requires only one action variable (`}
        <span {...{
          "className": "math math-inline",
          "parentName": "p"
        }}><span {...{
            "className": "katex",
            "parentName": "span"
          }}><span {...{
              "className": "katex-mathml",
              "parentName": "span"
            }}><math {...{
                "xmlns": "http://www.w3.org/1998/Math/MathML",
                "parentName": "span"
              }}><semantics {...{
                  "parentName": "math"
                }}><mrow {...{
                    "parentName": "semantics"
                  }}><mi {...{
                      "parentName": "mrow"
                    }}>{`A`}</mi></mrow>
                  <annotation {...{
                    "encoding": "application/x-tex",
                    "parentName": "semantics"
                  }}>{`A`}</annotation></semantics></math></span>
            <span {...{
              "className": "katex-html",
              "aria-hidden": "true",
              "parentName": "span"
            }}><span {...{
                "className": "base",
                "parentName": "span"
              }}><span {...{
                  "className": "strut",
                  "style": {
                    "height": "0.68333em",
                    "verticalAlign": "0em"
                  },
                  "parentName": "span"
                }} />
                <span {...{
                  "className": "mord mathnormal",
                  "parentName": "span"
                }}>{`A`}</span></span></span></span></span>
        {`).`}</p>
      <p>{`The remainder of this section goes through the Sarsa pseudo code, line by line.
It focuses on the differences to the Q-learning pseudo code, which was explained `}
        <Link to="/posts/qlearning/#qpc" mdxType="Link">{`here`}</Link>
        {` (pseudo code lines that
are identical in both algorithms are grayed out).`}</p>
      <p className="muted">
        <p><span className="algoLineMainText"><KatexInline formula={`0: \\text{Loop for
  each episode:}`} mdxType="KatexInline" />
            {` `}</span>
          {` `}
          {` `}
          {` (`}
          {`“`}
          {`outer loop`}
          {`”`}
          {`, same as in Q-learning)`}</p>
      </p>
      <p className="muted">
        <p><span className="algoLineMainText"><KatexInline formula={`1: S \\leftarrow
  s_0`} mdxType="KatexInline" />
            {` `}</span>
          {` `}
          {` `}
          {` (`}
          {`“`}
          {`set agent to starting state`}
          {`”`}
          {` same as in Q-learning)`}</p>
      </p>
      <span className="algoLineMainText">
        <KatexInline formula={`2: \\text{Select
  action } A \\text{ from state } S \\text{ using } \\epsilon\\text{-greedy
  policy}`} mdxType="KatexInline" />
      </span>
      <ul>

        <li {...{
          "parentName": "ul"
        }}>{`This is similar to Q-learning's code line 3, yet here we select an action already
before entering the inner loop (code line 4). This is the different
`}
          {`“`}
          {`rhythm`}
          {`”`}
          {` I talked about earlier: the Sarsa algorithm requires the
current action and the next action for its update.`}</li>

      </ul>
      <p className="muted">
        <p><span className="algoLineMainText"><KatexInline formula={`3: \\text{Loop until
  a terminal state is reached:}`} mdxType="KatexInline" />
            {` `}</span>
          {` `}
          {` `}
          {` (`}
          {`“`}
          {`inner loop`}
          {`”`}
          {`, same as in
Q-learning)`}</p>
      </p>
      <p className="muted">
        <p><span className="algoLineMainText"><KatexInline formula={`4: \\text{Take
  action } A, \\text{ observe reward } R \\text{ and new state } S'`} mdxType="KatexInline" />
            {` `}</span>
          {` (same as in Q-learning)`}</p>
      </p>
      <span className="algoLineMainText">
        <KatexInline formula={`5: \\text{Select
  action } A' \\text{ from state } S' \\text{ using } \\epsilon\\text{-greedy
  policy}`} mdxType="KatexInline" />
      </span>
      <ul>

        <li {...{
          "parentName": "ul"
        }}>{`Here the `}
          <em {...{
            "parentName": "li"
          }}>{`next action`}</em>
          {` `}
          <span {...{
            "className": "math math-inline",
            "parentName": "li"
          }}><span {...{
              "className": "katex",
              "parentName": "span"
            }}><span {...{
                "className": "katex-mathml",
                "parentName": "span"
              }}><math {...{
                  "xmlns": "http://www.w3.org/1998/Math/MathML",
                  "parentName": "span"
                }}><semantics {...{
                    "parentName": "math"
                  }}><mrow {...{
                      "parentName": "semantics"
                    }}><msup {...{
                        "parentName": "mrow"
                      }}><mi {...{
                          "parentName": "msup"
                        }}>{`A`}</mi>
                        <mo {...{
                          "mathvariant": "normal",
                          "lspace": "0em",
                          "rspace": "0em",
                          "parentName": "msup"
                        }}>{`′`}</mo></msup></mrow>
                    <annotation {...{
                      "encoding": "application/x-tex",
                      "parentName": "semantics"
                    }}>{`A'`}</annotation></semantics></math></span>
              <span {...{
                "className": "katex-html",
                "aria-hidden": "true",
                "parentName": "span"
              }}><span {...{
                  "className": "base",
                  "parentName": "span"
                }}><span {...{
                    "className": "strut",
                    "style": {
                      "height": "0.751892em",
                      "verticalAlign": "0em"
                    },
                    "parentName": "span"
                  }} />
                  <span {...{
                    "className": "mord",
                    "parentName": "span"
                  }}><span {...{
                      "className": "mord mathnormal",
                      "parentName": "span"
                    }}>{`A`}</span>
                    <span {...{
                      "className": "msupsub",
                      "parentName": "span"
                    }}><span {...{
                        "className": "vlist-t",
                        "parentName": "span"
                      }}><span {...{
                          "className": "vlist-r",
                          "parentName": "span"
                        }}><span {...{
                            "className": "vlist",
                            "style": {
                              "height": "0.751892em"
                            },
                            "parentName": "span"
                          }}><span {...{
                              "style": {
                                "top": "-3.063em",
                                "marginRight": "0.05em"
                              },
                              "parentName": "span"
                            }}><span {...{
                                "className": "pstrut",
                                "style": {
                                  "height": "2.7em"
                                },
                                "parentName": "span"
                              }} />
                              <span {...{
                                "className": "sizing reset-size6 size3 mtight",
                                "parentName": "span"
                              }}><span {...{
                                  "className": "mord mtight",
                                  "parentName": "span"
                                }}><span {...{
                                    "className": "mord mtight",
                                    "parentName": "span"
                                  }}>{`′`}</span></span></span></span></span></span></span></span></span></span></span></span></span>
          {` is selected according to an `}
          <span {...{
            "className": "math math-inline",
            "parentName": "li"
          }}><span {...{
              "className": "katex",
              "parentName": "span"
            }}><span {...{
                "className": "katex-mathml",
                "parentName": "span"
              }}><math {...{
                  "xmlns": "http://www.w3.org/1998/Math/MathML",
                  "parentName": "span"
                }}><semantics {...{
                    "parentName": "math"
                  }}><mrow {...{
                      "parentName": "semantics"
                    }}><mi {...{
                        "parentName": "mrow"
                      }}>{`ϵ`}</mi></mrow>
                    <annotation {...{
                      "encoding": "application/x-tex",
                      "parentName": "semantics"
                    }}>{`\\epsilon`}</annotation></semantics></math></span>
              <span {...{
                "className": "katex-html",
                "aria-hidden": "true",
                "parentName": "span"
              }}><span {...{
                  "className": "base",
                  "parentName": "span"
                }}><span {...{
                    "className": "strut",
                    "style": {
                      "height": "0.43056em",
                      "verticalAlign": "0em"
                    },
                    "parentName": "span"
                  }} />
                  <span {...{
                    "className": "mord mathnormal",
                    "parentName": "span"
                  }}>{`ϵ`}</span></span></span></span></span>
          {`-greedy
policy, based on the current Q-value estimates.`}</li>

      </ul>
      <span className="algoLineMainText" id="sarsaupdate">
        <KatexInline formula={`6: Q(S, A)
  \\leftarrow (1 - \\alpha) \\ Q(S, A) + \\alpha \\ [R + \\gamma Q(S',
  A')]`} mdxType="KatexInline" />
      </span>
      <ul>

        <li {...{
          "parentName": "ul"
        }}>

          <p {...{
            "parentName": "li"
          }}>{`The learning update. The same general intuition from the Q-learning update
applies here as well (the new estimate is a weighted average of the old estimate and a
Bellman estimate, as described `}
            <Link to="/posts/qlearning#qupdate" mdxType="Link">{`here`}</Link>
            {`).
The difference is that for Sarsa the Bellman estimate is not based on the Bellman equation
of the optimal value function (as it is for Q-learning), but on the (deterministic) Bellman equation of
the `}
            <strong {...{
              "parentName": "p"
            }}>{`agent's current policy`}</strong>
            {` `}
            <span {...{
              "className": "math math-inline",
              "parentName": "p"
            }}><span {...{
                "className": "katex",
                "parentName": "span"
              }}><span {...{
                  "className": "katex-mathml",
                  "parentName": "span"
                }}><math {...{
                    "xmlns": "http://www.w3.org/1998/Math/MathML",
                    "parentName": "span"
                  }}><semantics {...{
                      "parentName": "math"
                    }}><mrow {...{
                        "parentName": "semantics"
                      }}><mi {...{
                          "parentName": "mrow"
                        }}>{`π`}</mi></mrow>
                      <annotation {...{
                        "encoding": "application/x-tex",
                        "parentName": "semantics"
                      }}>{`\\pi`}</annotation></semantics></math></span>
                <span {...{
                  "className": "katex-html",
                  "aria-hidden": "true",
                  "parentName": "span"
                }}><span {...{
                    "className": "base",
                    "parentName": "span"
                  }}><span {...{
                      "className": "strut",
                      "style": {
                        "height": "0.43056em",
                        "verticalAlign": "0em"
                      },
                      "parentName": "span"
                    }} />
                    <span {...{
                      "className": "mord mathnormal",
                      "style": {
                        "marginRight": "0.03588em"
                      },
                      "parentName": "span"
                    }}>{`π`}</span></span></span></span></span>
            {`, given by
`}
            <KatexBlock formula={`
      q_{\\pi}(s, a) = r(s, a) + q_{\\pi}(s', a') , \\quad \\text{ for
  all } s \\in \\mathcal{S} \\text{ and } a \\in \\mathcal{A},
  `} mdxType="KatexBlock" /></p>

        </li>


        <li {...{
          "parentName": "ul"
        }}>

          <p {...{
            "parentName": "li"
          }}>{`where `}
            <span {...{
              "className": "math math-inline",
              "parentName": "p"
            }}><span {...{
                "className": "katex",
                "parentName": "span"
              }}><span {...{
                  "className": "katex-mathml",
                  "parentName": "span"
                }}><math {...{
                    "xmlns": "http://www.w3.org/1998/Math/MathML",
                    "parentName": "span"
                  }}><semantics {...{
                      "parentName": "math"
                    }}><mrow {...{
                        "parentName": "semantics"
                      }}><msup {...{
                          "parentName": "mrow"
                        }}><mi {...{
                            "parentName": "msup"
                          }}>{`s`}</mi>
                          <mo {...{
                            "mathvariant": "normal",
                            "lspace": "0em",
                            "rspace": "0em",
                            "parentName": "msup"
                          }}>{`′`}</mo></msup>
                        <mo {...{
                          "stretchy": "false",
                          "parentName": "mrow"
                        }}>{`(`}</mo>
                        <mi {...{
                          "parentName": "mrow"
                        }}>{`s`}</mi>
                        <mo {...{
                          "separator": "true",
                          "parentName": "mrow"
                        }}>{`,`}</mo>
                        <mi {...{
                          "parentName": "mrow"
                        }}>{`a`}</mi>
                        <mo {...{
                          "stretchy": "false",
                          "parentName": "mrow"
                        }}>{`)`}</mo></mrow>
                      <annotation {...{
                        "encoding": "application/x-tex",
                        "parentName": "semantics"
                      }}>{`s'(s, a)`}</annotation></semantics></math></span>
                <span {...{
                  "className": "katex-html",
                  "aria-hidden": "true",
                  "parentName": "span"
                }}><span {...{
                    "className": "base",
                    "parentName": "span"
                  }}><span {...{
                      "className": "strut",
                      "style": {
                        "height": "1.001892em",
                        "verticalAlign": "-0.25em"
                      },
                      "parentName": "span"
                    }} />
                    <span {...{
                      "className": "mord",
                      "parentName": "span"
                    }}><span {...{
                        "className": "mord mathnormal",
                        "parentName": "span"
                      }}>{`s`}</span>
                      <span {...{
                        "className": "msupsub",
                        "parentName": "span"
                      }}><span {...{
                          "className": "vlist-t",
                          "parentName": "span"
                        }}><span {...{
                            "className": "vlist-r",
                            "parentName": "span"
                          }}><span {...{
                              "className": "vlist",
                              "style": {
                                "height": "0.751892em"
                              },
                              "parentName": "span"
                            }}><span {...{
                                "style": {
                                  "top": "-3.063em",
                                  "marginRight": "0.05em"
                                },
                                "parentName": "span"
                              }}><span {...{
                                  "className": "pstrut",
                                  "style": {
                                    "height": "2.7em"
                                  },
                                  "parentName": "span"
                                }} />
                                <span {...{
                                  "className": "sizing reset-size6 size3 mtight",
                                  "parentName": "span"
                                }}><span {...{
                                    "className": "mord mtight",
                                    "parentName": "span"
                                  }}><span {...{
                                      "className": "mord mtight",
                                      "parentName": "span"
                                    }}>{`′`}</span></span></span></span></span></span></span></span></span>
                    <span {...{
                      "className": "mopen",
                      "parentName": "span"
                    }}>{`(`}</span>
                    <span {...{
                      "className": "mord mathnormal",
                      "parentName": "span"
                    }}>{`s`}</span>
                    <span {...{
                      "className": "mpunct",
                      "parentName": "span"
                    }}>{`,`}</span>
                    <span {...{
                      "className": "mspace",
                      "style": {
                        "marginRight": "0.16666666666666666em"
                      },
                      "parentName": "span"
                    }} />
                    <span {...{
                      "className": "mord mathnormal",
                      "parentName": "span"
                    }}>{`a`}</span>
                    <span {...{
                      "className": "mclose",
                      "parentName": "span"
                    }}>{`)`}</span></span></span></span></span>
            {` is the next state as determined by the transition function,
and `}
            <span {...{
              "className": "math math-inline",
              "parentName": "p"
            }}><span {...{
                "className": "katex",
                "parentName": "span"
              }}><span {...{
                  "className": "katex-mathml",
                  "parentName": "span"
                }}><math {...{
                    "xmlns": "http://www.w3.org/1998/Math/MathML",
                    "parentName": "span"
                  }}><semantics {...{
                      "parentName": "math"
                    }}><mrow {...{
                        "parentName": "semantics"
                      }}><msup {...{
                          "parentName": "mrow"
                        }}><mi {...{
                            "parentName": "msup"
                          }}>{`a`}</mi>
                          <mo {...{
                            "mathvariant": "normal",
                            "lspace": "0em",
                            "rspace": "0em",
                            "parentName": "msup"
                          }}>{`′`}</mo></msup>
                        <mo {...{
                          "parentName": "mrow"
                        }}>{`=`}</mo>
                        <mi {...{
                          "parentName": "mrow"
                        }}>{`π`}</mi>
                        <mo {...{
                          "stretchy": "false",
                          "parentName": "mrow"
                        }}>{`(`}</mo>
                        <msup {...{
                          "parentName": "mrow"
                        }}><mi {...{
                            "parentName": "msup"
                          }}>{`s`}</mi>
                          <mo {...{
                            "mathvariant": "normal",
                            "lspace": "0em",
                            "rspace": "0em",
                            "parentName": "msup"
                          }}>{`′`}</mo></msup>
                        <mo {...{
                          "stretchy": "false",
                          "parentName": "mrow"
                        }}>{`)`}</mo></mrow>
                      <annotation {...{
                        "encoding": "application/x-tex",
                        "parentName": "semantics"
                      }}>{`a' = \\pi(s')`}</annotation></semantics></math></span>
                <span {...{
                  "className": "katex-html",
                  "aria-hidden": "true",
                  "parentName": "span"
                }}><span {...{
                    "className": "base",
                    "parentName": "span"
                  }}><span {...{
                      "className": "strut",
                      "style": {
                        "height": "0.751892em",
                        "verticalAlign": "0em"
                      },
                      "parentName": "span"
                    }} />
                    <span {...{
                      "className": "mord",
                      "parentName": "span"
                    }}><span {...{
                        "className": "mord mathnormal",
                        "parentName": "span"
                      }}>{`a`}</span>
                      <span {...{
                        "className": "msupsub",
                        "parentName": "span"
                      }}><span {...{
                          "className": "vlist-t",
                          "parentName": "span"
                        }}><span {...{
                            "className": "vlist-r",
                            "parentName": "span"
                          }}><span {...{
                              "className": "vlist",
                              "style": {
                                "height": "0.751892em"
                              },
                              "parentName": "span"
                            }}><span {...{
                                "style": {
                                  "top": "-3.063em",
                                  "marginRight": "0.05em"
                                },
                                "parentName": "span"
                              }}><span {...{
                                  "className": "pstrut",
                                  "style": {
                                    "height": "2.7em"
                                  },
                                  "parentName": "span"
                                }} />
                                <span {...{
                                  "className": "sizing reset-size6 size3 mtight",
                                  "parentName": "span"
                                }}><span {...{
                                    "className": "mord mtight",
                                    "parentName": "span"
                                  }}><span {...{
                                      "className": "mord mtight",
                                      "parentName": "span"
                                    }}>{`′`}</span></span></span></span></span></span></span></span></span>
                    <span {...{
                      "className": "mspace",
                      "style": {
                        "marginRight": "0.2777777777777778em"
                      },
                      "parentName": "span"
                    }} />
                    <span {...{
                      "className": "mrel",
                      "parentName": "span"
                    }}>{`=`}</span>
                    <span {...{
                      "className": "mspace",
                      "style": {
                        "marginRight": "0.2777777777777778em"
                      },
                      "parentName": "span"
                    }} /></span>
                  <span {...{
                    "className": "base",
                    "parentName": "span"
                  }}><span {...{
                      "className": "strut",
                      "style": {
                        "height": "1.001892em",
                        "verticalAlign": "-0.25em"
                      },
                      "parentName": "span"
                    }} />
                    <span {...{
                      "className": "mord mathnormal",
                      "style": {
                        "marginRight": "0.03588em"
                      },
                      "parentName": "span"
                    }}>{`π`}</span>
                    <span {...{
                      "className": "mopen",
                      "parentName": "span"
                    }}>{`(`}</span>
                    <span {...{
                      "className": "mord",
                      "parentName": "span"
                    }}><span {...{
                        "className": "mord mathnormal",
                        "parentName": "span"
                      }}>{`s`}</span>
                      <span {...{
                        "className": "msupsub",
                        "parentName": "span"
                      }}><span {...{
                          "className": "vlist-t",
                          "parentName": "span"
                        }}><span {...{
                            "className": "vlist-r",
                            "parentName": "span"
                          }}><span {...{
                              "className": "vlist",
                              "style": {
                                "height": "0.751892em"
                              },
                              "parentName": "span"
                            }}><span {...{
                                "style": {
                                  "top": "-3.063em",
                                  "marginRight": "0.05em"
                                },
                                "parentName": "span"
                              }}><span {...{
                                  "className": "pstrut",
                                  "style": {
                                    "height": "2.7em"
                                  },
                                  "parentName": "span"
                                }} />
                                <span {...{
                                  "className": "sizing reset-size6 size3 mtight",
                                  "parentName": "span"
                                }}><span {...{
                                    "className": "mord mtight",
                                    "parentName": "span"
                                  }}><span {...{
                                      "className": "mord mtight",
                                      "parentName": "span"
                                    }}>{`′`}</span></span></span></span></span></span></span></span></span>
                    <span {...{
                      "className": "mclose",
                      "parentName": "span"
                    }}>{`)`}</span></span></span></span></span>
            {` is the action chosen by the current policy in the next
state.`}
            <SideNote snId="detvsstoch" mdxType="SideNote">{`The Bellman equation looks slightly more
complicated if the policy and/or environment is/are stochastic, yet the
intuition remains the same. They are provided in Sutton & Barto's
book.`}</SideNote>
            {` The Sarsa update is said to be
`}
            <em {...{
              "parentName": "p"
            }}>{`on-policy`}</em>
            {` because the Bellman estimate uses the Q-value of the next action that is
actually chosen by the current policy. The Q-learning update, on the other hand,
is said to be `}
            <em {...{
              "parentName": "p"
            }}>{`off-policy`}</em>
            {`, because it always considers the maximum Q-value in
the next state, regardless of which action the agent chooses next.`}</p>

        </li>

      </ul>
      <span className="algoLineMainText">
        <KatexInline formula={`7: S \\leftarrow
  S'; A \\leftarrow A'`} mdxType="KatexInline" />
      </span>
      <ul>

        <li {...{
          "parentName": "ul"
        }}>{`The `}
          {`“`}
          {`next state`}
          {`”`}
          {` becomes the `}
          {`“`}
          {`current state`}
          {`”`}
          {` of the
next iteration, just as in Q-learning. In addition, the `}
          {`“`}
          {`next
action`}
          {`”`}
          {` becomes the `}
          {`“`}
          {`current action`}
          {`”`}
          {`.`}</li>

      </ul>
      <h4>{`Q-learning vs. Sarsa`}</h4>
      <p>{`What difference does it make to use the Sarsa update rule in place of the
Q-learning update rule? Let's first compare the policies learned by the two
algorithms in the Pancakes Gridworld and then try to generalize our findings.`}</p>
      <p>{`You might have noticed that the Pancakes Gridworld in the sketch above is
different from the one that was used in the Q-learning post. The version used in
the present post imitates the `}
        {`“`}
        {`Cliff Walking`}
        {`”`}
        {` example from Sutton &
Barto's book.`}
        <SideNote snId="cliff" mdxType="SideNote">{`Example 6.6. (page 132) in Sutton & Barto
(`}
          <a href="http://incompleteideas.net/book/the-book-2nd.html">{`2018`}</a>
          {`).`}</SideNote>
        {`
It's a great example to show the difference between on-policy and off-policy
learning for an explorative, stochastic policy (for example, the
`}
        <span {...{
          "className": "math math-inline",
          "parentName": "p"
        }}><span {...{
            "className": "katex",
            "parentName": "span"
          }}><span {...{
              "className": "katex-mathml",
              "parentName": "span"
            }}><math {...{
                "xmlns": "http://www.w3.org/1998/Math/MathML",
                "parentName": "span"
              }}><semantics {...{
                  "parentName": "math"
                }}><mrow {...{
                    "parentName": "semantics"
                  }}><mi {...{
                      "parentName": "mrow"
                    }}>{`ϵ`}</mi></mrow>
                  <annotation {...{
                    "encoding": "application/x-tex",
                    "parentName": "semantics"
                  }}>{`\\epsilon`}</annotation></semantics></math></span>
            <span {...{
              "className": "katex-html",
              "aria-hidden": "true",
              "parentName": "span"
            }}><span {...{
                "className": "base",
                "parentName": "span"
              }}><span {...{
                  "className": "strut",
                  "style": {
                    "height": "0.43056em",
                    "verticalAlign": "0em"
                  },
                  "parentName": "span"
                }} />
                <span {...{
                  "className": "mord mathnormal",
                  "parentName": "span"
                }}>{`ϵ`}</span></span></span></span></span>
        {`-greedy policy ), as you will see further below.`}</p>
      <p>{`The following sketch shows a `}
        <strong {...{
          "parentName": "p"
        }}>{`Q-learning`}</strong>
        {` agent in the
same Pancakes environment that was used in the Sarsa-sketch at the top of this
post.`}</p>
      <Sketch sketch={sketchQ} mdxType="Sketch" />
      <p>{`If you run both algorithms for long enough, the typical (non-exploring) trajectories taken
by the respective agents will eventually look as follows.`}
        <SideNote snId="exploon" mdxType="SideNote">{`
Assuming that both algorithms used a constant, small exploration
parameter `}
          <span {...{
            "className": "math math-inline"
          }}><span {...{
              "className": "katex",
              "parentName": "span"
            }}><span {...{
                "className": "katex-mathml",
                "parentName": "span"
              }}><math {...{
                  "xmlns": "http://www.w3.org/1998/Math/MathML",
                  "parentName": "span"
                }}><semantics {...{
                    "parentName": "math"
                  }}><mrow {...{
                      "parentName": "semantics"
                    }}><mi {...{
                        "parentName": "mrow"
                      }}>{`ϵ`}</mi></mrow>
                    <annotation {...{
                      "encoding": "application/x-tex",
                      "parentName": "semantics"
                    }}>{`\\epsilon`}</annotation></semantics></math></span>
              <span {...{
                "className": "katex-html",
                "aria-hidden": "true",
                "parentName": "span"
              }}><span {...{
                  "className": "base",
                  "parentName": "span"
                }}><span {...{
                    "className": "strut",
                    "style": {
                      "height": "0.43056em",
                      "verticalAlign": "0em"
                    },
                    "parentName": "span"
                  }} />
                  <span {...{
                    "className": "mord mathnormal",
                    "parentName": "span"
                  }}>{`ϵ`}</span></span></span></span></span>
          {` `}
          <em>{`during`}</em>
          {` the learning process and then turned off
exploration (i.e., `}
          <span {...{
            "className": "math math-inline"
          }}><span {...{
              "className": "katex",
              "parentName": "span"
            }}><span {...{
                "className": "katex-mathml",
                "parentName": "span"
              }}><math {...{
                  "xmlns": "http://www.w3.org/1998/Math/MathML",
                  "parentName": "span"
                }}><semantics {...{
                    "parentName": "math"
                  }}><mrow {...{
                      "parentName": "semantics"
                    }}><mi {...{
                        "parentName": "mrow"
                      }}>{`ϵ`}</mi>
                      <mo {...{
                        "parentName": "mrow"
                      }}>{`=`}</mo>
                      <mn {...{
                        "parentName": "mrow"
                      }}>{`0`}</mn></mrow>
                    <annotation {...{
                      "encoding": "application/x-tex",
                      "parentName": "semantics"
                    }}>{`\\epsilon=0`}</annotation></semantics></math></span>
              <span {...{
                "className": "katex-html",
                "aria-hidden": "true",
                "parentName": "span"
              }}><span {...{
                  "className": "base",
                  "parentName": "span"
                }}><span {...{
                    "className": "strut",
                    "style": {
                      "height": "0.43056em",
                      "verticalAlign": "0em"
                    },
                    "parentName": "span"
                  }} />
                  <span {...{
                    "className": "mord mathnormal",
                    "parentName": "span"
                  }}>{`ϵ`}</span>
                  <span {...{
                    "className": "mspace",
                    "style": {
                      "marginRight": "0.2777777777777778em"
                    },
                    "parentName": "span"
                  }} />
                  <span {...{
                    "className": "mrel",
                    "parentName": "span"
                  }}>{`=`}</span>
                  <span {...{
                    "className": "mspace",
                    "style": {
                      "marginRight": "0.2777777777777778em"
                    },
                    "parentName": "span"
                  }} /></span>
                <span {...{
                  "className": "base",
                  "parentName": "span"
                }}><span {...{
                    "className": "strut",
                    "style": {
                      "height": "0.64444em",
                      "verticalAlign": "0em"
                    },
                    "parentName": "span"
                  }} />
                  <span {...{
                    "className": "mord",
                    "parentName": "span"
                  }}>{`0`}</span></span></span></span></span>
          {`) to create these trajectories.`}</SideNote>
        {`
`}
        <Sketch sketch={sketchPolicy} mdxType="Sketch" /></p>
      <p><em {...{
          "parentName": "p"
        }}>{`Why are the policies learned by Q-learning and Sarsa different?`}</em></p>
      <p>{`The different policies are a direct consequence of the different update rules
used by the two algorithms. In what follows, we'll consider an example
transition and compare the corresponding learning updates.`}</p>
      <p>{`Assume that the agent has already interacted with the environment and has learned the
following value estimates (hover your mouse above a grid cell to see numerical values).
`}
        <Sketch sketch={sketchValue} mdxType="Sketch" /></p>
      <p>{`Furthermore assume that the agent transitioned from `}
        <span {...{
          "className": "math math-inline",
          "parentName": "p"
        }}><span {...{
            "className": "katex",
            "parentName": "span"
          }}><span {...{
              "className": "katex-mathml",
              "parentName": "span"
            }}><math {...{
                "xmlns": "http://www.w3.org/1998/Math/MathML",
                "parentName": "span"
              }}><semantics {...{
                  "parentName": "math"
                }}><mrow {...{
                    "parentName": "semantics"
                  }}><msub {...{
                      "parentName": "mrow"
                    }}><mi {...{
                        "parentName": "msub"
                      }}>{`s`}</mi>
                      <mn {...{
                        "parentName": "msub"
                      }}>{`5`}</mn></msub></mrow>
                  <annotation {...{
                    "encoding": "application/x-tex",
                    "parentName": "semantics"
                  }}>{`s_5`}</annotation></semantics></math></span>
            <span {...{
              "className": "katex-html",
              "aria-hidden": "true",
              "parentName": "span"
            }}><span {...{
                "className": "base",
                "parentName": "span"
              }}><span {...{
                  "className": "strut",
                  "style": {
                    "height": "0.58056em",
                    "verticalAlign": "-0.15em"
                  },
                  "parentName": "span"
                }} />
                <span {...{
                  "className": "mord",
                  "parentName": "span"
                }}><span {...{
                    "className": "mord mathnormal",
                    "parentName": "span"
                  }}>{`s`}</span>
                  <span {...{
                    "className": "msupsub",
                    "parentName": "span"
                  }}><span {...{
                      "className": "vlist-t vlist-t2",
                      "parentName": "span"
                    }}><span {...{
                        "className": "vlist-r",
                        "parentName": "span"
                      }}><span {...{
                          "className": "vlist",
                          "style": {
                            "height": "0.30110799999999993em"
                          },
                          "parentName": "span"
                        }}><span {...{
                            "style": {
                              "top": "-2.5500000000000003em",
                              "marginLeft": "0em",
                              "marginRight": "0.05em"
                            },
                            "parentName": "span"
                          }}><span {...{
                              "className": "pstrut",
                              "style": {
                                "height": "2.7em"
                              },
                              "parentName": "span"
                            }} />
                            <span {...{
                              "className": "sizing reset-size6 size3 mtight",
                              "parentName": "span"
                            }}><span {...{
                                "className": "mord mtight",
                                "parentName": "span"
                              }}>{`5`}</span></span></span></span>
                        <span {...{
                          "className": "vlist-s",
                          "parentName": "span"
                        }}>{`​`}</span></span>
                      <span {...{
                        "className": "vlist-r",
                        "parentName": "span"
                      }}><span {...{
                          "className": "vlist",
                          "style": {
                            "height": "0.15em"
                          },
                          "parentName": "span"
                        }}><span {...{
                            "parentName": "span"
                          }} /></span></span></span></span></span></span></span></span></span>
        {` to `}
        <span {...{
          "className": "math math-inline",
          "parentName": "p"
        }}><span {...{
            "className": "katex",
            "parentName": "span"
          }}><span {...{
              "className": "katex-mathml",
              "parentName": "span"
            }}><math {...{
                "xmlns": "http://www.w3.org/1998/Math/MathML",
                "parentName": "span"
              }}><semantics {...{
                  "parentName": "math"
                }}><mrow {...{
                    "parentName": "semantics"
                  }}><msub {...{
                      "parentName": "mrow"
                    }}><mi {...{
                        "parentName": "msub"
                      }}>{`s`}</mi>
                      <mn {...{
                        "parentName": "msub"
                      }}>{`6`}</mn></msub></mrow>
                  <annotation {...{
                    "encoding": "application/x-tex",
                    "parentName": "semantics"
                  }}>{`s_6`}</annotation></semantics></math></span>
            <span {...{
              "className": "katex-html",
              "aria-hidden": "true",
              "parentName": "span"
            }}><span {...{
                "className": "base",
                "parentName": "span"
              }}><span {...{
                  "className": "strut",
                  "style": {
                    "height": "0.58056em",
                    "verticalAlign": "-0.15em"
                  },
                  "parentName": "span"
                }} />
                <span {...{
                  "className": "mord",
                  "parentName": "span"
                }}><span {...{
                    "className": "mord mathnormal",
                    "parentName": "span"
                  }}>{`s`}</span>
                  <span {...{
                    "className": "msupsub",
                    "parentName": "span"
                  }}><span {...{
                      "className": "vlist-t vlist-t2",
                      "parentName": "span"
                    }}><span {...{
                        "className": "vlist-r",
                        "parentName": "span"
                      }}><span {...{
                          "className": "vlist",
                          "style": {
                            "height": "0.30110799999999993em"
                          },
                          "parentName": "span"
                        }}><span {...{
                            "style": {
                              "top": "-2.5500000000000003em",
                              "marginLeft": "0em",
                              "marginRight": "0.05em"
                            },
                            "parentName": "span"
                          }}><span {...{
                              "className": "pstrut",
                              "style": {
                                "height": "2.7em"
                              },
                              "parentName": "span"
                            }} />
                            <span {...{
                              "className": "sizing reset-size6 size3 mtight",
                              "parentName": "span"
                            }}><span {...{
                                "className": "mord mtight",
                                "parentName": "span"
                              }}>{`6`}</span></span></span></span>
                        <span {...{
                          "className": "vlist-s",
                          "parentName": "span"
                        }}>{`​`}</span></span>
                      <span {...{
                        "className": "vlist-r",
                        "parentName": "span"
                      }}><span {...{
                          "className": "vlist",
                          "style": {
                            "height": "0.15em"
                          },
                          "parentName": "span"
                        }}><span {...{
                            "parentName": "span"
                          }} /></span></span></span></span></span></span></span></span></span>
        {` and then
makes an exploratory`}
        <SideNote snId="explolink" mdxType="SideNote">{` Recall that sometimes the agent needs to randomly explore
the environment during the learning process, as
described `}
          <Link to="/posts/qlearning#faq" mdxType="Link">{`here`}</Link>
          {`.`}</SideNote>
        {` move
and selects action `}
        {`“`}
        {`south`}
        {`”`}
        {`, or more formally:
`}
        <KatexBlock formula={`
      S = s_5, \\ A = east, \\ R = -1, \\ S' = s_6, \\ A' = south.
  `} mdxType="KatexBlock" /></p>
      <p>{`The Sarsa update for this transition is given by
`}
        <KatexBlock formula={`
      Q(s_5, east) \\leftarrow (1 - \\alpha) \\ Q(s_5, east) + 
      \\alpha \\ [-1 + \\gamma Q(s_6, \\textcolor{blue}{south})],
  `} mdxType="KatexBlock" /></p>
      <p>{`whereas the Q-learning update is given by
`}
        <KatexBlock formula={`
      Q(s_5, east) \\leftarrow (1 - \\alpha) \\ Q(s_5, east) + 
      \\alpha \\ [-1 + \\gamma Q(s_6, \\textcolor{red}{east})].
  `} mdxType="KatexBlock" /></p>
      <p>{`To put it in words, the Sarsa update uses the value corresponding to the
`}
        {`“`}
        {`on-policy action`}
        {`”`}
        {` for its update. Q-learning, on the other hand, uses the
value-maximizing action (in `}
        <span {...{
          "className": "math math-inline",
          "parentName": "p"
        }}><span {...{
            "className": "katex",
            "parentName": "span"
          }}><span {...{
              "className": "katex-mathml",
              "parentName": "span"
            }}><math {...{
                "xmlns": "http://www.w3.org/1998/Math/MathML",
                "parentName": "span"
              }}><semantics {...{
                  "parentName": "math"
                }}><mrow {...{
                    "parentName": "semantics"
                  }}><msub {...{
                      "parentName": "mrow"
                    }}><mi {...{
                        "parentName": "msub"
                      }}>{`s`}</mi>
                      <mn {...{
                        "parentName": "msub"
                      }}>{`6`}</mn></msub></mrow>
                  <annotation {...{
                    "encoding": "application/x-tex",
                    "parentName": "semantics"
                  }}>{`s_6`}</annotation></semantics></math></span>
            <span {...{
              "className": "katex-html",
              "aria-hidden": "true",
              "parentName": "span"
            }}><span {...{
                "className": "base",
                "parentName": "span"
              }}><span {...{
                  "className": "strut",
                  "style": {
                    "height": "0.58056em",
                    "verticalAlign": "-0.15em"
                  },
                  "parentName": "span"
                }} />
                <span {...{
                  "className": "mord",
                  "parentName": "span"
                }}><span {...{
                    "className": "mord mathnormal",
                    "parentName": "span"
                  }}>{`s`}</span>
                  <span {...{
                    "className": "msupsub",
                    "parentName": "span"
                  }}><span {...{
                      "className": "vlist-t vlist-t2",
                      "parentName": "span"
                    }}><span {...{
                        "className": "vlist-r",
                        "parentName": "span"
                      }}><span {...{
                          "className": "vlist",
                          "style": {
                            "height": "0.30110799999999993em"
                          },
                          "parentName": "span"
                        }}><span {...{
                            "style": {
                              "top": "-2.5500000000000003em",
                              "marginLeft": "0em",
                              "marginRight": "0.05em"
                            },
                            "parentName": "span"
                          }}><span {...{
                              "className": "pstrut",
                              "style": {
                                "height": "2.7em"
                              },
                              "parentName": "span"
                            }} />
                            <span {...{
                              "className": "sizing reset-size6 size3 mtight",
                              "parentName": "span"
                            }}><span {...{
                                "className": "mord mtight",
                                "parentName": "span"
                              }}>{`6`}</span></span></span></span>
                        <span {...{
                          "className": "vlist-s",
                          "parentName": "span"
                        }}>{`​`}</span></span>
                      <span {...{
                        "className": "vlist-r",
                        "parentName": "span"
                      }}><span {...{
                          "className": "vlist",
                          "style": {
                            "height": "0.15em"
                          },
                          "parentName": "span"
                        }}><span {...{
                            "parentName": "span"
                          }} /></span></span></span></span></span></span></span></span></span>
        {`, the action `}
        {`“`}
        <span {...{
          "className": "math math-inline",
          "parentName": "p"
        }}><span {...{
            "className": "katex",
            "parentName": "span"
          }}><span {...{
              "className": "katex-mathml",
              "parentName": "span"
            }}><math {...{
                "xmlns": "http://www.w3.org/1998/Math/MathML",
                "parentName": "span"
              }}><semantics {...{
                  "parentName": "math"
                }}><mrow {...{
                    "parentName": "semantics"
                  }}><mi {...{
                      "parentName": "mrow"
                    }}>{`e`}</mi>
                    <mi {...{
                      "parentName": "mrow"
                    }}>{`a`}</mi>
                    <mi {...{
                      "parentName": "mrow"
                    }}>{`s`}</mi>
                    <mi {...{
                      "parentName": "mrow"
                    }}>{`t`}</mi></mrow>
                  <annotation {...{
                    "encoding": "application/x-tex",
                    "parentName": "semantics"
                  }}>{`east`}</annotation></semantics></math></span>
            <span {...{
              "className": "katex-html",
              "aria-hidden": "true",
              "parentName": "span"
            }}><span {...{
                "className": "base",
                "parentName": "span"
              }}><span {...{
                  "className": "strut",
                  "style": {
                    "height": "0.61508em",
                    "verticalAlign": "0em"
                  },
                  "parentName": "span"
                }} />
                <span {...{
                  "className": "mord mathnormal",
                  "parentName": "span"
                }}>{`e`}</span>
                <span {...{
                  "className": "mord mathnormal",
                  "parentName": "span"
                }}>{`a`}</span>
                <span {...{
                  "className": "mord mathnormal",
                  "parentName": "span"
                }}>{`s`}</span>
                <span {...{
                  "className": "mord mathnormal",
                  "parentName": "span"
                }}>{`t`}</span></span></span></span></span>
        {`”`}
        {` has the highest value
according to the current estimates). If we plug in the actual values (and
setting `}
        <span {...{
          "className": "math math-inline",
          "parentName": "p"
        }}><span {...{
            "className": "katex",
            "parentName": "span"
          }}><span {...{
              "className": "katex-mathml",
              "parentName": "span"
            }}><math {...{
                "xmlns": "http://www.w3.org/1998/Math/MathML",
                "parentName": "span"
              }}><semantics {...{
                  "parentName": "math"
                }}><mrow {...{
                    "parentName": "semantics"
                  }}><mi {...{
                      "parentName": "mrow"
                    }}>{`α`}</mi>
                    <mo {...{
                      "parentName": "mrow"
                    }}>{`=`}</mo>
                    <mn {...{
                      "parentName": "mrow"
                    }}>{`0.2`}</mn></mrow>
                  <annotation {...{
                    "encoding": "application/x-tex",
                    "parentName": "semantics"
                  }}>{`\\alpha = 0.2`}</annotation></semantics></math></span>
            <span {...{
              "className": "katex-html",
              "aria-hidden": "true",
              "parentName": "span"
            }}><span {...{
                "className": "base",
                "parentName": "span"
              }}><span {...{
                  "className": "strut",
                  "style": {
                    "height": "0.43056em",
                    "verticalAlign": "0em"
                  },
                  "parentName": "span"
                }} />
                <span {...{
                  "className": "mord mathnormal",
                  "style": {
                    "marginRight": "0.0037em"
                  },
                  "parentName": "span"
                }}>{`α`}</span>
                <span {...{
                  "className": "mspace",
                  "style": {
                    "marginRight": "0.2777777777777778em"
                  },
                  "parentName": "span"
                }} />
                <span {...{
                  "className": "mrel",
                  "parentName": "span"
                }}>{`=`}</span>
                <span {...{
                  "className": "mspace",
                  "style": {
                    "marginRight": "0.2777777777777778em"
                  },
                  "parentName": "span"
                }} /></span>
              <span {...{
                "className": "base",
                "parentName": "span"
              }}><span {...{
                  "className": "strut",
                  "style": {
                    "height": "0.64444em",
                    "verticalAlign": "0em"
                  },
                  "parentName": "span"
                }} />
                <span {...{
                  "className": "mord",
                  "parentName": "span"
                }}>{`0`}</span>
                <span {...{
                  "className": "mord",
                  "parentName": "span"
                }}>{`.`}</span>
                <span {...{
                  "className": "mord",
                  "parentName": "span"
                }}>{`2`}</span></span></span></span></span>
        {` and `}
        <span {...{
          "className": "math math-inline",
          "parentName": "p"
        }}><span {...{
            "className": "katex",
            "parentName": "span"
          }}><span {...{
              "className": "katex-mathml",
              "parentName": "span"
            }}><math {...{
                "xmlns": "http://www.w3.org/1998/Math/MathML",
                "parentName": "span"
              }}><semantics {...{
                  "parentName": "math"
                }}><mrow {...{
                    "parentName": "semantics"
                  }}><mi {...{
                      "parentName": "mrow"
                    }}>{`γ`}</mi>
                    <mo {...{
                      "parentName": "mrow"
                    }}>{`=`}</mo>
                    <mn {...{
                      "parentName": "mrow"
                    }}>{`1`}</mn></mrow>
                  <annotation {...{
                    "encoding": "application/x-tex",
                    "parentName": "semantics"
                  }}>{`\\gamma = 1`}</annotation></semantics></math></span>
            <span {...{
              "className": "katex-html",
              "aria-hidden": "true",
              "parentName": "span"
            }}><span {...{
                "className": "base",
                "parentName": "span"
              }}><span {...{
                  "className": "strut",
                  "style": {
                    "height": "0.625em",
                    "verticalAlign": "-0.19444em"
                  },
                  "parentName": "span"
                }} />
                <span {...{
                  "className": "mord mathnormal",
                  "style": {
                    "marginRight": "0.05556em"
                  },
                  "parentName": "span"
                }}>{`γ`}</span>
                <span {...{
                  "className": "mspace",
                  "style": {
                    "marginRight": "0.2777777777777778em"
                  },
                  "parentName": "span"
                }} />
                <span {...{
                  "className": "mrel",
                  "parentName": "span"
                }}>{`=`}</span>
                <span {...{
                  "className": "mspace",
                  "style": {
                    "marginRight": "0.2777777777777778em"
                  },
                  "parentName": "span"
                }} /></span>
              <span {...{
                "className": "base",
                "parentName": "span"
              }}><span {...{
                  "className": "strut",
                  "style": {
                    "height": "0.64444em",
                    "verticalAlign": "0em"
                  },
                  "parentName": "span"
                }} />
                <span {...{
                  "className": "mord",
                  "parentName": "span"
                }}>{`1`}</span></span></span></span></span>
        {`), we see a
big difference in the new value estimates for `}
        <span {...{
          "className": "math math-inline",
          "parentName": "p"
        }}><span {...{
            "className": "katex",
            "parentName": "span"
          }}><span {...{
              "className": "katex-mathml",
              "parentName": "span"
            }}><math {...{
                "xmlns": "http://www.w3.org/1998/Math/MathML",
                "parentName": "span"
              }}><semantics {...{
                  "parentName": "math"
                }}><mrow {...{
                    "parentName": "semantics"
                  }}><mi {...{
                      "parentName": "mrow"
                    }}>{`Q`}</mi>
                    <mo {...{
                      "stretchy": "false",
                      "parentName": "mrow"
                    }}>{`(`}</mo>
                    <msub {...{
                      "parentName": "mrow"
                    }}><mi {...{
                        "parentName": "msub"
                      }}>{`s`}</mi>
                      <mn {...{
                        "parentName": "msub"
                      }}>{`5`}</mn></msub>
                    <mo {...{
                      "separator": "true",
                      "parentName": "mrow"
                    }}>{`,`}</mo>
                    <mi {...{
                      "parentName": "mrow"
                    }}>{`e`}</mi>
                    <mi {...{
                      "parentName": "mrow"
                    }}>{`a`}</mi>
                    <mi {...{
                      "parentName": "mrow"
                    }}>{`s`}</mi>
                    <mi {...{
                      "parentName": "mrow"
                    }}>{`t`}</mi>
                    <mo {...{
                      "stretchy": "false",
                      "parentName": "mrow"
                    }}>{`)`}</mo></mrow>
                  <annotation {...{
                    "encoding": "application/x-tex",
                    "parentName": "semantics"
                  }}>{`Q(s_5, east)`}</annotation></semantics></math></span>
            <span {...{
              "className": "katex-html",
              "aria-hidden": "true",
              "parentName": "span"
            }}><span {...{
                "className": "base",
                "parentName": "span"
              }}><span {...{
                  "className": "strut",
                  "style": {
                    "height": "1em",
                    "verticalAlign": "-0.25em"
                  },
                  "parentName": "span"
                }} />
                <span {...{
                  "className": "mord mathnormal",
                  "parentName": "span"
                }}>{`Q`}</span>
                <span {...{
                  "className": "mopen",
                  "parentName": "span"
                }}>{`(`}</span>
                <span {...{
                  "className": "mord",
                  "parentName": "span"
                }}><span {...{
                    "className": "mord mathnormal",
                    "parentName": "span"
                  }}>{`s`}</span>
                  <span {...{
                    "className": "msupsub",
                    "parentName": "span"
                  }}><span {...{
                      "className": "vlist-t vlist-t2",
                      "parentName": "span"
                    }}><span {...{
                        "className": "vlist-r",
                        "parentName": "span"
                      }}><span {...{
                          "className": "vlist",
                          "style": {
                            "height": "0.30110799999999993em"
                          },
                          "parentName": "span"
                        }}><span {...{
                            "style": {
                              "top": "-2.5500000000000003em",
                              "marginLeft": "0em",
                              "marginRight": "0.05em"
                            },
                            "parentName": "span"
                          }}><span {...{
                              "className": "pstrut",
                              "style": {
                                "height": "2.7em"
                              },
                              "parentName": "span"
                            }} />
                            <span {...{
                              "className": "sizing reset-size6 size3 mtight",
                              "parentName": "span"
                            }}><span {...{
                                "className": "mord mtight",
                                "parentName": "span"
                              }}>{`5`}</span></span></span></span>
                        <span {...{
                          "className": "vlist-s",
                          "parentName": "span"
                        }}>{`​`}</span></span>
                      <span {...{
                        "className": "vlist-r",
                        "parentName": "span"
                      }}><span {...{
                          "className": "vlist",
                          "style": {
                            "height": "0.15em"
                          },
                          "parentName": "span"
                        }}><span {...{
                            "parentName": "span"
                          }} /></span></span></span></span></span>
                <span {...{
                  "className": "mpunct",
                  "parentName": "span"
                }}>{`,`}</span>
                <span {...{
                  "className": "mspace",
                  "style": {
                    "marginRight": "0.16666666666666666em"
                  },
                  "parentName": "span"
                }} />
                <span {...{
                  "className": "mord mathnormal",
                  "parentName": "span"
                }}>{`e`}</span>
                <span {...{
                  "className": "mord mathnormal",
                  "parentName": "span"
                }}>{`a`}</span>
                <span {...{
                  "className": "mord mathnormal",
                  "parentName": "span"
                }}>{`s`}</span>
                <span {...{
                  "className": "mord mathnormal",
                  "parentName": "span"
                }}>{`t`}</span>
                <span {...{
                  "className": "mclose",
                  "parentName": "span"
                }}>{`)`}</span></span></span></span></span>
        {`, depending on which
update rule is used: For Sarsa, the new value is given by
`}
        <KatexBlock formula={`
      Q(s_5, east) =  0.8 \\times (-1.82) + 
      0.2 \\times [-1 + (\\textcolor{blue}{-36.00})] = -8.86.
  `} mdxType="KatexBlock" /></p>
      <p>{`For Q-learning, the new value estimate for `}
        <span {...{
          "className": "math math-inline",
          "parentName": "p"
        }}><span {...{
            "className": "katex",
            "parentName": "span"
          }}><span {...{
              "className": "katex-mathml",
              "parentName": "span"
            }}><math {...{
                "xmlns": "http://www.w3.org/1998/Math/MathML",
                "parentName": "span"
              }}><semantics {...{
                  "parentName": "math"
                }}><mrow {...{
                    "parentName": "semantics"
                  }}><mi {...{
                      "parentName": "mrow"
                    }}>{`Q`}</mi>
                    <mo {...{
                      "stretchy": "false",
                      "parentName": "mrow"
                    }}>{`(`}</mo>
                    <msub {...{
                      "parentName": "mrow"
                    }}><mi {...{
                        "parentName": "msub"
                      }}>{`s`}</mi>
                      <mn {...{
                        "parentName": "msub"
                      }}>{`5`}</mn></msub>
                    <mo {...{
                      "separator": "true",
                      "parentName": "mrow"
                    }}>{`,`}</mo>
                    <mi {...{
                      "parentName": "mrow"
                    }}>{`e`}</mi>
                    <mi {...{
                      "parentName": "mrow"
                    }}>{`a`}</mi>
                    <mi {...{
                      "parentName": "mrow"
                    }}>{`s`}</mi>
                    <mi {...{
                      "parentName": "mrow"
                    }}>{`t`}</mi>
                    <mo {...{
                      "stretchy": "false",
                      "parentName": "mrow"
                    }}>{`)`}</mo></mrow>
                  <annotation {...{
                    "encoding": "application/x-tex",
                    "parentName": "semantics"
                  }}>{`Q(s_5, east)`}</annotation></semantics></math></span>
            <span {...{
              "className": "katex-html",
              "aria-hidden": "true",
              "parentName": "span"
            }}><span {...{
                "className": "base",
                "parentName": "span"
              }}><span {...{
                  "className": "strut",
                  "style": {
                    "height": "1em",
                    "verticalAlign": "-0.25em"
                  },
                  "parentName": "span"
                }} />
                <span {...{
                  "className": "mord mathnormal",
                  "parentName": "span"
                }}>{`Q`}</span>
                <span {...{
                  "className": "mopen",
                  "parentName": "span"
                }}>{`(`}</span>
                <span {...{
                  "className": "mord",
                  "parentName": "span"
                }}><span {...{
                    "className": "mord mathnormal",
                    "parentName": "span"
                  }}>{`s`}</span>
                  <span {...{
                    "className": "msupsub",
                    "parentName": "span"
                  }}><span {...{
                      "className": "vlist-t vlist-t2",
                      "parentName": "span"
                    }}><span {...{
                        "className": "vlist-r",
                        "parentName": "span"
                      }}><span {...{
                          "className": "vlist",
                          "style": {
                            "height": "0.30110799999999993em"
                          },
                          "parentName": "span"
                        }}><span {...{
                            "style": {
                              "top": "-2.5500000000000003em",
                              "marginLeft": "0em",
                              "marginRight": "0.05em"
                            },
                            "parentName": "span"
                          }}><span {...{
                              "className": "pstrut",
                              "style": {
                                "height": "2.7em"
                              },
                              "parentName": "span"
                            }} />
                            <span {...{
                              "className": "sizing reset-size6 size3 mtight",
                              "parentName": "span"
                            }}><span {...{
                                "className": "mord mtight",
                                "parentName": "span"
                              }}>{`5`}</span></span></span></span>
                        <span {...{
                          "className": "vlist-s",
                          "parentName": "span"
                        }}>{`​`}</span></span>
                      <span {...{
                        "className": "vlist-r",
                        "parentName": "span"
                      }}><span {...{
                          "className": "vlist",
                          "style": {
                            "height": "0.15em"
                          },
                          "parentName": "span"
                        }}><span {...{
                            "parentName": "span"
                          }} /></span></span></span></span></span>
                <span {...{
                  "className": "mpunct",
                  "parentName": "span"
                }}>{`,`}</span>
                <span {...{
                  "className": "mspace",
                  "style": {
                    "marginRight": "0.16666666666666666em"
                  },
                  "parentName": "span"
                }} />
                <span {...{
                  "className": "mord mathnormal",
                  "parentName": "span"
                }}>{`e`}</span>
                <span {...{
                  "className": "mord mathnormal",
                  "parentName": "span"
                }}>{`a`}</span>
                <span {...{
                  "className": "mord mathnormal",
                  "parentName": "span"
                }}>{`s`}</span>
                <span {...{
                  "className": "mord mathnormal",
                  "parentName": "span"
                }}>{`t`}</span>
                <span {...{
                  "className": "mclose",
                  "parentName": "span"
                }}>{`)`}</span></span></span></span></span>
        {` is given by
`}
        <KatexBlock formula={`
  Q(s_5, east) =  0.8 \\times (-1.82) + 
  0.2 \\times [-1 + \\textcolor{red}{1.22}] = -1.41.
  `} mdxType="KatexBlock" /></p>
      <p>{`If compared with the value estimate of `}
        <span {...{
          "className": "math math-inline",
          "parentName": "p"
        }}><span {...{
            "className": "katex",
            "parentName": "span"
          }}><span {...{
              "className": "katex-mathml",
              "parentName": "span"
            }}><math {...{
                "xmlns": "http://www.w3.org/1998/Math/MathML",
                "parentName": "span"
              }}><semantics {...{
                  "parentName": "math"
                }}><mrow {...{
                    "parentName": "semantics"
                  }}><mi {...{
                      "parentName": "mrow"
                    }}>{`Q`}</mi>
                    <mo {...{
                      "stretchy": "false",
                      "parentName": "mrow"
                    }}>{`(`}</mo>
                    <msub {...{
                      "parentName": "mrow"
                    }}><mi {...{
                        "parentName": "msub"
                      }}>{`s`}</mi>
                      <mn {...{
                        "parentName": "msub"
                      }}>{`5`}</mn></msub>
                    <mo {...{
                      "separator": "true",
                      "parentName": "mrow"
                    }}>{`,`}</mo>
                    <mi {...{
                      "parentName": "mrow"
                    }}>{`e`}</mi>
                    <mi {...{
                      "parentName": "mrow"
                    }}>{`a`}</mi>
                    <mi {...{
                      "parentName": "mrow"
                    }}>{`s`}</mi>
                    <mi {...{
                      "parentName": "mrow"
                    }}>{`t`}</mi>
                    <mo {...{
                      "stretchy": "false",
                      "parentName": "mrow"
                    }}>{`)`}</mo></mrow>
                  <annotation {...{
                    "encoding": "application/x-tex",
                    "parentName": "semantics"
                  }}>{`Q(s_5, east)`}</annotation></semantics></math></span>
            <span {...{
              "className": "katex-html",
              "aria-hidden": "true",
              "parentName": "span"
            }}><span {...{
                "className": "base",
                "parentName": "span"
              }}><span {...{
                  "className": "strut",
                  "style": {
                    "height": "1em",
                    "verticalAlign": "-0.25em"
                  },
                  "parentName": "span"
                }} />
                <span {...{
                  "className": "mord mathnormal",
                  "parentName": "span"
                }}>{`Q`}</span>
                <span {...{
                  "className": "mopen",
                  "parentName": "span"
                }}>{`(`}</span>
                <span {...{
                  "className": "mord",
                  "parentName": "span"
                }}><span {...{
                    "className": "mord mathnormal",
                    "parentName": "span"
                  }}>{`s`}</span>
                  <span {...{
                    "className": "msupsub",
                    "parentName": "span"
                  }}><span {...{
                      "className": "vlist-t vlist-t2",
                      "parentName": "span"
                    }}><span {...{
                        "className": "vlist-r",
                        "parentName": "span"
                      }}><span {...{
                          "className": "vlist",
                          "style": {
                            "height": "0.30110799999999993em"
                          },
                          "parentName": "span"
                        }}><span {...{
                            "style": {
                              "top": "-2.5500000000000003em",
                              "marginLeft": "0em",
                              "marginRight": "0.05em"
                            },
                            "parentName": "span"
                          }}><span {...{
                              "className": "pstrut",
                              "style": {
                                "height": "2.7em"
                              },
                              "parentName": "span"
                            }} />
                            <span {...{
                              "className": "sizing reset-size6 size3 mtight",
                              "parentName": "span"
                            }}><span {...{
                                "className": "mord mtight",
                                "parentName": "span"
                              }}>{`5`}</span></span></span></span>
                        <span {...{
                          "className": "vlist-s",
                          "parentName": "span"
                        }}>{`​`}</span></span>
                      <span {...{
                        "className": "vlist-r",
                        "parentName": "span"
                      }}><span {...{
                          "className": "vlist",
                          "style": {
                            "height": "0.15em"
                          },
                          "parentName": "span"
                        }}><span {...{
                            "parentName": "span"
                          }} /></span></span></span></span></span>
                <span {...{
                  "className": "mpunct",
                  "parentName": "span"
                }}>{`,`}</span>
                <span {...{
                  "className": "mspace",
                  "style": {
                    "marginRight": "0.16666666666666666em"
                  },
                  "parentName": "span"
                }} />
                <span {...{
                  "className": "mord mathnormal",
                  "parentName": "span"
                }}>{`e`}</span>
                <span {...{
                  "className": "mord mathnormal",
                  "parentName": "span"
                }}>{`a`}</span>
                <span {...{
                  "className": "mord mathnormal",
                  "parentName": "span"
                }}>{`s`}</span>
                <span {...{
                  "className": "mord mathnormal",
                  "parentName": "span"
                }}>{`t`}</span>
                <span {...{
                  "className": "mclose",
                  "parentName": "span"
                }}>{`)`}</span></span></span></span></span>
        {` before the update
(`}
        <span {...{
          "className": "math math-inline",
          "parentName": "p"
        }}><span {...{
            "className": "katex",
            "parentName": "span"
          }}><span {...{
              "className": "katex-mathml",
              "parentName": "span"
            }}><math {...{
                "xmlns": "http://www.w3.org/1998/Math/MathML",
                "parentName": "span"
              }}><semantics {...{
                  "parentName": "math"
                }}><mrow {...{
                    "parentName": "semantics"
                  }}><mo {...{
                      "parentName": "mrow"
                    }}>{`=`}</mo>
                    <mo {...{
                      "parentName": "mrow"
                    }}>{`−`}</mo>
                    <mn {...{
                      "parentName": "mrow"
                    }}>{`1.82`}</mn></mrow>
                  <annotation {...{
                    "encoding": "application/x-tex",
                    "parentName": "semantics"
                  }}>{`=-1.82`}</annotation></semantics></math></span>
            <span {...{
              "className": "katex-html",
              "aria-hidden": "true",
              "parentName": "span"
            }}><span {...{
                "className": "base",
                "parentName": "span"
              }}><span {...{
                  "className": "strut",
                  "style": {
                    "height": "0.36687em",
                    "verticalAlign": "0em"
                  },
                  "parentName": "span"
                }} />
                <span {...{
                  "className": "mrel",
                  "parentName": "span"
                }}>{`=`}</span>
                <span {...{
                  "className": "mspace",
                  "style": {
                    "marginRight": "0.2777777777777778em"
                  },
                  "parentName": "span"
                }} /></span>
              <span {...{
                "className": "base",
                "parentName": "span"
              }}><span {...{
                  "className": "strut",
                  "style": {
                    "height": "0.72777em",
                    "verticalAlign": "-0.08333em"
                  },
                  "parentName": "span"
                }} />
                <span {...{
                  "className": "mord",
                  "parentName": "span"
                }}>{`−`}</span>
                <span {...{
                  "className": "mord",
                  "parentName": "span"
                }}>{`1`}</span>
                <span {...{
                  "className": "mord",
                  "parentName": "span"
                }}>{`.`}</span>
                <span {...{
                  "className": "mord",
                  "parentName": "span"
                }}>{`8`}</span>
                <span {...{
                  "className": "mord",
                  "parentName": "span"
                }}>{`2`}</span></span></span></span></span>
        {`), the Q-learning
update increases the estimate but the Sarsa update decreases the estimate.
Importantly, the new value-maximizing action in state `}
        <span {...{
          "className": "math math-inline",
          "parentName": "p"
        }}><span {...{
            "className": "katex",
            "parentName": "span"
          }}><span {...{
              "className": "katex-mathml",
              "parentName": "span"
            }}><math {...{
                "xmlns": "http://www.w3.org/1998/Math/MathML",
                "parentName": "span"
              }}><semantics {...{
                  "parentName": "math"
                }}><mrow {...{
                    "parentName": "semantics"
                  }}><msub {...{
                      "parentName": "mrow"
                    }}><mi {...{
                        "parentName": "msub"
                      }}>{`s`}</mi>
                      <mn {...{
                        "parentName": "msub"
                      }}>{`5`}</mn></msub></mrow>
                  <annotation {...{
                    "encoding": "application/x-tex",
                    "parentName": "semantics"
                  }}>{`s_5`}</annotation></semantics></math></span>
            <span {...{
              "className": "katex-html",
              "aria-hidden": "true",
              "parentName": "span"
            }}><span {...{
                "className": "base",
                "parentName": "span"
              }}><span {...{
                  "className": "strut",
                  "style": {
                    "height": "0.58056em",
                    "verticalAlign": "-0.15em"
                  },
                  "parentName": "span"
                }} />
                <span {...{
                  "className": "mord",
                  "parentName": "span"
                }}><span {...{
                    "className": "mord mathnormal",
                    "parentName": "span"
                  }}>{`s`}</span>
                  <span {...{
                    "className": "msupsub",
                    "parentName": "span"
                  }}><span {...{
                      "className": "vlist-t vlist-t2",
                      "parentName": "span"
                    }}><span {...{
                        "className": "vlist-r",
                        "parentName": "span"
                      }}><span {...{
                          "className": "vlist",
                          "style": {
                            "height": "0.30110799999999993em"
                          },
                          "parentName": "span"
                        }}><span {...{
                            "style": {
                              "top": "-2.5500000000000003em",
                              "marginLeft": "0em",
                              "marginRight": "0.05em"
                            },
                            "parentName": "span"
                          }}><span {...{
                              "className": "pstrut",
                              "style": {
                                "height": "2.7em"
                              },
                              "parentName": "span"
                            }} />
                            <span {...{
                              "className": "sizing reset-size6 size3 mtight",
                              "parentName": "span"
                            }}><span {...{
                                "className": "mord mtight",
                                "parentName": "span"
                              }}>{`5`}</span></span></span></span>
                        <span {...{
                          "className": "vlist-s",
                          "parentName": "span"
                        }}>{`​`}</span></span>
                      <span {...{
                        "className": "vlist-r",
                        "parentName": "span"
                      }}><span {...{
                          "className": "vlist",
                          "style": {
                            "height": "0.15em"
                          },
                          "parentName": "span"
                        }}><span {...{
                            "parentName": "span"
                          }} /></span></span></span></span></span></span></span></span></span>
        {` is now `}
        <span {...{
          "className": "math math-inline",
          "parentName": "p"
        }}><span {...{
            "className": "katex",
            "parentName": "span"
          }}><span {...{
              "className": "katex-mathml",
              "parentName": "span"
            }}><math {...{
                "xmlns": "http://www.w3.org/1998/Math/MathML",
                "parentName": "span"
              }}><semantics {...{
                  "parentName": "math"
                }}><mrow {...{
                    "parentName": "semantics"
                  }}><mi {...{
                      "parentName": "mrow"
                    }}>{`n`}</mi>
                    <mi {...{
                      "parentName": "mrow"
                    }}>{`o`}</mi>
                    <mi {...{
                      "parentName": "mrow"
                    }}>{`r`}</mi>
                    <mi {...{
                      "parentName": "mrow"
                    }}>{`t`}</mi>
                    <mi {...{
                      "parentName": "mrow"
                    }}>{`h`}</mi></mrow>
                  <annotation {...{
                    "encoding": "application/x-tex",
                    "parentName": "semantics"
                  }}>{`north`}</annotation></semantics></math></span>
            <span {...{
              "className": "katex-html",
              "aria-hidden": "true",
              "parentName": "span"
            }}><span {...{
                "className": "base",
                "parentName": "span"
              }}><span {...{
                  "className": "strut",
                  "style": {
                    "height": "0.69444em",
                    "verticalAlign": "0em"
                  },
                  "parentName": "span"
                }} />
                <span {...{
                  "className": "mord mathnormal",
                  "parentName": "span"
                }}>{`n`}</span>
                <span {...{
                  "className": "mord mathnormal",
                  "parentName": "span"
                }}>{`o`}</span>
                <span {...{
                  "className": "mord mathnormal",
                  "style": {
                    "marginRight": "0.02778em"
                  },
                  "parentName": "span"
                }}>{`r`}</span>
                <span {...{
                  "className": "mord mathnormal",
                  "parentName": "span"
                }}>{`t`}</span>
                <span {...{
                  "className": "mord mathnormal",
                  "parentName": "span"
                }}>{`h`}</span></span></span></span></span>
        {`
for the Sarsa agent, whereas it stays `}
        <span {...{
          "className": "math math-inline",
          "parentName": "p"
        }}><span {...{
            "className": "katex",
            "parentName": "span"
          }}><span {...{
              "className": "katex-mathml",
              "parentName": "span"
            }}><math {...{
                "xmlns": "http://www.w3.org/1998/Math/MathML",
                "parentName": "span"
              }}><semantics {...{
                  "parentName": "math"
                }}><mrow {...{
                    "parentName": "semantics"
                  }}><mi {...{
                      "parentName": "mrow"
                    }}>{`e`}</mi>
                    <mi {...{
                      "parentName": "mrow"
                    }}>{`a`}</mi>
                    <mi {...{
                      "parentName": "mrow"
                    }}>{`s`}</mi>
                    <mi {...{
                      "parentName": "mrow"
                    }}>{`t`}</mi></mrow>
                  <annotation {...{
                    "encoding": "application/x-tex",
                    "parentName": "semantics"
                  }}>{`east`}</annotation></semantics></math></span>
            <span {...{
              "className": "katex-html",
              "aria-hidden": "true",
              "parentName": "span"
            }}><span {...{
                "className": "base",
                "parentName": "span"
              }}><span {...{
                  "className": "strut",
                  "style": {
                    "height": "0.61508em",
                    "verticalAlign": "0em"
                  },
                  "parentName": "span"
                }} />
                <span {...{
                  "className": "mord mathnormal",
                  "parentName": "span"
                }}>{`e`}</span>
                <span {...{
                  "className": "mord mathnormal",
                  "parentName": "span"
                }}>{`a`}</span>
                <span {...{
                  "className": "mord mathnormal",
                  "parentName": "span"
                }}>{`s`}</span>
                <span {...{
                  "className": "mord mathnormal",
                  "parentName": "span"
                }}>{`t`}</span></span></span></span></span>
        {` for the Q-learning agent.
This is consistent with the typical trajectories of both algorithms shown further above.`}</p>
      <p><em {...{
          "parentName": "p"
        }}>{`So... which policy is optimal?`}</em></p>
      <p>{`It depends! The Sarsa algorithm learns the
optimal policy for an agent that never ceases to explore. For an exploring agent
it is dangerous to walk too close to the mushrooms and it thus pays off to make the
two extra steps required to take the middle lane (`}
        <span {...{
          "className": "math math-inline",
          "parentName": "p"
        }}><span {...{
            "className": "katex",
            "parentName": "span"
          }}><span {...{
              "className": "katex-mathml",
              "parentName": "span"
            }}><math {...{
                "xmlns": "http://www.w3.org/1998/Math/MathML",
                "parentName": "span"
              }}><semantics {...{
                  "parentName": "math"
                }}><mrow {...{
                    "parentName": "semantics"
                  }}><msub {...{
                      "parentName": "mrow"
                    }}><mi {...{
                        "parentName": "msub"
                      }}>{`s`}</mi>
                      <mn {...{
                        "parentName": "msub"
                      }}>{`10`}</mn></msub></mrow>
                  <annotation {...{
                    "encoding": "application/x-tex",
                    "parentName": "semantics"
                  }}>{`s_{10}`}</annotation></semantics></math></span>
            <span {...{
              "className": "katex-html",
              "aria-hidden": "true",
              "parentName": "span"
            }}><span {...{
                "className": "base",
                "parentName": "span"
              }}><span {...{
                  "className": "strut",
                  "style": {
                    "height": "0.58056em",
                    "verticalAlign": "-0.15em"
                  },
                  "parentName": "span"
                }} />
                <span {...{
                  "className": "mord",
                  "parentName": "span"
                }}><span {...{
                    "className": "mord mathnormal",
                    "parentName": "span"
                  }}>{`s`}</span>
                  <span {...{
                    "className": "msupsub",
                    "parentName": "span"
                  }}><span {...{
                      "className": "vlist-t vlist-t2",
                      "parentName": "span"
                    }}><span {...{
                        "className": "vlist-r",
                        "parentName": "span"
                      }}><span {...{
                          "className": "vlist",
                          "style": {
                            "height": "0.30110799999999993em"
                          },
                          "parentName": "span"
                        }}><span {...{
                            "style": {
                              "top": "-2.5500000000000003em",
                              "marginLeft": "0em",
                              "marginRight": "0.05em"
                            },
                            "parentName": "span"
                          }}><span {...{
                              "className": "pstrut",
                              "style": {
                                "height": "2.7em"
                              },
                              "parentName": "span"
                            }} />
                            <span {...{
                              "className": "sizing reset-size6 size3 mtight",
                              "parentName": "span"
                            }}><span {...{
                                "className": "mord mtight",
                                "parentName": "span"
                              }}><span {...{
                                  "className": "mord mtight",
                                  "parentName": "span"
                                }}>{`1`}</span>
                                <span {...{
                                  "className": "mord mtight",
                                  "parentName": "span"
                                }}>{`0`}</span></span></span></span></span>
                        <span {...{
                          "className": "vlist-s",
                          "parentName": "span"
                        }}>{`​`}</span></span>
                      <span {...{
                        "className": "vlist-r",
                        "parentName": "span"
                      }}><span {...{
                          "className": "vlist",
                          "style": {
                            "height": "0.15em"
                          },
                          "parentName": "span"
                        }}><span {...{
                            "parentName": "span"
                          }} /></span></span></span></span></span></span></span></span></span>
        {` to `}
        <span {...{
          "className": "math math-inline",
          "parentName": "p"
        }}><span {...{
            "className": "katex",
            "parentName": "span"
          }}><span {...{
              "className": "katex-mathml",
              "parentName": "span"
            }}><math {...{
                "xmlns": "http://www.w3.org/1998/Math/MathML",
                "parentName": "span"
              }}><semantics {...{
                  "parentName": "math"
                }}><mrow {...{
                    "parentName": "semantics"
                  }}><msub {...{
                      "parentName": "mrow"
                    }}><mi {...{
                        "parentName": "msub"
                      }}>{`s`}</mi>
                      <mn {...{
                        "parentName": "msub"
                      }}>{`14`}</mn></msub></mrow>
                  <annotation {...{
                    "encoding": "application/x-tex",
                    "parentName": "semantics"
                  }}>{`s_{14}`}</annotation></semantics></math></span>
            <span {...{
              "className": "katex-html",
              "aria-hidden": "true",
              "parentName": "span"
            }}><span {...{
                "className": "base",
                "parentName": "span"
              }}><span {...{
                  "className": "strut",
                  "style": {
                    "height": "0.58056em",
                    "verticalAlign": "-0.15em"
                  },
                  "parentName": "span"
                }} />
                <span {...{
                  "className": "mord",
                  "parentName": "span"
                }}><span {...{
                    "className": "mord mathnormal",
                    "parentName": "span"
                  }}>{`s`}</span>
                  <span {...{
                    "className": "msupsub",
                    "parentName": "span"
                  }}><span {...{
                      "className": "vlist-t vlist-t2",
                      "parentName": "span"
                    }}><span {...{
                        "className": "vlist-r",
                        "parentName": "span"
                      }}><span {...{
                          "className": "vlist",
                          "style": {
                            "height": "0.30110799999999993em"
                          },
                          "parentName": "span"
                        }}><span {...{
                            "style": {
                              "top": "-2.5500000000000003em",
                              "marginLeft": "0em",
                              "marginRight": "0.05em"
                            },
                            "parentName": "span"
                          }}><span {...{
                              "className": "pstrut",
                              "style": {
                                "height": "2.7em"
                              },
                              "parentName": "span"
                            }} />
                            <span {...{
                              "className": "sizing reset-size6 size3 mtight",
                              "parentName": "span"
                            }}><span {...{
                                "className": "mord mtight",
                                "parentName": "span"
                              }}><span {...{
                                  "className": "mord mtight",
                                  "parentName": "span"
                                }}>{`1`}</span>
                                <span {...{
                                  "className": "mord mtight",
                                  "parentName": "span"
                                }}>{`4`}</span></span></span></span></span>
                        <span {...{
                          "className": "vlist-s",
                          "parentName": "span"
                        }}>{`​`}</span></span>
                      <span {...{
                        "className": "vlist-r",
                        "parentName": "span"
                      }}><span {...{
                          "className": "vlist",
                          "style": {
                            "height": "0.15em"
                          },
                          "parentName": "span"
                        }}><span {...{
                            "parentName": "span"
                          }} /></span></span></span></span></span></span></span></span></span>
        {`) further
away from the mushrooms.`}</p>
      <p>{`The Q-learning algorithm, on the other hand, learns the optimal policy for an
agent that at some point stops exploration and thus can safely walk
close to the mushrooms without ever accidentally eating them.`}</p>
      <p>{`In the particular setting of the Pancakes world studied here, nothing speaks against`}
        <br {...{
          "parentName": "p"
        }} />
        {`
`}
        {`stopping exploration at some point after the environment has been
thoroughly explored. Therefore, the policy learned by the Q-learning algorithm
seems like the better option. However, in general, one could think of environments that change over
time (for example, the positions of gold and/or mushrooms could change over
time) and thus may require the agent to continue exploring forever. In such a
case the Sarsa algorithm might be the better option.`}</p>
      <p className="intro">
        <p>{`If you liked this post, please consider following me on `}
          <Link to="https://twitter.com/JanMalteL" mdxType="Link">{`Twitter`}</Link>
          {` for updates on new blog posts.`}</p>
      </p>
      <Commento id={props.pageContext.frontmatter.slug} mdxType="Commento" />
    </Layout>

  </MDXLayout>;
}
;
MDXContent.isMDXComponent = true;
      