ReAgent/usage.html

<!DOCTYPE html>
<html class="writer-html5" lang="en" >
<head>
  <meta charset="utf-8" /><meta name="generator" content="Docutils 0.17.1: http://docutils.sourceforge.net/" />

  <meta name="viewport" content="width=device-width, initial-scale=1.0" />
  <title>Usage &mdash; ReAgent 1.0 documentation</title>
      <link rel="stylesheet" href="_static/pygments.css" type="text/css" />
      <link rel="stylesheet" href="_static/css/theme.css" type="text/css" />
  <!--[if lt IE 9]>
    <script src="_static/js/html5shiv.min.js"></script>
  <![endif]-->

        <script data-url_root="./" id="documentation_options" src="_static/documentation_options.js"></script>
        <script src="_static/jquery.js"></script>
        <script src="_static/underscore.js"></script>
        <script src="_static/doctools.js"></script>
    <script src="_static/js/theme.js"></script>
    <link rel="index" title="Index" href="genindex.html" />
    <link rel="search" title="Search" href="search.html" />
    <link rel="next" title="ReAgent Serving Platform (RASP)" href="rasp_tutorial.html" />
    <link rel="prev" title="Installation" href="installation.html" />
</head>

<body class="wy-body-for-nav">
  <div class="wy-grid-for-nav">
    <nav data-toggle="wy-nav-shift" class="wy-nav-side">
      <div class="wy-side-scroll">
        <div class="wy-side-nav-search" >
            <a href="index.html" class="icon icon-home"> ReAgent
          </a>
<div role="search">
  <form id="rtd-search-form" class="wy-form" action="search.html" method="get">
    <input type="text" name="q" placeholder="Search docs" />
    <input type="hidden" name="check_keywords" value="yes" />
    <input type="hidden" name="area" value="default" />
  </form>
</div>
        </div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
              <p class="caption" role="heading"><span class="caption-text">Getting Started</span></p>
<ul class="current">
<li class="toctree-l1"><a class="reference internal" href="installation.html">Installation</a></li>
<li class="toctree-l1 current"><a class="current reference internal" href="#">Usage</a><ul>
<li class="toctree-l2"><a class="reference internal" href="#quick-start">Quick Start</a></li>
<li class="toctree-l2"><a class="reference internal" href="#on-policy-rl-training">1 - On-Policy RL Training</a></li>
<li class="toctree-l2"><a class="reference internal" href="#offline-rl-training-batch-rl">2- Offline RL Training (Batch RL)</a><ul>
<li class="toctree-l3"><a class="reference internal" href="#step-1-create-training-data">Step 1 - Create training data</a></li>
<li class="toctree-l3"><a class="reference internal" href="#step-2-convert-the-data-to-the-timeline-format">Step 2 - Convert the data to the <code class="docutils literal notranslate"><span class="pre">Timeline</span></code> format</a></li>
<li class="toctree-l3"><a class="reference internal" href="#step-3-determine-normalization-parameters">Step 3 - Determine normalization parameters</a></li>
<li class="toctree-l3"><a class="reference internal" href="#step-4-train-model">Step 4 - Train model</a></li>
<li class="toctree-l3"><a class="reference internal" href="#step-5-evaluate-the-model">Step 5 - Evaluate the Model</a></li>
<li class="toctree-l3"><a class="reference internal" href="#step-6-visualize-results-via-tensorboard">Step 6 - Visualize Results via Tensorboard</a></li>
</ul>
</li>
</ul>
</li>
<li class="toctree-l1"><a class="reference internal" href="rasp_tutorial.html">RASP (Not Actively Maintained)</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Advanced Topics</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="continuous_integration.html">Continuous Integration</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Package Reference</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="api/reagent.core.html">Core</a></li>
<li class="toctree-l1"><a class="reference internal" href="api/reagent.data.html">Data</a></li>
<li class="toctree-l1"><a class="reference internal" href="api/reagent.gym.html">Gym</a></li>
<li class="toctree-l1"><a class="reference internal" href="api/reagent.evaluation.html">Evaluation</a></li>
<li class="toctree-l1"><a class="reference internal" href="api/reagent.lite.html">Lite</a></li>
<li class="toctree-l1"><a class="reference internal" href="api/reagent.mab.html">MAB</a></li>
<li class="toctree-l1"><a class="reference internal" href="api/reagent.model_managers.html">Model Managers</a></li>
<li class="toctree-l1"><a class="reference internal" href="api/reagent.model_utils.html">Model Utils</a></li>
<li class="toctree-l1"><a class="reference internal" href="api/reagent.net_builder.html">Net Builders</a></li>
<li class="toctree-l1"><a class="reference internal" href="api/reagent.optimizer.html">Optimizers</a></li>
<li class="toctree-l1"><a class="reference internal" href="api/reagent.models.html">Models</a></li>
<li class="toctree-l1"><a class="reference internal" href="api/reagent.prediction.html">Prediction</a></li>
<li class="toctree-l1"><a class="reference internal" href="api/reagent.preprocessing.html">Preprocessing</a></li>
<li class="toctree-l1"><a class="reference internal" href="api/reagent.training.html">Training</a></li>
<li class="toctree-l1"><a class="reference internal" href="api/reagent.workflow.html">Workflow</a></li>
<li class="toctree-l1"><a class="reference internal" href="api/modules.html">All Modules</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Others</span></p>
<ul>
<li class="toctree-l1"><a class="reference external" href="https://github.com/facebookresearch/ReAgent">Github</a></li>
<li class="toctree-l1"><a class="reference internal" href="license.html">License</a></li>
</ul>

        </div>
      </div>
    </nav>

    <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
          <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
          <a href="index.html">ReAgent</a>
      </nav>

      <div class="wy-nav-content">
        <div class="rst-content">
          <div role="navigation" aria-label="Page navigation">
  <ul class="wy-breadcrumbs">
      <li><a href="index.html" class="icon icon-home"></a> &raquo;</li>
      <li>Usage</li>
      <li class="wy-breadcrumbs-aside">
            <a href="_sources/usage.rst.txt" rel="nofollow"> View page source</a>
      </li>
  </ul>
  <hr/>
</div>
          <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
           <div itemprop="articleBody">

  <section id="usage">
<span id="id1"></span><h1>Usage<a class="headerlink" href="#usage" title="Permalink to this headline"></a></h1>
<p>ReAgent is designed for large-scale, distributed recommendation/optimization tasks where we don’t
have access to a simulator.  In this environment, it’s typically better to train offline on batches
of data, and release new policies slowly over time.  Because the policy updates slowly and in
batches, we use <em>off-policy</em> algorithms.  To test a new policy without deploying it, we rely on
<em>counter-factual policy evaluation (CPE)</em>, a set of techniques for estimating a policy based on the
actions of another policy.</p>
<p>This tutorial is tested in our CircleCI <a class="reference external" href="https://github.com/facebookresearch/ReAgent/blob/62661e35b62b06ed161e661b906616a2d389eb3a/.circleci/config.yml#L79-L128">end-to-end tests</a>.
If there is anything not kept up-to-date in this tutorial, please always refer to the latest code.</p>
<section id="quick-start">
<h2>Quick Start<a class="headerlink" href="#quick-start" title="Permalink to this headline"></a></h2>
<p>We have set up <a class="reference external" href="https://click.palletsprojects.com/en/7.x/">Click</a> commands to run our RL workflow. The basic usage pattern is</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="o">./</span><span class="n">reagent</span><span class="o">/</span><span class="n">workflow</span><span class="o">/</span><span class="n">cli</span><span class="o">.</span><span class="n">py</span> <span class="n">run</span> <span class="o">&lt;</span><span class="n">module</span><span class="o">.</span><span class="n">function</span><span class="o">&gt;</span> <span class="o">&lt;</span><span class="n">path</span><span class="o">/</span><span class="n">to</span><span class="o">/</span><span class="n">config</span><span class="o">&gt;</span>
</pre></div>
</div>
<p>To train a model online with OpenAI Gym, simply run the Click command:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span> # set the config
 export CONFIG=reagent/gym/tests/configs/cartpole/discrete_dqn_cartpole_online.yaml
 # train and evaluate model on gym environment
./reagent/workflow/cli.py run reagent.gym.tests.test_gym.run_test $CONFIG
</pre></div>
</div>
<p>To train a batch RL model, run the following commands:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span># set the config
export CONFIG=reagent/workflow/sample_configs/discrete_dqn_cartpole_offline.yaml
# gather some random transitions (can replace with your own)
./reagent/workflow/cli.py run reagent.workflow.gym_batch_rl.offline_gym_random $CONFIG
# convert data to timeline format
./reagent/workflow/cli.py run reagent.workflow.gym_batch_rl.timeline_operator $CONFIG
# train model based on timeline data
./reagent/workflow/cli.py run reagent.workflow.training.identify_and_train_network $CONFIG
# evaluate the model
./reagent/workflow/cli.py run reagent.workflow.gym_batch_rl.evaluate_gym &quot;$CONFIG&quot;
</pre></div>
</div>
<p>Now we will describe how the above commands work, starting with a traditional RL setup with a simulator where we can trivially evaluate new policies:</p>
</section>
<section id="on-policy-rl-training">
<h2>1 - On-Policy RL Training<a class="headerlink" href="#on-policy-rl-training" title="Permalink to this headline"></a></h2>
<p>OpenAI Gym is a set of environments: simulators that can run policies for a given task and generate rewards.  If a simulator is accessible, on-policy training (where the latest version of the policy makes new decisions in real-time) can give better results. We have a suite of benchmarks on OpenAI Gym, which is listed in <code class="docutils literal notranslate"><span class="pre">reagent/gym/tests/test_gym.py</span></code>’s <code class="docutils literal notranslate"><span class="pre">GYM_TESTS</span></code>. To train a model on OpenAI Gym, simply run the Click command:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="o">./</span><span class="n">reagent</span><span class="o">/</span><span class="n">workflow</span><span class="o">/</span><span class="n">cli</span><span class="o">.</span><span class="n">py</span> <span class="n">run</span> <span class="n">reagent</span><span class="o">.</span><span class="n">gym</span><span class="o">.</span><span class="n">tests</span><span class="o">.</span><span class="n">test_gym</span><span class="o">.</span><span class="n">run_test</span> <span class="n">reagent</span><span class="o">/</span><span class="n">gym</span><span class="o">/</span><span class="n">tests</span><span class="o">/</span><span class="n">configs</span><span class="o">/</span><span class="n">cartpole</span><span class="o">/</span><span class="n">discrete_dqn_cartpole_online</span><span class="o">.</span><span class="n">yaml</span>
</pre></div>
</div>
<p>Configs for different environments and algorithms can be found in <code class="docutils literal notranslate"><span class="pre">reagent/gym/tests/configs/&lt;env_name&gt;/&lt;algorithm&gt;_&lt;env_name&gt;_online.yaml</span></code>.</p>
<p>While this is typically the set up for people conducting RL research, it isn’t always practical to deploy on-policy RL for several reasons:</p>
<ol class="arabic simple">
<li><p>We don’t have a simulator and the problem may be so complex that building an accurate simulator is non-trivial.</p></li>
<li><p>Thousands or even tens-of-thousands of machines must execute the policy in parallel, and keeping the latest policy in sync on all of these nodes is difficult</p></li>
<li><p>We want to evaluate the behavior of the policy offline and then keep the policy constant afterwards to reduce the risk that the policy will degrade at odd hours.</p></li>
<li><p>We are building on top of traditional recommender systems that typically rely on a fixed, stochastic policy.</p></li>
</ol>
<p>For these reasons, ReAgent is designed to support batch, off-policy RL.  Let’s now walk though how to train a model with ReAgent:</p>
</section>
<section id="offline-rl-training-batch-rl">
<h2>2- Offline RL Training (Batch RL)<a class="headerlink" href="#offline-rl-training-batch-rl" title="Permalink to this headline"></a></h2>
<p>The main use case of ReAgent is to train RL models in the <strong>batch</strong> setting. In batch reinforcement learning the data collection and policy learning steps are decoupled. Specifically, we try to learn the best possible policy given the input data. In batch RL, being able to handle thousands of varying feature types and distributions and algorithm performance estimates before deployment are of key importance.</p>
<p>In this example, we will train a DQN model on Offline <code class="docutils literal notranslate"><span class="pre">CartPole-v0</span></code> data, where Click command config should be set to</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">export</span> <span class="n">CONFIG</span><span class="o">=</span><span class="n">reagent</span><span class="o">/</span><span class="n">workflow</span><span class="o">/</span><span class="n">sample_configs</span><span class="o">/</span><span class="n">discrete_dqn_cartpole_offline</span><span class="o">.</span><span class="n">yaml</span>
</pre></div>
</div>
<p>We now proceed to give pseudo-code to sketch out the main ideas of our batch RL workflow.</p>
<section id="step-1-create-training-data">
<h3>Step 1 - Create training data<a class="headerlink" href="#step-1-create-training-data" title="Permalink to this headline"></a></h3>
<p>We first generate data from a random policy (chooses random actions) run on the <code class="docutils literal notranslate"><span class="pre">CartPole-v0</span></code> environment.
In particular, the following Click command runs 150 episodes of <code class="docutils literal notranslate"><span class="pre">CartPole-v0</span></code> (max steps of 200) and stored the pickled dataframe in <code class="docutils literal notranslate"><span class="pre">/tmp/tmp_pickle.pkl</span></code>, which you may inspect via <code class="docutils literal notranslate"><span class="pre">pd_df</span> <span class="pre">=</span> <span class="pre">pd.read_pickle(pkl_path)</span></code>.</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span>./reagent/workflow/cli.py run reagent.workflow.gym_batch_rl.offline_gym_random $CONFIG
</pre></div>
</div>
<p>The command essentially performs the following pseudo-code:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">dataset</span> <span class="o">=</span> <span class="n">ReplayBuffer</span><span class="p">()</span>
<span class="k">for</span> <span class="n">epoch</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">num_episodes_for_data_batch</span><span class="p">):</span>
  <span class="n">run_episode</span> <span class="o">&amp;</span> <span class="n">store</span> <span class="n">transitions</span>

<span class="n">df</span> <span class="o">=</span> <span class="n">dataset</span><span class="o">.</span><span class="n">to_pandas_df</span><span class="p">()</span>
<span class="n">df</span><span class="o">.</span><span class="n">to_pickle</span><span class="p">(</span><span class="n">pkl_path</span><span class="p">)</span>
</pre></div>
</div>
<p>In practice, end users would generate a dataset in a similar format from their production system. For this example, the data is stored as a pickled Pandas dataframe.</p>
<p>This is human-readable, but not the most efficient way to store tabular data.  Other ways to store input data are parquet, CSV, or any other format that can be read by Apache Spark.  All of these formats are fine, as long as the following schema is maintained:</p>
<table class="docutils align-default">
<colgroup>
<col style="width: 33%" />
<col style="width: 33%" />
<col style="width: 33%" />
</colgroup>
<thead>
<tr class="row-odd"><th class="head"><p>Column</p></th>
<th class="head"><p>Type</p></th>
<th class="head"><p>Description</p></th>
</tr>
</thead>
<tbody>
<tr class="row-even"><td><p>mdp_id</p></td>
<td><p>string</p></td>
<td><p>A unique ID for the episode (e.g. an entire playthrough of a game)</p></td>
</tr>
<tr class="row-odd"><td><p>sequence_number</p></td>
<td><p>integer</p></td>
<td><p>Defines the ordering of states in an MDP (e.g. the timestamp of an event)</p></td>
</tr>
<tr class="row-even"><td><p>state_features</p></td>
<td><p><code class="docutils literal notranslate"><span class="pre">map&lt;integer,float&gt;</span></code></p></td>
<td><p>A set of features describing the state.</p></td>
</tr>
<tr class="row-odd"><td><p>action</p></td>
<td><p>string</p></td>
<td><p>The name of the action chosen</p></td>
</tr>
<tr class="row-even"><td><p>reward</p></td>
<td><p>float</p></td>
<td><p>The reward at this state/action</p></td>
</tr>
<tr class="row-odd"><td><p>possible_actions</p></td>
<td><p><code class="docutils literal notranslate"><span class="pre">list&lt;string&gt;</span></code></p></td>
<td><p>A list of all possible actions at this state.  Note that the action taken must be present in this list.</p></td>
</tr>
<tr class="row-even"><td><p>action_probability</p></td>
<td><p>float</p></td>
<td><p>The probability of taking this action if the policy is stochastic, else <code class="docutils literal notranslate"><span class="pre">null</span></code>.  Note that we strongly encourage using a stochastic policy instead of choosing the best action at every timestep.  This exploration will improve the evaluation and ultimately result in better learned policies.</p></td>
</tr>
<tr class="row-odd"><td><p>ds</p></td>
<td><p>string</p></td>
<td><p>A unique ID for this dataset.</p></td>
</tr>
</tbody>
</table>
<p>Once you have data on this format, you can move on to Step 2.</p>
</section>
<section id="step-2-convert-the-data-to-the-timeline-format">
<h3>Step 2 - Convert the data to the <code class="docutils literal notranslate"><span class="pre">Timeline</span></code> format<a class="headerlink" href="#step-2-convert-the-data-to-the-timeline-format" title="Permalink to this headline"></a></h3>
<p>Models are trained on consecutive pairs of state/action tuples. To assist in creating this table, we have an <code class="docutils literal notranslate"><span class="pre">RLTimelineOperator</span></code> spark operator. Let’s build and run the timeline operator on the data:</p>
<p>First, we need to build the Spark library that will execute the timeline.  Apache Spark is a platform for doing massively-parallel processing.  Although we are running this on a single file, Spark is designed to work on thousands of files distribued across many machines.  Explaining HDFS, Hive, and Spark are beyond the scope of this tutorial, but for large datasets it’s important to understand these concepts and that it’s possible to run ReAgent in a distributed environment by simply changing the location of the input from a file to an HDFS folder.</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="c1"># Build timeline package (only need to do this first time)</span>
<span class="n">mvn</span> <span class="o">-</span><span class="n">f</span> <span class="n">preprocessing</span><span class="o">/</span><span class="n">pom</span><span class="o">.</span><span class="n">xml</span> <span class="n">clean</span> <span class="n">package</span>
</pre></div>
</div>
<p>When running spark locally, spark creates a fake “cluster” where it stores all of the data.  We want to remove this before running so we don’t accidentally pull in data from a prior run.  In a production setting, we would delete the output data table before running using a Hive command.</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="c1"># Clear last run&#39;s spark data (in case of interruption)</span>
<span class="n">rm</span> <span class="o">-</span><span class="n">Rf</span> <span class="n">spark</span><span class="o">-</span><span class="n">warehouse</span> <span class="n">derby</span><span class="o">.</span><span class="n">log</span> <span class="n">metastore_db</span> <span class="n">preprocessing</span><span class="o">/</span><span class="n">spark</span><span class="o">-</span><span class="n">warehouse</span> <span class="n">preprocessing</span><span class="o">/</span><span class="n">metastore_db</span> <span class="n">preprocessing</span><span class="o">/</span><span class="n">derby</span><span class="o">.</span><span class="n">log</span>
</pre></div>
</div>
<p>Now that we are ready, let’s run our spark job on our local machine. This will produce a massive amount of logging (because we are running many systems that typically are distributed across many nodes) and there will be some exception stack traces printed because we are running in a psuedo-distributed mode.  Generally this is fine as long as the output data is generated. To do so, run the following Click command:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span>./reagent/workflow/cli.py run reagent.workflow.gym_batch_rl.timeline_operator $CONFIG
</pre></div>
</div>
<p>The command essentially performs the following pseudo-code:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="c1"># load pandas dataframe</span>
<span class="n">pd_df</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">read_pickle</span><span class="p">(</span><span class="n">pkl_path</span><span class="p">)</span>

<span class="c1"># convert to Spark dataframe</span>
<span class="n">spark</span> <span class="o">=</span> <span class="n">get_spark_session</span><span class="p">()</span>
<span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">(</span><span class="n">pd_df</span><span class="p">)</span>

<span class="c1"># run timelime operator</span>
<span class="n">json_params</span> <span class="o">=</span> <span class="n">make_input_to_timeline_operator</span><span class="p">()</span>
<span class="n">spark</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">com</span><span class="o">.</span><span class="n">facebook</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">rl</span><span class="o">.</span><span class="n">Timeline</span><span class="o">.</span><span class="n">main</span><span class="p">(</span><span class="n">json_params</span><span class="p">)</span>
</pre></div>
</div>
<p>Now that our data is a Spark table in Hive storage, we’re ready to run the training workflow (Steps 3-4). These steps are altogether accomplished with the following Click command:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span>./reagent/workflow/cli.py run reagent.workflow.training.identify_and_train_network $CONFIG
</pre></div>
</div>
<p>We now proceed to describing this command and present some pseudo-code.</p>
</section>
<section id="step-3-determine-normalization-parameters">
<h3>Step 3 - Determine normalization parameters<a class="headerlink" href="#step-3-determine-normalization-parameters" title="Permalink to this headline"></a></h3>
<p>Data from production systems is often sparse, noisy and arbitrarily distributed. Literature has shown that neural networks learn faster and better when operating on batches of features that are normally distributed. ReAgent includes a workflow that automatically analyzes the training dataset and determines the best transformation function and corresponding normalization parameters for each feature. We do this via <code class="docutils literal notranslate"><span class="pre">ModelManager.run_feature_identification</span></code>, where <code class="docutils literal notranslate"><span class="pre">input_table_spec</span></code> points to a Spark table with the timeline data.</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">model</span><span class="p">:</span> <span class="n">ModelManager__Union</span>
<span class="n">manager</span> <span class="o">=</span> <span class="n">model</span><span class="o">.</span><span class="n">value</span>
<span class="n">manager</span><span class="o">.</span><span class="n">run_feature_identification</span><span class="p">(</span><span class="n">input_table_spec</span><span class="p">)</span>
</pre></div>
</div>
<p>The normalization is a Python dictionary where each key is a feature id and each value is NormalizationData.
An example of this, in JSON format, is</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="p">{</span>
    <span class="s2">&quot;0&quot;</span><span class="p">:</span> <span class="s2">&quot;{</span><span class="se">\&quot;</span><span class="s2">feature_type</span><span class="se">\&quot;</span><span class="s2">:</span><span class="se">\&quot;</span><span class="s2">CONTINUOUS</span><span class="se">\&quot;</span><span class="s2">,</span><span class="se">\&quot;</span><span class="s2">mean</span><span class="se">\&quot;</span><span class="s2">:0.5675003528594971,</span><span class="se">\&quot;</span><span class="s2">stddev</span><span class="se">\&quot;</span><span class="s2">:1.0,</span><span class="se">\&quot;</span><span class="s2">min_value</span><span class="se">\&quot;</span><span class="s2">:-0.1467551738023758,</span><span class="se">\&quot;</span><span class="s2">max_value</span><span class="se">\&quot;</span><span class="s2">:2.1779561042785645}&quot;</span><span class="p">,</span>
    <span class="s2">&quot;1&quot;</span><span class="p">:</span> <span class="s2">&quot;{</span><span class="se">\&quot;</span><span class="s2">feature_type</span><span class="se">\&quot;</span><span class="s2">:</span><span class="se">\&quot;</span><span class="s2">CONTINUOUS</span><span class="se">\&quot;</span><span class="s2">,</span><span class="se">\&quot;</span><span class="s2">mean</span><span class="se">\&quot;</span><span class="s2">:0.42259514331817627,</span><span class="se">\&quot;</span><span class="s2">stddev</span><span class="se">\&quot;</span><span class="s2">:1.0,</span><span class="se">\&quot;</span><span class="s2">min_value</span><span class="se">\&quot;</span><span class="s2">:-1.3586808443069458,</span><span class="se">\&quot;</span><span class="s2">max_value</span><span class="se">\&quot;</span><span class="s2">:1.8529225587844849}&quot;</span><span class="p">,</span>
    <span class="s2">&quot;2&quot;</span><span class="p">:</span> <span class="s2">&quot;{</span><span class="se">\&quot;</span><span class="s2">feature_type</span><span class="se">\&quot;</span><span class="s2">:</span><span class="se">\&quot;</span><span class="s2">CONTINUOUS</span><span class="se">\&quot;</span><span class="s2">,</span><span class="se">\&quot;</span><span class="s2">mean</span><span class="se">\&quot;</span><span class="s2">:0.028220390900969505,</span><span class="se">\&quot;</span><span class="s2">stddev</span><span class="se">\&quot;</span><span class="s2">:1.0,</span><span class="se">\&quot;</span><span class="s2">min_value</span><span class="se">\&quot;</span><span class="s2">:-0.14581388235092163,</span><span class="se">\&quot;</span><span class="s2">max_value</span><span class="se">\&quot;</span><span class="s2">:0.19483095407485962}&quot;</span><span class="p">,</span>
    <span class="s2">&quot;3&quot;</span><span class="p">:</span> <span class="s2">&quot;{</span><span class="se">\&quot;</span><span class="s2">feature_type</span><span class="se">\&quot;</span><span class="s2">:</span><span class="se">\&quot;</span><span class="s2">CONTINUOUS</span><span class="se">\&quot;</span><span class="s2">,</span><span class="se">\&quot;</span><span class="s2">mean</span><span class="se">\&quot;</span><span class="s2">:0.02947876788675785,</span><span class="se">\&quot;</span><span class="s2">stddev</span><span class="se">\&quot;</span><span class="s2">:1.0,</span><span class="se">\&quot;</span><span class="s2">min_value</span><span class="se">\&quot;</span><span class="s2">:-2.194336175918579,</span><span class="se">\&quot;</span><span class="s2">max_value</span><span class="se">\&quot;</span><span class="s2">:2.164193868637085}&quot;</span>
<span class="p">}</span>
</pre></div>
</div>
<p>NB: <code class="docutils literal notranslate"><span class="pre">reagent/workflow/training.py</span></code> is what the pseudo-code in Steps 3 and 4 are trying to depict. Models should subclass <code class="docutils literal notranslate"><span class="pre">ModelManager</span></code> and implement all abstract methods (including <code class="docutils literal notranslate"><span class="pre">run_feature_identification</span></code> and <code class="docutils literal notranslate"><span class="pre">query_data</span></code>) to be added to our registry of models.</p>
</section>
<section id="step-4-train-model">
<h3>Step 4 - Train model<a class="headerlink" href="#step-4-train-model" title="Permalink to this headline"></a></h3>
<p>To train the model, we first save our Spark table to Parquet format, and use <a class="reference external" href="https://github.com/uber/petastorm">Petastorm</a>’s PyTorch DataLoader, which can efficiently read Parquet formatted data. We do this via <code class="docutils literal notranslate"><span class="pre">ModelManager.query_data</span></code>, which each <code class="docutils literal notranslate"><span class="pre">ModelManager</span></code> in our registry of models must implement. In this step, we also process the rewards, i.e. computing multi-step rewards or computing the reward from <code class="docutils literal notranslate"><span class="pre">metrics</span></code> columns directly.</p>
<p>Now we are ready to train a model by running:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="c1"># make preprocessor from the normalization parameters of Step 3</span>
<span class="n">batch_preprocessor</span> <span class="o">=</span> <span class="n">manager</span><span class="o">.</span><span class="n">build_batch_preprocessor</span><span class="p">(</span><span class="n">use_gpu</span><span class="p">)</span>

<span class="c1"># read preprocessed data</span>
<span class="n">data_reader</span> <span class="o">=</span> <span class="n">petastorm</span><span class="o">.</span><span class="n">make_batch_reader</span><span class="p">(</span><span class="n">train_dataset</span><span class="o">.</span><span class="n">parquet_url</span><span class="p">)</span>
<span class="k">with</span> <span class="n">DataLoader</span><span class="p">(</span><span class="n">data_reader</span><span class="p">,</span> <span class="n">batch_preprocessor</span><span class="p">)</span> <span class="k">as</span> <span class="n">dataloader</span><span class="p">:</span>
  <span class="k">for</span> <span class="n">batch</span> <span class="ow">in</span> <span class="n">dataloader</span><span class="p">:</span>
    <span class="n">trainer</span><span class="o">.</span><span class="n">train</span><span class="p">(</span><span class="n">batch</span><span class="p">)</span>

<span class="c1"># Store model outputs</span>
<span class="n">torchscript_output_path</span> <span class="o">=</span> <span class="sa">f</span><span class="s2">&quot;model_</span><span class="si">{</span><span class="nb">round</span><span class="p">(</span><span class="n">time</span><span class="o">.</span><span class="n">time</span><span class="p">())</span><span class="si">}</span><span class="s2">.torchscript&quot;</span>
<span class="n">serving_module</span> <span class="o">=</span> <span class="n">manager</span><span class="o">.</span><span class="n">build_serving_module</span><span class="p">()</span>
<span class="n">torch</span><span class="o">.</span><span class="n">jit</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">serving_module</span><span class="p">,</span> <span class="n">torchscript_output_path</span><span class="p">)</span>

<span class="c1"># store for later use</span>
<span class="n">training_output</span><span class="o">.</span><span class="n">output_path</span> <span class="o">=</span> <span class="n">torchscript_output_path</span>
</pre></div>
</div>
<p>Note that the model is trained purely on the randomly generated data we collected in Step 1.
We are taking a batch of data that we generated previously and training by looping over that data and interatively learning a better policy than the policy that generated the data.
Effectively, this is learning to perform a task by observing completely random transitions from an environment! While doing so, we are not even building a dynamics model of the environment.</p>
<p>NB: We can do the same for the <code class="docutils literal notranslate"><span class="pre">eval_dataset</span></code> if we want to perform CPE during training as a diagnosis tool.</p>
</section>
<section id="step-5-evaluate-the-model">
<h3>Step 5 - Evaluate the Model<a class="headerlink" href="#step-5-evaluate-the-model" title="Permalink to this headline"></a></h3>
<p>Now that we have trained a new policy on the offline <code class="docutils literal notranslate"><span class="pre">CartPole-v0</span></code> data, we can try it out to see how it does:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span>./reagent/workflow/cli.py run reagent.workflow.gym_batch_rl.evaluate_gym $CONFIG
</pre></div>
</div>
<p>which performs the following pseudo-code</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="c1"># load our previous serving module</span>
<span class="n">jit_model</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">jit</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">saved_serving_module</span><span class="p">)</span>

<span class="c1"># wrap around module to fit our gymrunner interface</span>
<span class="n">policy</span> <span class="o">=</span> <span class="n">create_predictor_policy_from_model</span><span class="p">(</span><span class="n">env</span><span class="p">,</span> <span class="n">jit_model</span><span class="p">)</span>
<span class="n">agent</span> <span class="o">=</span> <span class="n">Agent</span><span class="o">.</span><span class="n">create_for_env_with_serving_policy</span><span class="p">(</span><span class="n">env</span><span class="p">,</span> <span class="n">policy</span><span class="o">=</span><span class="n">policy</span><span class="p">)</span>

<span class="c1"># run Agent on environment, and record rewards</span>
<span class="n">rewards</span> <span class="o">=</span> <span class="n">evaluate_for_n_episodes</span><span class="p">(</span>
    <span class="n">n</span><span class="o">=</span><span class="n">num_eval_episodes</span><span class="p">,</span> <span class="n">env</span><span class="o">=</span><span class="n">env</span><span class="p">,</span> <span class="n">agent</span><span class="o">=</span><span class="n">agent</span><span class="p">,</span> <span class="n">max_steps</span><span class="o">=</span><span class="n">max_steps</span>
<span class="p">)</span>
</pre></div>
</div>
<p>Even on completely random data, DQN can learn a policy that obtains scores close to the maximum possible score of 200!</p>
</section>
<section id="step-6-visualize-results-via-tensorboard">
<h3>Step 6 - Visualize Results via Tensorboard<a class="headerlink" href="#step-6-visualize-results-via-tensorboard" title="Permalink to this headline"></a></h3>
<p>We can now view loss plots and CPE estimates in Tensorboard after running:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">tensorboard</span> <span class="o">--</span><span class="n">logdir</span> <span class="n">outputs</span><span class="o">/</span>
</pre></div>
</div>
<p>at <a class="reference external" href="localhost:6006">localhost:6006</a>. When done viewing the results deactivate the virtualenv by typing <code class="docutils literal notranslate"><span class="pre">deactivate</span></code>.</p>
</section>
</section>
</section>


           </div>
          </div>
          <footer><div class="rst-footer-buttons" role="navigation" aria-label="Footer">
        <a href="installation.html" class="btn btn-neutral float-left" title="Installation" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left" aria-hidden="true"></span> Previous</a>
        <a href="rasp_tutorial.html" class="btn btn-neutral float-right" title="ReAgent Serving Platform (RASP)" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right" aria-hidden="true"></span></a>
    </div>

  <hr/>

  <div role="contentinfo">
    <p>&#169; Copyright 2022, Meta Platforms, Inc.</p>
  </div>

  Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
    <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
    provided by <a href="https://readthedocs.org">Read the Docs</a>.


</footer>
        </div>
      </div>
    </section>
  </div>
  <script>
      jQuery(function () {
          SphinxRtdTheme.Navigation.enable(true);
      });
  </script>

</body>
</html>