solved bonus nb on finches - included test with multivariate

ericmjl · ericmjl · commit 62442a69a97e · 2018-07-10T11:49:42.000-05:00
diff --git a/notebooks/bonus-exploration-finches.ipynb b/notebooks/bonus-exploration-finches.ipynb
@@ -254,7 +254,7 @@
     "\n",
     "    mu = pm.HalfNormal('mu', sd=20, shape=(2,))\n",
     "    \n",
-    "    like = pm.MvNormal('like', mu=mu, cov=sigma, observed=df[['beak_depth', 'beak_length']].values)"
+    "    like = pm.MvNormal('like', mu=mu, cov=sigma, observed=df.iloc[scandens_idx][['beak_depth', 'beak_length']].values)"
    ]
   },
   {
@@ -300,15 +300,172 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "samples_mv = pm.sample_ppc(trace, model=mv_beaks)"
+    "samples_mv = pm.sample_ppc(trace_mv, model=mv_beaks)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": []
+   "source": [
+    "samples_mv['like'][:, 0]  # beak_depth\n",
+    "samples_mv['like'][:, 1]  # beak_length"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fig = plt.figure()\n",
+    "ax1 = fig.add_subplot(121)\n",
+    "ax2 = fig.add_subplot(122)\n",
+    "\n",
+    "x, y = ECDF(samples_mv['like'][:, 0])\n",
+    "ax1.plot(x, y, label='ppc')\n",
+    "x, y = ECDF(df.iloc[scandens_idx]['beak_depth'])\n",
+    "ax1.plot(x, y, label='data')\n",
+    "ax1.set_title('beak depth')\n",
+    "ax1.legend()\n",
+    "\n",
+    "x, y = ECDF(samples_mv['like'][:, 1])\n",
+    "ax2.plot(x, y, label='ppc')\n",
+    "x, y = ECDF(df.iloc[scandens_idx]['beak_length'])\n",
+    "ax2.plot(x, y, label='data')\n",
+    "ax2.set_title('beak length')\n",
+    "ax2.legend()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fig = plt.figure()\n",
+    "ax = fig.add_subplot(111)\n",
+    "\n",
+    "x, y = ECDF(trace_mv['sigma'][:, 0, 1])\n",
+    "ax.plot(x, y, label='samples')\n",
+    "x, y = ECDF(df.iloc[scandens_idx]['shape'])\n",
+    "ax.plot(x, y)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Maybe the right way to compute shape is to regress depth on length, and compute the slope. After all, that's all that depth/length really is.\n",
+    "\n",
+    "We will assume a model: $y=mx$, no intercept."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with pm.Model() as shape_model:\n",
+    "    shape = pm.Normal('shape', mu=0, sd=100)\n",
+    "    sd = pm.HalfCauchy('sd', beta=100)\n",
+    "    \n",
+    "    mu = shape * df.iloc[scandens_idx]['beak_length'].values\n",
+    "    \n",
+    "    like = pm.Normal('like', mu=mu, sd=sd, observed=df.iloc[scandens_idx]['beak_depth'].values)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with shape_model:\n",
+    "    trace_shape = pm.sample(2000)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fig = plt.figure()\n",
+    "ax = fig.add_subplot(111)\n",
+    "\n",
+    "x, y = ECDF(trace_shape['shape'])\n",
+    "ax.plot(x, y, label='sample')\n",
+    "x, y = ECDF(df.iloc[scandens_idx]['shape'].values)\n",
+    "ax.plot(x, y, label='data')\n",
+    "ax.legend()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "I have the model mis-specified - I get the posterior distribution over the slope, but not the distribution of shapes. I guess shapes and slopes are kind of different. \n",
+    "\n",
+    "Let's try just estimating shape directly."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with pm.Model() as shape_model:\n",
+    "    mu = pm.HalfNormal('mu', sd=100)\n",
+    "    sd = pm.HalfCauchy('sd', beta=100)\n",
+    "    \n",
+    "    like = pm.Normal('shape', mu=mu, sd=sd, observed=df.iloc[scandens_idx]['shape'].values)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with shape_model:\n",
+    "    trace = pm.sample(2000)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "samples = pm.sample_ppc(trace, model=shape_model)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fig = plt.figure()\n",
+    "ax = fig.add_subplot(111)\n",
+    "\n",
+    "x, y = ECDF(samples['shape'])\n",
+    "ax.plot(x, y, label='samples')\n",
+    "x, y = ECDF(df.iloc[scandens_idx]['shape'])\n",
+    "ax.plot(x, y, label='data')\n",
+    "ax.legend()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "As it turns out, the simplest model is the best fitting one..."
+   ]
   }
  ],
  "metadata": {