{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Does Compulsory School Attendance Affect Schooling and Earnings?\n",
    "\n",
    "We replicate tables IV, V, and VI of Angrist and Krueger (1991).\n",
    "We start by loading the data."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import requests\n",
    "from sklearn.preprocessing import OneHotEncoder\n",
    "import subprocess\n",
    "import tempfile\n",
    "\n",
    "url = \"https://economics.mit.edu/sites/default/files/inline-files/NEW7080_1.rar\"\n",
    "\n",
    "\n",
    "dir = tempfile.TemporaryDirectory()\n",
    "with open(f\"{dir.name}/file.rar\", 'wb') as file:\n",
    "    file.write(requests.get(url).content)\n",
    "\n",
    "subprocess.run([\"tar\", \"xf\", f\"{dir.name}/file.rar\", \"-C\", dir.name])\n",
    "df = pd.read_stata(f\"{dir.name}/NEW7080.dta\")\n",
    "\n",
    "# renaming from\n",
    "# https://economics.mit.edu/sites/default/files/inline-files/Descriptive%20Statistics%20QOB.txt\n",
    "df = df.rename(columns={\n",
    "    \"v1\": \"age\",\n",
    "    \"v2\": \"ageq\",\n",
    "    \"v4\": \"educ\",\n",
    "    \"v5\": \"enocent\",\n",
    "    \"v6\": \"esocent\",\n",
    "    \"v9\": \"lwklywge\",\n",
    "    \"v10\": \"married\",\n",
    "    \"v11\": \"midatl\",\n",
    "    \"v12\": \"mt\",\n",
    "    \"v13\": \"neweng\",\n",
    "    \"v16\": \"census\",\n",
    "    \"v18\": \"qob\",\n",
    "    \"v19\": \"race\",\n",
    "    \"v20\": \"smsa\",\n",
    "    \"v21\": \"soatl\",\n",
    "    \"v24\": \"wnocent\",\n",
    "    \"v25\": \"wsocent\",\n",
    "    \"v27\": \"yob\",\n",
    "})\n",
    "\n",
    "# replace AGEQ=AGEQ-1900 if CENSUS==80\n",
    "df.loc[lambda x: x[\"census\"].eq(80), \"ageq\"] -= 1900\n",
    "# gen AGEQSQ= AGEQ*AGEQ\n",
    "df[\"ageqsq\"] = df[\"ageq\"] ** 2\n",
    "\n",
    "df[\"yob_dummies\"] = df[\"yob\"] % 10\n",
    "yob_encoder = OneHotEncoder(\n",
    "    categories=[list(range(9))],\n",
    "    sparse_output=False,\n",
    "    handle_unknown=\"ignore\"\n",
    ")\n",
    "yob_encoder.set_output(transform=\"pandas\")\n",
    "yob_dummies = yob_encoder.fit_transform(df[[\"yob_dummies\"]])\n",
    "\n",
    "df[\"yqob\"] = df[\"yob_dummies\"].astype(\"str\") + df[\"qob\"].astype(\"str\")\n",
    "yqob_encoder = OneHotEncoder(\n",
    "    categories=[[f\"{y}{q}\" for y in range(10) for q in [2, 3, 4]]],\n",
    "    sparse_output=False,\n",
    "    handle_unknown=\"ignore\"\n",
    ").set_output(transform=\"pandas\")\n",
    "yqob_dummies = yqob_encoder.fit_transform(df[[\"yqob\"]])\n",
    "\n",
    "df = pd.concat([df, yob_dummies, yqob_dummies], axis=1)\n",
    "\n",
    "cohorts = {\n",
    "    \"IV\": df[lambda x: x[\"yob\"].isin(range(1920, 1930))],\n",
    "    \"V\": df[lambda x: x[\"yob\"].isin(range(30, 40))],\n",
    "    \"VI\": df[lambda x: x[\"yob\"].isin(range(40, 50))],\n",
    "}\n",
    "\n",
    "age = [\"age\", \"ageqsq\"]\n",
    "other = [\"race\", \"married\", \"smsa\"]\n",
    "region = [\"neweng\", \"midatl\", \"enocent\", \"wnocent\", \"soatl\", \"esocent\", \"wsocent\", \"mt\"]\n",
    "\n",
    "yob_names = yob_encoder.get_feature_names_out().tolist()\n",
    "yqob_names = yqob_encoder.get_feature_names_out().tolist()\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We now replicate results from tables IV, V, and VI.\n",
    "We don't perfectly replicate columns (4), (8), as the authors include age and age squared only in the first, but not the second stage.\n",
    "We include them in both stages.\n",
    "See also the following code from https://economics.mit.edu/sites/default/files/inline-files/QOB%20Table%20IV.do:\n",
    "```stata\n",
    "** Col 2 4 6 8 ***\n",
    "ivregress 2sls LWKLYWGE YR20-YR28 (EDUC = QTR120-QTR129 QTR220-QTR229 QTR320-QTR329 YR20-YR28)\n",
    "ivregress 2sls LWKLYWGE YR20-YR28 AGEQ AGEQSQ (EDUC = QTR120-QTR129 QTR220-QTR229 QTR320-QTR329 YR20-YR28)\n",
    "ivregress 2sls LWKLYWGE YR20-YR28 RACE MARRIED SMSA NEWENG MIDATL ENOCENT WNOCENT SOATL ESOCENT WSOCENT MT  (EDUC = QTR120-QTR129 QTR220-QTR229 QTR320-QTR329 YR20-YR28)\n",
    "ivregress 2sls LWKLYWGE YR20-YR28 RACE MARRIED SMSA NEWENG MIDATL ENOCENT WNOCENT SOATL ESOCENT WSOCENT MT AGEQ AGEQSQ (EDUC = QTR120-QTR129 QTR220-QTR229 QTR320-QTR329 YR20-YR28)\n",
    "```"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Table IV\n",
      "Column (1), 0.0802 (0.0004)\n",
      "Column (2), 0.0769 (0.0150)\n",
      "wald: 3.2e-07, ar: 0.0085, clr: 0.00052, lm: 0.00093, j: 0.17\n",
      "Column (3), 0.0802 (0.0004)\n",
      "Column (4), 0.1352 (0.0337)\n",
      "wald: 6e-05, ar: 0.095, clr: 0.11, lm: 0.33, j: 0.8\n",
      "Column (5), 0.0701 (0.0004)\n",
      "Column (6), 0.0669 (0.0151)\n",
      "wald: 9.4e-06, ar: 0.028, clr: 0.002, lm: 0.0028, j: 0.23\n",
      "Column (7), 0.0701 (0.0004)\n",
      "Column (8), 0.1039 (0.0341)\n",
      "wald: 0.0023, ar: 0.2, clr: 0.27, lm: 0.7, j: 0.69\n",
      "\n",
      "Table V\n",
      "Column (1), 0.0711 (0.0003)\n",
      "Column (2), 0.0891 (0.0161)\n",
      "wald: 3.2e-08, ar: 0.013, clr: 1.2e-05, lm: 1e-05, j: 0.66\n",
      "Column (3), 0.0711 (0.0003)\n",
      "Column (4), 0.0655 (0.0280)\n",
      "wald: 0.019, ar: 0.64, clr: 0.38, lm: 0.33, j: 0.71\n",
      "Column (5), 0.0632 (0.0003)\n",
      "Column (6), 0.0806 (0.0164)\n",
      "wald: 8.8e-07, ar: 0.064, clr: 7.4e-05, lm: 5e-05, j: 0.8\n",
      "Column (7), 0.0632 (0.0003)\n",
      "Column (8), 0.0509 (0.0279)\n",
      "wald: 0.069, ar: 0.85, clr: 0.51, lm: 0.42, j: 0.87\n",
      "\n",
      "Table VI\n",
      "Column (1), 0.0573 (0.0003)\n",
      "Column (2), 0.0553 (0.0138)\n",
      "wald: 5.8e-05, ar: 5.6e-11, clr: 0.0089, lm: 0.039, j: 5.4e-10\n",
      "Column (3), 0.0574 (0.0003)\n",
      "Column (4), 0.1293 (0.0191)\n",
      "wald: 1.4e-11, ar: 1.6e-09, clr: 1.9e-10, lm: 9.8e-08, j: 0.0032\n",
      "Column (5), 0.0520 (0.0003)\n",
      "Column (6), 0.0393 (0.0145)\n",
      "wald: 0.0067, ar: 1e-08, clr: 0.19, lm: 0.29, j: 1.1e-08\n",
      "Column (7), 0.0521 (0.0003)\n",
      "Column (8), 0.1138 (0.0200)\n",
      "wald: 1.3e-08, ar: 2.5e-07, clr: 1.3e-07, lm: 1.6e-05, j: 0.0025\n"
     ]
    }
   ],
   "source": [
    "from ivmodels import KClass\n",
    "from ivmodels.tests import wald_test, anderson_rubin_test, conditional_likelihood_ratio_test, lagrange_multiplier_test, j_test\n",
    "\n",
    "for table, cohort in cohorts.items():\n",
    "    print(f\"\\nTable {table}\")\n",
    "    for column, kappa, exogenous in [\n",
    "        (\"(1)\", \"ols\", yob_names),\n",
    "        (\"(2)\", \"tsls\", yob_names),\n",
    "        (\"(3)\", \"ols\", yob_names + age),\n",
    "        (\"(4)\", \"tsls\", yob_names + age),\n",
    "        (\"(5)\", \"ols\", yob_names + region + other),\n",
    "        (\"(6)\", \"tsls\", yob_names + region + other),\n",
    "        (\"(7)\", \"ols\", yob_names + region + other + age),\n",
    "        (\"(8)\", \"tsls\", yob_names + region + other + age)\n",
    "    ]:\n",
    "        y = cohort[[\"lwklywge\"]]\n",
    "        X = cohort[[\"educ\"]]\n",
    "        C = cohort[exogenous]\n",
    "        Z = cohort[yqob_names]\n",
    "        estimator = KClass(kappa).fit(X=X, y=y, C=C, Z=Z)\n",
    "\n",
    "        wald_stat, wald_p = wald_test(X=X, y=y, Z=Z, C=C, beta=np.zeros(1), estimator=kappa)\n",
    "        std_error = np.abs(estimator.coef_[0]) / np.sqrt(wald_stat)\n",
    "\n",
    "        print(f\"Column {column}, {estimator.coef_[0]:.4f} ({std_error:.4f})\")\n",
    "\n",
    "        if kappa == \"tsls\":\n",
    "            _, ar_p = anderson_rubin_test(X=X, y=y, Z=Z, C=C, beta=np.zeros(1))\n",
    "            _, clr_p = conditional_likelihood_ratio_test(X=X, y=y, Z=Z, C=C, beta=np.zeros(1))\n",
    "            _, lm_p = lagrange_multiplier_test(X=X, y=y, Z=Z, C=C, beta=np.zeros(1))\n",
    "            _, j_p = j_test(X=X, y=y, Z=Z, C=C)\n",
    "            print(f\"wald: {wald_p:.2g}, ar: {ar_p:.2g}, clr: {clr_p:.2g}, lm: {lm_p:.2g}, j: {j_p:.2g}\")\n",
    "            \n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Notably, for cohorts 1920 - 1929 and 1930 - 1939, the causal effect of education on wages is no longer significant at level 0.05 if using weak-instrument-robust inference and if age and its square are included as an exogenous variables.\n",
    "The LIML variant of the J-statistic rejects the null of correct model specification at level 0.01 for cohort 1940 - 49, making any inference questionable."
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "ivmodels",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}