From 3668c65cb9cec13314f99e88005a80f0614445d4 Mon Sep 17 00:00:00 2001 From: Shaun Setlock Date: Sun, 24 Apr 2022 21:32:26 -0400 Subject: [PATCH] Working on jupyter notebook, finished proximity filtering. --- main/analysis.ipynb | 492 +++++++++++++++++++++++++++++++++----------- 1 file changed, 368 insertions(+), 124 deletions(-) diff --git a/main/analysis.ipynb b/main/analysis.ipynb index 8c6ca6c..853617f 100644 --- a/main/analysis.ipynb +++ b/main/analysis.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 70, + "execution_count": 32, "metadata": {}, "outputs": [], "source": [ @@ -32,7 +32,7 @@ }, { "cell_type": "code", - "execution_count": 71, + "execution_count": 33, "metadata": {}, "outputs": [], "source": [ @@ -50,7 +50,7 @@ }, { "cell_type": "code", - "execution_count": 72, + "execution_count": 34, "metadata": {}, "outputs": [], "source": [ @@ -108,7 +108,7 @@ }, { "cell_type": "code", - "execution_count": 73, + "execution_count": 35, "metadata": {}, "outputs": [], "source": [ @@ -129,14 +129,81 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "For Boston, drop all schools that aren't in Massachusetts." + "We should definitely removal all schools that aren't in Massachusetts." ] }, { "cell_type": "code", - "execution_count": 74, + "execution_count": 36, "metadata": {}, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "There are 1789 schools from the original API results.\n", + "Allowing only schools from Massachusetts reduces the dataset to 1375 schools.\n" + ] + } + ], + "source": [ + "print(f'There are {len(boston_df)} schools from the original API results.')\n", + "\n", + "# only allow from MA\n", + "boston_df = boston_df[boston_df['state'] == \"MA\"]\n", + "print(f'Allowing only schools from Massachusetts reduces the dataset to {len(boston_df)} schools.')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "How many unique district id's are there?" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "There are 230 unique school districts.\n", + "\n" + ] + } + ], + "source": [ + "# get unique districts\n", + "districts = boston_df[\"district-id\"].unique()\n", + "print(f'\\nThere are {len(districts)} unique school districts.\\n')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Which of these districts are close to both work and downtown boston?" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "There are 116 school districts within reasonable proximity to downtown and work.\n", + "\n", + "There are 820 schools within these proximal districts.\n", + "\n" + ] + }, { "data": { "text/html": [ @@ -201,165 +268,342 @@ " \n", " \n", " \n", - " 2501042\n", - " 7050505\n", - " Masconomet Regional High School\n", + " 2500363\n", + " 380013\n", + " Spofford Pond\n", " public\n", - " h\n", - " 9,10,11,12\n", + " e\n", + " 3,4,5,6\n", " Boxford\n", " MA\n", " 1921\n", " Essex County\n", - " 42.627754\n", - " -70.974693\n", - " Masconomet School District\n", - " 259\n", - " 8.0\n", + " 42.697018\n", + " -71.017365\n", + " Boxford School District\n", + " 102\n", + " 7.0\n", " 2021.0\n", - " (42.627754, -70.974693)\n", - " 30.005931\n", - " 28.583420\n", + " (42.697018, -71.017365)\n", + " 22.917933\n", + " 19.554889\n", " \n", " \n", - " 2500337\n", - " 350380\n", - " Young Achievers Science and Math School\n", + " 2506356\n", + " 100305\n", + " Gibbs School\n", " public\n", - " p,e,m\n", - " PK,KG,1,2,3,4,5,6,7,8\n", - " Mattapan\n", + " e,m\n", + " 6\n", + " Arlington\n", " MA\n", - " 2126\n", - " Suffolk County\n", - " 42.282269\n", - " -71.095016\n", - " Boston School District\n", - " 99\n", - " 2.0\n", + " 2474\n", + " Middlesex County\n", + " 42.410576\n", + " -71.145081\n", + " Arlington Public Schools\n", + " 69\n", + " 7.0\n", " 2021.0\n", - " (42.282269, -71.095016)\n", - " 9.673200\n", - " 24.989359\n", + " (42.410576, -71.145081)\n", + " 4.794958\n", + " 7.066929\n", " \n", " \n", - " 2500402\n", - " 440017\n", - " Kennedy K-5 Elementary School\n", - " public\n", - " e\n", - " KG,1,2,3,4,5\n", - " Brockton\n", - " MA\n", - " 2301\n", - " Plymouth County\n", - " 42.059696\n", - " -71.037262\n", - " Brockton School District\n", - " 111\n", - " 4.0\n", - " 2021.0\n", - " (42.059696, -71.037262)\n", - " 34.339345\n", - " 49.384728\n", - " \n", - " \n", - " 2501682\n", - " 3070010\n", - " Boyden\n", - " public\n", - " e\n", - " KG,1,2,3,4,5\n", - " Walpole\n", - " MA\n", - " 2071\n", - " Norfolk County\n", - " 42.105808\n", - " -71.258743\n", - " Walpole School District\n", - " 426\n", - " 6.0\n", - " 2021.0\n", - " (42.105808, -71.258743)\n", - " 32.933990\n", - " 40.921772\n", - " \n", - " \n", - " 2501507\n", - " 2760305\n", - " P. Brent Trottier Middle School\n", + " 2501835\n", + " 3470410\n", + " Daniel L Joyce Middle School\n", " public\n", " m\n", " 6,7,8\n", - " Southborough\n", + " Woburn\n", " MA\n", - " 1772\n", - " Worcester County\n", - " 42.299240\n", - " -71.542259\n", - " Southborough School District\n", - " 387\n", + " 1801\n", + " Middlesex County\n", + " 42.477467\n", + " -71.175484\n", + " Woburn School District\n", + " 467\n", + " 4.0\n", + " 2021.0\n", + " (42.477467, -71.175484)\n", + " 9.264922\n", + " 4.013598\n", + " \n", + " \n", + " 2501714\n", + " 3150005\n", + " Claypit Hill School\n", + " public\n", + " e\n", + " KG,1,2,3,4,5\n", + " Wayland\n", + " MA\n", + " 1778\n", + " Middlesex County\n", + " 42.373108\n", + " -71.344765\n", + " Wayland School District\n", + " 434\n", " 8.0\n", " 2021.0\n", - " (42.29924, -71.542259)\n", - " 39.445654\n", - " 30.606258\n", + " (42.373108, -71.344765)\n", + " 13.952791\n", + " 8.347379\n", + " \n", + " \n", + " 2502631\n", + " 1810055\n", + " Tenney Grammar School\n", + " public\n", + " p,e,m\n", + " PK,KG,1,2,3,4,5,6,7,8\n", + " Methuen\n", + " MA\n", + " 1844\n", + " Essex County\n", + " 42.732357\n", + " -71.177345\n", + " Methuen School District\n", + " 270\n", + " 3.0\n", + " 2021.0\n", + " (42.732357, -71.177345)\n", + " 25.763243\n", + " 18.273064\n", + " \n", + " \n", + " 2500515\n", + " 710505\n", + " Danvers High School\n", + " public\n", + " h\n", + " 9,10,11,12,UG\n", + " Danvers\n", + " MA\n", + " 1923\n", + " Essex County\n", + " 42.582523\n", + " -70.931618\n", + " Danvers School District\n", + " 141\n", + " 6.0\n", + " 2021.0\n", + " (42.582523, -70.931618)\n", + " 16.464503\n", + " 18.045917\n", + " \n", + " \n", + " 2501498\n", + " 2740410\n", + " Next Wave Junior High School\n", + " public\n", + " m\n", + " 7,8\n", + " Somerville\n", + " MA\n", + " 2145\n", + " Middlesex County\n", + " 42.387581\n", + " -71.087326\n", + " Somerville School District\n", + " 383\n", + " NaN\n", + " NaN\n", + " (42.387581, -71.087326)\n", + " 1.609308\n", + " 10.378716\n", + " \n", + " \n", + " 2501384\n", + " 2430310\n", + " Broad Meadows Middle School\n", + " public\n", + " m\n", + " 6,7,8\n", + " Quincy\n", + " MA\n", + " 2169\n", + " Norfolk County\n", + " 42.259659\n", + " -70.985237\n", + " Quincy School District\n", + " 349\n", + " 4.0\n", + " 2021.0\n", + " (42.259659, -70.985237)\n", + " 8.646003\n", + " 20.169491\n", + " \n", + " \n", + " 2500916\n", + " 1570006\n", + " Hanscom Primary School\n", + " public\n", + " p,e\n", + " PK,KG,1,2,3\n", + " Hanscom Air Force Bs\n", + " MA\n", + " 1731\n", + " Middlesex County\n", + " 42.456898\n", + " -71.278549\n", + " Lincoln School District\n", + " 242\n", + " 3.0\n", + " 2021.0\n", + " (42.456898, -71.278549)\n", + " 12.234463\n", + " 1.705602\n", + " \n", + " \n", + " 2501788\n", + " 3360065\n", + " Lawrence W Pingree\n", + " public\n", + " e\n", + " KG,1,2,3,4\n", + " Weymouth\n", + " MA\n", + " 2189\n", + " Norfolk County\n", + " 42.217670\n", + " -70.925240\n", + " Weymouth School District\n", + " 455\n", + " 8.0\n", + " 2021.0\n", + " (42.21767, -70.92524)\n", + " 12.754639\n", + " 24.381842\n", " \n", " \n", "\n", "" ], "text/plain": [ - " state-id name type \\\n", - "universal-id \n", - "2501042 7050505 Masconomet Regional High School public \n", - "2500337 350380 Young Achievers Science and Math School public \n", - "2500402 440017 Kennedy K-5 Elementary School public \n", - "2501682 3070010 Boyden public \n", - "2501507 2760305 P. Brent Trottier Middle School public \n", + " state-id name type level-codes \\\n", + "universal-id \n", + "2500363 380013 Spofford Pond public e \n", + "2506356 100305 Gibbs School public e,m \n", + "2501835 3470410 Daniel L Joyce Middle School public m \n", + "2501714 3150005 Claypit Hill School public e \n", + "2502631 1810055 Tenney Grammar School public p,e,m \n", + "2500515 710505 Danvers High School public h \n", + "2501498 2740410 Next Wave Junior High School public m \n", + "2501384 2430310 Broad Meadows Middle School public m \n", + "2500916 1570006 Hanscom Primary School public p,e \n", + "2501788 3360065 Lawrence W Pingree public e \n", "\n", - " level-codes level city state zip \\\n", - "universal-id \n", - "2501042 h 9,10,11,12 Boxford MA 1921 \n", - "2500337 p,e,m PK,KG,1,2,3,4,5,6,7,8 Mattapan MA 2126 \n", - "2500402 e KG,1,2,3,4,5 Brockton MA 2301 \n", - "2501682 e KG,1,2,3,4,5 Walpole MA 2071 \n", - "2501507 m 6,7,8 Southborough MA 1772 \n", + " level city state zip \\\n", + "universal-id \n", + "2500363 3,4,5,6 Boxford MA 1921 \n", + "2506356 6 Arlington MA 2474 \n", + "2501835 6,7,8 Woburn MA 1801 \n", + "2501714 KG,1,2,3,4,5 Wayland MA 1778 \n", + "2502631 PK,KG,1,2,3,4,5,6,7,8 Methuen MA 1844 \n", + "2500515 9,10,11,12,UG Danvers MA 1923 \n", + "2501498 7,8 Somerville MA 2145 \n", + "2501384 6,7,8 Quincy MA 2169 \n", + "2500916 PK,KG,1,2,3 Hanscom Air Force Bs MA 1731 \n", + "2501788 KG,1,2,3,4 Weymouth MA 2189 \n", "\n", " county lat lon \\\n", "universal-id \n", - "2501042 Essex County 42.627754 -70.974693 \n", - "2500337 Suffolk County 42.282269 -71.095016 \n", - "2500402 Plymouth County 42.059696 -71.037262 \n", - "2501682 Norfolk County 42.105808 -71.258743 \n", - "2501507 Worcester County 42.299240 -71.542259 \n", + "2500363 Essex County 42.697018 -71.017365 \n", + "2506356 Middlesex County 42.410576 -71.145081 \n", + "2501835 Middlesex County 42.477467 -71.175484 \n", + "2501714 Middlesex County 42.373108 -71.344765 \n", + "2502631 Essex County 42.732357 -71.177345 \n", + "2500515 Essex County 42.582523 -70.931618 \n", + "2501498 Middlesex County 42.387581 -71.087326 \n", + "2501384 Norfolk County 42.259659 -70.985237 \n", + "2500916 Middlesex County 42.456898 -71.278549 \n", + "2501788 Norfolk County 42.217670 -70.925240 \n", "\n", - " district-name district-id rating year \\\n", - "universal-id \n", - "2501042 Masconomet School District 259 8.0 2021.0 \n", - "2500337 Boston School District 99 2.0 2021.0 \n", - "2500402 Brockton School District 111 4.0 2021.0 \n", - "2501682 Walpole School District 426 6.0 2021.0 \n", - "2501507 Southborough School District 387 8.0 2021.0 \n", + " district-name district-id rating year \\\n", + "universal-id \n", + "2500363 Boxford School District 102 7.0 2021.0 \n", + "2506356 Arlington Public Schools 69 7.0 2021.0 \n", + "2501835 Woburn School District 467 4.0 2021.0 \n", + "2501714 Wayland School District 434 8.0 2021.0 \n", + "2502631 Methuen School District 270 3.0 2021.0 \n", + "2500515 Danvers School District 141 6.0 2021.0 \n", + "2501498 Somerville School District 383 NaN NaN \n", + "2501384 Quincy School District 349 4.0 2021.0 \n", + "2500916 Lincoln School District 242 3.0 2021.0 \n", + "2501788 Weymouth School District 455 8.0 2021.0 \n", "\n", " coordinates distance-to-downtown distance-to-work \n", "universal-id \n", - "2501042 (42.627754, -70.974693) 30.005931 28.583420 \n", - "2500337 (42.282269, -71.095016) 9.673200 24.989359 \n", - "2500402 (42.059696, -71.037262) 34.339345 49.384728 \n", - "2501682 (42.105808, -71.258743) 32.933990 40.921772 \n", - "2501507 (42.29924, -71.542259) 39.445654 30.606258 " + "2500363 (42.697018, -71.017365) 22.917933 19.554889 \n", + "2506356 (42.410576, -71.145081) 4.794958 7.066929 \n", + "2501835 (42.477467, -71.175484) 9.264922 4.013598 \n", + "2501714 (42.373108, -71.344765) 13.952791 8.347379 \n", + "2502631 (42.732357, -71.177345) 25.763243 18.273064 \n", + "2500515 (42.582523, -70.931618) 16.464503 18.045917 \n", + "2501498 (42.387581, -71.087326) 1.609308 10.378716 \n", + "2501384 (42.259659, -70.985237) 8.646003 20.169491 \n", + "2500916 (42.456898, -71.278549) 12.234463 1.705602 \n", + "2501788 (42.21767, -70.92524) 12.754639 24.381842 " ] }, - "execution_count": 74, + "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "boston_df = boston_df[boston_df['state'] == \"MA\"]\n", + "# calculate distance to PoI using geo-center of districts\n", + "distances_to_downtown = {k: np.mean(list(v)) for k, v in boston_df.groupby('district-id')['distance-to-downtown']}\n", + "distances_to_work = {k: np.mean(list(v)) for k, v in boston_df.groupby('district-id')['distance-to-work']}\n", "\n", - "boston_df.sample(5)" + "df_downtown = pd.DataFrame.from_dict(distances_to_downtown, orient='index')\n", + "df_work = pd.DataFrame.from_dict(distances_to_work, orient='index')\n", + "\n", + "# merge these new columns\n", + "both_df = pd.merge(left=df_downtown, right=df_work, how='inner', left_index=True, right_index=True)\n", + "both_df.rename(columns={'0_x': \"downtown\", '0_y': \"work\"}, inplace=True)\n", + "\n", + "both_df = both_df[both_df[\"downtown\"] < 35.0]\n", + "both_df = both_df[both_df[\"work\"] < 25.0]\n", + "\n", + "print(f'There are {len(both_df)} school districts within reasonable proximity to downtown and work.\\n')\n", + "\n", + "# filter out all schools which aren't in proximal districts\n", + "proximal_district_ids = list(both_df.index)\n", + "boston_df = boston_df[boston_df['district-id'].isin(proximal_district_ids)]\n", + "\n", + "print(f'There are {len(boston_df)} schools within these proximal districts.\\n')\n", + "\n", + "boston_df.sample(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Some of these districts don't have enough rating data. Those should be dropped." + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "boston_df.groupby(['district-id'])" ] } ],