In [32]:
# imports
import pandas as pd
import numpy as np

from great_schools import get_nearby_schools
from distance import get_distance
from secret import get_key


## Shaun and Daniela's Boston Public School Analysis
#### 2021.04.10

Fetch the API key from the local filesystem.

In [33]:
# get the API key
api_key_file = '../keys/api.key'
api_key = get_key(api_key_file)

Use the `nearby_schools` API endpoint to grab raw data of all schools within the maximum radius

In [34]:
# Some columns will dropped immediately as pre-processing.
drops = [
    'nces-id',
    'school-summary',
    'street',
    'fipscounty',
    'phone',
    'fax',
    'web-site',
    'overview-url',
    'rating-description',
    'distance',
]

# Grab data for Boston.
refresh = False
boston_nearby_schools_file = '../data/nearby_schools/boston.csv'
if refresh:
    boston_schools = get_nearby_schools(api_key,"42.3","-71.2","50")
    boston_df = pd.DataFrame.from_dict(boston_schools)
    boston_df.drop(columns=drops,inplace=True)
    boston_df.to_csv(boston_nearby_schools_file, )
else:
    boston_df = pd.read_csv(boston_nearby_schools_file)
    boston_df.set_index(keys=["universal-id"], drop=True, inplace=True)
    boston_df.drop(columns=["Unnamed: 0"], inplace=True)

# Grab data for Buffalo.
refresh = False
buffalo_nearby_schools_file = '../data/nearby_schools/buffalo.csv'
if refresh:
    buffalo_schools = get_nearby_schools(api_key,"42.9625","-78.7425","50")
    buffalo_df = pd.DataFrame.from_dict(buffalo_schools)
    buffalo_df.drop(columns=drops,inplace=True)
    buffalo_df.to_csv(buffalo_nearby_schools_file)
else:
    buffalo_df = pd.read_csv(buffalo_nearby_schools_file)
    buffalo_df.set_index(keys=["universal-id"], drop=True, inplace=True)
    buffalo_df.drop(columns=["Unnamed: 0"], inplace=True)

Process the `lat` and `lon` columns from the API output into tuples.

Then create two new columns:
- Distance to Downtown
- Distance to Work

In [35]:
# Form tuple to represent coordinates
boston_df['coordinates'] = list(zip(boston_df.lat,boston_df.lon))
#boston_df.drop(columns=['lat', 'lon'], inplace=True)

# Define coordinates of important places
downtown=(42.3674836866797, -71.07134540735377) # Science Museum
work=(42.47381059540949, -71.25414135292398) # Hartwell

# Create new columns to tabulate distance to these important places
boston_df['distance-to-downtown'] = boston_df['coordinates'].apply(func=get_distance,p2=downtown)
boston_df['distance-to-work'] = boston_df['coordinates'].apply(func=get_distance,p2=work)

We should definitely removal all schools that aren't in Massachusetts.

In [36]:
print(f'There are {len(boston_df)} schools from the original API results.')

# only allow from MA
boston_df = boston_df[boston_df['state'] == "MA"]
print(f'Allowing only schools from Massachusetts reduces the dataset to {len(boston_df)} schools.')

There are 1789 schools from the original API results.
Allowing only schools from Massachusetts reduces the dataset to 1375 schools.


How many unique district id's are there?

In [37]:
# get unique districts
districts = boston_df["district-id"].unique()
print(f'\nThere are {len(districts)} unique school districts.\n')


There are 230 unique school districts.



Which of these districts are close to both work and downtown boston?

In [38]:
# calculate distance to PoI using geo-center of districts
distances_to_downtown = {k: np.mean(list(v)) for k, v in boston_df.groupby('district-id')['distance-to-downtown']}
distances_to_work = {k: np.mean(list(v)) for k, v in boston_df.groupby('district-id')['distance-to-work']}

df_downtown = pd.DataFrame.from_dict(distances_to_downtown, orient='index')
df_work = pd.DataFrame.from_dict(distances_to_work, orient='index')

# merge these new columns
both_df = pd.merge(left=df_downtown, right=df_work, how='inner', left_index=True, right_index=True)
both_df.rename(columns={'0_x': "downtown", '0_y': "work"}, inplace=True)

both_df = both_df[both_df["downtown"] < 35.0]
both_df = both_df[both_df["work"] < 25.0]

print(f'There are {len(both_df)} school districts within reasonable proximity to downtown and work.\n')

# filter out all schools which aren't in proximal districts
proximal_district_ids = list(both_df.index)
boston_df = boston_df[boston_df['district-id'].isin(proximal_district_ids)]

print(f'There are {len(boston_df)} schools within these proximal districts.\n')

boston_df.sample(10)

There are 116 school districts within reasonable proximity to downtown and work.

There are 820 schools within these proximal districts.



Unnamed: 0_level_0,state-id,name,type,level-codes,level,city,state,zip,county,lat,lon,district-name,district-id,rating,year,coordinates,distance-to-downtown,distance-to-work
universal-id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2500363,380013,Spofford Pond,public,e,3456,Boxford,MA,1921,Essex County,42.697018,-71.017365,Boxford School District,102,7.0,2021.0,"(42.697018, -71.017365)",22.917933,19.554889
2506356,100305,Gibbs School,public,"e,m",6,Arlington,MA,2474,Middlesex County,42.410576,-71.145081,Arlington Public Schools,69,7.0,2021.0,"(42.410576, -71.145081)",4.794958,7.066929
2501835,3470410,Daniel L Joyce Middle School,public,m,678,Woburn,MA,1801,Middlesex County,42.477467,-71.175484,Woburn School District,467,4.0,2021.0,"(42.477467, -71.175484)",9.264922,4.013598
2501714,3150005,Claypit Hill School,public,e,"KG,1,2,3,4,5",Wayland,MA,1778,Middlesex County,42.373108,-71.344765,Wayland School District,434,8.0,2021.0,"(42.373108, -71.344765)",13.952791,8.347379
2502631,1810055,Tenney Grammar School,public,"p,e,m","PK,KG,1,2,3,4,5,6,7,8",Methuen,MA,1844,Essex County,42.732357,-71.177345,Methuen School District,270,3.0,2021.0,"(42.732357, -71.177345)",25.763243,18.273064
2500515,710505,Danvers High School,public,h,"9,10,11,12,UG",Danvers,MA,1923,Essex County,42.582523,-70.931618,Danvers School District,141,6.0,2021.0,"(42.582523, -70.931618)",16.464503,18.045917
2501498,2740410,Next Wave Junior High School,public,m,78,Somerville,MA,2145,Middlesex County,42.387581,-71.087326,Somerville School District,383,,,"(42.387581, -71.087326)",1.609308,10.378716
2501384,2430310,Broad Meadows Middle School,public,m,678,Quincy,MA,2169,Norfolk County,42.259659,-70.985237,Quincy School District,349,4.0,2021.0,"(42.259659, -70.985237)",8.646003,20.169491
2500916,1570006,Hanscom Primary School,public,"p,e","PK,KG,1,2,3",Hanscom Air Force Bs,MA,1731,Middlesex County,42.456898,-71.278549,Lincoln School District,242,3.0,2021.0,"(42.456898, -71.278549)",12.234463,1.705602
2501788,3360065,Lawrence W Pingree,public,e,"KG,1,2,3,4",Weymouth,MA,2189,Norfolk County,42.21767,-70.92524,Weymouth School District,455,8.0,2021.0,"(42.21767, -70.92524)",12.754639,24.381842


Some of these districts don't have enough rating data. Those should be dropped.

In [40]:
boston_df.groupby(['district-id'])

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7f54f95addf0>