Awesome
Observed Trends:
- Looking at all the reading scores and math scores in each table, students tend to do better in reading than in math.
- Looking at the top performing schools, bottom performing schools, and passing rates by school type, students have a much higher passing rate in charter schools (~95%) than in district schools (~74%).
- Looking at the summary of performance by spending ranges per student, it seems that students have a higher passing rate in the lower spending ranges (<$615).
# Import dependencies
import pandas as pd
import numpy as np
# Create paths to csv files
school_path = "raw_data/schools_complete.csv"
student_path = "raw_data/students_complete.csv"
# Read the csv files
schools = pd.read_csv(school_path)
schools = schools.rename(columns={"name": "school"}) # so we can merge later
students = pd.read_csv(student_path)
students = pd.DataFrame(students)
District Summary
# Calculate total schools, students, and budget of district
total_schools = schools['School ID'].count()
total_students = schools['size'].sum()
total_budget = schools['budget'].sum()
# Calculate average math and reading scores
avg_math = students['math_score'].mean()
avg_read = students['reading_score'].mean()
# Calculate percent passing rates
pass_math = students.loc[students['math_score'] >= 70]
per_math = len(pass_math)/ total_students * 100
pass_read = students.loc[students['reading_score'] >= 70]
per_read = len(pass_read)/ total_students * 100
overall = (per_math + per_read)/2
# Create dataframe for district summary
district_summary = pd.DataFrame({"Total Schools": total_schools,
"Total Students": total_students,
"Total Budget": total_budget,
"Average Math Score": avg_math,
"Average Reading Score": avg_read,
"% Passing Math": per_math,
"% Passing Reading": per_read,
"% Overall Passing Rate": [overall]})
#Reorganize columns with double brackets
district_summary = district_summary[["Total Schools", "Total Students", "Total Budget",
"Average Math Score", "Average Reading Score",
"% Passing Math", "% Passing Reading", "% Overall Passing Rate"]]
# Format to "$0,000.00"
district_summary['Total Budget'] = district_summary['Total Budget'].map('${:,.2f}'.format)
district_summary
<div>
<table border="1" class="dataframe">
<thead>
<tr style="text-align: right;">
<th></th>
<th>Total Schools</th>
<th>Total Students</th>
<th>Total Budget</th>
<th>Average Math Score</th>
<th>Average Reading Score</th>
<th>% Passing Math</th>
<th>% Passing Reading</th>
<th>% Overall Passing Rate</th>
</tr>
</thead>
<tbody>
<tr>
<th>0</th>
<td>15</td>
<td>39170</td>
<td>$24,649,428.00</td>
<td>78.985371</td>
<td>81.87784</td>
<td>74.980853</td>
<td>85.805463</td>
<td>80.393158</td>
</tr>
</tbody>
</table>
</div>
School Summary
# Merge schools and students
merge_df = pd.merge(schools, students, on="school")
merge_df = merge_df.drop(['School ID', 'Student ID'], axis=1)
# Set school as index, get type, total students, total budget and per student budget
school_index = schools.set_index('school')
school_type = school_index['type']
school_students = school_index['size']
school_budget = school_index['budget']
psb = school_budget/school_students
# Set school as index and groupby school for merge_df to get average scores by school
index = merge_df.set_index('school')
grouped = index.groupby(['school'])
# Get average math and reading score
school_avg_math = grouped['math_score'].mean()
school_avg_read = grouped['reading_score'].mean()
# Calculate percent that passed math
total_stu = grouped['name'].count()
grouped_math = pass_math.groupby('school')
school_pass_math = grouped_math['name'].count()/total_stu*100
# Calculate percent that passed reading
grouped_reading = pass_read.groupby('school')
school_pass_read = grouped_reading['name'].count()/total_stu*100
# Calculate overall passing rate
overall_pass = (school_pass_math + school_pass_read)/2
# Create dataframe for school summary
school_summary = pd.DataFrame({"School Type": school_type,
"Total Students": school_students,
"Total School Budget": school_budget,
"Per Student Budget": psb,
"Average Math Score": school_avg_math,
"Average Reading Score": school_avg_read,
"% Passing Math": school_pass_math,
"% Passing Reading": school_pass_read,
"Overall Passing Rate": overall_pass})
# Reorganize school summary columns
school_summary = school_summary[["School Type", "Total Students", "Total School Budget",
"Per Student Budget", "Average Math Score", "Average Reading Score",
"% Passing Math", "% Passing Reading", "Overall Passing Rate"]]
# Format to "$0,000.00"
school_summary['Total School Budget'] = school_summary['Total School Budget'].map('${:,.2f}'.format)
school_summary['Per Student Budget'] = school_summary['Per Student Budget'].map('${:,.2f}'.format)
school_summary.head()
<div>
<table border="1" class="dataframe">
<thead>
<tr style="text-align: right;">
<th></th>
<th>School Type</th>
<th>Total Students</th>
<th>Total School Budget</th>
<th>Per Student Budget</th>
<th>Average Math Score</th>
<th>Average Reading Score</th>
<th>% Passing Math</th>
<th>% Passing Reading</th>
<th>Overall Passing Rate</th>
</tr>
</thead>
<tbody>
<tr>
<th>Bailey High School</th>
<td>District</td>
<td>4976</td>
<td>$3,124,928.00</td>
<td>$628.00</td>
<td>77.048432</td>
<td>81.033963</td>
<td>66.680064</td>
<td>81.933280</td>
<td>74.306672</td>
</tr>
<tr>
<th>Cabrera High School</th>
<td>Charter</td>
<td>1858</td>
<td>$1,081,356.00</td>
<td>$582.00</td>
<td>83.061895</td>
<td>83.975780</td>
<td>94.133477</td>
<td>97.039828</td>
<td>95.586652</td>
</tr>
<tr>
<th>Figueroa High School</th>
<td>District</td>
<td>2949</td>
<td>$1,884,411.00</td>
<td>$639.00</td>
<td>76.711767</td>
<td>81.158020</td>
<td>65.988471</td>
<td>80.739234</td>
<td>73.363852</td>
</tr>
<tr>
<th>Ford High School</th>
<td>District</td>
<td>2739</td>
<td>$1,763,916.00</td>
<td>$644.00</td>
<td>77.102592</td>
<td>80.746258</td>
<td>68.309602</td>
<td>79.299014</td>
<td>73.804308</td>
</tr>
<tr>
<th>Griffin High School</th>
<td>Charter</td>
<td>1468</td>
<td>$917,500.00</td>
<td>$625.00</td>
<td>83.351499</td>
<td>83.816757</td>
<td>93.392371</td>
<td>97.138965</td>
<td>95.265668</td>
</tr>
</tbody>
</table>
</div>
Top Performing Schools (By Passing Rate)
# Create top performing schools summary by passing rate
top_summary = school_summary.loc[school_summary['Overall Passing Rate'] > 90]
top_summary.sort_values(['Overall Passing Rate'], ascending=False).head()
<div>
<table border="1" class="dataframe">
<thead>
<tr style="text-align: right;">
<th></th>
<th>School Type</th>
<th>Total Students</th>
<th>Total School Budget</th>
<th>Per Student Budget</th>
<th>Average Math Score</th>
<th>Average Reading Score</th>
<th>% Passing Math</th>
<th>% Passing Reading</th>
<th>Overall Passing Rate</th>
</tr>
</thead>
<tbody>
<tr>
<th>Cabrera High School</th>
<td>Charter</td>
<td>1858</td>
<td>$1,081,356.00</td>
<td>$582.00</td>
<td>83.061895</td>
<td>83.975780</td>
<td>94.133477</td>
<td>97.039828</td>
<td>95.586652</td>
</tr>
<tr>
<th>Thomas High School</th>
<td>Charter</td>
<td>1635</td>
<td>$1,043,130.00</td>
<td>$638.00</td>
<td>83.418349</td>
<td>83.848930</td>
<td>93.272171</td>
<td>97.308869</td>
<td>95.290520</td>
</tr>
<tr>
<th>Pena High School</th>
<td>Charter</td>
<td>962</td>
<td>$585,858.00</td>
<td>$609.00</td>
<td>83.839917</td>
<td>84.044699</td>
<td>94.594595</td>
<td>95.945946</td>
<td>95.270270</td>
</tr>
<tr>
<th>Griffin High School</th>
<td>Charter</td>
<td>1468</td>
<td>$917,500.00</td>
<td>$625.00</td>
<td>83.351499</td>
<td>83.816757</td>
<td>93.392371</td>
<td>97.138965</td>
<td>95.265668</td>
</tr>
<tr>
<th>Wilson High School</th>
<td>Charter</td>
<td>2283</td>
<td>$1,319,574.00</td>
<td>$578.00</td>
<td>83.274201</td>
<td>83.989488</td>
<td>93.867718</td>
<td>96.539641</td>
<td>95.203679</td>
</tr>
</tbody>
</table>
</div>
Bottom Performing Schools (By Passing Rate)
# Create bottom performing schools summary by passing rate
bottom_summary = school_summary.loc[school_summary['Overall Passing Rate'] < 75]
bottom_summary.sort_values(['Overall Passing Rate'], ascending=True).head()
<div>
<table border="1" class="dataframe">
<thead>
<tr style="text-align: right;">
<th></th>
<th>School Type</th>
<th>Total Students</th>
<th>Total School Budget</th>
<th>Per Student Budget</th>
<th>Average Math Score</th>
<th>Average Reading Score</th>
<th>% Passing Math</th>
<th>% Passing Reading</th>
<th>Overall Passing Rate</th>
</tr>
</thead>
<tbody>
<tr>
<th>Rodriguez High School</th>
<td>District</td>
<td>3999</td>
<td>$2,547,363.00</td>
<td>$637.00</td>
<td>76.842711</td>
<td>80.744686</td>
<td>66.366592</td>
<td>80.220055</td>
<td>73.293323</td>
</tr>
<tr>
<th>Figueroa High School</th>
<td>District</td>
<td>2949</td>
<td>$1,884,411.00</td>
<td>$639.00</td>
<td>76.711767</td>
<td>81.158020</td>
<td>65.988471</td>
<td>80.739234</td>
<td>73.363852</td>
</tr>
<tr>
<th>Huang High School</th>
<td>District</td>
<td>2917</td>
<td>$1,910,635.00</td>
<td>$655.00</td>
<td>76.629414</td>
<td>81.182722</td>
<td>65.683922</td>
<td>81.316421</td>
<td>73.500171</td>
</tr>
<tr>
<th>Johnson High School</th>
<td>District</td>
<td>4761</td>
<td>$3,094,650.00</td>
<td>$650.00</td>
<td>77.072464</td>
<td>80.966394</td>
<td>66.057551</td>
<td>81.222432</td>
<td>73.639992</td>
</tr>
<tr>
<th>Ford High School</th>
<td>District</td>
<td>2739</td>
<td>$1,763,916.00</td>
<td>$644.00</td>
<td>77.102592</td>
<td>80.746258</td>
<td>68.309602</td>
<td>79.299014</td>
<td>73.804308</td>
</tr>
</tbody>
</table>
</div>
Math Scores by Grade
# Math scores by grade
ninth = students.loc[students['grade']=='9th'].groupby("school")
ninth_math = ninth['math_score'].mean()
tenth = students.loc[students['grade']=='10th'].groupby("school")
tenth_math = tenth['math_score'].mean()
eleventh = students.loc[students['grade']=='11th'].groupby("school")
eleventh_math = eleventh['math_score'].mean()
twelfth = students.loc[students['grade']=='12th'].groupby("school")
twelfth_math = twelfth['math_score'].mean()
# Create dataframe for math scores summary
math_summary = pd.DataFrame({"9th": ninth_math,
"10th": tenth_math,
"11th": eleventh_math,
"12th": twelfth_math})
math_summary = math_summary[["9th","10th","11th","12th"]]
del math_summary.index.name
math_summary.head()
<div>
<table border="1" class="dataframe">
<thead>
<tr style="text-align: right;">
<th></th>
<th>9th</th>
<th>10th</th>
<th>11th</th>
<th>12th</th>
</tr>
</thead>
<tbody>
<tr>
<th>Bailey High School</th>
<td>77.083676</td>
<td>76.996772</td>
<td>77.515588</td>
<td>76.492218</td>
</tr>
<tr>
<th>Cabrera High School</th>
<td>83.094697</td>
<td>83.154506</td>
<td>82.765560</td>
<td>83.277487</td>
</tr>
<tr>
<th>Figueroa High School</th>
<td>76.403037</td>
<td>76.539974</td>
<td>76.884344</td>
<td>77.151369</td>
</tr>
<tr>
<th>Ford High School</th>
<td>77.361345</td>
<td>77.672316</td>
<td>76.918058</td>
<td>76.179963</td>
</tr>
<tr>
<th>Griffin High School</th>
<td>82.044010</td>
<td>84.229064</td>
<td>83.842105</td>
<td>83.356164</td>
</tr>
</tbody>
</table>
</div>
Reading Scores by Grade
# Reading scores by grade
r_ninth = students.loc[students['grade'] == '9th'].groupby("school")
ninth_read = r_ninth['reading_score'].mean()
r_tenth = students.loc[students['grade'] == '10th'].groupby("school")
tenth_read = r_tenth['reading_score'].mean()
r_eleventh = students.loc[students['grade'] == '11th'].groupby("school")
eleventh_read = r_eleventh['reading_score'].mean()
r_twelfth = students.loc[students['grade'] == '12th'].groupby("school")
twelfth_read = r_twelfth['reading_score'].mean()
# Create dataframe for reading scores summary
read_summary = pd.DataFrame({"9th": ninth_read,
"10th": tenth_read,
"11th": eleventh_read,
"12th": twelfth_read})
read_summary = read_summary[["9th","10th","11th","12th"]]
del read_summary.index.name
read_summary.head()
<div>
<table border="1" class="dataframe">
<thead>
<tr style="text-align: right;">
<th></th>
<th>9th</th>
<th>10th</th>
<th>11th</th>
<th>12th</th>
</tr>
</thead>
<tbody>
<tr>
<th>Bailey High School</th>
<td>81.303155</td>
<td>80.907183</td>
<td>80.945643</td>
<td>80.912451</td>
</tr>
<tr>
<th>Cabrera High School</th>
<td>83.676136</td>
<td>84.253219</td>
<td>83.788382</td>
<td>84.287958</td>
</tr>
<tr>
<th>Figueroa High School</th>
<td>81.198598</td>
<td>81.408912</td>
<td>80.640339</td>
<td>81.384863</td>
</tr>
<tr>
<th>Ford High School</th>
<td>80.632653</td>
<td>81.262712</td>
<td>80.403642</td>
<td>80.662338</td>
</tr>
<tr>
<th>Griffin High School</th>
<td>83.369193</td>
<td>83.706897</td>
<td>84.288089</td>
<td>84.013699</td>
</tr>
</tbody>
</table>
</div>
Scores by School Spending
# Create bins
bins = [0,585,615,645,675]
# Create names for the bins
spending_range = ['<$585','$585-615','$615-645','$645-675']
# Change formatting of per student budget in school_summary from string back to float so it can be binned
school_summary['Per Student Budget'] = school_summary['Per Student Budget'].str.replace('$', '')
school_summary['Per Student Budget'] = school_summary['Per Student Budget'].astype(float)
school_summary["Spending Ranges (Per Student)"] = pd.cut(school_summary["Per Student Budget"],
bins, labels=spending_range)
spend_summary = school_summary.groupby("Spending Ranges (Per Student)")
spend_summary = spend_summary[["Average Math Score", "Average Reading Score", "% Passing Math",
"% Passing Reading", "Overall Passing Rate"]]
spend_summary.mean()
<div>
<table border="1" class="dataframe">
<thead>
<tr style="text-align: right;">
<th></th>
<th>Average Math Score</th>
<th>Average Reading Score</th>
<th>% Passing Math</th>
<th>% Passing Reading</th>
<th>Overall Passing Rate</th>
</tr>
<tr>
<th>Spending Ranges (Per Student)</th>
<th></th>
<th></th>
<th></th>
<th></th>
<th></th>
</tr>
</thead>
<tbody>
<tr>
<th><$585</th>
<td>83.455399</td>
<td>83.933814</td>
<td>93.460096</td>
<td>96.610877</td>
<td>95.035486</td>
</tr>
<tr>
<th>$585-615</th>
<td>83.599686</td>
<td>83.885211</td>
<td>94.230858</td>
<td>95.900287</td>
<td>95.065572</td>
</tr>
<tr>
<th>$615-645</th>
<td>79.079225</td>
<td>81.891436</td>
<td>75.668212</td>
<td>86.106569</td>
<td>80.887391</td>
</tr>
<tr>
<th>$645-675</th>
<td>76.997210</td>
<td>81.027843</td>
<td>66.164813</td>
<td>81.133951</td>
<td>73.649382</td>
</tr>
</tbody>
</table>
</div>
Scores by School Size
# Create bins
bins2 = [0, 1000, 2000, 5000]
# Create names for bins
size_range = ['Small', 'Medium', 'Large']
school_summary["School Size"] = pd.cut(school_summary["Total Students"],
bins2, labels=size_range)
size_summary = school_summary.groupby("School Size")
size_summary = size_summary[["Average Math Score", "Average Reading Score", "% Passing Math",
"% Passing Reading", "Overall Passing Rate"]]
size_summary.mean()
<div>
</style>
<table border="1" class="dataframe">
<thead>
<tr style="text-align: right;">
<th></th>
<th>Average Math Score</th>
<th>Average Reading Score</th>
<th>% Passing Math</th>
<th>% Passing Reading</th>
<th>Overall Passing Rate</th>
</tr>
<tr>
<th>School Size</th>
<th></th>
<th></th>
<th></th>
<th></th>
<th></th>
</tr>
</thead>
<tbody>
<tr>
<th>Small</th>
<td>83.821598</td>
<td>83.929843</td>
<td>93.550225</td>
<td>96.099437</td>
<td>94.824831</td>
</tr>
<tr>
<th>Medium</th>
<td>83.374684</td>
<td>83.864438</td>
<td>93.599695</td>
<td>96.790680</td>
<td>95.195187</td>
</tr>
<tr>
<th>Large</th>
<td>77.746417</td>
<td>81.344493</td>
<td>69.963361</td>
<td>82.766634</td>
<td>76.364998</td>
</tr>
</tbody>
</table>
</div>
Scores by School Type
type_summary = school_summary.groupby("School Type")
type_summary = type_summary[["Average Math Score", "Average Reading Score", "% Passing Math",
"% Passing Reading", "Overall Passing Rate"]]
type_summary.mean()
<div>
<table border="1" class="dataframe">
<thead>
<tr style="text-align: right;">
<th></th>
<th>Average Math Score</th>
<th>Average Reading Score</th>
<th>% Passing Math</th>
<th>% Passing Reading</th>
<th>Overall Passing Rate</th>
</tr>
<tr>
<th>School Type</th>
<th></th>
<th></th>
<th></th>
<th></th>
<th></th>
</tr>
</thead>
<tbody>
<tr>
<th>Charter</th>
<td>83.473852</td>
<td>83.896421</td>
<td>93.620830</td>
<td>96.586489</td>
<td>95.103660</td>
</tr>
<tr>
<th>District</th>
<td>76.956733</td>
<td>80.966636</td>
<td>66.548453</td>
<td>80.799062</td>
<td>73.673757</td>
</tr>
</tbody>
</table>
</div>