Home

Awesome

Observed Trends:

# Import dependencies
import pandas as pd
import numpy as np

# Create paths to csv files
school_path = "raw_data/schools_complete.csv"
student_path = "raw_data/students_complete.csv"

# Read the csv files
schools = pd.read_csv(school_path)
schools = schools.rename(columns={"name": "school"}) # so we can merge later

students = pd.read_csv(student_path)
students = pd.DataFrame(students)

District Summary

# Calculate total schools, students, and budget of district
total_schools = schools['School ID'].count()
total_students = schools['size'].sum()
total_budget = schools['budget'].sum()

# Calculate average math and reading scores
avg_math = students['math_score'].mean()
avg_read = students['reading_score'].mean()

# Calculate percent passing rates
pass_math = students.loc[students['math_score'] >= 70]
per_math = len(pass_math)/ total_students * 100

pass_read = students.loc[students['reading_score'] >= 70]
per_read = len(pass_read)/ total_students * 100

overall = (per_math + per_read)/2 

# Create dataframe for district summary
district_summary = pd.DataFrame({"Total Schools": total_schools,
                                "Total Students": total_students,
                                "Total Budget": total_budget,
                                "Average Math Score": avg_math,
                                "Average Reading Score": avg_read,
                                "% Passing Math": per_math,
                                "% Passing Reading": per_read,
                                "% Overall Passing Rate": [overall]})

#Reorganize columns with double brackets
district_summary = district_summary[["Total Schools", "Total Students", "Total Budget", 
                                     "Average Math Score", "Average Reading Score", 
                                     "% Passing Math", "% Passing Reading", "% Overall Passing Rate"]]

# Format to "$0,000.00"
district_summary['Total Budget'] = district_summary['Total Budget'].map('${:,.2f}'.format)

district_summary
<div> <table border="1" class="dataframe"> <thead> <tr style="text-align: right;"> <th></th> <th>Total Schools</th> <th>Total Students</th> <th>Total Budget</th> <th>Average Math Score</th> <th>Average Reading Score</th> <th>% Passing Math</th> <th>% Passing Reading</th> <th>% Overall Passing Rate</th> </tr> </thead> <tbody> <tr> <th>0</th> <td>15</td> <td>39170</td> <td>$24,649,428.00</td> <td>78.985371</td> <td>81.87784</td> <td>74.980853</td> <td>85.805463</td> <td>80.393158</td> </tr> </tbody> </table> </div>

School Summary

# Merge schools and students
merge_df = pd.merge(schools, students, on="school")
merge_df = merge_df.drop(['School ID', 'Student ID'], axis=1)

# Set school as index, get type, total students, total budget and per student budget
school_index = schools.set_index('school')
school_type = school_index['type']
school_students = school_index['size']
school_budget = school_index['budget']
psb = school_budget/school_students

# Set school as index and groupby school for merge_df to get average scores by school
index = merge_df.set_index('school')
grouped = index.groupby(['school'])

# Get average math and reading score
school_avg_math = grouped['math_score'].mean()
school_avg_read = grouped['reading_score'].mean()

# Calculate percent that passed math
total_stu = grouped['name'].count()
grouped_math = pass_math.groupby('school')
school_pass_math = grouped_math['name'].count()/total_stu*100

# Calculate percent that passed reading
grouped_reading = pass_read.groupby('school')
school_pass_read = grouped_reading['name'].count()/total_stu*100
                             
# Calculate overall passing rate
overall_pass = (school_pass_math + school_pass_read)/2

# Create dataframe for school summary
school_summary = pd.DataFrame({"School Type": school_type,
                              "Total Students": school_students,
                              "Total School Budget": school_budget,
                              "Per Student Budget": psb,
                              "Average Math Score": school_avg_math,
                              "Average Reading Score": school_avg_read,
                              "% Passing Math": school_pass_math,
                              "% Passing Reading": school_pass_read,
                              "Overall Passing Rate": overall_pass})

# Reorganize school summary columns 
school_summary = school_summary[["School Type", "Total Students", "Total School Budget", 
                                 "Per Student Budget", "Average Math Score", "Average Reading Score", 
                                 "% Passing Math", "% Passing Reading", "Overall Passing Rate"]]

# Format to "$0,000.00"
school_summary['Total School Budget'] = school_summary['Total School Budget'].map('${:,.2f}'.format)
school_summary['Per Student Budget'] = school_summary['Per Student Budget'].map('${:,.2f}'.format)

school_summary.head()
<div> <table border="1" class="dataframe"> <thead> <tr style="text-align: right;"> <th></th> <th>School Type</th> <th>Total Students</th> <th>Total School Budget</th> <th>Per Student Budget</th> <th>Average Math Score</th> <th>Average Reading Score</th> <th>% Passing Math</th> <th>% Passing Reading</th> <th>Overall Passing Rate</th> </tr> </thead> <tbody> <tr> <th>Bailey High School</th> <td>District</td> <td>4976</td> <td>$3,124,928.00</td> <td>$628.00</td> <td>77.048432</td> <td>81.033963</td> <td>66.680064</td> <td>81.933280</td> <td>74.306672</td> </tr> <tr> <th>Cabrera High School</th> <td>Charter</td> <td>1858</td> <td>$1,081,356.00</td> <td>$582.00</td> <td>83.061895</td> <td>83.975780</td> <td>94.133477</td> <td>97.039828</td> <td>95.586652</td> </tr> <tr> <th>Figueroa High School</th> <td>District</td> <td>2949</td> <td>$1,884,411.00</td> <td>$639.00</td> <td>76.711767</td> <td>81.158020</td> <td>65.988471</td> <td>80.739234</td> <td>73.363852</td> </tr> <tr> <th>Ford High School</th> <td>District</td> <td>2739</td> <td>$1,763,916.00</td> <td>$644.00</td> <td>77.102592</td> <td>80.746258</td> <td>68.309602</td> <td>79.299014</td> <td>73.804308</td> </tr> <tr> <th>Griffin High School</th> <td>Charter</td> <td>1468</td> <td>$917,500.00</td> <td>$625.00</td> <td>83.351499</td> <td>83.816757</td> <td>93.392371</td> <td>97.138965</td> <td>95.265668</td> </tr> </tbody> </table> </div>

Top Performing Schools (By Passing Rate)

# Create top performing schools summary by passing rate
top_summary = school_summary.loc[school_summary['Overall Passing Rate'] > 90]
top_summary.sort_values(['Overall Passing Rate'], ascending=False).head()
<div> <table border="1" class="dataframe"> <thead> <tr style="text-align: right;"> <th></th> <th>School Type</th> <th>Total Students</th> <th>Total School Budget</th> <th>Per Student Budget</th> <th>Average Math Score</th> <th>Average Reading Score</th> <th>% Passing Math</th> <th>% Passing Reading</th> <th>Overall Passing Rate</th> </tr> </thead> <tbody> <tr> <th>Cabrera High School</th> <td>Charter</td> <td>1858</td> <td>$1,081,356.00</td> <td>$582.00</td> <td>83.061895</td> <td>83.975780</td> <td>94.133477</td> <td>97.039828</td> <td>95.586652</td> </tr> <tr> <th>Thomas High School</th> <td>Charter</td> <td>1635</td> <td>$1,043,130.00</td> <td>$638.00</td> <td>83.418349</td> <td>83.848930</td> <td>93.272171</td> <td>97.308869</td> <td>95.290520</td> </tr> <tr> <th>Pena High School</th> <td>Charter</td> <td>962</td> <td>$585,858.00</td> <td>$609.00</td> <td>83.839917</td> <td>84.044699</td> <td>94.594595</td> <td>95.945946</td> <td>95.270270</td> </tr> <tr> <th>Griffin High School</th> <td>Charter</td> <td>1468</td> <td>$917,500.00</td> <td>$625.00</td> <td>83.351499</td> <td>83.816757</td> <td>93.392371</td> <td>97.138965</td> <td>95.265668</td> </tr> <tr> <th>Wilson High School</th> <td>Charter</td> <td>2283</td> <td>$1,319,574.00</td> <td>$578.00</td> <td>83.274201</td> <td>83.989488</td> <td>93.867718</td> <td>96.539641</td> <td>95.203679</td> </tr> </tbody> </table> </div>

Bottom Performing Schools (By Passing Rate)

# Create bottom performing schools summary by passing rate
bottom_summary = school_summary.loc[school_summary['Overall Passing Rate'] < 75]
bottom_summary.sort_values(['Overall Passing Rate'], ascending=True).head()
<div> <table border="1" class="dataframe"> <thead> <tr style="text-align: right;"> <th></th> <th>School Type</th> <th>Total Students</th> <th>Total School Budget</th> <th>Per Student Budget</th> <th>Average Math Score</th> <th>Average Reading Score</th> <th>% Passing Math</th> <th>% Passing Reading</th> <th>Overall Passing Rate</th> </tr> </thead> <tbody> <tr> <th>Rodriguez High School</th> <td>District</td> <td>3999</td> <td>$2,547,363.00</td> <td>$637.00</td> <td>76.842711</td> <td>80.744686</td> <td>66.366592</td> <td>80.220055</td> <td>73.293323</td> </tr> <tr> <th>Figueroa High School</th> <td>District</td> <td>2949</td> <td>$1,884,411.00</td> <td>$639.00</td> <td>76.711767</td> <td>81.158020</td> <td>65.988471</td> <td>80.739234</td> <td>73.363852</td> </tr> <tr> <th>Huang High School</th> <td>District</td> <td>2917</td> <td>$1,910,635.00</td> <td>$655.00</td> <td>76.629414</td> <td>81.182722</td> <td>65.683922</td> <td>81.316421</td> <td>73.500171</td> </tr> <tr> <th>Johnson High School</th> <td>District</td> <td>4761</td> <td>$3,094,650.00</td> <td>$650.00</td> <td>77.072464</td> <td>80.966394</td> <td>66.057551</td> <td>81.222432</td> <td>73.639992</td> </tr> <tr> <th>Ford High School</th> <td>District</td> <td>2739</td> <td>$1,763,916.00</td> <td>$644.00</td> <td>77.102592</td> <td>80.746258</td> <td>68.309602</td> <td>79.299014</td> <td>73.804308</td> </tr> </tbody> </table> </div>

Math Scores by Grade

# Math scores by grade
ninth = students.loc[students['grade']=='9th'].groupby("school")
ninth_math = ninth['math_score'].mean()
tenth = students.loc[students['grade']=='10th'].groupby("school")
tenth_math = tenth['math_score'].mean()
eleventh = students.loc[students['grade']=='11th'].groupby("school")
eleventh_math = eleventh['math_score'].mean()
twelfth = students.loc[students['grade']=='12th'].groupby("school")
twelfth_math = twelfth['math_score'].mean()

# Create dataframe for math scores summary
math_summary = pd.DataFrame({"9th": ninth_math,
                            "10th": tenth_math,
                            "11th": eleventh_math,
                            "12th": twelfth_math})
math_summary = math_summary[["9th","10th","11th","12th"]]
del math_summary.index.name

math_summary.head()
<div> <table border="1" class="dataframe"> <thead> <tr style="text-align: right;"> <th></th> <th>9th</th> <th>10th</th> <th>11th</th> <th>12th</th> </tr> </thead> <tbody> <tr> <th>Bailey High School</th> <td>77.083676</td> <td>76.996772</td> <td>77.515588</td> <td>76.492218</td> </tr> <tr> <th>Cabrera High School</th> <td>83.094697</td> <td>83.154506</td> <td>82.765560</td> <td>83.277487</td> </tr> <tr> <th>Figueroa High School</th> <td>76.403037</td> <td>76.539974</td> <td>76.884344</td> <td>77.151369</td> </tr> <tr> <th>Ford High School</th> <td>77.361345</td> <td>77.672316</td> <td>76.918058</td> <td>76.179963</td> </tr> <tr> <th>Griffin High School</th> <td>82.044010</td> <td>84.229064</td> <td>83.842105</td> <td>83.356164</td> </tr> </tbody> </table> </div>

Reading Scores by Grade

# Reading scores by grade
r_ninth = students.loc[students['grade'] == '9th'].groupby("school")
ninth_read = r_ninth['reading_score'].mean()
r_tenth = students.loc[students['grade'] == '10th'].groupby("school")
tenth_read = r_tenth['reading_score'].mean()
r_eleventh = students.loc[students['grade'] == '11th'].groupby("school")
eleventh_read = r_eleventh['reading_score'].mean()
r_twelfth = students.loc[students['grade'] == '12th'].groupby("school")
twelfth_read = r_twelfth['reading_score'].mean()

# Create dataframe for reading scores summary
read_summary = pd.DataFrame({"9th": ninth_read,
                            "10th": tenth_read,
                            "11th": eleventh_read,
                            "12th": twelfth_read})
read_summary = read_summary[["9th","10th","11th","12th"]]
del read_summary.index.name

read_summary.head()
<div> <table border="1" class="dataframe"> <thead> <tr style="text-align: right;"> <th></th> <th>9th</th> <th>10th</th> <th>11th</th> <th>12th</th> </tr> </thead> <tbody> <tr> <th>Bailey High School</th> <td>81.303155</td> <td>80.907183</td> <td>80.945643</td> <td>80.912451</td> </tr> <tr> <th>Cabrera High School</th> <td>83.676136</td> <td>84.253219</td> <td>83.788382</td> <td>84.287958</td> </tr> <tr> <th>Figueroa High School</th> <td>81.198598</td> <td>81.408912</td> <td>80.640339</td> <td>81.384863</td> </tr> <tr> <th>Ford High School</th> <td>80.632653</td> <td>81.262712</td> <td>80.403642</td> <td>80.662338</td> </tr> <tr> <th>Griffin High School</th> <td>83.369193</td> <td>83.706897</td> <td>84.288089</td> <td>84.013699</td> </tr> </tbody> </table> </div>

Scores by School Spending

# Create bins
bins = [0,585,615,645,675]

# Create names for the bins
spending_range = ['<$585','$585-615','$615-645','$645-675']

# Change formatting of per student budget in school_summary from string back to float so it can be binned 
school_summary['Per Student Budget'] = school_summary['Per Student Budget'].str.replace('$', '')
school_summary['Per Student Budget'] = school_summary['Per Student Budget'].astype(float)
                                                      
school_summary["Spending Ranges (Per Student)"] = pd.cut(school_summary["Per Student Budget"], 
                                                         bins, labels=spending_range)
spend_summary = school_summary.groupby("Spending Ranges (Per Student)")
spend_summary = spend_summary[["Average Math Score", "Average Reading Score", "% Passing Math", 
                               "% Passing Reading", "Overall Passing Rate"]]
spend_summary.mean()
<div> <table border="1" class="dataframe"> <thead> <tr style="text-align: right;"> <th></th> <th>Average Math Score</th> <th>Average Reading Score</th> <th>% Passing Math</th> <th>% Passing Reading</th> <th>Overall Passing Rate</th> </tr> <tr> <th>Spending Ranges (Per Student)</th> <th></th> <th></th> <th></th> <th></th> <th></th> </tr> </thead> <tbody> <tr> <th>&lt;$585</th> <td>83.455399</td> <td>83.933814</td> <td>93.460096</td> <td>96.610877</td> <td>95.035486</td> </tr> <tr> <th>$585-615</th> <td>83.599686</td> <td>83.885211</td> <td>94.230858</td> <td>95.900287</td> <td>95.065572</td> </tr> <tr> <th>$615-645</th> <td>79.079225</td> <td>81.891436</td> <td>75.668212</td> <td>86.106569</td> <td>80.887391</td> </tr> <tr> <th>$645-675</th> <td>76.997210</td> <td>81.027843</td> <td>66.164813</td> <td>81.133951</td> <td>73.649382</td> </tr> </tbody> </table> </div>

Scores by School Size

# Create bins
bins2 = [0, 1000, 2000, 5000]

# Create names for bins
size_range = ['Small', 'Medium', 'Large']

school_summary["School Size"] = pd.cut(school_summary["Total Students"], 
                                                         bins2, labels=size_range)
size_summary = school_summary.groupby("School Size")
size_summary = size_summary[["Average Math Score", "Average Reading Score", "% Passing Math", 
                             "% Passing Reading", "Overall Passing Rate"]]
size_summary.mean()
<div> </style> <table border="1" class="dataframe"> <thead> <tr style="text-align: right;"> <th></th> <th>Average Math Score</th> <th>Average Reading Score</th> <th>% Passing Math</th> <th>% Passing Reading</th> <th>Overall Passing Rate</th> </tr> <tr> <th>School Size</th> <th></th> <th></th> <th></th> <th></th> <th></th> </tr> </thead> <tbody> <tr> <th>Small</th> <td>83.821598</td> <td>83.929843</td> <td>93.550225</td> <td>96.099437</td> <td>94.824831</td> </tr> <tr> <th>Medium</th> <td>83.374684</td> <td>83.864438</td> <td>93.599695</td> <td>96.790680</td> <td>95.195187</td> </tr> <tr> <th>Large</th> <td>77.746417</td> <td>81.344493</td> <td>69.963361</td> <td>82.766634</td> <td>76.364998</td> </tr> </tbody> </table> </div>

Scores by School Type

type_summary = school_summary.groupby("School Type")
type_summary = type_summary[["Average Math Score", "Average Reading Score", "% Passing Math", 
                             "% Passing Reading", "Overall Passing Rate"]]
type_summary.mean()
<div> <table border="1" class="dataframe"> <thead> <tr style="text-align: right;"> <th></th> <th>Average Math Score</th> <th>Average Reading Score</th> <th>% Passing Math</th> <th>% Passing Reading</th> <th>Overall Passing Rate</th> </tr> <tr> <th>School Type</th> <th></th> <th></th> <th></th> <th></th> <th></th> </tr> </thead> <tbody> <tr> <th>Charter</th> <td>83.473852</td> <td>83.896421</td> <td>93.620830</td> <td>96.586489</td> <td>95.103660</td> </tr> <tr> <th>District</th> <td>76.956733</td> <td>80.966636</td> <td>66.548453</td> <td>80.799062</td> <td>73.673757</td> </tr> </tbody> </table> </div>