#Import pandas and numpy
import numpy as np
import pandas as pd


db = pd.read_csv("./notebooks/hsls_17_student_pets_sr_v1_0.csv")

db


db.drop(db.columns[4014:], axis='columns', inplace=True)

db.head()


drop_cols = []

for col in db:
    unique = db[col].unique()
    if len(unique) == 1 and unique[0] == -5:
        drop_cols.append(col)
        
db.drop(drop_cols, axis='columns', inplace=True)        
db.head()


drop_cols = []

for col in db:
    if col.startswith(("P", "M", "N", "A", "C")):
        drop_cols.append(col)
        
db.drop(drop_cols, axis="columns", inplace=True)        
db.head()


cols = ["X1SEX", "X1RACE", "X1TXMTH", "X1PAR1RACE", "X1PAR2RACE", "X1SES", "X1MTHEFF", "X1SCIEFF",
        
        "X2ENROLSTAT", "X2DROPSTAT", "X2MEFFORT", "X2SEFFORT",
        
        "X3DROPOUTTIME", "X3DROPSTAT", "X3HSCOMPSTAT", 
        "X3TGPAMAT", "X3TGPASCI", "X3TGPACOMPSCI", "X3TGPASTEM", "X3TCREDSTEM",
        
        "X4EVRAPPCLG", "X4ENTMJSTNSF", "X4SIBPSE",
        
        "X5STEMCRED", "X5HIGHDEG", "X5STEM1ATT", "X5STEM1GPA", "X5GPAALL", "X5OWEAMT",
    
        "S1MPERSON1", "S1MPERSON2", "S1SPERSON1", "S1SPERSON2", 
        "S1SCHWASTE", "S1FAVSUBJ", "S1LEASTSUBJ", "S1EDUEXPECT", "S1ABILITYBA",
    
        "S2ABSENT", "S2EDUASP", "S2SUREDIPL", "S2FOCUS2013", "S2TYPEPS2013", "S2FIRSTCHOICE",
        "S2ENGCOMP", "S2MTHCOMP", "S2SCICOMP",
    
        "S3CANTAFFORD", "S3CURWORK",
    
        "S4REPUTATION", "S4COSTATTEND", "S4EDUEXP",
        "S4MLEARN", "S4MBORN", "S4SLEARN", "S4SBORN",
        "S4MTHMF", "S4SCIMF", "S4CSIMF", "S4ENGMF", "S4MTHRC", "S4SCIRC", "S4CSIRC", "S4ENGRC",
        "S4GOODINVEST", "S4STUDOREMP"]

db = db[cols]
db


indices = []

for index,row in db.iterrows():
    if (row["X1SEX"] == -9) or (row["X1RACE"] == -9):
        indices.append(index)
        
db.drop(indices, inplace=True)        
db

/opt/conda/lib/python3.8/site-packages/pandas/core/frame.py:4167: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


# a table of races, we will use this later in our data visualization
race_table = ["American Indian/Alaska Native", "Asian", "African-American", "Non-racial Hispanic", 
              "Racial Hispanic", "Multi-racial", "Native Hawaiian/Pacific Islander", "White"]
grad_table = ["High School Diploma", "GED/HS equiv.", "Dropped Out", "Still Enrolled", "Unknown"]

# get the number of high school outcomes by type for each race and sex
grad_count = np.zeros((2,8,5), dtype=int)
for i,row in db.iterrows():
    grad_count[int(row["X1SEX"]-1)][int(row["X1RACE"]-1)][int(row["X3HSCOMPSTAT"]-1)] += 1
        
# get the combined grad count of both sexes
combined_grad_count = [grad_count[0][i][j] + grad_count[1][i][j] for i in range(8) for j in range(5)]
combined_grad_count = np.reshape(combined_grad_count, (8,5))

# get the total student count for each race
total_race = [np.sum(combined_grad_count[i]) for i in range(8)]

# get the proportion of students of each high school outcome for each race
proportion_grad = [combined_grad_count[i][j] / total_race[i] for i in range(8) for j in range(5)]
proportion_grad = np.reshape(proportion_grad, (8,5))

# return that proportion
df_prop_grad = pd.DataFrame(data=proportion_grad, index=race_table, columns=grad_table)
df_prop_grad


enroll_table = ["9th grade", "1st sem. 11th grade", "High School", "College app", "Associates+"]
enrollment_rate = np.zeros((8,5), dtype=int)

for i,row in db.iterrows():
    enrollment_rate[int(row["X1RACE"]-1)][0] += 1
    enrollment_rate[int(row["X1RACE"]-1)][1] += (1 if row["X2ENROLSTAT"] <= 4 else 0)
    enrollment_rate[int(row["X1RACE"]-1)][2] += (0 if row["X3DROPSTAT"] == 1 else 1)
    enrollment_rate[int(row["X1RACE"]-1)][3] += (1 if row["X4EVRAPPCLG"] == 1 else 0)
    enrollment_rate[int(row["X1RACE"]-1)][4] += (1 if row["X5HIGHDEG"] >= 2 else 0)

df_enrollment = pd.DataFrame(data=enrollment_rate, index=race_table, columns=enroll_table)
df_enrollment


#preparing tables and filtering the database for our searches
sex_table = ["Male", "Female"]
filt_db = db[(db["X5STEMCRED"] >= 0) & (db["X5STEM1GPA"] >= 0)]

stem_gpa = np.zeros((8,2))
all_gpa = np.zeros((8,2))
total_race_college = np.zeros((8,2))
stem_degree_college = np.zeros((8,2))

#sum up the GPA of all of the students by their race and sex
for i, row in filt_db.iterrows():
    stem_gpa[int(row["X1RACE"]-1)][int(row["X1SEX"]-1)] += row["X5STEM1GPA"]
    all_gpa[int(row["X1RACE"]-1)][int(row["X1SEX"]-1)] += row["X5GPAALL"]
    total_race_college[int(row["X1RACE"]-1)][int(row["X1SEX"]-1)] += 1
    
#tally which students of a given race have a STEM degree
for i, row in filt_db.iterrows():
    stem_degree_college[int(row["X1RACE"]-1)][int(row["X1SEX"]-1)] += (1 if row["X5STEMCRED"] == 1 else 0)

#calculate the mean by dividing by the total number of students in the category
for i in range(8):
    for j in range(2):
        stem_gpa[i][j] /= total_race_college[i][j]
        all_gpa[i][j] /= total_race_college[i][j]
    
#create dataframes from the arrays
df_stem_gpa = pd.DataFrame(data=stem_gpa, index=race_table, columns=sex_table)
df_all_gpa = pd.DataFrame(data=all_gpa, index=race_table, columns=sex_table)

df_stem_gpa


#creating tables and filtering undesirable results from the database
skill_table = ["Math is a Skill", "Math is a Trait", "Science is a Skill", "Science is a Trait"]
agree_table = ["Strongly Agree", "Agree", "Disagree", "Strongly Disagree"]

# here we filter any of the codes that are negative, because they correspond to data points that
# either were not answered or by students who did not go on to complete the fourth survey
filt_db = db[["X1SEX", "X1MTHEFF", "X1SCIEFF", "S4MLEARN", "S4MBORN", "S4SLEARN", "S4SBORN"]]
filt_db = filt_db[filt_db["S4MLEARN"] > 0]
filt_db = filt_db[filt_db["S4MBORN"] > 0]
filt_db = filt_db[filt_db["S4SLEARN"] > 0]
filt_db = filt_db[filt_db["S4SBORN"] > 0]

#tallying the counts for each survey element by student
stem_skill = np.zeros((2,4,4), dtype=int)
for i, row in filt_db.iterrows():
    stem_skill[int(row["X1SEX"]-1)][0][int(row["S4MLEARN"]-1)] += 1
    stem_skill[int(row["X1SEX"]-1)][1][int(row["S4MBORN"]-1)] += 1
    stem_skill[int(row["X1SEX"]-1)][2][int(row["S4SLEARN"]-1)] += 1
    stem_skill[int(row["X1SEX"]-1)][3][int(row["S4SBORN"]-1)] += 1

# combining the two sexes into one table, which is what we will display at the end
combined_stem_skill = np.zeros((4,4), dtype=int)
for i in range(4):
    for j in range(4):
        combined_stem_skill[i][j] = stem_skill[0][i][j] + stem_skill[1][i][j]

c_skill_df = pd.DataFrame(data=combined_stem_skill, index=skill_table, columns=agree_table)
c_skill_df


from matplotlib import pyplot as plt

w = 0.15

fig, axis = plt.subplots(figsize=(20,8))
b1 = axis.bar(np.arange(8)-2*w, df_prop_grad["High School Diploma"], w, color = 'green')
b2 = axis.bar(np.arange(8)-w, df_prop_grad["GED/HS equiv."], w, color = 'mediumaquamarine')
b3 = axis.bar(np.arange(8), df_prop_grad["Still Enrolled"], w, color = 'gold')
b4 = axis.bar(np.arange(8)+w, df_prop_grad["Dropped Out"], w, color = 'firebrick')
b5 = axis.bar(np.arange(8)+2*w, df_prop_grad["Unknown"], w, color = 'gray')

axis.set_ylabel("Proportion of students")
axis.set_xticks(np.arange(8))
axis.set_xticklabels(race_table)
l = axis.legend(grad_table)


years = [0, 2, 4, 6, 8]

fig, axis = plt.subplots(figsize=(20,8))
axis.stackplot(years, df_enrollment)

axis.set_ylabel("Number of students at level")
axis.set_xlabel("Educational Attainment")
axis.set_xticks(years)
axis.set_xticklabels(enroll_table)
l = axis.legend(race_table)


diff_gpa = np.subtract(all_gpa, stem_gpa)

from matplotlib import pyplot as plt

w = .4

fig, axes = plt.subplots(2, figsize=(20,16))
b1 = axes[0].bar(np.arange(8)-w/2, diff_gpa[:,0], w, color = 'steelblue')
b2 = axes[0].bar(np.arange(8)+w/2, diff_gpa[:,1], w, color = 'plum')
b3 = axes[1].bar(np.arange(8)-w/2, stem_degree_college[:,0], w, color = 'steelblue')
b4 = axes[1].bar(np.arange(8)+w/2, stem_degree_college[:,1], w, color = 'plum')

axes[0].set_ylabel("GPA Difference")
axes[0].set_title("GPA Difference between all classes and STEM")
axes[0].set_xticks(np.arange(8))
axes[0].set_xticklabels(race_table)
l = axes[0].legend(sex_table)

axes[1].set_ylabel("Number of STEM degrees")
axes[1].set_title("Number of STEM degrees by race and sex")
axes[1].set_xticks(np.arange(8))
axes[1].set_xticklabels(race_table)
l2 = axes[1].legend(sex_table)


from scipy.stats import chisquare

chisquare(np.transpose(proportion_grad * 100.0))

Power_divergenceResult(statistic=array([125.14233242, 289.18495742, 197.21291897, 139.2578783 ,
       206.54740192, 230.31500358, 174.62809917, 264.01442348]), pvalue=array([4.25547426e-26, 2.33036158e-61, 1.49295346e-41, 4.06928988e-29,
       1.46884624e-43, 1.12922292e-48, 1.06174414e-36, 6.22116269e-56]))


### Written by: Jakob Wachter
### For: CMSC320, Introduction to Data Science
### Thanks to the TA who graded this :)

	STU_ID	SCH_ID	X1NCESID	X2NCESID	STRAT_ID	PSU	X2UNIV1	X2UNIV2A	X2UNIV2B	X3UNIV1	...	W5W1W2W3W4PSRECORDS191	W5W1W2W3W4PSRECORDS192	W5W1W2W3W4PSRECORDS193	W5W1W2W3W4PSRECORDS194	W5W1W2W3W4PSRECORDS195	W5W1W2W3W4PSRECORDS196	W5W1W2W3W4PSRECORDS197	W5W1W2W3W4PSRECORDS198	W5W1W2W3W4PSRECORDS199	W5W1W2W3W4PSRECORDS200
0	10001	-5	-5	-5	-5	-5	11	1	1	1111	...	0.000000	2098.087446	1824.641398	0.000000	2431.665487	0.0	0.000000	2457.423209	0.0	2053.407870
1	10002	-5	-5	-5	-5	-5	11	1	1	1111	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.0	0.000000	0.000000	0.0	0.000000
2	10003	-5	-5	-5	-5	-5	11	1	1	1111	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.0	0.000000	0.000000	0.0	0.000000
3	10004	-5	-5	-5	-5	-5	10	1	7	1001	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.0	0.000000	0.000000	0.0	0.000000
4	10005	-5	-5	-5	-5	-5	11	1	1	1111	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.0	0.000000	0.000000	0.0	0.000000
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
23498	35202	-5	-5	-5	-5	-5	11	1	6	1111	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.0	0.000000	0.000000	0.0	0.000000
23499	35203	-5	-5	-5	-5	-5	11	1	1	1111	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.0	0.000000	0.000000	0.0	0.000000
23500	35204	-5	-5	-5	-5	-5	11	1	1	1111	...	249.221073	217.039152	0.000000	236.368745	0.000000	0.0	386.330427	0.000000	0.0	0.000000
23501	35205	-5	-5	-5	-5	-5	11	1	1	1111	...	740.024218	0.000000	0.000000	0.000000	775.191655	0.0	0.000000	1006.429693	0.0	862.728189
23502	35206	-5	-5	-5	-5	-5	11	1	1	1111	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.0	0.000000	0.000000	0.0	0.000000

	STU_ID	SCH_ID	X1NCESID	X2NCESID	STRAT_ID	PSU	X2UNIV1	X2UNIV2A	X2UNIV2B	X3UNIV1	...	X5PFYNETPRICEGRT_IM	X5PFYPELLPACK_IM	X5PFYTOTLOAN_IM	X5PFYTOTLOAN2_IM	X5PFYTOTLOAN3_IM	X5EVRFEDAPP_IM	X5FEDAPP14_IM	X5FEDAPP15_IM	X5FEDAPP16_IM	X5PFYTUITION_IM
0	10001	-5	-5	-5	-5	-5	11	1	1	1111	...	0	0	0	0	0	0	0	0	0	0
1	10002	-5	-5	-5	-5	-5	11	1	1	1111	...	-8	-8	-8	-8	-8	-8	-8	-8	-8	-8
2	10003	-5	-5	-5	-5	-5	11	1	1	1111	...	-8	-8	-8	-8	-8	-8	-8	-8	-8	-8
3	10004	-5	-5	-5	-5	-5	10	1	7	1001	...	-6	-6	-6	-6	-6	-6	-6	-6	-6	-6
4	10005	-5	-5	-5	-5	-5	11	1	1	1111	...	-6	-6	-6	-6	-6	-6	-6	-6	-6	-6

	STU_ID	X2UNIV1	X2UNIV2A	X2UNIV2B	X3UNIV1	X4UNIV1	W1STUDENT	W1PARENT	W1MATHTCH	W1SCITCH	...	X5PFYNETPRICEGRT_IM	X5PFYPELLPACK_IM	X5PFYTOTLOAN_IM	X5PFYTOTLOAN2_IM	X5PFYTOTLOAN3_IM	X5EVRFEDAPP_IM	X5FEDAPP14_IM	X5FEDAPP15_IM	X5FEDAPP16_IM	X5PFYTUITION_IM
0	10001	11	1	1	1111	11111	375.667105	470.250141	423.238620	393.169508	...	0	0	0	0	0	0	0	0	0	0
1	10002	11	1	1	1111	11111	189.309446	224.455466	329.640843	207.892322	...	-8	-8	-8	-8	-8	-8	-8	-8	-8	-8
2	10003	11	1	1	1111	11111	143.591863	185.301339	231.718703	0.000000	...	-8	-8	-8	-8	-8	-8	-8	-8	-8	-8
3	10004	10	1	7	1001	10011	227.937019	301.431713	261.518593	306.102816	...	-6	-6	-6	-6	-6	-6	-6	-6	-6	-6
4	10005	11	1	1	1111	11111	145.019401	190.834136	169.946035	188.432535	...	-6	-6	-6	-6	-6	-6	-6	-6	-6	-6

	STU_ID	X2UNIV1	X2UNIV2A	X2UNIV2B	X3UNIV1	X4UNIV1	W1STUDENT	W1PARENT	W1MATHTCH	W1SCITCH	...	X5PFYNETPRICEGRT_IM	X5PFYPELLPACK_IM	X5PFYTOTLOAN_IM	X5PFYTOTLOAN2_IM	X5PFYTOTLOAN3_IM	X5EVRFEDAPP_IM	X5FEDAPP14_IM	X5FEDAPP15_IM	X5FEDAPP16_IM	X5PFYTUITION_IM
0	10001	11	1	1	1111	11111	375.667105	470.250141	423.238620	393.169508	...	0	0	0	0	0	0	0	0	0	0
1	10002	11	1	1	1111	11111	189.309446	224.455466	329.640843	207.892322	...	-8	-8	-8	-8	-8	-8	-8	-8	-8	-8
2	10003	11	1	1	1111	11111	143.591863	185.301339	231.718703	0.000000	...	-8	-8	-8	-8	-8	-8	-8	-8	-8	-8
3	10004	10	1	7	1001	10011	227.937019	301.431713	261.518593	306.102816	...	-6	-6	-6	-6	-6	-6	-6	-6	-6	-6
4	10005	11	1	1	1111	11111	145.019401	190.834136	169.946035	188.432535	...	-6	-6	-6	-6	-6	-6	-6	-6	-6	-6

	X1SEX	X1RACE	X1TXMTH	X1PAR1RACE	X1PAR2RACE	X1SES	X1MTHEFF	X1SCIEFF	X2ENROLSTAT	X2DROPSTAT	...	S4MTHMF	S4SCIMF	S4CSIMF	S4ENGMF	S4MTHRC	S4SCIRC	S4CSIRC	S4ENGRC	S4GOODINVEST	S4STUDOREMP
0	1	8	0.8304	8	8	1.5644	0.95	1.10	1	0	...	1	1	-7	-7	1	1	-7	-7	2	1
1	2	8	-0.2956	5	8	-0.3699	0.55	0.69	1	0	...	-7	-7	-7	-7	-7	-7	-7	-7	2	1
2	2	3	1.2997	3	-7	1.2741	0.68	-0.05	1	0	...	4	4	-7	-7	4	4	-7	-7	1	1
3	2	8	-0.1427	8	-7	0.5498	0.10	0.25	1	0	...	-7	-7	-7	-7	-7	-7	-7	-7	-7	-7
4	1	8	1.1405	8	-7	0.1495	0.10	0.25	1	3	...	-7	-7	-7	-7	-7	-7	-7	-7	-7	-7
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
23498	2	5	0.6572	-8	-8	0.0205	-1.75	1.83	5	1	...	-8	-8	-8	-8	-8	-8	-8	-8	-8	-8
23499	2	5	-0.4529	-9	-7	-1.2098	0.82	-0.36	1	0	...	-7	-7	-7	-7	-7	-7	-7	-7	-7	-7
23500	2	8	-0.0935	8	8	-0.0649	-0.17	-1.02	2	0	...	4	3	-7	-7	4	4	-7	-7	1	-3
23501	1	8	1.0181	8	-7	0.8512	0.10	-0.05	1	0	...	-7	-7	-7	-7	-7	-7	-7	-7	2	2
23502	2	8	1.8673	8	8	1.6397	-0.58	-0.46	1	0	...	-7	4	-7	-7	-7	4	-7	-7	1	1

CMSC320 Final Project -- Women and Minorities in STEM¶

Written by: Jakob Wachter¶

Data Collection¶

Data Processing¶

Exploratory Analysis¶

Analysis 1: High School Graduates¶

Analysis 2: Enrollment Rate over Time¶

Analysis 3: Involvement with STEM¶

Analysis 4: Are they Gifts?¶

Extra Analysis: Is there Bias?¶

Data Visualization¶

Analysis 1¶

Analysis 2¶

Analysis 3¶

Hypothesis Testing¶

Chi Squared Testing¶

Linear Regression Modeling¶

Machine Learning¶

Insight & Conclusions¶

	High School Diploma	GED/HS equiv.	Dropped Out	Still Enrolled	Unknown
American Indian/Alaska Native	0.642424	0.042424	0.109091	0.066667	0.139394
Asian	0.878074	0.011270	0.010246	0.017930	0.082480
African-American	0.759494	0.042466	0.040016	0.049816	0.108207
Non-racial Hispanic	0.661137	0.026066	0.071090	0.063981	0.177725
Racial Hispanic	0.770963	0.027259	0.036148	0.044444	0.121185
Multi-racial	0.805255	0.027821	0.034518	0.039155	0.093251
Native Hawaiian/Pacific Islander	0.718182	0.009091	0.036364	0.072727	0.163636
White	0.848866	0.025741	0.029631	0.022099	0.073663

	9th grade	1st sem. 11th grade	High School	College app	Associates+
American Indian/Alaska Native	165	149	149	84	2
Asian	1952	1850	1938	1420	80
African-American	2449	2268	2364	1494	78
Non-racial Hispanic	422	355	396	206	10
Racial Hispanic	3375	3135	3257	1942	129
Multi-racial	1941	1826	1878	1205	55
Native Hawaiian/Pacific Islander	110	105	106	56	3
White	12082	11493	11723	7683	603

	Male	Female
American Indian/Alaska Native	2.369231	2.283333
Asian	2.789256	2.930233
African-American	1.826909	2.086911
Non-racial Hispanic	2.039130	2.490244
Racial Hispanic	2.243557	2.394882
Multi-racial	2.451812	2.511976
Native Hawaiian/Pacific Islander	2.653846	2.253846
White	2.577737	2.762724

	Strongly Agree	Agree	Disagree	Strongly Disagree
Math is a Skill	3045	9514	2207	301
Math is a Trait	774	3233	8451	2609
Science is a Skill	2863	10300	1756	148
Science is a Trait	576	2834	9010	2647