%matplotlib inline
import pandas as pd
# Import & combine the control & experimental groups!
control = pd.read_csv("control.csv")
control["group"] = 0 # control
experimental = pd.read_csv("experimental.csv")
experimental["group"] = 1 # experimental
data = pd.concat([control,experimental], ignore_index=True)
data.head(1)
# It's cleaning time.
# Rename the columns
data.columns = [
'timestamp',
'likeable',
'surprising',
'q1',
'q2',
'q3',
'q4',
'q5',
'q6',
'q7',
'q8',
'q9',
'q10',
'age',
'gender',
'edu',
'group'
]
data.head()
# Drop anyone who didn't respond to Likeable & Surprising
data.dropna(subset=['likeable','surprising'], inplace=True)
# And drop anyone who didn't answer at least ONE question in Q1-10
data.dropna(subset=['q1','q2','q3','q4','q5','q6','q7','q8','q9','q10'], how='all', inplace=True)
# Any other blank Q's to 0
for i in range(10):
col = "q" + str(i+1)
data[col].fillna(0, inplace=True)
# Convert edu to 'none', 'grade', 'high', '2yr', '4yr', 'masters', 'phd'
pd.options.mode.chained_assignment = None # Shut up Pandas, I know what I'm doing (I think)
print("Before:")
print(data["edu"].value_counts(dropna=False))
data["edu"].fillna("none", inplace=True) # if didn't say, it's none
rename_edu = {
"Grade school/Primary school": "grade",
"High school/Secondary school/GED": "high",
"2-year college degree (Associate's)": "2yr",
"4-year college degree (Bachelor's)": "4yr",
"Master's degree": "masters",
"PhD or other advanced professional degree": "phd",
"None / Don't know / Rather not say": "none"
}
edu_col = data["edu"]
for key, value in rename_edu.items():
edu_col[edu_col==key] = value
print("\n\nAfter:")
print(data["edu"].value_counts(dropna=False))
# Convert gender to 'm', 'f', 'nb', 'none'
data["gender"] = data["gender"].str.lower().str.strip() # strip & lowercase it all
print("Before:")
print(data["gender"].value_counts(dropna=False))
data["gender"].fillna("none", inplace=True) # if didn't say, it's none
# Rename to those 3
rename_gender = {
"m": "m",
"male": "m",
"man": "m",
"men": "m",
"guy": "m",
"f": "f",
"female": "f",
"woman": "f",
"w": "f",
"gal": "f",
"nonbinary": "nb",
"non-binary": "nb",
"non binary": "nb",
"agender": "nb",
"questioning": "nb",
"genderfluid": "nb",
"other": "nb"
}
gender_col = data["gender"]
for key, value in rename_gender.items():
gender_col[gender_col==key] = value
# Everything else gets none
# coz i hate cleaning data and y'all have a lot of typos
gender_col[ ((gender_col!="m") & (gender_col!="f")) & (gender_col!="nb") ] = "none"
print("\n\nAfter:")
print(data["gender"].value_counts(dropna=False))
# Reset index, because we dropped some people earlier
data.index = range(data.shape[0])
data.shape[0]
data.to_csv("cleaned.csv")