In [1]:
%matplotlib inline
import pandas as pd
In [2]:
# Import & combine the control & experimental groups!

control = pd.read_csv("control.csv")
control["group"] = 0 # control

experimental = pd.read_csv("experimental.csv")
experimental["group"] = 1 # experimental

data = pd.concat([control,experimental], ignore_index=True)
data.head(1)
Out[2]:
Timestamp How much did you LIKE the previous page? (Be honest!) How SURPRISING did the statistics feel to you? (regardless if you dis/liked the presentation style) From 1910–2015, what was the % change in extreme poverty? (if decrease, type negative number) From 1990–2016, what was the % change in air particle pollution? (if decrease, type negative number) In 2016, what % of deaths were from homicide + war + terrorism combined? In 2016, what % of world had at least one mental/substance disorder? In 2016, what % of deaths were from heart disease? From 1986–2014, what was the % change in nuclear warheads? (if decrease, type negative number) From 1990–2016, what was the % change in the suicide death rate? (if decrease, type negative number) In 2015, what % of people lived in a democracy? From 1950–2015, what was the % change in the world's fertility rate? (if decrease, type negative number) From 1959–2016, what was the % change in CO2 emissions? (if decrease, type negative number) What's your age? (OPTIONAL) What's your gender? (OPTIONAL) What's the highest level of academic schooling you've completed so far? (OPTIONAL) group
0 2019/01/10 11:13:00 AM EST 7.0 3.0 -88.0 15.0 1.0 15.0 33.0 -80.0 -33.0 58.0 -50.0 33.0 27.0 Male 4-year college degree (Bachelor's) 0
In [3]:
# It's cleaning time.

# Rename the columns
data.columns = [
    'timestamp',
    'likeable',
    'surprising',
    'q1',
    'q2',
    'q3',
    'q4',
    'q5',
    'q6',
    'q7',
    'q8',
    'q9',
    'q10',
    'age',
    'gender',
    'edu',
    'group'
]
data.head()
Out[3]:
timestamp likeable surprising q1 q2 q3 q4 q5 q6 q7 q8 q9 q10 age gender edu group
0 2019/01/10 11:13:00 AM EST 7.0 3.0 -88.0 15.0 1.0 15.0 33.0 -80.0 -33.0 58.0 -50.0 33.0 27.0 Male 4-year college degree (Bachelor's) 0
1 2019/01/10 11:32:59 AM EST 4.0 5.0 -30.0 30.0 1.0 14.0 10.0 -50.0 -5.0 51.0 -30.0 10.0 18.0 Male High school/Secondary school/GED 0
2 2019/01/10 11:33:55 AM EST 3.0 3.0 -85.0 20.0 3.0 35.0 36.0 -86.0 100.0 50.0 -30.0 300.0 21.0 Male High school/Secondary school/GED 0
3 2019/01/10 11:34:50 AM EST 7.0 3.0 -88.0 30.0 1.0 15.0 30.0 -88.0 -33.0 56.0 -51.0 300.0 45.0 Male 4-year college degree (Bachelor's) 0
4 2019/01/10 11:35:08 AM EST 7.0 7.0 -86.0 32.0 1.0 26.0 32.0 -36.0 16.0 56.0 -17.0 40.0 17.0 Male NaN 0
In [4]:
# Drop anyone who didn't respond to Likeable & Surprising
data.dropna(subset=['likeable','surprising'], inplace=True)

# And drop anyone who didn't answer at least ONE question in Q1-10
data.dropna(subset=['q1','q2','q3','q4','q5','q6','q7','q8','q9','q10'], how='all', inplace=True)

# Any other blank Q's to 0
for i in range(10):
    col = "q" + str(i+1)
    data[col].fillna(0, inplace=True)
In [5]:
# Convert edu to 'none', 'grade', 'high', '2yr', '4yr', 'masters', 'phd'

pd.options.mode.chained_assignment = None # Shut up Pandas, I know what I'm doing (I think)

print("Before:")
print(data["edu"].value_counts(dropna=False))

data["edu"].fillna("none", inplace=True) # if didn't say, it's none

rename_edu = {
    "Grade school/Primary school": "grade",
    "High school/Secondary school/GED": "high",
    "2-year college degree (Associate's)": "2yr",
    "4-year college degree (Bachelor's)": "4yr",
    "Master's degree": "masters",
    "PhD or other advanced professional degree": "phd",
    "None / Don't know / Rather not say": "none"
}
edu_col = data["edu"]
for key, value in rename_edu.items():
    edu_col[edu_col==key] = value
    
print("\n\nAfter:")
print(data["edu"].value_counts(dropna=False))
Before:
4-year college degree (Bachelor's)           473
Master's degree                              403
High school/Secondary school/GED             297
PhD or other advanced professional degree    225
2-year college degree (Associate's)           88
NaN                                           63
Grade school/Primary school                   48
None / Don't know / Rather not say            23
Name: edu, dtype: int64


After:
4yr        473
masters    403
high       297
phd        225
2yr         88
none        86
grade       48
Name: edu, dtype: int64
In [6]:
# Convert gender to 'm', 'f', 'nb', 'none'

data["gender"] = data["gender"].str.lower().str.strip() # strip & lowercase it all

print("Before:")
print(data["gender"].value_counts(dropna=False))

data["gender"].fillna("none", inplace=True) # if didn't say, it's none

# Rename to those 3
rename_gender = {
    "m": "m",
    "male": "m",
    "man": "m",
    "men": "m",
    "guy": "m",
    
    "f": "f",
    "female": "f",
    "woman": "f",
    "w": "f",
    "gal": "f",
    
    "nonbinary": "nb",
    "non-binary": "nb",
    "non binary": "nb",
    "agender": "nb",
    "questioning": "nb",
    "genderfluid": "nb",
    "other": "nb"
}
gender_col = data["gender"]
for key, value in rename_gender.items():
    gender_col[gender_col==key] = value
    
# Everything else gets none
# coz i hate cleaning data and y'all have a lot of typos
gender_col[ ((gender_col!="m") & (gender_col!="f")) & (gender_col!="nb") ] = "none"
    
print("\n\nAfter:")
print(data["gender"].value_counts(dropna=False))
Before:
male                                           905
female                                         221
NaN                                            181
m                                              144
f                                               28
man                                             19
woman                                           13
agender                                         11
non-binary                                      10
nonbinary                                        9
men                                              6
non binary                                       5
cis male                                         4
nb                                               3
make                                             3
w                                                2
mail                                             2
other                                            2
questioning                                      2
none                                             1
woman, suuuppper trans and kinda an enby         1
male (bonne chance with the experiment btw)      1
mqle                                             1
fgfffffffffffffff                                1
trans female                                     1
nonbinary trans woman                            1
masculine                                        1
she/her                                          1
mal                                              1
guy                                              1
                                              ... 
cis woman                                        1
nonbinary man                                    1
a friggin' macho male <3                         1
human                                            1
female (she/her)                                 1
demimale                                         1
мале                                             1
transgender male                                 1
femalr                                           1
boy                                              1
femme                                            1
cis man                                          1
male       ҉                                     1
"they" say male                                  1
cis m                                            1
female (trans)                                   1
baguette                                         1
gender?                                          1
male (but genderfluid)                           1
what                                             1
they/them                                        1
demi-male                                        1
female / non-binary                              1
female (cis)                                     1
queer                                            1
maöe                                             1
male-ish??                                       1
nb woman                                         1
girl                                             1
male (questioning)                               1
Name: gender, Length: 69, dtype: int64


After:
m       1075
f        264
none     238
nb        43
Name: gender, dtype: int64
In [7]:
# Reset index, because we dropped some people earlier
data.index = range(data.shape[0])
data.shape[0]
Out[7]:
1620
In [8]:
data.to_csv("cleaned.csv")