#import statements
import pandas as pd
import numpy as np
from collections import defaultdict
import random
import warnings
from sklearn.preprocessing import LabelEncoder


# supresses warnings because we're dangerous
warnings.filterwarnings('ignore')


# removes max display for rows and cols
pd.set_option("display.max_rows", None, "display.max_columns", None)

# import 2014 survey using pandas
survey_2014 = pd.read_csv('survey.csv')


# displays first 3 rows of 2014 survey data in a DataFrame
survey_2014.head(3)


# Here, I decide to drop the free-response column [comments],
# since I know I don't want to include that data in my analysis.
# I also drop the Timestamp column ahead of time, since the 2016
# survey lacks Timestamp data, and we eventually want to combine the two.
subset_2014 = survey_2014.drop(columns=['comments', 'Timestamp'])

# Here, I drop any invalid rows (if present), where all the values are NaN.
subset_2014.dropna(how='all', inplace=True)


# import 2016 survey using pandas
survey_2016 = pd.read_csv('mental-heath-in-tech-2016_20161114.csv')

# display first 2 rows of the 2016 data
survey_2016.head(2)


# The DataFrame.rename method requires a dictionary for renaming
# which we are filling below, beginning with defaultdict().
rename_dict = defaultdict()

# Inserting keys as the columns to be renamed
for col in survey_2016.columns:
    rename_dict[col] = ""

# Values list for the rename dict
rename_list = ['self_employed', 'no_employees', 'tech_company', 'primary_role', 'benefits', 
               'care_options', 'wellness_program', 'seek_help', 'anonymity',
              'leave', 'mental_health_consequence', 'phys_health_consequence',
              'coworkers', 'supervisor', 'mental_vs_physical', 'obs_consequence',
              'coverage', 'resources', 'revtoclients', 'revclient_consequence',
              'revtocoworkers', 'revcoworker_consequence', 'work_interfere', 'interfere_amt',
              'prev_employers', 'prev_benefits', 'prev_care_options', 'prev_wellness_program',
              'prev_seek_help', 'prev_anonymity', 'prev_mentalhealthconsequence',
              'prev_physhealthconsequence', 'prev_coworkers', 'prev_supervisors',
              'prev_mentalvsphysical', 'prev_obs_consequence', 'mentalhealthinterview',
              'mentalhealthinterview_why', 'physhealthinterview', 'physhealthinterview_why',
              'identify_mental', 'viewed_negatively_mental', 'willingtoshare', 'obs_unsupportive',
              'obs_unsupportive_affect', 'family_history', 'disorder_past', 'disorder_curr','disorder_past_yes',
              'disorder_past_maybe', 'diagnosed', 'diagnosed_conditions', 'treatment',
               'work_interfere_withtreat', 'work_interfere_notreat', 'Age', 'Gender', 'Country',
              'state', 'Country_repeat', 'state_repeat', 'work_position', 'remote_work'] 

# Inserting the values to rename
for i in range(len(rename_list)):
    k = list(rename_dict.keys())[i]
    v = rename_list[i]
    rename_dict[k] = v

# Based on the dict we created above, we rename the columns here.
subset_2016 = survey_2016.rename(columns=rename_dict)

# Similar to the 2014 DataFrame, I am dropping the free response columns.
subset_2016.drop(columns=['mentalhealthinterview_why', 'physhealthinterview_why'])

# Similarly, I drop any invalid rows (if present), where all the values are NaN.
subset_2016.dropna(how='all', inplace=True)

# Outputting the modified version
subset_2016.head(3)


# Dropping columns that aren't found in the 2014 survey (I know. After we did all that work to rename it all nicely!)
subset_2016.drop(columns=['Country_repeat', 'state_repeat', 'work_position','interfere_amt',
              'prev_employers', 'prev_benefits', 'prev_care_options', 'prev_wellness_program',
              'prev_seek_help', 'prev_anonymity', 'prev_mentalhealthconsequence',
              'prev_physhealthconsequence', 'prev_coworkers', 'prev_supervisors',
              'prev_mentalvsphysical', 'prev_obs_consequence', 'mentalhealthinterview', 'physhealthinterview',
              'identify_mental', 'viewed_negatively_mental', 'willingtoshare', 'obs_unsupportive',
              'obs_unsupportive_affect', 'disorder_past', 'disorder_curr','disorder_past_yes',
              'disorder_past_maybe', 'diagnosed', 'diagnosed_conditions', 'revtoclients', 'revclient_consequence',
              'revtocoworkers', 'revcoworker_consequence', 'coverage', 'primary_role', 'resources'], inplace=True)


# Now, combining the two DataFrames is easy.
survey = pd.concat([subset_2014, subset_2016])

# Outputting our new DataFrame, survey
survey.head(3)


# Right away, we want to remove all rows related to a non-tech company.
# This will allow any conclusions or discussions to be more focused.
survey.drop(survey[survey['tech_company'] == 'No'].index, inplace = True)


# Deciding which columns we'd like to keep in our DataFrame
# Each list indicates the relevance of the columns; this is
# purely for clarity reasons, we concatenate the lists as cols
employee_ind = ['Gender', 'Age', 'family_history']
workplace_ind = ['benefits', 'care_options', 'wellness_program', 'seek_help',
                 'anonymity', 'leave', 'mental_health_consequence', 'phys_health_consequence',
                'coworkers', 'supervisor', 'mental_vs_physical', 'obs_consequence', 
                'no_employees']
target = ['work_interfere', 'work_interfere_withtreat', 'work_interfere_notreat']

# concatenated list of all cols to keep
cols = employee_ind + workplace_ind + target

# retrieves subset of DataFrame to be used for the remainder of the project
survey = survey[cols]


# Tidying up gender - Since this question was a free-response question, we have
# to clean up all of the options that intended to say Male or Female. I chose to handle
# this column by manually sending in a list of values to DataFrame.replace, including
# misspellings and handling different cases by converting all responses to lowercase.
survey['Gender'] = survey['Gender'].str.lower()

# Replacing values for 'Male'
survey['Gender'].replace(to_replace=['male', 'man', 'm', 'cis male', 
                                     'cisdude', 'dude', 'malr', 'dude',
                                     'male-ish', 'maile', 'mail'], 
                         value='Male', inplace=True)

# Replace values for 'Female'
survey['Gender'].replace(to_replace=['female', 'f', 'woman', 'queer/she/they', 'trans woman',
                                    'femail', 'cis-female/femme', 'femake', 'female (trans)',
                                    'female (cis)', 'cis female'], 
                         value='Female', inplace=True)

# Replacing all other values and classifying under 'Nonbinary', i.e., not under the binary of M/F.
survey['Gender'].replace(to_replace=['non-binary', 'nonbinary', 'unicorn', 'enby', 'genderqueer', 'agender', 
                                     'neuter', 'queer', 'bigender', 'androgynous', 'nb masculine', 'androgyne',
                                    'ostensibly male', 'guy (-ish) ^_^', 'male leaning androgynous',
                                    'ostensibly male, unsure what that really means'], 
                         value='Nonbinary', inplace=True)

# Certain values were not classified and dropped, such as nonsensical answers ('Nah', 'p', 'A little about you')
# NaN values were also dropped. The below code updates the survey DataFrame as described.
survey = survey[survey['Gender'].isin(['Male', 'Female', 'Nonbinary'])]

# Removes I don't know values from family_history
survey = survey[survey['family_history'].isin(['Yes', 'No'])]


# Replaces the following responses with NaN values
survey.replace(to_replace=['Not applicable to me', 'Don\'t know', 'Not sure'], value=np.nan, inplace=True)


# Uses Label Encoder to code 'Yes', 'No', and other responses using 0, 1, and 2.
# Here, we are making our categorical variables into numerical values to use for
# our later logistic regression.
number = LabelEncoder()
survey['Gender'] = number.fit_transform(survey['Gender'].astype('str'))
survey['family_history'] = number.fit_transform(survey['family_history'].astype('str'))

# Outputs the first 3 rows of survey to see our new values
survey.head(3)


# Casts the no_employees column to str values for replacement
# and to more easily be used as an x-axis value later
survey['no_employees'] = survey['no_employees'].astype(str)

# Replacing ranges of employee size with the median of each lower bound and
# upper bound. There is a sacrifice of accuracy here, and it is not the sole option
# option in dealing with categorial variables, but it will prepare our predictor
# for our Logistic Regression later on.

# We will store these values in a separate column, since we would
# still like to inspect the no_employee ranges as given by the survey.
survey['no_employees_med'] =  survey['no_employees']

# Note: since 'More than 1000' did not have an upper bound, I naively assigned
# it to be 2000, and assigned the median as 1500.
survey['no_employees_med'].replace(to_replace = ['1-5'], value=3, inplace=True)
survey['no_employees_med'].replace(to_replace = ['6-25'], value=16, inplace=True)
survey['no_employees_med'].replace(to_replace = ['26-100'], value=63, inplace=True)
survey['no_employees_med'].replace(to_replace = ['100-500'], value=300, inplace=True)
survey['no_employees_med'].replace(to_replace = ['500-1000'], value=750, inplace=True)
survey['no_employees_med'].replace(to_replace = ['More than 1000'], value=1500, inplace=True)


# There are a handful of invalid ages, such as values well above 100 and negative values.
# Here, we decide to omit minors from the survey and ages greater than 80.
survey['Age'] = np.where(survey['Age'] < 18, np.nan, survey['Age'])
survey['Age'] = np.where(survey['Age'] >= 80, np.nan, survey['Age'])


# Iterate through the survey DataFrame to handle work_interfere as described above
for row in survey.iterrows():
    wi_t = row[1]['work_interfere_withtreat']
    wi_nt = row[1]['work_interfere_notreat']
    if wi_t != np.nan and wi_nt == np.nan:
        row[1]['work_interfere'] = wi_t
    elif wi_t == np.nan and wi_nt != np.nan:
        row[1]['work_interfere'] = wi_nt
    elif wi_t != np.nan and wi_nt != np.nan:
        row[1]['work_interfere'] = random.choice([wi_t, wi_nt])


# drops 2016 survey work_interfere columns
survey.drop(columns=['work_interfere_withtreat', 'work_interfere_notreat'], inplace=True)


# Uses DataFrame.replace to group work_interfere as described above
survey['work_interfere'].replace(to_replace=['No', 'Never', 'Rarely'], value='Little to No Interference', inplace=True)
survey['work_interfere'].replace(to_replace=['Sometimes', 'Yes', 'Often'], value='Moderate to Significant Interference', inplace=True)
survey['work_interfere'].replace(to_replace=['Unsure'], value=np.nan, inplace=True)


# Classifying questions into 'positive questions' and 'negative questions'
pos_qs = ['benefits', 'care_options', 'wellness_program', 'seek_help', 'anonymity', 'supervisor', 'coworkers', 'mental_vs_physical']
neg_qs = ['mental_health_consequence', 'phys_health_consequence', 'obs_consequence']
other_qs = ['leave']


# we will put all of the scores in a list to send in
# as a parameter to pd.Series to create a new column
scores = []

# Calculates a score for each row based on how we
# classified the questions above
for row in survey.iterrows():
    score = 0
    for col in pos_qs:
        if row[1][col] == 'Yes':
            score += 1
        elif row[1][col] == 'No':
            score -= 1
    for col in neg_qs:
        if row[1][col] == 'No':
            score += 1
        elif row[1][col] == 'Yes':
            score -= 1
    for col in other_qs:
        if row[1][col] in ['Somewhat easy', 'Very easy']:
            score += 1
        elif row[1][col] in ['Somewhat difficult', 'Very difficult']:
            score -= 1
    
    # Gives 'extra credit' for every company that achieves the max score
    if score == 12:
        score += 1
    
    # I don't want to deal with negative scores. Since the lowest possible
    # score is -12, I will add 12 to each score, making the lowest possible
    # score 0, instead of -12.
    scores.append(score + 12)
    
# Adds scores into the survey DataFrame as a new column    
survey['workplace_score'] = pd.Series(scores)


# Since these columns have been operationalized into one column, no longer need them
survey.drop(columns=['benefits', 'care_options', 'wellness_program', 'seek_help', 'anonymity', 'leave',
           'mental_health_consequence', 'phys_health_consequence', 'coworkers', 'supervisor', 
           'mental_vs_physical', 'obs_consequence'], inplace=True)


# import statements for EDA
import seaborn as sns
import matplotlib.pyplot as plt


# Plots the Age column as a histogram
# kde refers to the Kernel Density Estimation, which
# gives us a rough estimate of the distribution by age.
sns.histplot(survey['Age'].dropna(), kde=True)
plt.title("Age Distribution of Survey Respondents")

Text(0.5, 1.0, 'Age Distribution of Survey Respondents')


# displays counts of family_history responses,
# where 0 is 'No', 1 is 'Yes'
survey['family_history'].value_counts()

0    1028
1     910
Name: family_history, dtype: int64


# Shows count of individuals who report having a
# family history of mental illness or not
s = survey['family_history'].astype('category')
s = s.cat.rename_categories(["No", "Yes"])
sns.countplot(x=s)
plt.title("Survey Q: Do you have a family history of mental illness?")
plt.xlabel("Responses (Yes or No)")

Text(0.5, 0, 'Responses (Yes or No)')


# displays counts of Gender responses,
# where 0 is 'Female', 1 is 'Male', 2 is 'Nonbinary'
survey['Gender'].value_counts()

1    1509
0     411
2      18
Name: Gender, dtype: int64


# Shows count of individuals by gender
s.dropna(inplace=True)
s = survey['Gender'].astype('category')
s = s.cat.rename_categories(['Female', 'Male', 'Nonbinary'])
sns.countplot(x=s)
plt.title("Count of Survey Respondents Gender")

Text(0.5, 1.0, 'Count of Survey Respondents Gender')


# Displays count of each category of no_employees
sns.countplot(survey['no_employees'].dropna(), 
              order = ['1-5', '6-25', '26-100', '100-500', '500-1000', 'More than 1000'])
plt.title("Distirbution of Company Size/Employee Count")
plt.xlabel("Company Size")

Text(0.5, 0, 'Company Size')


# Displays a distribution by age for each category of work_interfere
g = sns.FacetGrid(survey, col='work_interfere', height=4)
g = g.map(sns.histplot, 'Age', kde=True)
g.set_axis_labels("Age", "Count")

<seaborn.axisgrid.FacetGrid at 0x7f1feaae1be0>


# Displays a distribution by gender for each category of work_interfere
g = sns.FacetGrid(survey, col='work_interfere', height=4)
p = sns.countplot()
g = g.map(sns.countplot, 'Gender')
g.set_axis_labels("Gender\n0: Male, 1: Female, 2: Nonbinary", "Count")

<seaborn.axisgrid.FacetGrid at 0x7f1fea9f4b80>


# Displays a distribution by number of employees for each category of work_interfere
# Here, I continue to use a countplot over a histplot, since the latter lacks the order parameter,
# which allows me to manually order the x-axis.
g = sns.FacetGrid(survey, col='work_interfere', height=4)
g = g.map(sns.countplot, 'no_employees', order = ['1-5', '6-25', '26-100', '100-500', '500-1000', 'More than 1000'])
g.set_axis_labels("Company Size", "Count")

<seaborn.axisgrid.FacetGrid at 0x7f1fea917670>


# Displays a distribution by workplace_score for each category of work_interfere
g = sns.FacetGrid(survey, col='work_interfere', height=4)
g = g.map(sns.histplot, 'workplace_score', kde=True)
g.set_axis_labels("Workplace Score", "Count")

<seaborn.axisgrid.FacetGrid at 0x7f1feaa2d970>


# Displays a distribution by family_history for each category of work_interfere
s = survey.dropna()
g = sns.FacetGrid(s, col='work_interfere', height=4)
g = g.map(sns.countplot, 'family_history')
g.set_axis_labels("Family History of Mental Illness (MI) \n0: No MI History, 1: Has Family MI History", "Count")

<seaborn.axisgrid.FacetGrid at 0x7f1feac3e370>


# Import from sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn import metrics


# Drops any NaN values
tidy_survey = survey.replace(to_replace='nan', value=np.nan)
tidy_survey.dropna(inplace=True)

# Displays a summary of our tidy_survey dataset
tidy_survey.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 795 entries, 0 to 1256
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Gender            795 non-null    int64  
 1   Age               795 non-null    float64
 2   family_history    795 non-null    int64  
 3   no_employees      795 non-null    object 
 4   work_interfere    795 non-null    object 
 5   no_employees_med  795 non-null    float64
 6   workplace_score   795 non-null    int64  
dtypes: float64(2), int64(3), object(2)
memory usage: 49.7+ KB


# X will contain our predictor variables, all of which have been converted to
# number values. Y will contain our target value, work_interfere.
X = tidy_survey[['Age', 'workplace_score', 'Gender', 'family_history', 'no_employees_med']]
y = tidy_survey['work_interfere']

# Here, we use train_test_split to split into training and testing sets
# with a 25:75 ratio, test set to training set.

# Scikit docs often set random_state to 42; however, this value does not matter,
# as long as it is set to the same value. It initializes the random number generator,
# and the specific number doesn't have any meaningful effects on our prediction model.
X_train, X_test,y_train, y_test = train_test_split(X,y,test_size=0.25, random_state=42)

# logreg instantiates the model
logreg = LogisticRegression()

# Fits our model with the given data
lm = logreg.fit(X_train,y_train)

# Creates an array of predictions
prediction = logreg.predict(X_test)

# Prints the coefficients for the predictor vars
# and the intercept
print(lm.coef_, lm.intercept_)

[[-1.06015793e-02 -3.64750981e-02 -2.99555231e-01  1.07580271e+00
  -1.53807163e-04]] [1.21010358]


# Inserts predictors and corresponding coefficients into a DataFrame
preds = ['Age', 'workplace_score', 'Gender', 'family_history', 'no_employees_med']
model_results = pd.DataFrame([preds, lm.coef_[0]]).transpose()
model_results.columns = ['Predictors', 'Coefficients']
model_results.sort_values('Coefficients', inplace=True)

# display dataframe
model_results


# Creates a Confusion Matrix to evaluate the performance of the predictive model
cnf_matrix = metrics.confusion_matrix(y_test, prediction)

# We will get a 2x2 matrix (due to our binary logistic regression)
# where 26 and 97 represent actual predictions (True Positive, and True Negative)
# and 54 and 22 representing incorrect predictions, Type 1 and Type 2 Errors, respectively.
cnf_matrix

array([[26, 54],
       [22, 97]])


# Indicates our classification names
class_names=['Little to No Interference','Moderate to Significant Interference']

# Sets the axes and labels for our heatmap
fig, ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)

# Uses Seaborn to create the heatmap
sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, cmap=plt.cm.Greens ,fmt='g')

# Sets axes layouts and labels
ax.xaxis.set_label_position("bottom")
plt.tight_layout()
plt.title('Confusion matrix for the Logistic Regression Model', y=1.05)
plt.ylabel('Actual')
plt.xlabel('Predicted')

Text(0.5, 15.0, 'Predicted')


# Outputs the key metrics for our classification
print(classification_report(y_test,prediction))

                                      precision    recall  f1-score   support

           Little to No Interference       0.54      0.33      0.41        80
Moderate to Significant Interference       0.64      0.82      0.72       119

                            accuracy                           0.62       199
                           macro avg       0.59      0.57      0.56       199
                        weighted avg       0.60      0.62      0.59       199

	Are you self-employed?	How many employees does your company or organization have?	Is your employer primarily a tech company/organization?	Is your primary role within your company related to tech/IT?	Does your employer provide mental health benefits as part of healthcare coverage?	Do you know the options for mental health care available under your employer-provided coverage?	Has your employer ever formally discussed mental health (for example, as part of a wellness campaign or other official communication)?	Does your employer offer resources to learn more about mental health concerns and options for seeking help?	Is your anonymity protected if you choose to take advantage of mental health or substance abuse treatment resources provided by your employer?	If a mental health issue prompted you to request a medical leave from work, asking for that leave would be:	Do you think that discussing a mental health disorder with your employer would have negative consequences?	Do you think that discussing a physical health issue with your employer would have negative consequences?	Would you feel comfortable discussing a mental health disorder with your coworkers?	Would you feel comfortable discussing a mental health disorder with your direct supervisor(s)?	Do you feel that your employer takes mental health as seriously as physical health?	Have you heard of or observed negative consequences for co-workers who have been open about mental health issues in your workplace?	Do you have medical coverage (private insurance or state-provided) which includes treatment of mental health issues?	Do you know local or online resources to seek help for a mental health disorder?	If you have been diagnosed or treated for a mental health disorder, do you ever reveal this to clients or business contacts?	If you have revealed a mental health issue to a client or business contact, do you believe this has impacted you negatively?	If you have been diagnosed or treated for a mental health disorder, do you ever reveal this to coworkers or employees?	If you have revealed a mental health issue to a coworker or employee, do you believe this has impacted you negatively?	Do you believe your productivity is ever affected by a mental health issue?	If yes, what percentage of your work time (time performing primary or secondary job functions) is affected by a mental health issue?	Do you have previous employers?	Have your previous employers provided mental health benefits?	Were you aware of the options for mental health care provided by your previous employers?	Did your previous employers ever formally discuss mental health (as part of a wellness campaign or other official communication)?	Did your previous employers provide resources to learn more about mental health issues and how to seek help?	Was your anonymity protected if you chose to take advantage of mental health or substance abuse treatment resources with previous employers?	Do you think that discussing a mental health disorder with previous employers would have negative consequences?	Do you think that discussing a physical health issue with previous employers would have negative consequences?	Would you have been willing to discuss a mental health issue with your previous co-workers?	Would you have been willing to discuss a mental health issue with your direct supervisor(s)?	Did you feel that your previous employers took mental health as seriously as physical health?	Did you hear of or observe negative consequences for co-workers with mental health issues in your previous workplaces?	Would you be willing to bring up a physical health issue with a potential employer in an interview?	Why or why not?	Would you bring up a mental health issue with a potential employer in an interview?	Why or why not?.1	Do you feel that being identified as a person with a mental health issue would hurt your career?	Do you think that team members/co-workers would view you more negatively if they knew you suffered from a mental health issue?	How willing would you be to share with friends and family that you have a mental illness?	Have you observed or experienced an unsupportive or badly handled response to a mental health issue in your current or previous workplace?	Have your observations of how another individual who discussed a mental health disorder made you less likely to reveal a mental health issue yourself in your current workplace?	Do you have a family history of mental illness?	Have you had a mental health disorder in the past?	Do you currently have a mental health disorder?	If yes, what condition(s) have you been diagnosed with?	If maybe, what condition(s) do you believe you have?	Have you been diagnosed with a mental health condition by a medical professional?	If so, what condition(s) were you diagnosed with?	Have you ever sought treatment for a mental health issue from a mental health professional?	If you have a mental health issue, do you feel that it interferes with your work when being treated effectively?	If you have a mental health issue, do you feel that it interferes with your work when NOT being treated effectively?	What is your age?	What is your gender?	What country do you live in?	What US state or territory do you live in?	What country do you work in?	What US state or territory do you work in?	Which of the following best describes your work position?	Do you work remotely?
0	0	26-100	1.0	NaN	Not eligible for coverage / N/A	NaN	No	No	I don't know	Very easy	No	No	Maybe	Yes	I don't know	No	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1	No, none did	N/A (not currently aware)	I don't know	None did	I don't know	Some of them	None of them	Some of my previous employers	Some of my previous employers	I don't know	None of them	Maybe	NaN	Maybe	NaN	Maybe	No, I don't think they would	Somewhat open	No	NaN	No	Yes	No	NaN	NaN	Yes	Anxiety Disorder (Generalized, Social, Phobia,...	0	Not applicable to me	Not applicable to me	39	Male	United Kingdom	NaN	United Kingdom	NaN	Back-end Developer	Sometimes
1	0	6-25	1.0	NaN	No	Yes	Yes	Yes	Yes	Somewhat easy	No	No	Maybe	Yes	Yes	No	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1	Yes, they all did	I was aware of some	None did	Some did	Yes, always	None of them	None of them	No, at none of my previous employers	Some of my previous employers	Some did	None of them	Maybe	It would depend on the health issue. If there ...	No	While mental health has become a more prominen...	No, I don't think it would	No, I don't think they would	Somewhat open	No	NaN	Yes	Yes	Yes	Anxiety Disorder (Generalized, Social, Phobia,...	NaN	Yes	Anxiety Disorder (Generalized, Social, Phobia,...	1	Rarely	Sometimes	29	male	United States of America	Illinois	United States of America	Illinois	Back-end Developer\|Front-end Developer	Never

Workplace attitudes towards mental health and their impact on employee productivity¶

What is the Data Science Pipeline?¶

Introduction - Why Track Attitudes towards Mental Illness?¶

1. Data Collection¶

2. Data Management and Representation¶

Preparing for later modeling - target and predictor values¶

Thinking ahead: Tidying Gender, Age, family_history¶

Thinking ahead: Tidying no_employees and measuring workplace culture/attitude¶

3. Exploratory Data Analysis¶

Inspecting our data¶

Predictions¶

Exploring how our possible predictors interact with our target value¶

4. Hypothesis Testing and Machine Learning¶

Introduction to Logistic Regression¶

Evaluating the performance of our model with a Confusion Matrix¶

Interpreting the classification_report¶

5. Insight and Discussion¶

Walking back through the Data Science Pipeline¶

1. Data Collection¶

2. Data Management and Representation and 3. Exploratory Data Analysis¶

4. Discussing Predictive Model Performance (Hypothesis Testing, Machine Learning)¶

Final Thoughts¶

	Timestamp	Age	Gender	Country	state	self_employed	family_history	treatment	work_interfere	no_employees	remote_work	tech_company	benefits	care_options	wellness_program	seek_help	anonymity	leave	mental_health_consequence	phys_health_consequence	coworkers	supervisor	mental_health_interview	phys_health_interview	mental_vs_physical	obs_consequence	comments
0	2014-08-27 11:29:31	37	Female	United States	IL	NaN	No	Yes	Often	6-25	No	Yes	Yes	Not sure	No	Yes	Yes	Somewhat easy	No	No	Some of them	Yes	No	Maybe	Yes	No	NaN
1	2014-08-27 11:29:37	44	M	United States	IN	NaN	No	No	Rarely	More than 1000	No	No	Don't know	No	Don't know	Don't know	Don't know	Don't know	Maybe	No	No	No	No	No	Don't know	No	NaN
2	2014-08-27 11:29:44	32	Male	Canada	NaN	NaN	No	No	Rarely	6-25	No	Yes	No	No	No	No	Don't know	Somewhat difficult	No	No	Yes	Yes	Yes	Yes	No	No	NaN

	Predictors	Coefficients
2	Gender	-0.299555
1	workplace_score	-0.0364751
0	Age	-0.0106016
4	no_employees_med	-0.000153807
3	family_history	1.0758