#IMPORT MODULES ------

#Stats modules
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

#Helper modules
import matplotlib.pyplot as plt   #Plotting support
import numpy as np   #Array support
import pandas as pd   #DataFrame support
import seaborn as sns   #Plotting support

#Suppress warnings in Jupyter 
import warnings
warnings.filterwarnings('ignore')

#IPython - HTML notebook formatting
from IPython.core.display import HTML


HTML("""
<style>
.jp-needs-light-background {
    display: block;
    margin: auto;
}

.jp-OutputArea-output.jp-RenderedHTMLCommon table {
    margin: 2em auto;
    background: #eae9e9;
    border: 1px solid #000;
    font-size: 12px;
}

.toc {
    font-size: 16px;
}

.nest-one {
    margin-left: 1em;
    font-style: italic;
    font-size: 14px;
}

.nest-two {
    margin-left: 3em;
    font-style: italic;
    font-size: 12px;
}

.faint {
    opacity: 0.2;
}
</style>
""")


#IMPORT DATA ------

#Import CSV as DataFrame
data = pd.read_csv("../data/cleaned_data.csv")


data.head()


#Investigate colinearity between potential explanatory variables

#Isolate numeric variables
cols = ["Goal", "Backers", "fund_days", "name_len", "years_since"]
expl = data[cols]

#Create correlation matrix
expl.corr()


#Plot correlation matrix
expl.corr().style.background_gradient(cmap='coolwarm', axis=None).set_precision(2)


vif = [variance_inflation_factor(expl.values, i) for i in range(len(expl.columns))]
print(pd.DataFrame(vif, expl.columns, columns=["VIF"]))

                  VIF
Goal         1.002171
Backers      1.015185
fund_days    4.494515
name_len     4.357918
years_since  4.089828


print(data.shape[0])

261358


#Split dataset into 'features' and 'target variable'
feature_cols = ['Backers', 'fund_days', 'Goal', 'name_len', 's_spring', 's_summer', 's_fall', 'years_since']
x = data[feature_cols]
y = data['State']


#Add an intercept (i.e., a column of 1's) to x
x = sm.add_constant(x)

#Describe the model (statsmodels.discrete.discrete_model.Logit)
model = sm.Logit(endog=y, exog=x, missing='none')

#Fit the model
result = model.fit()

#Print model results (beta coefficients, p-values, and confidence intervals)
res = result.summary2()
print(res)

Optimization terminated successfully.
         Current function value: 0.325689
         Iterations 12
                          Results: Logit
==================================================================
Model:              Logit            Pseudo R-squared: 0.521      
Dependent Variable: State            AIC:              170260.9627
Date:               2023-04-11 14:08 BIC:              170355.2255
No. Observations:   261358           Log-Likelihood:   -85121.    
Df Model:           8                LL-Null:          -1.7765e+05
Df Residuals:       261349           LLR p-value:      0.0000     
Converged:          1.0000           Scale:            1.0000     
No. Iterations:     12.0000                                       
-------------------------------------------------------------------
              Coef.   Std.Err.      z      P>|z|    [0.025   0.975]
-------------------------------------------------------------------
const        -0.1337    0.0304    -4.4001  0.0000  -0.1933  -0.0741
Backers       0.0563    0.0003   209.1841  0.0000   0.0558   0.0568
fund_days    -0.0134    0.0005   -28.3361  0.0000  -0.0143  -0.0125
Goal         -0.0002    0.0000  -167.6861  0.0000  -0.0002  -0.0002
name_len      0.0048    0.0004    12.3536  0.0000   0.0040   0.0055
s_spring      0.0301    0.0175     1.7184  0.0857  -0.0042   0.0644
s_summer     -0.1262    0.0175    -7.2137  0.0000  -0.1605  -0.0919
s_fall       -0.0431    0.0179    -2.4067  0.0161  -0.0782  -0.0080
years_since  -0.0889    0.0031   -28.7278  0.0000  -0.0949  -0.0828
==================================================================


#Define coefficient table from results summary
coef_table = res.tables[1]

#Correct column names
cols = coef_table.columns
coef_table = coef_table[1:]
coef_table.columns = cols.astype(str).str.strip(" ")

#Add back-transformed columns for beta coefficients and confidence intervals
coef_table["OR"] = np.exp(coef_table["Coef."].astype(float))
coef_table["CI_Lower"] = np.exp(coef_table["[0.025"].astype(float))
coef_table["CI_Higher"] = np.exp(coef_table["0.975]"].astype(float))

#Display table
coef_table = pd.concat([coef_table.iloc[:,-3:], coef_table.iloc[:,3]], axis=1)
coef_table

	Name	Goal	Backers	State	fund_days	name_len	s_spring
0	Grace Jones Does Not Give A F$#% T-Shirt (limi...	1000	30	0	39.123056	59	1
1	CRYSTAL ANTLERS UNTITLED MOVIE	80000	3	0	87.994525	30	1
2	drawing for dollars	20	3	1	8.088854	19	1
3	Offline Wikipedia iPhone app	99	25	1	79.266424	28	1
4	Pantshirts	1900	10	0	28.409271	10	1

	Goal	Backers	fund_days	name_len	years_since
Goal	1.000000	0.006245	0.020847	-0.006465	0.015745
Backers	0.006245	1.000000	-0.001251	0.020870	0.025893
fund_days	0.020847	-0.001251	1.000000	0.017794	-0.197098
name_len	-0.006465	0.020870	0.017794	1.000000	-0.059036
years_since	0.015745	0.025893	-0.197098	-0.059036	1.000000

	OR	CI_Lower	CI_Higher	P>\|z\|
Backers	1.057927	1.057369	1.058485	0.000000e+00
fund_days	0.986670	0.985755	0.987587	1.240029e-176
Goal	0.999760	0.999757	0.999763	0.000000e+00
name_len	1.004784	1.004024	1.005546	4.657128e-35
s_spring	1.030556	0.995777	1.066550	8.572709e-02
s_summer	0.881415	0.851699	0.912168	5.444942e-13
s_fall	0.957812	0.924775	0.992031	1.609632e-02
years_since	0.914976	0.909446	0.920540	1.717033e-181

Data Analysis - Predicting Kickstarter Campaign Success¶

Summary¶

Import Modules and Data¶

Pre-Model Checks¶

Response variable has binary outcome¶

Observations are independent¶

No colinearity among explanatory variables¶

No multicolinearity between explanatory variables¶

Sufficient sample size¶

Run a Logistic Regression¶

Model Interpretation¶

	Goal	Backers	fund_days	name_len	years_since
Goal	1.00	0.01	0.02	-0.01	0.02
Backers	0.01	1.00	-0.00	0.02	0.03
fund_days	0.02	-0.00	1.00	0.02	-0.20
name_len	-0.01	0.02	0.02	1.00	-0.06
years_since	0.02	0.03	-0.20	-0.06	1.00