#IMPORT MODULES ------

#Common modules
import datetime as dt   #Used to handle date data
import matplotlib.pyplot as plt   #Plotting support
import pandas as pd   #DataFrame support

#Suppress warnings in Jupyter 
import warnings
warnings.filterwarnings('ignore')

#IPython - HTML notebook formatting
from IPython.core.display import HTML


HTML("""
<style>
.jp-needs-light-background {
    display: block;
    margin: auto;
}

.jp-OutputArea-output.jp-RenderedHTMLCommon table {
    margin: 2em auto;
    background: #eae9e9;
    border: 1px solid #000;
    font-size: 14px;
}

.toc {
    font-size: 16px;
}

.nest-one {
    margin-left: 1em;
    font-style: italic;
    font-size: 14px;
}

.nest-two {
    margin-left: 3em;
    font-style: italic;
    font-size: 12px;
}

.faint {
    opacity: 0.2;
}
</style>
""")


#IMPORT DATA ------

#Import CSV as DataFrame
data = pd.read_csv("../data/kickstarter_projects.csv")


data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 374853 entries, 0 to 374852
Data columns (total 11 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   ID           374853 non-null  int64 
 1   Name         374853 non-null  object
 2   Category     374853 non-null  object
 3   Subcategory  374853 non-null  object
 4   Country      374853 non-null  object
 5   Launched     374853 non-null  object
 6   Deadline     374853 non-null  object
 7   Goal         374853 non-null  int64 
 8   Pledged      374853 non-null  int64 
 9   Backers      374853 non-null  int64 
 10  State        374853 non-null  object
dtypes: int64(4), object(7)
memory usage: 31.5+ MB


print(data["Country"].value_counts())

United States     292618
United Kingdom     33671
Canada             14756
Australia           7839
Germany             4171
France              2939
Italy               2878
Netherlands         2868
Spain               2276
Sweden              1757
Mexico              1752
New Zealand         1447
Denmark             1113
Ireland              811
Switzerland          760
Norway               708
Hong Kong            618
Belgium              617
Austria              597
Singapore            555
Luxembourg            62
Japan                 40
Name: Country, dtype: int64


#Subset to US data
us = data[data["Country"] == "United States"]


#What values are present in the "State" column?
us["State"].value_counts()

Failed        152059
Successful    109299
Canceled       28305
Live            1740
Suspended       1215
Name: State, dtype: int64


us_filt = us[us["State"].isin(["Failed", "Successful"])]
us_filt["State"].value_counts()

Failed        152059
Successful    109299
Name: State, dtype: int64


#Define a mapping dict
md = {
    'Failed':0,
    'Successful':1
}

#Replace values using map() method
us_filt["State"] = us_filt["State"].map(md)


us_filt = us_filt.drop_duplicates()
us_filt.shape[0]

261358


#Category variable
us_filt["Category"].value_counts()

Film & Video    46932
Music           40225
Publishing      28677
Art             20448
Games           20229
Design          18485
Technology      18058
Food            18033
Fashion         14536
Theater          8194
Comics           8159
Photography      7229
Crafts           5963
Journalism       3109
Dance            3081
Name: Category, dtype: int64


#Subcategory variable
us_filt["Subcategory"].value_counts()

Product Design     13732
Documentary        12349
Music              10926
Shorts              9534
Tabletop Games      8939
                   ...  
Residencies           43
Letterpress           32
Chiptune              22
Literary Spaces       13
Taxidermy              6
Name: Subcategory, Length: 159, dtype: int64


#Country variable
us_filt["Country"].value_counts()

United States    261358
Name: Country, dtype: int64


us_filt["Launched"].head()

0    2009-04-21 21:02:48
1    2009-04-23 00:07:53
2    2009-04-24 21:52:03
3    2009-04-25 17:36:21
4    2009-04-27 14:10:39
Name: Launched, dtype: object


type(us_filt["Launched"][0])

str


#Convert column from str aformat into datetime format
us_filt["Launched"] = pd.to_datetime(us_filt["Launched"], format="%Y-%m-%d %H:%M:%S")


#Plot histogram of launch dates
plt.hist(us_filt["Launched"])
plt.xlabel("Year")
plt.ylabel("Count")
plt.title("Distribution of Campaign Launch Dates")
plt.show()


min(us_filt["Launched"])

Timestamp('2009-04-21 21:02:48')


us_filt["Deadline"].head()

0    2009-05-31
1    2009-07-20
2    2009-05-03
3    2009-07-14
4    2009-05-26
Name: Deadline, dtype: object


type(us_filt["Launched"][0])

pandas._libs.tslibs.timestamps.Timestamp


#Convert column from str format into datetime format
us_filt["Deadline"] = pd.to_datetime(us_filt["Deadline"], format="%Y-%m-%d")


#Plot histogram of launch dates
plt.hist(us_filt["Deadline"])
plt.xlabel("Year")
plt.ylabel("Count")
plt.title("Distribution of Campaign Fundraising Deadlines")
plt.show()


#Calculate time delta for each campaign period
fund_period = us_filt["Deadline"] - us_filt["Launched"]

#Convert time deltas to days (float)
us_filt["fund_days"] = fund_period.dt.total_seconds() / 3600 / 24


#Return total number of null values, if any
print(str(us_filt["fund_days"].isnull().sum()) + " Null Values")

0 Null Values


#Return summary statistics
us_filt["fund_days"].describe()

count    261358.000000
mean         33.628154
std          13.015217
min           0.005058
25%          29.090854
50%          29.743773
75%          36.353102
max          91.962650
Name: fund_days, dtype: float64


#Plot histogram of fundraising periods 
plt.hist(us_filt["fund_days"])
plt.xlabel('Fundraising Period (Days)')
plt.ylabel('Count')
plt.show()


#Return summary statistics
us_filt["Goal"].describe()

count    2.613580e+05
mean     3.934893e+04
std      1.035441e+06
min      0.000000e+00
25%      2.000000e+03
50%      5.000000e+03
75%      1.500000e+04
max      1.000000e+08
Name: Goal, dtype: float64


us_filt["Goal"].median()

5000.0


#Goal
plt.hist(us_filt["Goal"])
plt.xlabel('Fundraising Goal (USD)')
plt.ylabel('Count')
plt.title('Distribution of Fundraising Goals (USD)')
plt.show()


#Goal
plt.hist(us_filt["Goal"][us_filt["Goal"] <= 100000], bins=20)
plt.xlabel('Fundraising Goal (USD)')
plt.ylabel('Count')
plt.title('Distribution of Fundraising Goals < 1 Million USD')
plt.show()


#Return summary statistics
us_filt["Backers"].describe()

count    261358.000000
mean        122.745655
std        1039.320806
min           0.000000
25%           2.000000
50%          17.000000
75%          66.000000
max      219382.000000
Name: Backers, dtype: float64


#Calculate median value
us_filt["Backers"].median()

17.0


#Generate histogram
plt.hist(us_filt["Backers"])
plt.xlabel('Individuals')
plt.ylabel('Count')
plt.title('Distribution of Campaign Backers')
plt.show()


#Generate histogram
plt.hist(us_filt["Backers"][us_filt["Backers"] <= 1000], bins=20)
plt.xlabel('Individuals')
plt.ylabel('Count')
plt.title('Distribution of Campaign Backers < 1000')
plt.show()


#Create column
us_filt["name_len"] = us_filt["Name"].str.len()


#Explore column
plt.hist(us_filt["name_len"])
plt.xlabel("Length of Campaign Name (Characters)")
plt.ylabel("Count")
plt.show()


#Create launch year column
us_filt["years_since"] = us_filt["Launched"].dt.year - 2009


#List of seasons
seasons = ["s_winter", "s_spring", "s_summer", "s_fall"]

#Add new columns
for season, num in zip(seasons, list(range(1,5))):
    us_filt[season] = (us_filt["Launched"].dt.month%12 // 3 + 1 == num).astype(int)


#Counts by season
us_filt[seasons].sum()

s_winter    54727
s_spring    69853
s_summer    71324
s_fall      65454
dtype: int64


#Drop ID and Pledged columns
drops = ['ID','Category', 'Subcategory','Country', 'Launched', 'Deadline', 'Pledged']
us_filt = us_filt.drop(drops, axis=1)


us_filt.head()


us_filt.to_csv("../data/cleaned_data.csv", index=False)

Data Cleaning - Predicting Kickstarter Campaign Success¶

Summary¶

Import Modules and Data¶

Data Exploration and Cleaning¶

Examine Data Structure¶

Subset Data¶

Filter to US Data¶

Filter to Binary Outcome¶

Search for Missing/Erroneous Data¶

Remove Duplicate Values¶

Search for Erroneous Data¶

Category¶

Subcategory¶

Country¶

Launch Date¶

Deadline¶

Fundraising Period¶

Fundraising Goal¶

Number of Backers¶

Create New Columns¶

Campaign Title Length (Chars)¶

Time Since Kickstarter Launch (Years)¶

Season¶

Drop Unnecessary Columns¶

Save Cleaned CSV for Analysis¶

	Name	Goal	Backers	State	fund_days	name_len	s_spring
0	Grace Jones Does Not Give A F$#% T-Shirt (limi...	1000	30	0	39.123056	59	1
1	CRYSTAL ANTLERS UNTITLED MOVIE	80000	3	0	87.994525	30	1
2	drawing for dollars	20	3	1	8.088854	19	1
3	Offline Wikipedia iPhone app	99	25	1	79.266424	28	1
4	Pantshirts	1900	10	0	28.409271	10	1