# For data
import pandas as pd
from pandas import Series,DataFrame
import numpy as np
# For visualization
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')
%matplotlib inline
from __future__ import division
import datetime
#use to get info from web
import requests
#stringIO to work with csv
from StringIO import StringIO
doner_df = pd.read_csv('Election_Doner_Data.csv', index_col=False, dtype={
'cmte_id': object, 'cand_id': object, 'cand_nm': object, 'contbr_nm': object, 'contbr_city': object, 'contbr_st': object,
'contbr_zip': object, 'contbr_employer': object, 'contbr_occupation': object, 'contb_receipt_amt': float,
'contb_receipt_dt': object, 'receipt_desc': object,'memo_cd': object, 'memo_text': object, 'form_tp': object,
'file_num': int, 'tran_id': object, 'election_tp': object})
doner_df.info()
doner_df.head()
doner_df['contb_receipt_amt'].value_counts()
# removing negitives from our main dataframe
doner_df = doner_df[doner_df.contb_receipt_amt > 0]
# Get a list of all canidates
canidates = doner_df.cand_nm.unique()
canidates
# dictionary of party affiliation
party_map = {'Rubio, Marco': 'Republican',
'Santorum, Richard J.': 'Republican',
'Perry, James R. (Rick)': 'Republican',
'Carson, Benjamin S.': 'Republican',
"Cruz, Rafael Edward 'Ted'": 'Republican',
'Paul, Rand': 'Republican',
'Clinton, Hillary Rodham': 'Democrat',
'Sanders, Bernard': 'Democrat',
'Fiorina, Carly': 'Republican',
'Huckabee, Mike': 'Republican',
'Pataki, George E.': 'Republican',
"O'Malley, Martin Joseph": 'Democrat',
'Graham, Lindsey O.': 'Republican',
'Bush, Jeb': 'Republican',
'Trump, Donald J.': 'Republican',
'Jindal, Bobby': 'Republican',
'Christie, Christopher J.': 'Republican',
'Walker, Scott': 'Republican',
'Stein, Jill': '3rd Party',
'Webb, James Henry Jr.': 'Democrat',
'Kasich, John R.': 'Republican',
'Gilmore, James S III': 'Republican',
'Lessig, Lawrence': 'Democrat',
'Johnson, Gary': '3rd Party',
'McMullin, Evan': '3rd Party'}
# creating a party column and mapping party to canidate
doner_df['Party'] = doner_df.cand_nm.map(party_map)
doner_df.head(1)
# A count of how many donations were to each campaign
doner_df.groupby("cand_nm")["contb_receipt_amt"].count().sort_values()
# The Total the canidate raised
cand_amount = doner_df.groupby('cand_nm')['contb_receipt_amt'].sum().sort_values()
i = 0
for don in cand_amount:
print '%s raised %.0f' %(cand_amount.index[i],don)
i += 1
cand_amount.plot(kind='bar',figsize=(14,6))
# Average
avg_donations = doner_df.groupby("cand_nm").mean().sort_values(by="contb_receipt_amt")
avg_donations["contb_receipt_amt"].plot(kind="bar",figsize=(12,5))
doner_df.groupby('Party')['contb_receipt_amt'].sum().sort_values().plot(kind='bar')
# Pivot table much like in excel
job_pt = doner_df.pivot_table('contb_receipt_amt', index='contbr_occupation', columns = 'Party', aggfunc='sum')
job_pt["Total"] = job_pt.sum(axis=1)
job_pt.head()
job_pt = job_pt.sort_values(by="Total",ascending=True)
job_pt.info()
job_pt = job_pt[job_pt['Total'] > 2000000]
# 44 jobs are over $2,000,000
job_pt.shape
job_pt
# remove these rows
job_pt.drop(['INFORMATION REQUESTED PER BEST EFFORTS', 'INFORMATION REQUESTED', 'RETIRED',
'CONDUIT TOTAL LISTED IN AGG. FIELD'],axis=0,inplace=True)
#combine like rows
job_pt.loc['SELF-EMPLOYED'] = job_pt.loc['SELF-EMPLOYED'] + job_pt.loc['SELF EMPLOYED']
#remove self employed without dash
job_pt.drop('SELF EMPLOYED',inplace=True)
job_pt[['Democrat','Republican']].plot(kind='barh',figsize=(10,16),cmap='bwr')
# Pivot table much like in excel
job_pt = doner_df.pivot_table('contb_receipt_amt', index='contbr_occupation', columns = 'Party', aggfunc='count')
job_pt["Total"] = job_pt.sum(axis=1)
job_pt.head()
job_pt = job_pt.sort_values(by="Total",ascending=True)
job_pt = job_pt[job_pt['Total'] > 15000]
job_pt.shape
job_pt.drop(['DISABLED', 'INFORMATION REQUESTED PER BEST EFFORTS', 'RETIRED', 'INFORMATION REQUESTED',
'CONDUIT TOTAL LISTED IN AGG. FIELD'],axis=0,inplace=True)
job_pt.loc['SELF-EMPLOYED'] = job_pt.loc['SELF-EMPLOYED'] + job_pt.loc['SELF EMPLOYED']
job_pt.loc['REGISTERED NURSE'] = job_pt.loc['REGISTERED NURSE'] + job_pt.loc['RN']
job_pt.loc['BUSINESS OWNER'] = job_pt.loc['BUSINESS OWNER'] + job_pt.loc['OWNER']
job_pt.drop('SELF EMPLOYED', inplace=True)
job_pt.drop('RN', inplace=True)
job_pt.drop('OWNER', inplace=True)
job_pt[['Democrat','Republican']].plot(kind='barh',figsize=(10,16),cmap='bwr')
employer_df = doner_df.pivot_table('contb_receipt_amt',index='contbr_employer',columns = 'Party',aggfunc='sum')
employer_df.shape
# once again graphing 398,017 different companies will not only look horrible, but take forever
employer_df = employer_df[employer_df.sum(1) > 350000]
employer_df.shape
# Thats better
employer_df
employer_df.drop(['INFORMATION REQUESTED PER BEST EFFORTS', 'INFORMATION REQUESTED', 'RETIRED',
'TWICE REQUESTED NOT YET RECEIVED'],axis=0,inplace=True)
# combine like rows
employer_df.loc['SELF-EMPLOYED'] = employer_df.loc['SELF-EMPLOYED'] + employer_df.loc['SELF EMPLOYED'] + employer_df.loc['SELF']
employer_df.loc['NOT EMPLOYED'] = employer_df.loc['NOT EMPLOYED'] + employer_df.loc['NONE']
# remove self employed without dash
employer_df.drop('SELF EMPLOYED',inplace=True)
employer_df.drop('SELF',inplace=True)
employer_df.drop('NONE',inplace=True)
# These were far to large to allow us to see anything else
employer_df.drop('HOMEMAKER',inplace=True)
employer_df.drop('NOT EMPLOYED',inplace=True)
employer_df.drop('SELF-EMPLOYED',inplace=True)
employer_df["Total"] = employer_df.sum(axis=1)
employer_df = employer_df.sort_values(by="Total",ascending=True)
employer_df.head()
employer_df[['Democrat','Republican']].plot(kind='barh',figsize=(10,16),cmap='bwr')
state_df = doner_df.pivot_table('contb_receipt_amt',index='contbr_st',columns = 'Party',aggfunc='sum')
state_df.head()
state_df.drop(['20','30','FF','FR','GU','IS','LO','MP','ON','PR','SI','VI','XX','AS','AU','BC','BR','EN','JA',
'LE','SW','U*','UK','AA','BU','PU','SA','TE','ZZ','AB','BA','BE','CH','DU','E','EU','FM','GR','HE','KE','LI','MB',
'NB','NL','NO','NS','OS','QC','SH','SK','SO','ST','TO','W.','GE','N.','C','PW','AE','AM','AP','QB'],inplace=True)
state_df["Total"] = state_df.sum(axis=1)
state_df = state_df.sort_values(by="Total",ascending=True)
state_df.head()
state_df[['Democrat','Republican']].plot(kind='barh',figsize=(15,16),cmap='bwr')
RI_df = doner_df[doner_df.contbr_st == 'RI']
RI_df.head()
RI_df.shape
RI_df.contbr_city.unique()
RI_df['contbr_city'].value_counts()
RI_df = RI_df[~RI_df['contbr_city'].isin(['CHAMBERSBURG', 'CLINTON', 'FAIR HAVEN', 'CLAREMORE', 'FTWORTH', 'HERMITAGE', 'HONOMU',
'KIAWAH ISLAND', 'LAKE FOREST', 'MADISON', 'MIDDLEFIELD', 'PARIS', 'PERU', 'PITTSBURGH', 'POOLESVILLE', 'PORT ST LUCIE',
'POTTSTOWN', 'PREONZO', 'SALT LAKE CITY', 'SCOTTSDALE', 'SPARTANBURG', 'SPEARFISH', 'VACAVILLE', 'INFO REQUESTED',
'REDMOND'])]
RI_df = RI_df.replace(['BARRINGTION'],'BARRINGTON')
RI_df = RI_df.replace(['GLENDALE','HARRISVILLE','MAPLEVILLE','OAKLAND','PASCOAG'],'BURRILLVILLE')
RI_df = RI_df.replace(['CAROLINA','CHARLESTON','CHARLESTOWN'],'CHARLESTOWN')
RI_df = RI_df.replace(['GREENE'],'COVENTRY')
RI_df = RI_df.replace(['E GREENWICH','E. GREENWICH','EAST GREENWICH, RI 02818'],'EAST GREENWICH')
RI_df = RI_df.replace(['E PROVIDENCE','EAST PROVIDENCE','ESAT PROVIDENCE','RIVERSIDE','RUMFORD'],'EAST PROVIDENCE')
RI_df = RI_df.replace(['CLAYVILLE'],'FOSTER')
RI_df = RI_df.replace(['CHAPACHET','CHEPACHET','HARMONY'],'GLOCESTER')
RI_df = RI_df.replace(['ASHAWAY','HOPE VALLEY','ROCKVILLE'],'HOPKINTON')
RI_df = RI_df.replace(['ALBION','FAIR LAWN','LINCON','MANVILLE'],'LINCOLN')
RI_df = RI_df.replace(['ADAMSVILLE','L. COMPTON','LITTLE COMPTO','MANVILLE'],'LITTLE COMPTON')
RI_df = RI_df.replace(['MIDDLETWON'],'MIDDLETOWN')
RI_df = RI_df.replace(['NARRAGANSETT','NARRAGONSETT','SAUNDERSTOWN','SAUNDERSTROWN','NARRAGANSET'],'NARRAGANSETT')
RI_df = RI_df.replace(['BLOCK ISLAND'],'NEW SHOREHAM')
RI_df = RI_df.replace(['N KINGSTOWN','N. KINGSTOWN','NO.KINGSTOWN','NORTH KING','SLOCUM','WCKFORD','WICKFORD'],'NORTH KINGSTOWN')
RI_df = RI_df.replace(['N PROVIDENCE','N. PROVIDENCE','N.PROV.','NO. PROVIDENCE','NORTHPROVIDENCE'],'NORTH PROVIDENCE')
RI_df = RI_df.replace(['FORESTDALE','N SMITHFIELD','SLATERSVILLE'],'NORTH SMITHFIELD')
RI_df = RI_df.replace(['PRUDENCE ISL'],'PORTSMOUTH')
RI_df = RI_df.replace(['SHANNOCK','WOOD RIVER JUNCTION','WYOMING'],'RICHMOND')
RI_df = RI_df.replace(['HOPE','N SCITUATE','N. SCITUATE','N.SCITUATE','NORTH SCITUATE'],'SCITUATE')
RI_df = RI_df.replace(['GREENVILLE','SMITHFIELDSMITHFIELD'],'SMITHFIELD')
RI_df = RI_df.replace(['KINGSTON','PEACE DALE','S KINGSTOWN','S. KINGSTOWN','SOUTH KINGSTOWN','WAKEFIELD','WEDT KINGSTON','WEST KINGSTON'],'SOUTH KINGSTOWN')
RI_df = RI_df.replace(['WARIWCK','WAWICK','WARWICK. RHODE ISLAND'],'WARWICK')
RI_df = RI_df.replace(['W GREENWICH','WSET GREENWICH'],'WEST GREENWICH')
RI_df = RI_df.replace(['WEST WARWWICK'],'WEST WARWICK')
RI_df = RI_df.replace(['BRADFORD'],'WESTERLY')
RI_df['contbr_city'].value_counts()
# dictionary of RI Counties
county_map = {'BARRINGTON': 'BRISTOL',
'BRISTOL': 'BRISTOL',
'BURRILLVILLE': 'PROVIDENCE',
'CENTRAL FALLS': 'PROVIDENCE',
'CHARLESTOWN': 'WASHINGTON',
'COVENTRY': 'KENT',
'CRANSTON': 'PROVIDENCE',
'CUMBERLAND': 'PROVIDENCE',
'EAST GREENWICH': 'KENT',
'EAST PROVIDENCE': 'PROVIDENCE',
'EXETER': 'WASHINGTON',
'FOSTER': 'PROVIDENCE',
'GLOCESTER': 'PROVIDENCE',
'HOPKINTON': 'WASHINGTON',
'JAMESTOWN': 'NEWPORT',
'JOHNSTON': 'PROVIDENCE',
'LINCOLN': 'PROVIDENCE',
'LITTLE COMPTON': 'NEWPORT',
'MIDDLETOWN': 'NEWPORT',
'NARRAGANSETT': 'WASHINGTON',
'NEWPORT': 'NEWPORT',
'NEW SHOREHAM': 'WASHINGTON',
'NORTH KINGSTOWN': 'WASHINGTON',
'NORTH PROVIDENCE': 'PROVIDENCE',
'NORTH SMITHFIELD': 'PROVIDENCE',
'PAWTUCKET': 'PROVIDENCE',
'PORTSMOUTH': 'NEWPORT',
'PROVIDENCE': 'PROVIDENCE',
'RICHMOND': 'WASHINGTON',
'SCITUATE': 'PROVIDENCE',
'SMITHFIELD': 'PROVIDENCE',
'SOUTH KINGSTOWN': 'WASHINGTON',
'TIVERTON': 'NEWPORT',
'WARREN': 'BRISTOL',
'WARWICK': 'KENT',
'WESTERLY': 'WASHINGTON',
'WEST GREENWICH': 'KENT',
'WEST WARWICK': 'KENT',
'WOONSOCKET': 'PROVIDENCE'}
# creating a party column and mapping party to canidate
RI_df['County'] = RI_df.contbr_city.map(county_map)
RI_df['County'].value_counts()
RI_df.to_csv('RI_Pres_Election_2016.csv')