# For data
import pandas as pd
from pandas import Series,DataFrame
import numpy as np
# For visualization
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')
%matplotlib inline
from __future__ import division
import datetime
#use to get info from web
import requests
#stringIO to work with csv
from StringIO import StringIO
df = pd.read_csv("RI_Pres_Election_2016.csv", parse_dates=['contb_receipt_dt'])
df.head()
df.info()
sum_df = df.groupby(['contb_receipt_dt'],as_index=False).sum()
sum_df.head()
sum_df.plot('contb_receipt_dt','contb_receipt_amt',figsize=(12,6),marker='',linestyle='-',color='purple',
xlim=('2016-01-01','2016-11-08'))
count_df = df.groupby(['contb_receipt_dt'],as_index=False).count()
count_df.plot('contb_receipt_dt','contb_receipt_amt',figsize=(12,6),marker='',linestyle='-',color='purple',
xlim=('2016-01-01','2016-11-08'))
mean_df = df.groupby(['contb_receipt_dt'],as_index=False).mean()
mean_df.plot('contb_receipt_dt','contb_receipt_amt',figsize=(12,6),marker='',linestyle='-',color='purple',
xlim=('2016-01-01','2016-11-08'),ylim=(0,1000))
df.groupby('Party')['contb_receipt_amt'].sum().sort_values().plot(kind='bar')
df['contb_receipt_amt'].describe()
df.groupby('cand_nm')['contb_receipt_amt'].count().sort_values(ascending=False)
df['contb_receipt_amt'].value_counts()
cand_df = df.groupby("cand_nm").sum().sort_values(by="contb_receipt_amt",ascending=False)
cand_df = cand_df.drop(['contbr_zip','file_num'], axis=1)
cand_df
count_df = df.groupby("cand_nm").count().sort_values(by="contb_receipt_amt",ascending=False)
count_df = count_df.drop(['contbr_zip','file_num'], axis=1)
count_df['cmte_id']
total_donations = df.groupby("cand_nm").sum().sort_values(by="contb_receipt_amt",ascending=True)
total_donations["contb_receipt_amt"].plot(kind="bar",figsize=(12,5))
avg_donations = df.groupby("cand_nm").mean().sort_values(by="contb_receipt_amt")
avg_donations["contb_receipt_amt"].plot(kind="bar",figsize=(12,5))
# Getting a dataframe of just bernie and Hillary
dem_df = df[df['cand_nm'].isin(['Sanders, Bernard', 'Clinton, Hillary Rodham'])]
# only getting values under 100
don_df_15 = dem_df[dem_df.contb_receipt_amt < 15]
don_df_25 = dem_df[dem_df.contb_receipt_amt < 25]
don_df_35 = dem_df[dem_df.contb_receipt_amt < 35]
don_df_50 = dem_df[dem_df.contb_receipt_amt < 50]
don_df_75 = dem_df[dem_df.contb_receipt_amt < 75]
don_df_100 = dem_df[dem_df.contb_receipt_amt < 100]
# Grouping by canidate and totaling the ammount
don_df_15 = don_df_15.groupby("cand_nm").sum().sort_values(by="contb_receipt_amt")
don_df_25 = don_df_25.groupby("cand_nm").sum().sort_values(by="contb_receipt_amt")
don_df_35 = don_df_35.groupby("cand_nm").sum().sort_values(by="contb_receipt_amt")
don_df_50 = don_df_50.groupby("cand_nm").sum().sort_values(by="contb_receipt_amt")
don_df_75 = don_df_75.groupby("cand_nm").sum().sort_values(by="contb_receipt_amt")
don_df_100 = don_df_100.groupby("cand_nm").sum().sort_values(by="contb_receipt_amt")
# Concatinating the datasets together
frames = [don_df_15, don_df_25, don_df_35, don_df_50, don_df_75, don_df_100]
dem_concat = pd.concat(frames, keys=['15', '25', '35', '50', '75', '100'])
#resetting the index and dropping the columns we don't need
dem_concat = dem_concat.reset_index()
dem_concat = dem_concat.drop('contbr_zip', axis=1)
dem_concat = dem_concat.drop('file_num', axis=1)
dem_concat
# Pivoting by the amt ranges
dem_concat = dem_concat.pivot_table('contb_receipt_amt',index='level_0',columns = 'cand_nm',aggfunc='sum')
new_index= ['15', '25', '35', '50', '75', '100']
dem_concat = dem_concat.reindex(new_index)
dem_concat.head()
dem_concat[['Clinton, Hillary Rodham','Sanders, Bernard']].plot(kind='bar',figsize=(12,4))
plt.xlabel('Ammount')
locs, labels = plt.xticks()
plt.setp(labels, rotation=360)
plt.title('Bernie vs. Clinton')
df['contbr_city'].value_counts()
city_df = df.pivot_table('contb_receipt_amt',index='contbr_city',columns = 'Party',aggfunc='sum')
city_df["Total"] = city_df.sum(axis=1)
city_df = city_df.sort_values(by="Total",ascending=True)
city_df.head()
city_df[['Democrat','Republican']].plot(kind='barh',figsize=(10,16),cmap='bwr')
city_mean = df.pivot_table('contb_receipt_amt',index='contbr_city',columns = 'Party',aggfunc='mean')
city_mean["Total"] = city_mean.sum(axis=1)
city_mean = city_mean.sort_values(by="Total",ascending=True)
city_mean[['Democrat','Republican']].plot(kind='barh',figsize=(10,16),cmap='bwr')
city_count = df.pivot_table('contb_receipt_amt',index='contbr_city',columns = 'Party',aggfunc='count')
city_count["Total"] = city_count.sum(axis=1)
city_count = city_count.sort_values(by="Total",ascending=True)
city_count[['Democrat','Republican']].plot(kind='barh',figsize=(10,16),cmap='bwr')
occupation_df = df.pivot_table('contb_receipt_amt',index='contbr_occupation',columns = 'Party',aggfunc='sum')
occupation_df["Total"] = occupation_df.sum(axis=1)
occupation_df = occupation_df.sort_values(by="Total",ascending=True)
occupation_df.shape
occupation_df = occupation_df[occupation_df['Total'] > 10000]
# remove these rows
occupation_df.drop(['INFORMATION REQUESTED', 'RETIRED'],axis=0,inplace=True)
occupation_df.shape
occupation_df[['Democrat','Republican']].plot(kind='barh',figsize=(10,16),cmap='bwr')
occupation_count = df.pivot_table('contb_receipt_amt',index='contbr_occupation',columns = 'Party',aggfunc='count')
occupation_count["Total"] = occupation_count.sum(axis=1)
occupation_count = occupation_count.sort_values(by="Total",ascending=True)
occupation_count.shape
occupation_count = occupation_count[occupation_count['Total'] > 100]
occupation_count.drop(['INFORMATION REQUESTED', 'RETIRED','DISABLED'],axis=0,inplace=True)
occupation_count.loc['REGISTERED NURSE'] = occupation_count.loc['REGISTERED NURSE'] + occupation_count.loc['RN']
occupation_count.drop('RN', inplace=True)
occupation_count.shape
occupation_count[['Democrat','Republican']].plot(kind='barh',figsize=(10,16),cmap='bwr')
employer_df = df.pivot_table('contb_receipt_amt',index='contbr_employer',columns = 'Party',aggfunc='sum')
employer_df["Total"] = employer_df.sum(axis=1)
employer_df = employer_df[employer_df['Total'] > 5000]
employer_df.shape
employer_df.drop(['INFORMATION REQUESTED PER BEST EFFORTS', 'INFORMATION REQUESTED', 'RETIRED'],axis=0,inplace=True)
# combine like rows
employer_df.loc['SELF-EMPLOYED'] = employer_df.loc['SELF-EMPLOYED'] + employer_df.loc['SELF EMPLOYED'] + employer_df.loc['SELF']
employer_df.loc['NOT EMPLOYED'] = employer_df.loc['NOT EMPLOYED'] + employer_df.loc['NONE']
# remove self employed without dash
employer_df.drop('SELF EMPLOYED',inplace=True)
employer_df.drop('SELF',inplace=True)
employer_df.drop('NONE',inplace=True)
# These were far to large to allow us to see anything else
employer_df.drop('HOMEMAKER',inplace=True)
employer_df.drop('NOT EMPLOYED',inplace=True)
employer_df.drop('SELF-EMPLOYED',inplace=True)
employer_df = employer_df.sort_values(by="Total",ascending=True)
employer_df[['Democrat','Republican']].plot(kind='barh',figsize=(10,16),cmap='bwr')
employer_cnt = df.pivot_table('contb_receipt_amt',index='contbr_employer',columns = 'Party',aggfunc='count')
employer_cnt["Total"] = employer_cnt.sum(axis=1)
employer_cnt = employer_cnt[employer_cnt['Total'] > 50]
employer_cnt.shape
employer_cnt.drop(['INFORMATION REQUESTED PER BEST EFFORTS', 'INFORMATION REQUESTED', 'RETIRED'],axis=0,inplace=True)
# combine like rows
employer_cnt.loc['SELF-EMPLOYED'] = employer_cnt.loc['SELF-EMPLOYED'] + employer_cnt.loc['SELF EMPLOYED'] + employer_cnt.loc['SELF'] + employer_cnt.loc['SELF EMPLOYED-EMPLOYED']
employer_cnt.loc['NOT EMPLOYED'] = employer_cnt.loc['NOT EMPLOYED'] + employer_cnt.loc['NONE'] + employer_cnt.loc['UNEMPLOYED']
employer_cnt.loc['UNIVERSITY OF RHODE ISLAND'] = employer_cnt.loc['UNIVERSITY OF RHODE ISLAND'] + employer_cnt.loc['UNIVERSITY OF RHODE ISLAND']
# remove self employed without dash
employer_cnt.drop('SELF EMPLOYED',inplace=True)
employer_cnt.drop('SELF EMPLOYED-EMPLOYED',inplace=True)
employer_cnt.drop('SELF',inplace=True)
employer_cnt.drop('NONE',inplace=True)
employer_cnt.drop('UNEMPLOYED',inplace=True)
employer_cnt.drop('UNIVERSITY OF RI',inplace=True)
# These were far to large to allow us to see anything else
employer_cnt.drop('NOT EMPLOYED',inplace=True)
employer_cnt.drop('SELF-EMPLOYED',inplace=True)
employer_cnt = employer_cnt.sort_values(by="Total",ascending=True)
employer_cnt[['Democrat','Republican']].plot(kind='barh',figsize=(10,16),cmap='bwr')
df['County'].value_counts()
county_df = df.pivot_table('contb_receipt_amt',index='County',columns = 'Party',aggfunc='sum')
county_df["Total"] = county_df.sum(axis=1)
county_df = county_df.sort_values(by="Total",ascending=True)
county_df[['Democrat','Republican']].plot(kind='barh',figsize=(12,6),cmap='bwr')
county_df = df.pivot_table('contb_receipt_amt',index='County',columns = 'Party',aggfunc='mean')
county_df["Total"] = county_df.mean(axis=1)
county_df = county_df.sort_values(by="Total",ascending=True)
county_df[['Democrat','Republican']].plot(kind='barh',figsize=(12,6),cmap='bwr')
county_df = df.pivot_table('contb_receipt_amt',index='County',columns = 'Party',aggfunc='count')
county_df["Total"] = county_df.sum(axis=1)
county_df = county_df.sort_values(by="Total",ascending=True)
county_df[['Democrat','Republican']].plot(kind='barh',figsize=(12,6),cmap='bwr')