# For data
import pandas as pd
from pandas import Series,DataFrame
import numpy as np
# For visualization
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline
from __future__ import division
from datetime import datetime
# use to get info from web
import requests
# stringIO to work with csv
from StringIO import StringIO
# getting the data
url = 'http://elections.huffingtonpost.com/pollster/2016-general-election-trump-vs-clinton.csv'
source = requests.get(url).text
poll_data = StringIO(source)
# creating the dataframe
poll_df = pd.read_csv(poll_data)
poll_df.info()
# previewing the data
poll_df.head()
# averaging the data
avg = pd.DataFrame(poll_df.mean())
#droping the info we dont need
avg.drop('Number of Observations',axis=0,inplace=True)
avg.drop('Question Iteration',axis=0,inplace=True)
# getting the standard deviation
std = pd.DataFrame(poll_df.std())
#dropping the same columns we dont need
std.drop('Number of Observations',axis=0,inplace=True)
std.drop('Question Iteration',axis=0,inplace=True)
# combining them into a dataframe
poll_avg = pd.concat([avg,std],axis=1)
#adding header
poll_avg.columns = ['Average','STD']
# viewing the average and standard deviation
poll_avg
# Adding a difference column to show the difference between clinton/trump in every poll
# A POSITIVE value = Leaning Clinton a NEGITIVE value = Leaning Trump
poll_df['Difference'] = (poll_df.Clinton - poll_df.Trump)/100
# grouping all the polls by day and averaging all the data
poll_df = poll_df.groupby(['Start Date'],as_index=False).mean()
poll_df.head()
# previewing the difference throughout the election
poll_df.plot('Start Date','Difference',figsize=(12,4),marker='',linestyle='-',color='purple')
poll_df.plot('Start Date','Difference',figsize=(12,6),marker='o',linestyle='-',color='purple',xlim=(226,262))
#sept 26 1st debate
plt.axvline(x=225+2,linewidth=4,color='grey', alpha=0.5)
#oct 9th 2nd debate
plt.axvline(x=232+8,linewidth=4,color='grey', alpha=0.5)
#oct 19th 3rd debate
plt.axvline(x=232+18,linewidth=4,color='grey', alpha=0.5)
poll_df.plot('Start Date','Difference',figsize=(12,6),marker='o',linestyle='-',color='purple',xlim=(159,182))
plt.axvline(x=162,linewidth=4,color='blue', alpha=0.5)
plt.axvline(x=167,linewidth=4,color='blue', alpha=0.5)
plt.axvline(x=171,linewidth=4,color='red', alpha=0.5)
plt.axvline(x=174,linewidth=4,color='red', alpha=0.5)
plt.axvline(x=175,linewidth=4,color='blue', alpha=0.5)
plt.axvline(x=180,linewidth=4,color='blue', alpha=0.5)
poll_df.plot('Start Date','Difference',figsize=(12,6),marker='o',linestyle='-',color='purple',xlim=(183,208))
plt.axvline(x=189,linewidth=4,color='blue', alpha=0.5)
plt.axvline(x=196,linewidth=4,color='red', alpha=0.5)
poll_df.plot('Start Date','Difference',figsize=(12,6),marker='o',linestyle='-',color='purple',xlim=(209,231))
plt.axvline(x=215,linewidth=4,color='blue', alpha=0.5)
plt.axvline(x=216,linewidth=4,color='blue', alpha=0.5)
plt.axvline(x=221,linewidth=4,color='grey', alpha=0.5)
plt.axvline(x=227,linewidth=4,color='grey', alpha=0.5)
poll_df.plot('Start Date','Difference',figsize=(12,6),marker='o',linestyle='-',color='purple',xlim=(232,262))
plt.axvline(x=238,linewidth=4,color='grey', alpha=0.5)
plt.axvline(x=238,linewidth=4,color='grey', alpha=0.5)
plt.axvline(x=240,linewidth=4,color='grey', alpha=0.5)
plt.axvline(x=243,linewidth=4,color='red', alpha=0.5)
plt.axvline(x=250,linewidth=4,color='grey', alpha=0.5)
plt.axvline(x=259,linewidth=4,color='blue', alpha=0.5)
poll_df.plot('Start Date','Difference',figsize=(12,6),marker='o',linestyle='-',color='purple',xlim=(257,266))
plt.axvline(x=232+27,linewidth=4,color='blue', alpha=0.5)
poll_df.plot('Start Date','Difference',figsize=(12,6),marker='o',linestyle='-',color='purple',xlim=(134,159))
plt.axvline(x=143,linewidth=4,color='orange', alpha=0.5)
plt.axvline(x=152,linewidth=4,color='cyan', alpha=0.5)
plt.axvline(x=157,linewidth=4,color='orange', alpha=0.5)
poll_df.plot('Start Date','Difference',figsize=(12,6),marker='o',linestyle='-',color='purple',xlim=(159,182))
plt.axvline(x=169,linewidth=4,color='orange', alpha=0.5)
plt.axvline(x=170,linewidth=4,color='green', alpha=0.5)
poll_df.plot('Start Date','Difference',figsize=(12,6),marker='o',linestyle='-',color='purple',xlim=(183,208))
plt.axvline(x=185,linewidth=4,color='green', alpha=0.5)
plt.axvline(x=200,linewidth=4,color='green', alpha=0.5)
plt.axvline(x=203,linewidth=4,color='green', alpha=0.5)