Data Analysis with Python¶

# 1. import relevant libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pandas_datareader as pdr
%matplotlib inline

# 2.  Download stock price data, store in dataframe
amzn = pdr.get_data_google('AMZN', start='2016-08-03')

# 3. Take a look at the data
amzn.head()

# 4. Get just the data we are interested in
amzn_close = amzn['Close']

# 5. Verify data 
type(amzn_close.head())

pandas.core.series.Series

# 6. Convert close-to-close dollar change into instantaneous rate of return, verify transformation
amzn_percent_change = np.log(amzn_close / amzn_close.shift(1)) * 100
amzn_percent_change.head()

Date
2016-08-03         NaN
2016-08-04    0.809026
2016-08-05    0.682498
2016-08-08    0.075691
2016-08-09    0.228032
Name: Close, dtype: float64

# 7. Pandas describe function gives descriptive statistics, default assumes you are dealing with a sample
# Pandas also deals with missing values by ommitting them 
amzn_percent_change.describe()

count    252.000000
mean       0.106487
std        1.194826
min       -5.273753
25%       -0.484134
50%        0.117312
75%        0.811309
max        3.881057
Name: Close, dtype: float64

# 8. An alternative more comprehensive table of descriptives can be generated using scipy stats, assumes a sample
from scipy import stats
stats.describe(amzn_percent_change[1:])

DescribeResult(nobs=252, minmax=(-5.2737531271995062, 3.8810569610031007), mean=0.10648736829713538, variance=1.4276100656074371, skewness=-0.5610474712532317, kurtosis=2.3600263221846784)

# 9. for comparision generate random numbers that follow normal distribution
x = np.random.normal(.1064,1.1948, 252)
stats.describe(x)

DescribeResult(nobs=252, minmax=(-3.530203911382729, 3.0549917950041023), mean=0.18017895760082278, variance=1.364946888190105, skewness=-0.05145305633122407, kurtosis=-0.1534361303525733)

# 10. use scipy stats to test the kurtosis, the null hypothesis is that the sample is drawn from a population 
# where the underlying kurtosis is that of a normally distributed variable
print("x: ", stats.kurtosistest(x))
print("amzn: ", stats.kurtosistest(amzn_percent_change[1:]) )

x:  KurtosistestResult(statistic=-0.33790480941260126, pvalue=0.73543492313774006)
amzn:  KurtosistestResult(statistic=4.2481322914200996, pvalue=2.1556009784047011e-05)

# 11. plot histogram of price changes with normal curve overlay
import matplotlib.mlab as mlab

plt.hist(amzn_percent_change[1:], edgecolor='black', normed=True)

# overlay normal curve
mean = float(np.mean(amzn_percent_change))
sd = float(np.std(amzn_percent_change, ddof=1))
min_chng = float(np.min(amzn_percent_change))
max_chng = float(np.max(amzn_percent_change))
a = np.linspace(min_chng, max_chng,100)
plt.plot(a,mlab.normpdf(a,mean,sd))

[<matplotlib.lines.Line2D at 0x115a03e80>]

# 12. compare to sample drawn from a normally distributed population with same attributes as amzn price change
plt.hist(x, normed=True)

mean = float(np.mean(x))
sd = float(np.std(x))
b = np.linspace(np.min(x), np.max(x), 100)
plt.plot(b,mlab.normpdf(b, mean, sd))
plt.show()

	Open	High	Low	Close	Volume
Date
2016-08-03	757.06	758.89	752.25	754.64	3581525
2016-08-04	753.70	765.00	750.35	760.77	3178208
2016-08-05	764.81	768.47	763.09	765.98	2704391
2016-08-08	766.81	767.00	761.02	766.56	1986272
2016-08-09	767.39	772.60	766.90	768.31	1876091