Merge pull request #2088 from stared/patch-1

Statistical computing with Python - PEP8 fixes
This commit is contained in:
Geoff Liu 2016-01-03 13:47:09 -07:00
commit 5a2af22a27

View File

@ -9,6 +9,8 @@ This is a tutorial on how to do some typical statistical programming tasks using
```python ```python
# 0. Getting set up ==== # 0. Getting set up ====
""" Get set up with IPython and pip install the following: numpy, scipy, pandas, """ Get set up with IPython and pip install the following: numpy, scipy, pandas,
@ -35,7 +37,7 @@ r.text # raw page source
print(r.text) # prettily formatted print(r.text) # prettily formatted
# save the page source in a file: # save the page source in a file:
os.getcwd() # check what's the working directory os.getcwd() # check what's the working directory
f = open("learnxinyminutes.html","wb") f = open("learnxinyminutes.html", "wb")
f.write(r.text.encode("UTF-8")) f.write(r.text.encode("UTF-8"))
f.close() f.close()
@ -44,7 +46,7 @@ fp = "https://raw.githubusercontent.com/adambard/learnxinyminutes-docs/master/"
fn = "pets.csv" fn = "pets.csv"
r = requests.get(fp + fn) r = requests.get(fp + fn)
print(r.text) print(r.text)
f = open(fn,"wb") f = open(fn, "wb")
f.write(r.text.encode("UTF-8")) f.write(r.text.encode("UTF-8"))
f.close() f.close()
@ -58,7 +60,9 @@ f.close()
you've used R, you will be familiar with the idea of the "data.frame" already. you've used R, you will be familiar with the idea of the "data.frame" already.
""" """
import pandas as pd, numpy as np, scipy as sp import pandas as pd
import numpy as np
import scipy as sp
pets = pd.read_csv(fn) pets = pd.read_csv(fn)
pets pets
# name age weight species # name age weight species
@ -86,7 +90,7 @@ pets.age[0:2]
# 0 3 # 0 3
# 1 6 # 1 6
sum(pets.age)*2 # 28 sum(pets.age) * 2 # 28
max(pets.weight) - min(pets.weight) # 20 max(pets.weight) - min(pets.weight) # 20
""" If you are doing some serious linear algebra and number-crunching, you may """ If you are doing some serious linear algebra and number-crunching, you may
@ -96,7 +100,8 @@ max(pets.weight) - min(pets.weight) # 20
# 3. Charts ==== # 3. Charts ====
import matplotlib as mpl, matplotlib.pyplot as plt import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline %matplotlib inline
# To do data vizualization in Python, use matplotlib # To do data vizualization in Python, use matplotlib
@ -105,13 +110,17 @@ plt.hist(pets.age);
plt.boxplot(pets.weight); plt.boxplot(pets.weight);
plt.scatter(pets.age, pets.weight); plt.xlabel("age"); plt.ylabel("weight"); plt.scatter(pets.age, pets.weight)
plt.xlabel("age")
plt.ylabel("weight");
# seaborn sits atop matplotlib and makes plots prettier # seaborn sits atop matplotlib and makes plots prettier
import seaborn as sns import seaborn as sns
plt.scatter(pets.age, pets.weight); plt.xlabel("age"); plt.ylabel("weight"); plt.scatter(pets.age, pets.weight)
plt.xlabel("age")
plt.ylabel("weight");
# there are also some seaborn-specific plotting functions # there are also some seaborn-specific plotting functions
# notice how seaborn automatically labels the x-axis on this barplot # notice how seaborn automatically labels the x-axis on this barplot
@ -141,7 +150,7 @@ ggplot(aes(x="age",y="weight"), data=pets) + geom_point() + labs(title="pets")
url = "https://raw.githubusercontent.com/e99n09/R-notes/master/data/hre.csv" url = "https://raw.githubusercontent.com/e99n09/R-notes/master/data/hre.csv"
r = requests.get(url) r = requests.get(url)
fp = "hre.csv" fp = "hre.csv"
f = open(fp,"wb") f = open(fp, "wb")
f.write(r.text.encode("UTF-8")) f.write(r.text.encode("UTF-8"))
f.close() f.close()
@ -185,8 +194,9 @@ rx = re.compile(r'\d+$') # match trailing digits
- http://stackoverflow.com/questions/11860476/how-to-unlist-a-python-list - http://stackoverflow.com/questions/11860476/how-to-unlist-a-python-list
- http://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.html - http://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.html
""" """
def extractYear(v): def extractYear(v):
return(pd.Series(reduce(lambda x,y: x+y,map(rx.findall,v),[])).astype(int)) return(pd.Series(reduce(lambda x, y: x + y, map(rx.findall, v), [])).astype(int))
hre["BirthY"] = extractYear(hre.Birth) hre["BirthY"] = extractYear(hre.Birth)
hre["DeathY"] = extractYear(hre.Death) hre["DeathY"] = extractYear(hre.Death)
@ -199,7 +209,7 @@ sns.lmplot("BirthY", "EstAge", data=hre, hue="Dynasty", fit_reg=False);
# use scipy to run a linear regression # use scipy to run a linear regression
from scipy import stats from scipy import stats
(slope,intercept,rval,pval,stderr)=stats.linregress(hre.BirthY,hre.EstAge) (slope, intercept, rval, pval, stderr) = stats.linregress(hre.BirthY, hre.EstAge)
# code source: http://wiki.scipy.org/Cookbook/LinearRegression # code source: http://wiki.scipy.org/Cookbook/LinearRegression
# check the slope # check the slope
@ -223,6 +233,7 @@ sns.lmplot("BirthY", "EstAge", data=hre);
To see a version of the Holy Roman Emperors analysis using R, see To see a version of the Holy Roman Emperors analysis using R, see
- http://github.com/e99n09/R-notes/blob/master/holy_roman_emperors_dates.R - http://github.com/e99n09/R-notes/blob/master/holy_roman_emperors_dates.R
""" """
``` ```
If you want to learn more, get _Python for Data Analysis_ by Wes McKinney. It's a superb resource and I used it as a reference when writing this tutorial. If you want to learn more, get _Python for Data Analysis_ by Wes McKinney. It's a superb resource and I used it as a reference when writing this tutorial.