一日一万字の感謝の写経

一日一万字の感謝の写経をして強くなります。そのうち本を置き去りにして何かを作り始める予定です。

Python言語によるプログラミングイントロダクション 第2版 写経(17章)

 世界標準MIT教科書 Python言語によるプログラミングイントロダクション第2版: データサイエンスとアプリケーション

の写経をしたので記録を公開しておきます。 この記事では、17章をまとめておきます。

 Jupyter Labで実行したものをペーストしているので、ちょっとした正誤表になるかもしれません。

 以下、使用モジュールとpylabの設定。

import pylab
import statistics as stat
pylab.rc('figure', dpi=200)
import random
import scipy as sp
import scipy.integrate
import scipy.stats

写経

17.1 ボストンマラソンの標本抽出

openreadline7. 入力と出力 を参照した。 csvモジュール使った方が絶対簡単だけど、別の方法のファイル操作を勉強するためにも今回はテキスト通りにやる。ただ、withは使う。

code 17.2

def getBMData(filename):
    data = {'name': [], 'gender': [], 'age':[], 'division': [], 'country': [], 'time': []}
    with open(filename) as f:
        for line in f.readlines():
            split = line.split(',')
            data['name'].append(split[0])
            data['gender'].append(split[1])
            data['age'].append(split[2])
            data['division'].append(split[3])
            data['country'].append(split[4])
            data['time'].append(float(split[5][:-1]))
    return data

def makeHist(data, bins, title, xLabel, yLabel):
    pylab.hist(data, bins)
    pylab.title(title)
    pylab.xlabel(xLabel)
    pylab.ylabel(yLabel)
    mean = stat.mean(data)
    std = stat.stdev(data)
    pylab.annotate('Mean = ' + str(round(mean, 2)) + '\nSD = ' + str(round(std, 2)), fontsize=20, xy=(0.65, 0.75), xycoords = 'axes fraction')
bmResults2012Txt='bm_results2012.txt'
times = getBMData(bmResults2012Txt)['time']
makeHist(times, 20, '2012 Boston Marathon', 'Minutes to Complete Race', 'Number of Runners')

f:id:koheitsutsumi223:20190217220404p:plain

code 17.3

def sampleTimes(times, numExamples):
    sample = random.sample(times, numExamples)
    makeHist(sample, 10, 'Sample of Size ' + str(numExamples), 'Minutes to Complete Race', 'Number of Runners')
sampleSize = 40
sampleTimes(times, sampleSize)

f:id:koheitsutsumi223:20190217220407p:plain

code 17.4

area = round(scipy.integrate.quad(sp.stats.norm.pdf, -3, 3, (0, 1))[0], 4)
print('Probability of being within 3', 'of true of tight dist. =', area)
area = round(scipy.integrate.quad(sp.stats.norm.pdf, -3, 3, (0, 100))[0], 4)
print('Probability of being within 3', 'of true of tight dist. =', area)
Probability of being within 3 of true of tight dist. = 0.9973
Probability of being within 3 of true of tight dist. = 0.0239

code 17.5

def testSamples(numTrials, sampleSize):
    tightMeans, wideMeans = [], []
    for t in range(numTrials):
        sampleTight, sampleWide = [], []
        for i in range(sampleSize):
            sampleTight.append(random.gauss(0, 1))
            sampleWide.append(random.gauss(0, 100))
        tightMeans.append(stat.mean(sampleTight))
        wideMeans.append(stat.mean(sampleWide))
    return tightMeans, wideMeans


tightMeans, wideMeans = testSamples(1000, 40)
pylab.plot(wideMeans, 'y*', label=' SD = 100')
pylab.plot(tightMeans, 'bo', label='SD = 1')
pylab.xlabel('Sample Number')
pylab.ylabel('Sample Mean')
pylab.title('Means of Samples of Size ' + str(40))
pylab.legend()

pylab.figure()
pylab.hist(wideMeans, bins=20, label='SD = 100')
pylab.title('Distribution of Sample Means')
pylab.xlabel('Sample Mean')
pylab.ylabel('Frequency of Occurrence')
pylab.legend()
    <matplotlib.legend.Legend at 0x17d790c4c18>

f:id:koheitsutsumi223:20190217220420p:plain f:id:koheitsutsumi223:20190217220422p:plain

17.2 中心極限定理

code 17.6

def plotMeans(numDicePerTrial, numDiceThrown, numBins, legend, color, style):
    means = []
    numTrials = numDiceThrown//numDicePerTrial
    for i in range(numTrials):
        vals = 0
        for j in range(numDicePerTrial):
            vals += 5*random.random()
        means.append(vals/numDicePerTrial)
    pylab.hist(means, numBins, color=color, label=legend, weights=pylab.array(len(means)*[1])/len(means), hatch=style)
    return stat.mean(means), stat.variance(means)

mean, var = plotMeans(1, 100000, 11, '1 die', 'w', '*')
print('Mean of rolling 1 die =', round(var, 4), 'Variance =', round(var, 4))
mean, var = plotMeans(100, 100000, 11, 'Mean of 100 die', 'w', '//')
print('Mean of rolling 100 die =', round(var, 4), 'Variance =', round(var, 4))
pylab.title('Rolling Continuous Dice')
pylab.xlabel('Value')
pylab.ylabel('Probability')
pylab.legend()
    Mean of rolling 1 die = 2.0828 Variance = 2.0828
    Mean of rolling 100 die = 0.0215 Variance = 0.0215

    <matplotlib.legend.Legend at 0x215d97df7b8>

f:id:koheitsutsumi223:20190223224904p:plain

メモ

 weights=pylab.array(len(means)*[1])/len(means)len(means)*[1/len(means)]でもできる。前者は、numpy.ndarrayオブジェクトに含まれる要素すべてに対して演算(/len(means))を一度に実行させることで長さがlen(mean)で、各要素が1/len(means)numpy.ndarrayオブジェクトを生成している。ただ、同じことは後者のlen(means)*[1/len(means)]でもできるしわかりやすい。

 以下はnumpy.ndarrayオブジェクトに割り算をしたメモ。その下のコードはlen(means)*[1/len(means)]を使用したバージョン。

array = pylab.array([1, 2, 3])
type(array)
numpy.ndarray
array/3
array([0.33333333, 0.66666667, 1.        ])
means = [2, 2, 3, 2, 3, 3, 4, 2, 1]
len(means)*[1/len(means)]
[0.1111111111111111,
 0.1111111111111111,
 0.1111111111111111,
 0.1111111111111111,
 0.1111111111111111,
 0.1111111111111111,
 0.1111111111111111,
 0.1111111111111111,
 0.1111111111111111]
def plotMeans(numDicePerTrial, numDiceThrown, numBins, legend, color, style):
    means = []
    numTrials = numDiceThrown//numDicePerTrial
    for i in range(numTrials):
        vals = 0
        for j in range(numDicePerTrial):
            vals += 5*random.random()
        means.append(vals/numDicePerTrial)
    pylab.hist(means, numBins, color=color, label=legend, weights=len(means)*[1/len(means)], hatch=style)
    return stat.mean(means), stat.variance(means)

mean, var = plotMeans(1, 100000, 11, '1 die', 'w', '*')
print('Mean of rolling 1 die =', round(var, 4), 'Variance =', round(var, 4))
mean, var = plotMeans(100, 100000, 11, 'Mean of 100 die', 'w', '//')
print('Mean of rolling 100 die =', round(var, 4), 'Variance =', round(var, 4))
pylab.title('Rolling Continuous Dice')
pylab.xlabel('Value')
pylab.ylabel('Probability')
pylab.legend()
    Mean of rolling 1 die = 2.0798 Variance = 2.0798
    Mean of rolling 100 die = 0.0208 Variance = 0.0208
    




    <matplotlib.legend.Legend at 0x215d9e76b38>

f:id:koheitsutsumi223:20190223224907p:plain

weightspylab.histの第一引数と同じ形のアレイ(今のところ長さが同じリストっぽいものというイメージ)を指定する。weightsの各値はpylab.histの第一引数の各値に重みづけをする。

pylab.hist([1, 1, 2], weights=[10, 0, 3])
    (array([10.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  3.]),
     array([1. , 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2. ]),
     <a list of 10 Patch objects>)

f:id:koheitsutsumi223:20190223225003p:plain

どうやら、weightsには負の値も指定できるようだ。

pylab.hist([1, 1, 2], weights=[10, -12, 3])
    (array([-2.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  3.]),
     array([1. , 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2. ]),
     <a list of 10 Patch objects>)

f:id:koheitsutsumi223:20190223225008p:plain

code 17.7

def plotEstOfMeanFinTime(minimum=50, maximum=2000, interval=200, numPerSampleSize=20):
    meanOfMeans, stdOfMeans = [], []
    sampleSizes = range(minimum, maximum, interval)
    for sampleSize in sampleSizes:
        sampleMeans = []
        for t in range(numPerSampleSize):
            sample = random.sample(times, sampleSize)
            sampleMeans.append(stat.mean(sample))
        meanOfMeans.append(stat.mean(sampleMeans))
        stdOfMeans.append(stat.stdev(sampleMeans))
    pylab.errorbar(sampleSizes, meanOfMeans, yerr=1.96*pylab.array(stdOfMeans), label='Estimated mean and 95% confidence interval')
    pylab.xlim(0, max(sampleSizes) + 50)
    pylab.axhline(stat.mean(times), linestyle='--', label='Population mean')
    pylab.title('Estimates of Mean Finishing Time')
    pylab.xlabel('Sample Size')
    pylab.ylabel('Finishing Time (minutes)')
    pylab.legend(loc='best')
plotEstOfMeanFinTime(numPerSampleSize=20)

f:id:koheitsutsumi223:20190223225049p:plain

sampleSize毎に200回サンプリングしてみたら、やはり分散は安定して減少する。

plotEstOfMeanFinTime(numPerSampleSize=200)

f:id:koheitsutsumi223:20190223225054p:plain

メモ

 テキストの図 17.6をプロットしてみた。

def oneOverSquare(x):
    return 1/(x)**0.5

popStd = stat.pstdev(times)

def plotSEvsSD(minimum=1, maximum=2000, interval=50, numPerSampleSize=20):
    stdOfMeans = []
    sampleSizes = range(minimum, maximum, interval)
    for sampleSize in sampleSizes:
        sampleMeans = []
        for t in range(numPerSampleSize):
            sample = random.sample(times, sampleSize)
            sampleMeans.append(stat.mean(sample))
        stdOfMeans.append(stat.stdev(sampleMeans))
    pylab.plot(sampleSizes, stdOfMeans, label=f'Std of {numPerSampleSize} Means')
    pylab.plot(range(minimum, maximum), popStd*oneOverSquare(pylab.array([i for i in range(minimum, maximum)])), linestyle='--', label='SEM')
    pylab.xlim(min(sampleSizes), max(sampleSizes) + interval)
    pylab.title(f'SE vs. SD of {numPerSampleSize} Means')
    pylab.xlabel('Sample Size')
    pylab.ylabel('Standard Deviation')
    pylab.legend(loc='best')
plotSEvsSD()

f:id:koheitsutsumi223:20190223225100p:plain

sampleSizeが大きくなると、サンプリング平均の標準偏差SEMを下回る傾向があるんだけど、これは何なんだろう?統計に自信ニキ教えてください。

plotSEvsSD(minimum=2000, maximum=20000, interval=1000, numPerSampleSize=100)

f:id:koheitsutsumi223:20190223225104p:plain

17.3 平均の標準誤差

code 17.8

def plotSamSDvsPopSD(start=2, stop=200, step=2, numPerSample=100):
    popStd = stat.stdev(times)
    sampleSizes = range(start, stop, step)
    diffsMeans = []
    for sampleSize in sampleSizes:
        diffs = []
        for t in range(numPerSample):
            diffs.append(abs(popStd - stat.stdev(random.sample(times, sampleSize))))
        diffsMeans.append(stat.mean(diffs))
    pylab.plot(sampleSizes, diffsMeans)
    pylab.xlabel('Sample Size')
    pylab.ylabel('Abs(Pop. Std - Sample Std)')
    pylab.title('Sample SD vs Population SD')
plotSamSDvsPopSD()

f:id:koheitsutsumi223:20190223225108p:plain

code 17.9

def extOutsideConfInterval(sampleSize=200, numTrial=10000):
    popMean = stat.mean(times)
    numBad = 0
    for t in range(numTrial):
        sample = random.sample(times, sampleSize)
        sampleMean = stat.mean(sample)
        se = stat.stdev(sample)/sampleSize**0.5
        if abs(popMean - sampleMean) > 1.96*se:
            numBad += 1
    print(f'Fraction outside 95% confidence interval = {numBad/numTrial}')
extOutsideConfInterval()
Fraction outside 95% confidence interval = 0.0544