@justinholman/

TermProject-ModelTest

Python

No description

fork
loading
Files
  • main.py
  • colorado-profile-county.csv
  • colorado.csv
  • estimates.csv
  • hist1.png
  • hist2.png
  • out.csv
  • scatter1.png
  • scatter2.png
  • veh.csv
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
import csv
mpl.use('Agg')

# read vehpop data
vehdata = pd.read_csv('veh.csv')

# test a model
bachc = 2.40748873043362
urbanc = 0.0844764032499319
racec = -0.323391806760662
yintercept = 4.91914408164027

output = []
pred = []
vehactual = []
for i, row in vehdata.iterrows() :
  bach = row['pct_bachelorsorhigher']
  urban = row['poppct_urban']
  race = row['pct_racewhite']
  county = row['county']
  popest = row['popest']
  vehpop = row['Veh Pop']
  yhat = round(bach*bachc + urban*urbanc + race*racec + yintercept,1)
  actual = row['Veh Per 10k']
  resid = round(actual - yhat,1)
  predvehpop = round((yhat/10000)*popest,0)
  residpop = round(vehpop-predvehpop,1)
  output.append([county,actual,yhat,resid,vehpop,predvehpop,residpop])

# write output to csv and create some graphics
df = pd.DataFrame(output)
mycolumns = ['county','actual','yhat','resid','vehpop','predvehpop','residpop']
df.columns = mycolumns
df.to_csv('out.csv')

sns.distplot(df['yhat'])
plt.show()
plt.savefig('hist1.png')
plt.clf()

sns.distplot(df['resid'])
plt.show()
plt.savefig('hist2.png')
plt.clf()

sns.regplot('actual','yhat',data=df)
plt.show()
plt.savefig('scatter1.png')
plt.clf()

sns.regplot('vehpop','predvehpop',data=df)
plt.show()
plt.savefig('scatter2.png')
plt.clf()

# sort and print residuals
print(df.sort_values(by='residpop'))

# now calculate estimates for Colorado
colorado = pd.read_csv('colorado.csv')
colorado_estimates = []
for i, row in colorado.iterrows() :
  bach = row['pct_bachelorsorhigher']
  urban = row['poppct_urban']
  race = row['pct_racewhite']
  county = row['county']
  popest = row['popest']
  yhat = round(bach*bachc + urban*urbanc + race*racec + yintercept,1)
  predvehpop = round((yhat/10000)*popest,0)
  colorado_estimates.append([county,popest,yhat,predvehpop])
  
df = pd.DataFrame(colorado_estimates)
mycolumns = ['county','popest','yhat','predvehpop']
df.columns = mycolumns
df.to_csv('estimates.csv')

print("output written to file")