Python爬取历史天气数据

因论文实验需求,需要历史天气数据作为输入参数之一,因此特使用Python编写网络爬虫,爬取数据均来自历史天气网的历史天气。

依赖包

  1. bs4:解析HTML或XML
  2. requests:发送Http请求
  3. xlwt:读写Excel文件
  4. MySQLdb: 连接mysql数据库

Python代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
# -*- coding=utf-8 -*-
from bs4 import BeautifulSoup
import mysql.connector
import requests
import xlwt
import os
#获得某一个月的天气数据
def getListByUrl(url):
i = 0
res = requests.get(url,timeout=10)
while(res.status_code!=200 and i<3):
res = requests.get(url,timeout=10)
i += 1
if(res.status_code==200):
soup = BeautifulSoup(res.text,"html.parser")
weathers = soup.select("#tool_site")
title = weathers[1].select("h3")[0].text
weatherInfors = weathers[1].select("ul")
weatherList = list()
for weatherInfor in weatherInfors:
singleWeather = list()
for li in weatherInfor.select('li'):
singleWeather.append(li.text)
weatherList.append(singleWeather)
print(title)
return weatherList,title
else:
return None,None
#@par:addressUrl 获得某地区的数据
#@par:excelSavePath 数据的保存地址
def getListByAddress(addressUrl,excelSavePath):
# url = "http://lishi.tianqi.com/beijing/index.html"
url = addressUrl
i = 0
res = requests.get(url,timeout=10)
while(res.status_code!=200 and i<3):
res = requests.get(url,timeout=10)
i += 1
if(res.status_code==200):
soup = BeautifulSoup(res.text,"html.parser")
dates = soup.select(".tqtongji1 ul li a")
workbook = xlwt.Workbook(encoding='utf-8')
for d in dates:
weatherList,title = getListByUrl(d["href"])
if(weatherList!=None):
booksheet = workbook.add_sheet(title,cell_overwrite_ok=True)
for i,row in enumerate(weatherList):
for j,col in enumerate(row):
booksheet.write(i,j,col)
workbook.save(excelSavePath)
else:
return None
# 获取指定城市的月份列表
def getmonthlist(addressUrl):
url = addressUrl
i = 0
res = requests.get(url,timeout=10)
while(res.status_code!=200 and i<3):
res = requests.get(url,timeout=10)
i += 1
if(res.status_code==200):
soup = BeautifulSoup(res.text, "html.parser")
#获取月份列表
dates = soup.select(".tqtongji1 ul li a")
return dates
else:
return None
# 获取指定月份的天气列表
def getweatherlist(url):
i = 0
res = requests.get(url,timeout=10)
while(res.status_code!=200 and i<3):
res = requests.get(url,timeout=10)
i += 1
if (res.status_code == 200):
soup = BeautifulSoup(res.text, "html.parser")
weathers = soup.select("#tool_site")
title = weathers[1].select("h3")[0].text
weatherInfors = weathers[1].select("ul")
weatherList = list()
for weatherInfor in weatherInfors:
singleWeather = list()
for li in weatherInfor.select('li'):
singleWeather.append(li.text)
weatherList.append(singleWeather)
print(title)
return weatherList, title
else:
return None
# 创建数据库
def createtable(conn):
cursor = conn.cursor()
str = 'create table history_weather(' \
'id int primary key not null auto_increment,' \
'city char(10),' \
'date date,' \
'Tmax int,' \
'Tmin int,' \
'weather char(10),' \
'Wdirection char(10),' \
'Wpower char(10))'
try:
cursor.execute(str)
conn.commit()
except:
print('')
finally:
cursor.close()
# 每月天气数据写入数据库
def inserttomysql(city,weatherlist,conn):
cursor = conn.cursor()
sql = 'INSERT INTO history_weather values(%s,%s,%s,%s,%s,%s,%s,%s)'
param = []
for i in range(1,len(weatherlist)):
tmp = weatherlist[i]
tmp.insert(0,city)
tmp.insert(0,'')
param.append(tmp)
cursor.executemany(sql,param)
conn.commit()
cursor.close
# 存储到数据库
def SavetoMysql(host, port, user, passwd,db):
# 建立数据库连接
conn = mysql.connector.connect(host=host,port=port,user=user,passwd=passwd,db=db)
createtable(conn)
# 遍历所有城市
add = BeautifulSoup(requests.get('http://lishi.tianqi.com/').text, "html.parser")
bcity = add.select('[class=bcity]')
cityAddr = []
for i in range(len(bcity)):
cityAddr.extend(bcity[i].find_all('a', target='_blank'))
for i in range(len(cityAddr)):
date = getmonthlist(cityAddr[i]['href'])
city = cityAddr[i].text
if(date!=None):
for j in range(len(date)):
weatherlist, title = getweatherlist(date[j]['href'])
if(weatherlist!=None):
inserttomysql(city, weatherlist, conn)
# 关闭数据库连接
conn.close()
# 历史天气保存到excel
def SavetoExcel(addressName="all"):
addresses = BeautifulSoup(requests.get('http://lishi.tianqi.com/').text, "html.parser")
if(addressName=="all"):
bcity = addresses.select('[class=bcity]')
cityAddr = []
for i in range(len(bcity)):
cityAddr.extend(bcity[i].find_all('a', target='_blank'))
savePath = input("请输入即将保存天气数据的路径(如若不输入,将默认保存到c:/weather/下)\n")
for i in range(len(cityAddr)):
q = cityAddr[i]
city = cityAddr[i].text
if not savePath.strip():
savePath = 'c:/weather/'
if not os.path.exists(savePath):
os.makedirs(savePath)
filePath = savePath + city + ".xls"
getListByAddress(q["href"], filePath)
print(u"已保存"+city+u"的天气:" + filePath)
else:
queryAddress = addresses.find_all('a', text=addressName)
city = addressName
if len(queryAddress):
savePath = input("检测到有该城市数据,请输入即将保存天气数据的路径(如若不输入,将默认保存到c:/weather/" + addressName + ".xls):\n")
if not savePath.strip():
savePath = 'c:/weather/'
if not os.path.exists(savePath):
os.makedirs(savePath)
filePath = savePath + city + ".xls"
for q in queryAddress:
getListByAddress(q["href"], filePath)
print(u"已经天气数据保存到:" + filePath)
else:
print("不存在该城市的数据")
def main():
print('选择数据存储到:\n')
print ('1.存储到mysql\n')
print ('2.存储到Excel\n')
choice = input("请输入编号:")
if(choice=='1'):
host = input("输入数据库主机地址:")
port = input("输入数据库端口:")
user = input("输入用户名:")
passwd = input("输入密码:")
db = input("输入数据库名:")
if(host=='' and port=='' and user=='' and passwd=='' and db==''):
SavetoMysql()
else:
SavetoMysql(host,port,user,passwd,db)
elif(choice=='2'):
addressName = input("请输入即将获取天气的城市(输入all表示所有城市):")
SavetoExcel(addressName)
else:
print ('输入错误,按任意键退出。')
a = input("")
if __name__ == "__main__":
main()

使用说明

该代码可以实现爬取全国历史天气数据的爬取, 空间尺度可以精确到区级,包含了每日天气状况、最高最低气温、风向、风力数据。时间跨度从2011年1月(部分地区数据开始时间稍晚)至今。
代码运行时可以选择数据存储到数据库或者是Excel:

当选择数据库时,应输入数据库的连接信息,包括主机地址、数据库端口、用户名、密码和数据库名:

程序首先会自动创建一个名为history_weather的表,然后自动爬取所有城市所有年月的天气数据,存储到该表中:

结果:

当选择存储到Excel时,程序首先会让用户选择即将获取的城市,输入一个城市名,程序将先检测是否有该城市数据

存在该城市数据,则要求用户输入保存路径,如果不输入(直接回车),则输入Excel文件将直接保存到c:/weather中。

如果用户想需要获取所有城市的数据,则在输入城市名时输入all

结果:

参考

Python爬取历史天气数据

若本文对您有帮助,请打赏鼓励本人!
---------------- End ----------------
扫二维码
扫一扫,使用手机查看

扫一扫,使用手机查看

QQ