requests+pyquery爬虫体验

目标网站:http://jc.bjmemc.com.cn/IndexAirReport/AirDailyReport.aspx

需要将所有数据爬下放到数据库中.中途遇到了各种编码问题,因此使用了requests获取页面.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
#coding=utf-8
#encoding=utf-8
import re
import requests
import MySQLdb
import hashlib
from pyquery import PyQuery as pyq
def md5(text):
m = hashlib.md5()
m.update(text.encode('utf-8'))
return m.hexdigest()

def getandupload():
conn=MySQLdb.connect(host='localhost',user='root',passwd='',port=3306,charset='utf8')
cur=conn.cursor()
conn.select_db('pytest')
ret=requests.get(url="http://jc.bjmemc.com.cn/IndexAirReport/AirDailyReport.aspx")
doc=pyq(ret.text)
date = re.findall(r'(w*[0-9]+)w*',doc.find('#Label0').text())
#正则,将数字提取出来(为了将yyyy年mm月dd日改成yyyy/mm/dd)
date = date[0]+'/'+date[1]+'/'+date[2]
table = pyq(pyq(doc.find('#marqueebox')).find('table'))
stations = re.findall(r'<td style="width:126px">(.+?)</td>',table.html())
aqis = re.findall(r'<td style="width:90px">(.+?)</td>',table.html())
#通过正则将html代码中需要的数据提取出来并存在数组中.
for i in range(0, 35):
value=[md5(date+stations[i][1:]),date,stations[i][1:],aqis[2*i]]
cur.execute('insert ignore into aqi_bjmemc values(%s,%s,%s,%s)',value)
#设置了md5id为主键,因此当传重复数据的时候md5id重复,避免了重复上传数据.
conn.commit()
cur.close()
conn.close()
getandupload()