目标网站:http://jc.bjmemc.com.cn/IndexAirReport/AirDailyReport.aspx
需要将所有数据爬下放到数据库中.中途遇到了各种编码问题,因此使用了requests获取页面.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33
|
import re import requests import MySQLdb import hashlib from pyquery import PyQuery as pyq def md5(text): m = hashlib.md5() m.update(text.encode('utf-8')) return m.hexdigest()
def getandupload(): conn=MySQLdb.connect(host='localhost',user='root',passwd='',port=3306,charset='utf8') cur=conn.cursor() conn.select_db('pytest') ret=requests.get(url="http://jc.bjmemc.com.cn/IndexAirReport/AirDailyReport.aspx") doc=pyq(ret.text) date = re.findall(r'(w*[0-9]+)w*',doc.find('#Label0').text()) date = date[0]+'/'+date[1]+'/'+date[2] table = pyq(pyq(doc.find('#marqueebox')).find('table')) stations = re.findall(r'<td style="width:126px">(.+?)</td>',table.html()) aqis = re.findall(r'<td style="width:90px">(.+?)</td>',table.html()) for i in range(0, 35): value=[md5(date+stations[i][1:]),date,stations[i][1:],aqis[2*i]] cur.execute('insert ignore into aqi_bjmemc values(%s,%s,%s,%s)',value) conn.commit() cur.close() conn.close() getandupload()
|