简单爬虫

import re
import urllib
def getHtml (url):
page = urllib.urlopen(url)
html = page.read()
return html
def getImg (html, i):
reg = r’src=”(.*?\.jpg)” width’
imgre = re.compile(reg)
imglist = re.findall(imgre,html)
x = 0
for imgurl in imglist:
urllib.urlretrieve(imgurl,’%s_%s.jpg’ %(i,x))
x+=1
return imglist
def getImg1 (html, j):
reg = r’src=”(.*?\.jpg)” width’
imgre = re.compile(reg)
imglist = re.findall(imgre,html)
x = 0
for imgurl in imglist:
urllib.urlretrieve(imgurl,’%s_%s.jpg’ %(j,x))
x+=1
return imglist
j = 17
FormatH = “http://tieba.baidu.com/p/2311323620?pn=”
for i in range(1,18):
html = getHtml (FormatH+”%d” %i)
getImg(html, i)
getImg1 (html, j)
j-=1
print i

2 thoughts on “简单爬虫

发表评论

电子邮件地址不会被公开。 必填项已用*标注

36 + = 43