爬京客隆PDF代码
import requests
import os
from lxml import etree
if not os.path.exists('c:/c'):
os.mkdir('c:/c')
url = 'https://www.jkl.com.cn/cn/invest.aspx'
listpdf = []
listname = []
for page in range(1,4):
fanye = {'__EVENTTARGET': 'AspNetPager1',
'__EVENTARGUMENT': page}
UA = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'}
respon = requests.get( url = url,params=fanye,headers=UA).text
jiexi = etree.HTML(respon)
pdflist = jiexi.xpath('//div[@class="newsLis"]//li//@href')
name = jiexi.xpath('//div[@class="newsLis"]//li/a/text()')
for i in pdflist:
i = 'https://www.jkl.com.cn' + i
listpdf.append(i)
#print(i)
for j in name:
j= j.strip()
listname.append(j)
zidian = dict(zip(listname,listpdf))
for a,b in zidian.items():
houzhui = b.split('.')[-1]
pdfshuju = requests.get(url=b,headers=UA).content
add = 'c:/c/' + a +'.'+houzhui
with open(add,'wb') as u:
u.write(pdfshuju)
print(a,'下载成功')
页:
[1]