写入excl表中!
import requestsimport xlwtimport jsonheaders={ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"}# url="https://movie.douban.com/j/chart/top_list?type=17&interval_id=100%3A90&action=&start=0&limit=20"url="https://movie.douban.com/j/chart/top_list?"params={ 'type': '17', 'interval_id': '100:90', 'action': '', 'start': '1', 'limit': '300',}response=requests.get(url=url,params=params,headers=headers)page_text=response.textdata=json.loads(page_text)# print(data)title=["title","score","types","actors"]li=[]for dic in data: li.append([ dic["title"],dic["score"],dic["types"],dic["actors"] ])wbk=xlwt.Workbook()sheet=wbk.add_sheet("movie")for i in range(len(title)): sheet.write(0,i,title[i]) for i in range(1,len(li)): for j in range(len(title)): sheet.write(i,j,li[i][j]) wbk.save("douban_movie.xls")print("ok")
request
- 添加一个cell:a b
- 删除cell:x
- 双击:进入可编辑模式
- 切换cell的模式:
- y:markdown->code
- m:code->markdown
- tab:
- 执行cell:shift+enter
- 打开帮助文档:
- shift+tab
编码流程:
- 指定url
- 发起了请求
- 获取响应数据
- 持久化存储
#爬取搜狗首页源码信息import requests#指定urlurl="https://www.sogou.com/"# 发起请求response=requests.get(url=url)# 获取响应数据page_text=response.text# 持久化存储with open("sogou.html","w",encoding="utf-8")as f: f.write(page_text)
# 简易网页采集器#https://www.sogou.com/web?query=周杰伦import requestsurl="https://www.sogou.com/web"# 设置动态参数wd=input("enter a key word:")params={ "query":wd}# UA伪装headers={ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"}# params 是一个字典,用来处理动态请求参数response=requests.get(url=url,params=params,headers=headers)# 指定格式response.encoding="utf-8"page_text=response.textfileName=wd+'.html'with open(fileName,'w',encoding='utf-8') as f: f.write(page_text)print(fileName,'下载成功')
# 爬取肯德基餐厅位置信息import requestsurl="http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword"city=input("enter a city:")# UA伪装headers={ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"}data={ 'cname': '', 'pid': '', 'keyword': city, 'pageIndex': '1', 'pageSize': '10'}response=requests.post(url=url,data=data,headers=headers)json_data=response.json()for dic in json_data["Table1"]: print(dic['storeName']+':'+dic["addressDetail"]+"\n")
# 爬取药品信息# url="http://125.35.6.84:81/xk/"import requestsurl="http://125.35.6.84:81/xk/itownet/portalAction.do?method=getXkzsList"# UA伪装headers={ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"}page=int(input("共需要爬取页数:"))for num in range(1,page): data={ 'on': 'true', 'page':str(num), 'pageSize': '15', 'productName': '', 'conditionType': '1', 'applyname': '', 'applysn': '', } response=requests.post(url=url,data=data,headers=headers) json_data=response.json() for dic in json_data["list"]: # print(dic["ID"],dic["EPS_NAME"],'第一梯队') url1="http://125.35.6.84:81/xk/itownet/portalAction.do?method=getXkzsById" data={ 'id':dic["ID"] } response2=requests.post(url=url1,data=data,headers=headers) dic2=response2.json() print(dic2["epsName"],dic2["epsProductAddress"])
# 爬取豆瓣电影import requestsurl="https://movie.douban.com/j/chart/top_list?type=24&interval_id=100%3A90&action=&start=0&limit=20"# UA伪装headers={ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"}response=requests.get(url=url,headers=headers)json_data=response.json()# for dic in json_data:# print(dic["title"],dic["rating"][0])
#爬取图片import requestsurl="http://img.netbian.com/file/2019/0730/14d3739bf11dd92055abb56e3f792d3f.jpg"response=requests.get(url=url)content=response.contentwith open("./meinv.jpg","wb") as f: f.write(content)
#基于urllib爬取图片
from urllib import requesturl = 'https://timgsa.baidu.com/timg?image&quality=80&size=b9999_10000&sec=1564654066124&di=47579504bdd90e51c50c387cbff21391&imgtype=0&src=http%3A%2F%2Fb-ssl.duitang.com%2Fuploads%2Fitem%2F201508%2F05%2F20150805214640_tY24E.jpeg'
request.urlretrieve(url,filename="./meishaonv.jpg")
两种爬取图片方式的区别: 是否可以进行UA伪装