本文共 1881 字,大约阅读时间需要 6 分钟。
1.通过爬取网页源代码
import requestsdef getHTMLText(url): try: r=requests.get(url,timeout=30) r.raise_for_status() r.encoding=r.apparent_encoding return r.text except: return "产生异常"if __name__=="__main__": url="http://www.sdust.edu.cn" print(getHTMLText(url))2.爬取京东商品的源代码
import requestsurl="https://item.jd.com/5181380.html"try: r=requests.get(url) r.raise_for_status() r.encoding=r.apparent_encoding print(r.text[:1000])except: print("爬取失败")3.爬取亚马逊商品的源代码
import requestsurl="https://www.amazon.cn/gp/product/B071SDP8PC"try: kv={'user-agent':'Mozilla/5.0'} r=requests.get(url,headers=kv) r.raise_for_status r.encoding=r.apparent_encoding print(r.text[24440:25145])except: print("爬取失败")4.百度、360搜索关键词
import requestskeyword="Python"url="https://wwww.baidu.com/s"try: kv={'wd':keyword} r=requests.get(url,params=kv) print(r.request.url) r.raise_for_status() print(len(r.text))except: print("爬取失败")
import requestskeyword="Python"kv={'q':keyword}url="http://www.so.com/s"try: r=requests.get(url,params=kv) print(r.request.url) r.raise_for_status() print(len(r.text))except: print("爬取失败")
5.网络图片的爬取与下载
import requestsimport osurl="http://image.nationalgeographic.com.cn/2017/0730/20170730125917668.jpg"root="E://Python//wordplace//getHTMLText//picture//"path=root + url.split('/')[-1]try: if not os.path.exists(root): os.mkdir(root) if not os.path.exists(path): r=requests.get(url) with open(path,'wb') as f: f.write(r.content) f.close() print("文件保存成功") else: print("文件已存在")except: print("爬取失败")6.查询IP地址
import requestsurl="http://m.ip138.com/ip.asp?ip="try: r=requests.get(url+'202.204.80.112') r.raise_for_status() r.encoding=r.apparent_encoding print(r.text[-500:])except: print("爬取失败")
转载地址:http://nawdf.baihongyu.com/