当前位置:网络安全 > python爬虫——股票数据定向爬虫

python爬虫——股票数据定向爬虫

  • 发布:2023-09-19 17:17

全代码
仍然是对静态网页的数据爬取

#www.sychzs.cn
import requests
from bs4 import BeautifulSoup
import traceback
import redef getHTMLText(url):try:r=requests.get(url,timeout=30)r.raise_for_status()r.encoding=r.apparent_encodingreturn r.textexcept:return ""def getStocksList(lst,stocksurl):html=getHTMLText(stocksurl)soup=BeautifulSoup(html,"html.parser")a=soup.find_all('a')for i in a:try:href=i.attrs['href']onestock=re.findall(r'[s][hz]\d{6}',href)lst.append(onestock[0])except:#traceback.print_exc()continue
def StocksParse(lst,stocksurl,fpath):for stock in lst:infodic={}onestockurl='https://www.sychzs.cn/stock/'+stock+'.html'#print(onestockurl)onestock=getHTMLText(onestockurl)try:soup=BeautifulSoup(onestock,"html.parser")div=soup.find('div',class_='stock-bets')name=div.find_all('a',class_='bets-name')[0]infodic.update({'股票名称':name.text.split()[0]})keylist=div.find_all('dt')valuelist=div.find_all('dd')for i in range(len(keylist)):key=keylist[i].textvalue=valuelist[i].textinfodic[key]=valuewith open(fpath,'a',encoding='utf-8') as f:f.write(str(infodic)+'\n')except:traceback.print_exc()continuedef main():stocksurl='http://www.sychzs.cn/stocklist.html'lst=[]getStocksList(lst,stocksurl)fpath='F://baidustocks.txt'StocksParse(lst,stocksurl,fpath)main()

加深了对find()和find_all()的认识
但仍不熟悉数据处理这一方面,比如字典类型

运行后,爬了很久也没有结束,可见这样的爬虫不适合较大量的数据的爬取

相关文章

最新资讯