用python爬取东方财富上证A股并存入mysql

准备:

python、mysql

python用到的包:requests、pandas、re、time、random、sqlalchemy

1、找目标

要爬网站得先了解网站的结构,得知道想要的内容在哪。

进入目标页面(东方财富网的上证A股”),打开开发者工具,分析内容

在开发者工具中找到jq返回的数据中包含了我要的东西,那么就决定是它了。

在标头里找到请求的url和对应的请求标头

2、构造访问头,请求测试

import requests

##构造浏览器头部
headers={
'Accept': '*/*',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
'Connection': 'keep-alive',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.63 Safari/537.36 Edg/102.0.1245.33' 
}

url = '<http://54.push2.eastmoney.com/api/qt/clist/get?cb=jQuery1124031148056087743625_1655062773152&pn=1&pz=20&po=1&np=1&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&wbp2u=|0|0|0|web&fid=f3&fs=m:1+t:2,m:1+t:23&fields=f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f12,f13,f14,f15,f16,f17,f18,f20,f21,f23,f24,f25,f22,f11,f62,f128,f136,f115,f152&_=1655062773153>'

response = requests.get(url,headers=headers).text
print(response)

#####返回结果#####
jQuery1124031148056087743625_1655062773152({"rc":0,"rt":6,"svr":182994821,"lt":1,"full":1,"dlmkts":"","data":{"total":2173,"diff":[{"f1":2,"f2":59.33,"f3":20.0,"f4":9.89,"f5":17586,"f6":97450222.0,"f7":21.3,"f8":19.34,"f9":67.26,"f10":1.66,"f11":0.0,"f12":"688787","f13":1,"f14":"XD海天瑞","f15":59.33,"f16":48.8,"f17":49.99,"f18":49.44,"f20":2539324000,"f21":539606350,"f22":0.0,"f23":3.16,"f24":6.23,"f25":-35.64,"f62":17757021.0,"f115":102.8,"f128":"-","f140":"-","f141":"-","f136":"-","f152":2},{"f1":2,"f2":18.21,"f3":16.43,"f4":2.57,"f5":70019,"f6":126081393.0,"f7":20.2,"f8":4.39,"f9":609.96,"f10":3.86,"f11":0.05,"f12":"688510","f13":1,"f14":"航亚 科技","f15":18.77,"f16":15.61,"f17":15.64,"f18":15.64,"f20":4705147292,"f21":2906471969,"f22":0.22,"f23":5.02,"f24":-6.04,"f25":-37.14,"f62":13436152.0,"f115":199.88,"f128":"-","f140":"-","f141":"-","f136":"-","f152":2},{"f1":2,"f2":339.98,"f3":14.82,"f4":43.88,"f5":43211,"f6":1392461168.0,"f7":14.16,"f8":23.69,"f9":221.33,"f10":0.44,"f11":0.06,"f12":"688348","f13":1,"f14":"C昱能","f15":341.69,"f16":299.76,"f17":299.8,"f18":296.1,"f20":27198400000,"f21":6200742229,"f22":0.14,"f23":8.1,"f24":108.58,"f25":108.58,"f62":174535077.0,"f115":226.8,"f128":"-","f140":"-","f141":"-","f136":"-","f152":2},{"f1":2,"f2":22.91,"f3":13.3,"f4":2.69,"f5":24258,"f6":54247712.0,"f7":15.88,"f8":4.39,"f9":56.11,"f10":1.8,"f11":-0.3,"f12":"688685","f13":1,"f14":"迈信林","f15":23.5,"f16":20.29,"f17":20.48,"f18":20.22,"f20":2562865341,"f21":1265052032,"f22":-0.3,"f23":3.78,"f24":2.41,"f25":-16.08,"f62":4304122.0,"f115":44.56,"f128":"-","f140":"-","f141":"-","f136":"-","f152":2},{"f1":2,"f2":30.08,"f3":11.66,"f4":3.14,"f5":35027,"f6":101802682.0,"f7":14.7,"f8":10.8,"f9":35.41,"f10":1.01,"f11":0.23,"f12":"688529","f13":1,"f14":"豪森股份","f15":30.78,"f16":26.82,"f17":26.82,"f18":26.94,"f20":3850240000,"f21":975325651,"f22":-0.33,"f23":3.46,"f24":17.73,"f25":-1.31,"f62":2875792.0,"f115":39.03,"f128":"-","f140":"-","f141":"-","f136":"-","f152":2},{"f1":2,"f2":53.21,"f3":11.32,"f4":5.41,"f5":14290,"f6":73829241.0,"f7":16.69,"f8":4.71,"f9":520.64,"f10":2.54,"f11":0.83,"f12":"688212","f13":1,"f14":"澳华内镜","f15":54.89,"f16":46.91,"f17":47.6,"f18":47.8,"f20":7095021400,"f21":1613970083,"f22":0.08,"f23":5.77,"f24":21.76,"f25":40.88,"f62":-523500.0,"f115":139.03,"f128":"-","f140":"-","f141":"-","f136":"-","f152":2},{"f1":2,"f2":122.01,"f3":10.72,"f4":11.81,"f5":20968,"f6":245290813.0,"f7":15.41,"f8":3.05,"f9":59.87,"f10":1.4,"f11":-0.8,"f12":"688639","f13":1,"f14":"华恒生物","f15":123.98,"f16":107.0,"f17":109.82,"f18":110.2,"f20":13225884000,"f21":8398064332,"f22":-0.04,"f23":10.6,"f24":12.58,"f25":-5.6,"f62":1650293.0,"f115":68.75,"f128":"-","f140":"-","f141":"-","f136":"-","f152":2},{"f1":2,"f2":0.53,"f3":10.42,"f4":0.05,"f5":88781,"f6":4616440.0,"f7":12.5,"f8":1.7,"f9":41.84,"f10":0.31,"f11":0.0,"f12":"600870","f13":1,"f14":"退市厦华","f15":0.53,"f16":0.47,"f17":0.47,"f18":0.48,"f20":277295822,"f21":277295822,"f22":0.0,"f23":30.85,"f24":-87.14,"f25":-88.48,"f62":-237448.0,"f115":-153.01,"f128":"-","f140":"-","f141":"-","f136":"-","f152":2},{"f1":2,"f2":87.5,"f3":10.37,"f4":8.22,"f5":21014,"f6":178407427.0,"f7":11.34,"f8":4.81,"f9":33.45,"f10":1.02,"f11":0.7,"f12":"688077","f13":1,"f14":"大地熊","f15":87.5,"f16":78.51,"f17":79.41,"f18":79.28,"f20":7000000000,"f21":3821177500,"f22":0.0,"f23":6.64,"f24":10.89,"f25":14.33,"f62":3114180.0,"f115":40.2,"f128":"-","f140":"-","f141":"-","f136":"-","f152":2},{"f1":2,"f2":161.1,"f3":10.32,"f4":15.07,"f5":8157,"f6":127043408.0,"f7":12.61,"f8":3.6,"f9":52.38,"f10":0.93,"f11":-0.36,"f12":"688667","f13":1,"f14":"菱电电控","f15":162.0,"f16":143.59,"f17":146.0,"f18":146.03,"f20":8312760000,"f21":3649348198,"f22":-0.04,"f23":5.75,"f24":57.59,"f25":-6.42,"f62":7344454.0,"f115":58.58,"f128":"-","f140":"-","f141":"-","f136":"-","f152":2},{"f1":2,"f2":65.33,"f3":10.28,"f4":6.09,"f5":72596,"f6":454713120.0,"f7":11.93,"f8":9.37,"f9":21.13,"f10":1.29,"f11":0.2,"f12":"688707","f13":1,"f14":"振华新材","f15":65.65,"f16":58.58,"f17":58.62,"f18":59.24,"f20":28936931137,"f21":5063963031,"f22":-0.08,"f23":9.16,"f24":20.71,"f25":30.32,"f62":8158994.0,"f115":42.67,"f128":"-","f140":"-","f141":"-","f136":"-","f152":2},{"f1":2,"f2":4.26,"f3":10.08,"f4":0.39,"f5":369189,"f6":155414984.0,"f7":10.34,"f8":4.58,"f9":119.6,"f10":1.17,"f11":0.0,"f12":"600386","f13":1,"f14":"北巴传媒","f15":4.26,"f16":3.86,"f17":3.88,"f18":3.87,"f20":3435264000,"f21":3435264000,"f22":0.0,"f23":1.88,"f24":7.3,"f25":-4.48,"f62":49626176.0,"f115":28.07,"f128":"-","f140":"-","f141":"-","f136":"-","f152":2},{"f1":2,"f2":7.76,"f3":10.07,"f4":0.71,"f5":116098,"f6":88905140.0,"f7":9.65,"f8":2.76,"f9":29.17,"f10":0.89,"f11":0.0,"f12":"603803","f13":1,"f14":"瑞斯康达","f15":7.76,"f16":7.08,"f17":7.18,"f18":7.05,"f20":3267391122,"f21":3267391122,"f22":0.0,"f23":1.75,"f24":-14.35,"f25":-2.02,"f62":47584563.0,"f115":-4.21,"f128":"-","f140":"-","f141":"-","f136":"-","f152":2},{"f1":2,"f2":16.12,"f3":10.03,"f4":1.47,"f5":124231,"f6":193813871.0,"f7":11.74,"f8":2.54,"f9":14.03,"f10":1.63,"f11":0.0,"f12":"603556","f13":1,"f14":"海兴电力","f15":16.12,"f16":14.4,"f17":14.4,"f18":14.65,"f20":7877586725,"f21":7877586725,"f22":0.0,"f23":1.4,"f24":33.44,"f25":16.81,"f62":40984914.0,"f115":19.25,"f128":"-","f140":"-","f141":"-","f136":"-","f152":2},{"f1":2,"f2":10.64,"f3":10.03,"f4":0.97,"f5":409114,"f6":421969808.0,"f7":10.44,"f8":4.11,"f9":24.19,"f10":1.51,"f11":0.0,"f12":"600330","f13":1,"f14":"天通股份","f15":10.64,"f16":9.63,"f17":9.66,"f18":9.67,"f20":10603459367,"f21":10603459367,"f22":0.0,"f23":2.13,"f24":-13.07,"f25":-34.24,"f62":107169873.0,"f115":25.25,"f128":"-","f140":"-","f141":"-","f136":"-","f152":2},{"f1":2,"f2":15.69,"f3":10.03,"f4":1.43,"f5":78153,"f6":118595448.0,"f7":11.78,"f8":4.98,"f9":53.16,"f10":1.26,"f11":0.0,"f12":"603917","f13":1,"f14":"合力科技","f15":15.69,"f16":14.01,"f17":14.21,"f18":14.26,"f20":2460192000,"f21":2460192000,"f22":0.0,"f23":2.35,"f24":-1.26,"f25":-34.08,"f62":14565651.0,"f115":39.97,"f128":"-","f140":"-","f141":"-","f136":"-","f152":2},{"f1":2,"f2":15.69,"f3":10.03,"f4":1.43,"f5":2964833,"f6":4434435584.0,"f7":13.04,"f8":7.59,"f9":27.15,"f10":3.72,"f11":0.0,"f12":"601788","f13":1,"f14":" 光大证券","f15":15.69,"f16":13.83,"f17":14.5,"f18":14.26,"f20":72343258056,"f21":61296104784,"f22":0.0,"f23":1.35,"f24":28.92,"f25":5.09,"f62":312190816.0,"f115":20.95,"f128":"-","f140":"-","f141":"-","f136":"-","f152":2},{"f1":2,"f2":12.18,"f3":10.03,"f4":1.11,"f5":1896619,"f6":2256302944.0,"f7":10.84,"f8":19.18,"f9":45.66,"f10":1.91,"f11":0.08,"f12":"601456","f13":1,"f14":"国联证券","f15":12.18,"f16":10.98,"f17":11.0,"f18":11.07,"f20":34490997186,"f21":12045707145,"f22":0.0,"f23":2.08,"f24":-8.97,"f25":-12.81,"f62":333106592.0,"f115":37.48,"f128":"-","f140":"-","f141":"-","f136":"-","f152":2},{"f1":2,"f2":13.28,"f3":10.02,"f4":1.21,"f5":630288,"f6":805853088.0,"f7":11.85,"f8":11.27,"f9":10.77,"f10":1.68,"f11":0.0,"f12":"603299","f13":1,"f14":"苏盐井神","f15":13.28,"f16":11.85,"f17":11.99,"f18":12.07,"f20":10400212678,"f21":7429363200,"f22":0.0,"f23":2.3,"f24":57.72,"f25":38.91,"f62":114011575.0,"f115":19.95,"f128":"-","f140":"-","f141":"-","f136":"-","f152":2},{"f1":2,"f2":15.48,"f3":10.02,"f4":1.41,"f5":49919,"f6":75303134.0,"f7":10.52,"f8":2.29,"f9":29.39,"f10":1.03,"f11":0.0,"f12":"603819","f13":1,"f14":"XD神力股","f15":15.48,"f16":14.0,"f17":14.05,"f18":14.07,"f20":3370463295,"f21":3370463295,"f22":0.0,"f23":4.09,"f24":27.51,"f25":5.31,"f62":18877556.0,"f115":53.26,"f128":"-","f140":"-","f141":"-","f136":"-","f152":2}]}});

成功。。。。js所有的结果都反馈回来了

3、筛选需要的东西

import requests
import re
import pandas as pd
##构造浏览器头部
headers={
'Accept': '*/*',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
'Connection': 'keep-alive',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.63 Safari/537.36 Edg/102.0.1245.33' 
}

url = '<http://54.push2.eastmoney.com/api/qt/clist/get?cb=jQuery1124031148056087743625_1655062773152&pn=1&pz=20&po=1&np=1&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&wbp2u=|0|0|0|web&fid=f3&fs=m:1+t:2,m:1+t:23&fields=f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f12,f13,f14,f15,f16,f17,f18,f20,f21,f23,f24,f25,f22,f11,f62,f128,f136,f115,f152&_=1655062773153>'
eastmoney_info=[]
response = requests.get(url,headers=headers).text
a=re.findall(r'"f12":(.*?),',response)#代码
b=re.findall(r'"f14":(.*?),',response)#名称
c=re.findall(r'"f2":(.*?),',response)#最新价
d=re.findall(r'"f3":(.*?),',response)#涨跌幅%
e=re.findall(r'"f4":(.*?),',response)#涨跌额
f=re.findall(r'"f5":(.*?),',response)#成交量(手)
g=re.findall(r'"f6":(.*?),',response)#成交额
h=re.findall(r'"f7":(.*?),',response)#振幅%
i=re.findall(r'"f15":(.*?),',response)#最高
j=re.findall(r'"f16":(.*?),',response)#最低
k=re.findall(r'"f17":(.*?),',response)#今开
l=re.findall(r'"f18":(.*?),',response)#昨收
m=re.findall(r'"f10":(.*?),',response)#量比
n=re.findall(r'"f8":(.*?),',response)#换手率%
o=re.findall(r'"f9":(.*?),',response)#市盈率(动)
p=re.findall(r'"f23":(.*?),',response)#市净率
q=re.findall(r'"f115":(.*?),',response)#滚动市盈率
eastmoney_info.append(pd.DataFrame({'代码':a,'名称':b,'最新价':c,'涨跌幅%':d,'涨跌额':e,
                                '成交量(手)':f,'成交额':g,'振幅%':h,
                                '最高':i,'最低':j,'今开':k,
                                '昨收':l,'量比':m,'换手率%':n,
                                '市盈率_动态':o,'市净率':p,'滚动市盈率':q}))
print(eastmoney_info)

#####返回结果#####
[          代码       名称     最新价   涨跌幅%    涨跌额   成交量(手)           成交额    振幅%      最高      最低      今开      昨收    量比   换手率%  市盈率_动态    市净率    滚动市盈率
0   "688787"  "XD海天瑞"   59.33   20.0   9.89    17586    97450222.0   21.3   59.33    48.8   49.99   49.44  1.66  19.34   67.26   3.16    102.8
1   "688510"   "航亚科技"   18.21  16.43   2.57    70019   126081393.0   20.2   18.77   15.61   15.64   15.64  3.86   4.39  609.96   5.02   199.88
2   "688348"    "C昱能"  339.98  14.82  43.88    43211  1392461168.0  14.16  341.69  299.76   299.8   296.1  0.44  23.69  221.33    8.1    226.8
3   "688685"    "迈信林"   22.91   13.3   2.69    24258    54247712.0  15.88    23.5   20.29   20.48   20.22   1.8   4.39   56.11   3.78    44.56
4   "688529"   "豪森股份"   30.08  11.66   3.14    35027   101802682.0   14.7   30.78   26.82   26.82   26.94  1.01   10.8   35.41   3.46    39.03
5   "688212"   "澳华内镜"   53.21  11.32   5.41    14290    73829241.0  16.69   54.89   46.91    47.6    47.8  2.54   4.71  520.64   5.77   139.03
6   "688639"   "华恒生物"  122.01  10.72  11.81    20968   245290813.0  15.41  123.98   107.0  109.82   110.2   1.4   3.05   59.87   10.6    68.75
7   "600870"   "退市厦华"    0.53  10.42   0.05    88781     4616440.0   12.5    0.53    0.47    0.47    0.48  0.31    1.7   41.84  30.85  -153.01
8   "688077"    "大地熊"    87.5  10.37   8.22    21014   178407427.0  11.34    87.5   78.51   79.41   79.28  1.02   4.81   33.45   6.64     40.2
9   "688667"   "菱电电控"   161.1  10.32  15.07     8157   127043408.0  12.61   162.0  143.59   146.0  146.03  0.93    3.6   52.38   5.75    58.58
10  "688707"   "振华新材"   65.33  10.28   6.09    72596   454713120.0  11.93   65.65   58.58   58.62   59.24  1.29   9.37   21.13   9.16    42.67
11  "600386"   "北巴传媒"    4.26  10.08   0.39   369189   155414984.0  10.34    4.26    3.86    3.88    3.87  1.17   4.58   119.6   1.88    28.07
12  "603803"   "瑞斯康达"    7.76  10.07   0.71   116098    88905140.0   9.65    7.76    7.08    7.18    7.05  0.89   2.76   29.17   1.75    -4.21
13  "603556"   "海兴电力"   16.12  10.03   1.47   124231   193813871.0  11.74   16.12    14.4    14.4   14.65  1.63   2.54   14.03    1.4    19.25
14  "600330"   "天通股份"   10.64  10.03   0.97   409114   421969808.0  10.44   10.64    9.63    9.66    9.67  1.51   4.11   24.19   2.13    25.25
15  "603917"   "合力科技"   15.69  10.03   1.43    78153   118595448.0  11.78   15.69   14.01   14.21   14.26  1.26   4.98   53.16   2.35    39.97
16  "601788"   "光大证券"   15.69  10.03   1.43  2964833  4434435584.0  13.04   15.69   13.83    14.5   14.26  3.72   7.59   27.15   1.35    20.95
17  "601456"   "国联证券"   12.18  10.03   1.11  1896619  2256302944.0  10.84   12.18   10.98    11.0   11.07  1.91  19.18   45.66   2.08    37.48
18  "603299"   "苏盐井神"   13.28  10.02   1.21   630288   805853088.0  11.85   13.28   11.85   11.99   12.07  1.68  11.27   10.77    2.3    19.95
19  "603819"  "XD神力股"   15.48  10.02   1.41    49919    75303134.0  10.52   15.48    14.0   14.05   14.07  1.03   2.29   29.39   4.09    53.26]

但是这个只有1页的数据,我想要所有的上证A股。

再分析一下页面

当我点第二页的时候,另一条js数据中的地址发生了变化,pn变成了2,那这个数值就代表页面,所以,要遍历109次

重新修改一下代码,再套个循环就可以遍历到所有的数据。

import requests
import re
import pandas as pd
##构造浏览器头部
headers={
'Accept': '*/*',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
'Connection': 'keep-alive',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.63 Safari/537.36 Edg/102.0.1245.33' 
}
eastmoney_info=[]
for pn in range(1,110):
    url = '<http://54.push2.eastmoney.com/api/qt/clist/get?cb=jQuery1124031148056087743625_1655062773152&pn='+str(pn)+'&pz=20&po=1&np=1&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&wbp2u=|0|0|0|web&fid=f3&fs=m:1+t:2,m:1+t:23&fields=f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f12,f13,f14,f15,f16,f17,f18,f20,f21,f23,f24,f25,f22,f11,f62,f128,f136,f115,f152&_=1655062773153>'

    response = requests.get(url,headers=headers).text
    a=re.findall(r'"f12":(.*?),',response)#代码
    b=re.findall(r'"f14":(.*?),',response)#名称
    c=re.findall(r'"f2":(.*?),',response)#最新价
    d=re.findall(r'"f3":(.*?),',response)#涨跌幅%
    e=re.findall(r'"f4":(.*?),',response)#涨跌额
    f=re.findall(r'"f5":(.*?),',response)#成交量(手)
    g=re.findall(r'"f6":(.*?),',response)#成交额
    h=re.findall(r'"f7":(.*?),',response)#振幅%
    i=re.findall(r'"f15":(.*?),',response)#最高
    j=re.findall(r'"f16":(.*?),',response)#最低
    k=re.findall(r'"f17":(.*?),',response)#今开
    l=re.findall(r'"f18":(.*?),',response)#昨收
    m=re.findall(r'"f10":(.*?),',response)#量比
    n=re.findall(r'"f8":(.*?),',response)#换手率%
    o=re.findall(r'"f9":(.*?),',response)#市盈率(动)
    p=re.findall(r'"f23":(.*?),',response)#市净率
    q=re.findall(r'"f115":(.*?),',response)#滚动市盈率
    eastmoney_info.append(pd.DataFrame({'代码':a,'名称':b,'最新价':c,'涨跌幅%':d,'涨跌额':e,
                                    '成交量(手)':f,'成交额':g,'振幅%':h,
                                    '最高':i,'最低':j,'今开':k,
                                    '昨收':l,'量比':m,'换手率%':n,
                                    '市盈率_动态':o,'市净率':p,'滚动市盈率':q}))
eastmoney=pd.concat(eastmoney_info)
print(eastmoney)

4、导入数据库

from sqlalchemy import create_engine
sql_user = 'root'
sql_pwd = 'root'
engine = create_engine('mysql+pymysql://'+sql_user+':'+sql_pwd+'@localhost:3306/hsmbs')
pd.io.sql.to_sql(eastmoney, 'eastmoney', con = engine, index = False, if_exists = 'append')

最后为了减少网站负荷导致对方采取反爬举措。设置一下等待时间

5、最终代码

import requests
import pandas as pd
import re
import time
import random
from sqlalchemy import create_engine

##构造浏览器头部
headers={
'Accept': '*/*',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
'Connection': 'keep-alive',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.63 Safari/537.36 Edg/102.0.1245.33' 
}

#股票信息
eastmoney_info=[]
#爬取所有页
for pn in range(1,120):
    url = '<http://54.push2.eastmoney.com/api/qt/clist/get?cb=jQuery1124031148056087743625_1655062773152&pn='+str(pn)+'&pz=20&po=1&np=1&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&wbp2u=|0|0|0|web&fid=f3&fs=m:1+t:2,m:1+t:23&fields=f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f12,f13,f14,f15,f16,f17,f18,f20,f21,f23,f24,f25,f22,f11,f62,f128,f136,f115,f152&_=1655062773153>'
    ##设置随机等待时间,模拟人为操作,减少网站压力。
    wait_time=random.randint(3,6)
    ##获取响应html文本
    response=requests.get(url,headers=headers).text
        #print(response)
    a=re.findall(r'"f12":(.*?),',response)#代码
    b=re.findall(r'"f14":(.*?),',response)#名称
    c=re.findall(r'"f2":(.*?),',response)#最新价
    d=re.findall(r'"f3":(.*?),',response)#涨跌幅%
    e=re.findall(r'"f4":(.*?),',response)#涨跌额
    f=re.findall(r'"f5":(.*?),',response)#成交量(手)
    g=re.findall(r'"f6":(.*?),',response)#成交额
    h=re.findall(r'"f7":(.*?),',response)#振幅%
    i=re.findall(r'"f15":(.*?),',response)#最高
    j=re.findall(r'"f16":(.*?),',response)#最低
    k=re.findall(r'"f17":(.*?),',response)#今开
    l=re.findall(r'"f18":(.*?),',response)#昨收
    m=re.findall(r'"f10":(.*?),',response)#量比
    n=re.findall(r'"f8":(.*?),',response)#换手率%
    o=re.findall(r'"f9":(.*?),',response)#市盈率(动)
    p=re.findall(r'"f23":(.*?),',response)#市净率
    q=re.findall(r'"f115":(.*?),',response)#滚动市盈率
    eastmoney_info.append(pd.DataFrame({'代码':a,'名称':b,'最新价':c,'涨跌幅%':d,'涨跌额':e,
                                '成交量(手)':f,'成交额':g,'振幅%':h,
                                '最高':i,'最低':j,'今开':k,
                                '昨收':l,'量比':m,'换手率%':n,
                                '市盈率_动态':o,'市净率':p,'滚动市盈率':q}))
    ##设置等待时间,3-6秒随机
    time.sleep(wait_time)

eastmoney=pd.concat(eastmoney_info)
#print(eastmoney)
sql_user = 'root'
sql_pwd = 'root'
engine = create_engine('mysql+pymysql://'+sql_user+':'+sql_pwd+'@localhost:3306/hsmbs')
pd.io.sql.to_sql(eastmoney, 'eastmoney', con = engine, index = False, if_exists = 'append')
262 views
Comments
登录后评论
Sign In
·

由于不是很懂股票的术语,这些关键词都不太懂内在联系,只能看懂字面意思和最简单的勾稽关系。需要花点时间研究一下股票的东西,才能把数据库里的这些“资料”变废为宝。

·

clap

·

这样直接爬,会不会很"刑"啊🤔

·

接上一评论,爬取资料前可以先看一下网站的 robots.txt 文件,明确一下网站管理者允许抓取的部分资料。

eastmoney.com/robots.txt 的内容如下:

User-agent: *
Allow: /
Sitemap: http://www.eastmoney.com/sitemap.xml

所以这个网站中的所有内容应该都是可以抓取的。

robots.txt 的规范可以参考 https://developers.google.com/search/docs/advanced/robots/robots_txt?hl=zh-cn。

当然,robots.txt 属于君子约定,任何一个网站只要展示出来,那就有办法去自动获取数据。自己玩而不发布出来,目前来看是约束不了的。

另外,在代码方面,Python 有可以解析网页内容的库,根据选择符找到想要的内容应该比正则表达式要快一些,比如 Beautiful Soup。你可以尝试一下 +1