首页 » Python » python定向爬虫:scrapy抓取百度m端竞价的结果,并将结果按照商家给予分类

python定向爬虫:scrapy抓取百度m端竞价的结果,并将结果按照商家给予分类

 

核心代码如下

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
            jushe = "m.huishou.jushewang.com"
            jushe1 = "jushe2007.com/"
            jushenuomi = "recover2.html"
            jushenuomi2 = "nuomi-"
            mingshehui = "mingshehui.cn/"
            niushe1 = "hzxyns.com/"
            niushe2 = "duoaishe.com/"
            niushe3 = "niushe1997.com/"
            niushe4 = "nsw1997.cn/"
            niushe5 = "nsw58.cn/"
            jupinhui = "zhjph.com/"
            jiangyi = "jiangyiwatch.com/"
            julimingpin = "qianhu.wejianzhan.com/"
            zhenshe = "&merchantId=99628713&"
            baoshe = "watchnj.com/"
            baoshe1 = "&sl=watchnj.com&"
            zhiduoshao = "zhids.cn/"
            baoquan = "sz24k9999.com/"
            baoquan1 = "baoquanzb.com/"
            quanshe = "quanshenet.com/"
            if (jushenuomi in lading or jushenuomi2 in lading):
                business = "聚奢网糯米"
            elif(jushe in lading or jushe1 in lading):
                business = "聚奢网"
            elif(mingshehui in lading):
                business = "名奢汇"
            elif(niushe1 in lading or niushe2 in lading or niushe3 in lading or niushe4 in lading or niushe5 in lading ):
                business = "牛奢网"
            elif(jupinhui in lading):
                business = "巨品汇"
            elif(jiangyi in lading):
                business = "匠艺"
            elif(julimingpin in lading):
                business = "聚礼名品"
            elif(zhenshe in lading):
                business = "臻奢"
            elif(baoshe in lading or baoshe1 in lading):
                business = "宝奢汇"
            elif(zhiduoshao in lading):
                business = "值哆少"
            elif(baoquan in lading or baoquan1 in lading):
                business = "宝泉珠宝"
            elif(quanshe in lading):
                business = "全奢网"
            elif(zhiduoshao in lading):
                business = "值哆少"
            else:
                business = "未知"

以下为spider的所有代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
#coding:utf-8
import scrapy,re,requests,urllib,sys,time
from pcseo.items import PcseoItem
from tld import get_fld
# import MySQLdb as mdb

reload(sys)
sys.setdefaultencoding('utf8')

# class_dict = {
    # "公司简介":"",
    # "公司工资":"工资",
    # "公司面试":"面试",
    # "公司待遇":"待遇",
    # "公司怎么样":"怎么样",
    # "公司招聘":"招聘",
# }


# con = mdb.connect(host='47.92.200.32',user='root',passwd='123123',db='seo',charset='utf8')

# query_file = open('/root/seojiankong/pcseo/query_file','w')

# for k,v in class_dict.items():
    # with con:
        # cur = con.cursor(mdb.cursors.DictCursor)
        # cur.execute("select query from baidu_pc_rank order by rand() limit 10")
        # rows = cur.fetchall()
       
        # for row in rows:
            # query = "%s%s,%s" %(row['query'],v,k)
            # query_file.write(query + '\n')
           
# query_file.close()


def search(req,html):
    text = re.search(req,html)
    if text:
        data = text.group(1)
    else:
        data = "no"
    return data

class Dmozspider(scrapy.Spider):
    name = "pcseo"
    start_urls = []
    for line in open("D:/pyxuexi/msem/query_file"):
        word = line.strip()
        # word = line.split(',')[0]
        url = 'http://www.baidu.com/s?wd=%s' % urllib.quote(word)
        start_urls.append(url)
    def __get_url_query(self,url):
        m = re.search("wd=(.*)",url).group(1)
        return m

    def parse(self, response):
        query = self.__get_url_query(response.url)
        current_time = time.strftime("%Y-%m-%d-%H-%M",time.localtime(time.time()))
        for id in xrange(0,4):
            div = response.xpath("//*[@data-lp]")
            try:
                lading = div.xpath("@data-lp").extract()[id]
            except IndexError:
                continue
            lianjie = search('.*?href="(.*?)"',div.xpath("//*/a[@class='c-blocka ec_title ']").extract()[id])
            lading = str(lading)
            lading = urllib.unquote(lading)
            lading = urllib.unquote(lading)
            shortlading = get_fld(lading)
            rank = id+1
            title = re.sub("<[^>]*?>","",search("<h3[^>]*?>(.*?)</h3>",div.xpath("//*[@data-lp]//*[h3]").extract()[id]))
            jushe = "m.huishou.jushewang.com"
            jushe1 = "hs.jushe2007.com/"
            jushe2 = "bd.jushe2007.com/"
            jushenuomi = "recover2.html"
            jushenuomi2 = "nuomi-"
            mingshehui = "mingshehui.cn/"
            niushe1 = "hzxyns.com/"
            niushe2 = "duoaishe.com/"
            shewuwang = "shewuwang.com/"
            niushe3 = "niushe1997.com/"
            niushe4 = "nsw1997.cn/"
            niushe5 = "nsw58.cn/"
            jupinhui = "zhjph.com/"
            jiangyi = "jiangyiwatch.com/"
            julimingpin = "qianhu.wejianzhan.com/"
            zhenshe = "&merchantId=99628713&"
            baoshe = "watchnj.com/"
            baoshe1 = "&sl=watchnj.com&"
            zhiduoshao = "zhids.cn/"
            baoquan = "sz24k9999.com/"
            baoquan1 = "baoquanzb.com/"
            quanshe = "quanshenet.com/"
            if (jushenuomi in lading or jushenuomi2 in lading):
                business = "聚奢网糯米"
            elif(jushe in lading or jushe1 in lading):
                business = "聚奢网"
            elif(jushe1 in lading):
                business = "聚奢网小户"
                a = requests.get(lianjie)
            elif(jushe2 in lading):
                business = "聚奢网外包"
            elif(mingshehui in lading):
                business = "名奢汇"
            elif(niushe1 in lading or niushe2 in lading or niushe3 in lading or niushe4 in lading or niushe5 in lading ):
                business = "牛奢网"
            elif(jupinhui in lading):
                business = "巨品汇"
            elif(jiangyi in lading):
                business = "匠艺"
            elif(julimingpin in lading):
                business = "聚礼名品"
            elif(zhenshe in lading):
                business = "臻奢"
            elif(baoshe in lading or baoshe1 in lading):
                business = "宝奢汇"
            elif(zhiduoshao in lading):
                business = "值哆少"
            elif(baoquan in lading or baoquan1 in lading):
                business = "宝泉珠宝"
            elif(quanshe in lading):
                business = "全奢网"
            elif(shewuwang in lading):
                business = "奢物网"
            else:
                business = "未知"
            # rank = div.xpath("@data-rank").extract()[0]
       
        # for id in xrange(1,11):
            # div = response.xpath("//*[@id='%s']" % id)
            # rank = div.xpath("@id").extract()[0]
            # title = re.sub("<[^>]*?>","",search("<a[^>]*?>(.*?)</a>",div.xpath("h3/a").extract()[0]))
            # # ladingtest = div.xpath('.//a[@class="c-showurl"]').extract()
            # try:
                # lading = requests.head(re.sub('<[^>]*?>','',search('<a.*?href="(.*?)"',div.xpath('.//a[@class="c-showurl"]').extract()[0]))).headers['location']
            # except:
                # lading = re.sub('<[^>]*?>','',search('<a.*?href="(.*?)"',div.xpath('.//a[@class="c-showurl"]').extract()[0]))
            # # else:
                # # print "成功转码"
            item = PcseoItem()
            item['title'] = title
            item['rank'] = rank
            item['lading'] = lading
            item['query'] = urllib.unquote(query)
            item['time'] = current_time
            item['shortlading'] = shortlading
            item['business'] = business
            item['lianjie'] = lianjie
            yield item
        time.sleep(1)
            # print query,rank,title,lading,ladingtest

原文链接:python定向爬虫:scrapy抓取百度m端竞价的结果,并将结果按照商家给予分类,转载请注明来源!

0