首页 » Python » python定向爬虫:scrapy中setting的常用设置

python定向爬虫:scrapy中setting的常用设置

 
import random

BOT_NAME = 'seo'	

SPIDER_MODULES = ['seo.spiders']
NEWSPIDER_MODULE = 'seo.spiders'


# 随机cookie
def getCookie():
    cookie_list = [
    'BAIDUID=791ED7F86F43AF44A3808AB244404E1A:FG=1; PSTM=1443524661; BIDUPSID=4B0DC2F54860625BA83681F98C507951; BDUSS=VdqVXZlaHNPVE1jRzlRU3BEMlBFcFVDQTBGV3ZGcEZTSW90Sn5vZHFQT2pvVFJXQVFBQUFBJCQAAAAAAAAAAAEAAAAJkstJv7TXvMTj1NnM-AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAKMUDVajFA1WL; MCITY=-%3A; ispeed_lsm=2; SIGNIN_UC=70a2711cf1d3d9b1a82d2f87d633bd8a01942858111; H_WISE_SIDS=100043_100288; BDSFRCVID=tAAsJeC627DsPNr4QNLk-qjWNeK2EJ7TH6ao_taMbqNtTyotFjlwEG0PJ3lQpYD-gGLkogKK0mOTHUcP; H_BDCLCKID_SF=JbAjoKK5tKvbfP0kh-QJhnQH-UnLqMrZJT7Z0lOnMp05flToM6OGhP0WQqQaJ-RULIbEXqbLBnRvOKO_e6t5D5J0jN-s-bbfHDJK0b7aHJOoDDvO2j35y4LdLp7xJb5AWKLJbR7wbnj0hpcR3p3s2RIv24vQBMkeWJLfoIP2JCPMbP365ITSMtCfbfO02D62aKDs-lnx-hcqEpO9QT-aLq-gjbQgKPIL-CoObDTe5bOo8Ro6yjOsDUThDHt8J50OfR3fL-08bPoEqbjg54r5hnjH-UIS26uDJJFeo6Q2bnOHDtJpMtJ_Htu32q32DJ3J55ryL4tBan7JDTQm5bOBK-QK5MoO-TPDt5neaJ5n0-nnhn0wDj_M0tuqBnjetlQ4Q5RWhDJR2UJ2en-Ry6C-D5v0jatDq-TtHDjLQ-bqbTrjDnCr34FWKUI8LPbO05Jq5aPe_UjytUTBfMcDW-6vKfu-Ma7OKMcAt2LEoCtXJIL2MDKr5nJbq4uehU6X2D62aKDsLpjp-hcqEIL4jUO50MCXjbQwWPPL-CQU2J5ctq5kMUbSj4QoBn0_Xf5DWJ3nMCOJKJcsbh5nhMJ_DPvGKhFvqfJxWPny523ion6vQpnlHUtu-n5jHjJBjG8J3f; BDRCVFR[ltbVPlNi2ac]=mk3SLVN4HKm; BD_UPN=123253; H_PS_645EC=8871KezGVuec0l6U03EckUIiztA%2Be7LttD91u%2FB6ntENY5ucpQaoGsil%2BFmSqHBO; sug=3; sugstore=1; ORIGIN=0; bdime=21110; BDRCVFR[skC-pcPB0g_]=mk3SLVN4HKm; BD_CK_SAM=1; BDSVRTM=91; H_PS_PSSID=',
    'BAIDUID=0236A7F2BA57EAD085EEDE626343CB91:FG=1; PLUS=1; BIDUPSID=0236A7F2BA57EAD085EEDE626343CB91; PSTM=1444372071; BDRCVFR[skC-pcPB0g_]=mk3SLVN4HKm; BD_CK_SAM=1; BDSVRTM=64; H_PS_PSSID='
    ]
    cookie = random.choice(cookie_list)
    return cookie


# 禁止cookie
# COOKIES_ENABLED = False

# cookie debug
# COOKIES_DEBUG = False


# DEFAULT_REQUEST_HEADERS ,定义请求的头信息
DEFAULT_REQUEST_HEADERS = {
    'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'Accept-Encoding':'gzip, deflate, sdch',
    'Accept-Language':'zh-CN,zh;q=0.8,en;q=0.6',
    'Cache-Control':'max-age=0',
    'Connection':'keep-alive',
    'Host':'www.baidu.com',
    'RA-Sid':'7739A016-20140918-030243-3adabf-48f828',
    'RA-Ver':'3.0.7',
    'Upgrade-Insecure-Requests':'1',
    'Cookie':'%s' % getCookie(),
}


# 禁止显示告警
DOWNLOAD_HANDLERS = {
  's3': None,
}

# 下载延迟,既下载两个页面之间的等待时间
# DOWNLOAD_DELAY = 0.5

# 并发最大值
CONCURRENT_REQUESTS = 10

# 对单个网站的并发最大值
CONCURRENT_REQUESTS_PER_DOMAIN = 10

#启动自动限速
AUTOTHROTTLE_ENABLED = False

# 设置下载超时
DOWNLOAD_TIMEOUT = 60


原文链接:python定向爬虫:scrapy中setting的常用设置,转载请注明来源!

0