济南网站建设_福州网站设计_咸阳网站建设_北京网站建设
.add_experimental_option('excludeSwitches',['enable-automation'])service = Service('D:/App Data/PythonProject/LiVideoCrawling/LiVideoCrawling/chrome_driver/chromedriver.exe')classLivideocrawlingDownloaderMiddleware:# Not all methods need to be defined. If a method is not defined,# scrapy acts as if the downloader middleware does not modify the# passed objects.# 实例化一个浏览器对象bro = Chrome(service=service, options=chrome_options)@classmethoddeffrom_crawler(cls, crawler):# This method is used by Scrapy to create your spiders.s = cls()crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)return sdefprocess_request(self, request, spider):# 设置一个正则表达式,用于匹配指定类型的urlpattern =r'https://www\.pearvideo\.com/videoStatus\.jsp\?.*?'# 判断是否为视频的异步请求urlif re.match(pattern, request.url):# print(request.url)# print(spider.json_ids)# 请确认为视频异步请求url后# 通过循环判断,本次的url对应的视频idfor jid in spider.json_ids:# 获得主页中的预先存入的视频idfid = jid.split('_')[-1]# 获得当前请求url的视频idpattern =r'contId=(\d+)'match= re.search(pattern, request.url)vid =''ifmatch:vid =match.group(1)# print("主页视频id", fid, "详情页视频id", vid)# 判断这两个id是否相等if fid.startswith(vid):print('相等')# 如果相等则对防盗链请求头进行拼接request.headers['Referer']='https://www.pearvideo.com/'+ jiddefprocess_response(self, request, response, spider):if request.url.startswith(spider.start_urls[0]):# 对页面中的首页进行selenium请求self.bro.get(request.url)html_data = self.bro.page_source# 创建一个新的响应对象new_response = HtmlResponse(url=request.url, body=html_data.encode(), encoding='utf-8', request=request)return new_responseelse:return responsedefprocess_exception(self, request, exception, spider):# Called when a download handler or a process_request()# (from other downloader middleware) raises an exception.# Must either:# - return None: continue processing this exception# - return a Response object: stops process_exception() chain# - return a Request object: stops process_exception() chainpassdefspider_opened(self, spider):spider.logger.info("Spider opened: %s"% spider.name)">