您現(xiàn)在的位置是：家裝資訊

爬蟲項(xiàng)目01-山東省招標(biāo)與采購(gòu)網(wǎng)

2019-01-28 00:00家裝資訊

簡(jiǎn)介# -*- coding: utf-8 -*- import scrapy from scrapy.cmdline import execute from urllib.parse import urljoin """ 山東省采購(gòu)與招標(biāo)網(wǎng) """ class SdbSpider(scrapy.Spider): name = sdb allowed_domains = [www.sdbidding.org.cn] # 招標(biāo)公告 #中標(biāo)公示 #...

# -*- coding: utf-8 -*-
import scrapy
from scrapy.cmdline import execute
from urllib.parse import urljoin
"""
山東省采購(gòu)與招標(biāo)網(wǎng)
"""
class SdbSpider(scrapy.Spider):
 name = 'sdb'
 allowed_domains = ['www.sdbidding.org.cn']
 # 招標(biāo)公告 #中標(biāo)公示 #采購(gòu)公告 #成交公示
 start_urls = ['http://www.sdbidding.org.cn/bulletins?infoType=11', 'http://www.sdbidding.org.cn/bulletins?infoType=12', 'http://www.sdbidding.org.cn/bulletins?infoType=13&amp;type=1','http://www.sdbidding.org.cn/bulletins?infoType=14&amp;type=2']
 def start_requests(self):
 num = 0
 for start_url in self.start_urls:
 while num&lt;=20:#默認(rèn)爬取得頁(yè)數(shù),默認(rèn)爬取20頁(yè)
 num+=1
 yield scrapy.FormRequest (start_url,formdata={"pageNo":str(num)},callback=self.get_page)
 def get_page(self,response):
 start_url=response.url
 urls=response.xpath('//td[@class="tit"]//a//@href').extract()
 for url in urls:
 end_url=urljoin(start_url,url)
 yield scrapy.Request(url=end_url,callback=self.get_content)
 def get_content(self,response):
 #正文處理
 title = response.xpath('//h3//text()').extract()[0]
 print(title)
 ctime = response.xpath('//div[@class="detail-title"]//p//text()').extract()[0]
 print(ctime)
 content = response.xpath('//div[@class="details"]//p//text()').extract()[0]
 print(content)
 content_xml = response.xpath('//div[@class="details"]')
 print(content_xml)
if __name__ == '__main__':
 execute(["scrapy", "crawl", "sdb"])