taobao.py 1.3 KB

123456789101112131415161718192021222324252627282930313233343536
  1. # -*- coding: utf-8 -*-
  2. from io import StringIO
  3. from urllib.parse import urlencode
  4. import re
  5. import scrapy
  6. from image360.items import GoodsItem
  7. class TaobaoSpider(scrapy.Spider):
  8. name = 'taobao'
  9. allowed_domains = ['www.taobao.com']
  10. def start_requests(self):
  11. base_url = 'https://s.taobao.com/search?'
  12. params = {}
  13. for keyword in ['ipad', 'iphone', '小米手机']:
  14. params['q'] = keyword
  15. for page in range(10):
  16. params['s'] = page * 44
  17. full_url = base_url + urlencode(params)
  18. yield scrapy.Request(url=full_url, callback=self.parse)
  19. def parse(self, response):
  20. goods_list = response.xpath('//*[@id="mainsrp-itemlist"]/div/div/div[1]')
  21. for goods in goods_list:
  22. item = GoodsItem()
  23. item['price'] = goods.xpath('div[5]/div[2]/div[1]/div[1]/strong/text()').extract_first()
  24. item['deal'] = goods.xpath('div[5]/div[2]/div[1]/div[2]/text()').extract_first()
  25. segments = goods.xpath('div[6]/div[2]/div[2]/a/text()').extract()
  26. title = StringIO()
  27. for segment in segments:
  28. title.write(re.sub('\s', '', segment))
  29. item['title'] = title.getvalue()
  30. yield item