pipelines.py 1.4 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455
  1. # -*- coding: utf-8 -*-
  2. # Define your item pipelines here
  3. #
  4. # Don't forget to add your pipeline to the ITEM_PIPELINES setting
  5. # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
  6. import logging
  7. from pymongo import MongoClient
  8. from scrapy import Request
  9. from scrapy.exceptions import DropItem
  10. from scrapy.pipelines.images import ImagesPipeline
  11. logger = logging.getLogger('SaveImagePipeline')
  12. class SaveImagePipeline(ImagesPipeline):
  13. def get_media_requests(self, item, info):
  14. yield Request(url=item['url'])
  15. def item_completed(self, results, item, info):
  16. logger.debug('图片下载完成!')
  17. if not results[0][0]:
  18. raise DropItem('下载失败')
  19. return item
  20. def file_path(self, request, response=None, info=None):
  21. return request.url.split('/')[-1]
  22. class SaveToMongoPipeline(object):
  23. def __init__(self, mongo_url, db_name):
  24. self.mongo_url = mongo_url
  25. self.db_name = db_name
  26. self.client = None
  27. self.db = None
  28. def process_item(self, item, spider):
  29. return item
  30. def open_spider(self, spider):
  31. self.client = MongoClient(self.mongo_url)
  32. self.db = self.client[self.db_name]
  33. def close_spider(self, spider):
  34. self.client.close()
  35. @classmethod
  36. def from_crawler(cls, crawler):
  37. return cls(crawler.settings.get('MONGO_URL'),
  38. crawler.settings.get('MONGO_DB'))