pipelines.py 1.4 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243
  1. # -*- coding: utf-8 -*-
  2. # Define your item pipelines here
  3. #
  4. # Don't forget to add your pipeline to the ITEM_PIPELINES setting
  5. # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
  6. import pymongo
  7. from scrapy.exceptions import DropItem
  8. from scrapy.conf import settings
  9. from scrapy import log
  10. class DoubanPipeline(object):
  11. def __init__(self):
  12. connection = pymongo.MongoClient(settings['MONGODB_SERVER'], settings['MONGODB_PORT'])
  13. db = connection[settings['MONGODB_DB']]
  14. self.collection = db[settings['MONGODB_COLLECTION']]
  15. def process_item(self, item, spider):
  16. #Remove invalid data
  17. valid = True
  18. for data in item:
  19. if not data:
  20. valid = False
  21. raise DropItem("Missing %s of blogpost from %s" %(data, item['url']))
  22. if valid:
  23. #Insert data into database
  24. new_moive=[{
  25. "name":item['name'][0],
  26. "year":item['year'][0],
  27. "score":item['score'],
  28. "director":item['director'],
  29. "classification":item['classification'],
  30. "actor":item['actor']
  31. }]
  32. self.collection.insert(new_moive)
  33. log.msg("Item wrote to MongoDB database %s/%s" %
  34. (settings['MONGODB_DB'], settings['MONGODB_COLLECTION']),
  35. level=log.DEBUG, spider=spider)
  36. return item