main.py 3.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132
  1. from enum import Enum, unique
  2. from queue import Queue
  3. from random import random
  4. from threading import Thread, current_thread
  5. from time import sleep
  6. from urllib.parse import urlparse
  7. import requests
  8. from bs4 import BeautifulSoup
  9. @unique
  10. class SpiderStatus(Enum):
  11. IDLE = 0
  12. WORKING = 1
  13. def decode_page(page_bytes, charsets=('utf-8',)):
  14. page_html = None
  15. for charset in charsets:
  16. try:
  17. page_html = page_bytes.decode(charset)
  18. break
  19. except UnicodeDecodeError:
  20. pass
  21. return page_html
  22. class Retry(object):
  23. def __init__(self, *, retry_times=3,
  24. wait_secs=5, errors=(Exception, )):
  25. self.retry_times = retry_times
  26. self.wait_secs = wait_secs
  27. self.errors = errors
  28. def __call__(self, fn):
  29. def wrapper(*args, **kwargs):
  30. for _ in range(self.retry_times):
  31. try:
  32. return fn(*args, **kwargs)
  33. except self.errors as e:
  34. print(e)
  35. sleep((random() + 1) * self.wait_secs)
  36. return None
  37. return wrapper
  38. class Spider(object):
  39. def __init__(self):
  40. self.status = SpiderStatus.IDLE
  41. @Retry()
  42. def fetch(self, current_url, *, charsets=('utf-8', ),
  43. user_agent=None, proxies=None):
  44. thread_name = current_thread().name
  45. print(f'[{thread_name}]: {current_url}')
  46. headers = {'user-agent': user_agent} if user_agent else {}
  47. resp = requests.get(current_url,
  48. headers=headers, proxies=proxies)
  49. return decode_page(resp.content, charsets) \
  50. if resp.status_code == 200 else None
  51. def parse(self, html_page, *, domain='m.sohu.com'):
  52. soup = BeautifulSoup(html_page, 'lxml')
  53. url_links = []
  54. for a_tag in soup.body.select('a[href]'):
  55. parser = urlparse(a_tag.attrs['href'])
  56. scheme = parser.scheme or 'http'
  57. netloc = parser.netloc or domain
  58. if scheme != 'javascript' and netloc == domain:
  59. path = parser.path
  60. query = '?' + parser.query if parser.query else ''
  61. full_url = f'{scheme}://{netloc}{path}{query}'
  62. if full_url not in visited_urls:
  63. url_links.append(full_url)
  64. return url_links
  65. def extract(self, html_page):
  66. pass
  67. def store(self, data_dict):
  68. pass
  69. class SpiderThread(Thread):
  70. def __init__(self, name, spider, tasks_queue):
  71. super().__init__(name=name, daemon=True)
  72. self.spider = spider
  73. self.tasks_queue = tasks_queue
  74. def run(self):
  75. while True:
  76. current_url = self.tasks_queue.get()
  77. visited_urls.add(current_url)
  78. self.spider.status = SpiderStatus.WORKING
  79. html_page = self.spider.fetch(current_url)
  80. if html_page not in [None, '']:
  81. url_links = self.spider.parse(html_page)
  82. for url_link in url_links:
  83. self.tasks_queue.put(url_link)
  84. self.spider.status = SpiderStatus.IDLE
  85. def is_any_alive(spider_threads):
  86. return any([spider_thread.spider.status == SpiderStatus.WORKING
  87. for spider_thread in spider_threads])
  88. visited_urls = set()
  89. def main():
  90. task_queue = Queue()
  91. task_queue.put('http://m.sohu.com/')
  92. spider_threads = [SpiderThread('thread-%d' % i, Spider(), task_queue)
  93. for i in range(10)]
  94. for spider_thread in spider_threads:
  95. spider_thread.start()
  96. while not task_queue.empty() or is_any_alive(spider_threads):
  97. pass
  98. print('Over!')
  99. if __name__ == '__main__':
  100. main()