7 жил өмнө · ccae929b4c
--- a/Day66-75/Scrapy爬虫框架的应用.md
+++ b/Day66-75/Scrapy爬虫框架的应用.md
@@ -1,4 +1,4 @@
 
				-## Scrapy的应用(01)
			
 
				+## Scrapy爬虫框架的应用
			
 
				 
			
 
				 ### Scrapy概述
			
 
				 
			
@@ -101,6 +101,11 @@ $
 
				 
			
 
				 2. 在spiders文件夹中编写自己的爬虫。
			
 
				 
			
 
				+   ```Shell
			
 
				+   
			
 
				+   (venv) $ scrapy genspider movie movie.douban.com --template=crawl
			
 
				+   ```
			
 
				+
			
 
				    ```Python
			
 
				    
			
 
				    # -*- coding: utf-8 -*-
			
@@ -287,5 +292,77 @@ $
 
				    HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
			
 
				    ```
			
 
				 
			
 
				+### 补充说明
			
 
				+
			
 
				+#### XPath语法
			
 
				+
			
 
				+1. XPath路径表达式：XPath使用路径表达式来选取XML文档中的节点或者节点集。
			
 
				+
			
 
				+2. XPath节点：元素、属性、文本、命名空间、处理指令、注释、根节点。
			
 
				+
			
 
				+3. XPath语法。（注：下面的例子来自于[菜鸟教程](http://www.runoob.com/)网站的[XPath教程](http://www.runoob.com/xpath/xpath-syntax.html)。)
			
 
				+
			
 
				+   XML文件。
			
 
				+
			
 
				+   ```XML
			
 
				+   
			
 
				+   <?xml version="1.0" encoding="UTF-8"?>
			
 
				+   
			
 
				+   <bookstore>
			
 
				    
			
 
				+       <book>
			
 
				+         <title lang="eng">Harry Potter</title>
			
 
				+         <price>29.99</price>
			
 
				+       </book>
			
 
				+   
			
 
				+       <book>
			
 
				+         <title lang="eng">Learning XML</title>
			
 
				+         <price>39.95</price>
			
 
				+       </book>
			
 
				+   
			
 
				+   </bookstore>
			
 
				+   ```
			
 
				+   XPath语法。
			
 
				+
			
 
				+   | 路径表达式      | 结果                                                         |
			
 
				+   | --------------- | ------------------------------------------------------------ |
			
 
				+   | bookstore       | 选取 bookstore 元素的所有子节点。                            |
			
 
				+   | /bookstore      | 选取根元素 bookstore。注释：假如路径起始于正斜杠( / )，则此路径始终代表到某元素的绝对路径！ |
			
 
				+   | bookstore/book  | 选取属于 bookstore 的子元素的所有 book 元素。                |
			
 
				+   | //book          | 选取所有 book 子元素，而不管它们在文档中的位置。             |
			
 
				+   | bookstore//book | 选择属于 bookstore 元素的后代的所有 book 元素，而不管它们位于 bookstore 之下的什么位置。 |
			
 
				+   | //@lang         | 选取名为 lang 的所有属性。                                   |
			
 
				+
			
 
				+   XPath谓词。
			
 
				+
			
 
				+   | 路径表达式                         | 结果                                                         |
			
 
				+   | ---------------------------------- | ------------------------------------------------------------ |
			
 
				+   | /bookstore/book[1]                 | 选取属于 bookstore 子元素的第一个 book 元素。                |
			
 
				+   | /bookstore/book[last()]            | 选取属于 bookstore 子元素的最后一个 book 元素。              |
			
 
				+   | /bookstore/book[last()-1]          | 选取属于 bookstore 子元素的倒数第二个 book 元素。            |
			
 
				+   | /bookstore/book[position()<3]      | 选取最前面的两个属于 bookstore 元素的子元素的 book 元素。    |
			
 
				+   | //title[@lang]                     | 选取所有拥有名为 lang 的属性的 title 元素。                  |
			
 
				+   | //title[@lang='eng']               | 选取所有 title 元素，且这些元素拥有值为 eng 的 lang 属性。   |
			
 
				+   | /bookstore/book[price>35.00]       | 选取 bookstore 元素的所有 book 元素，且其中的 price 元素的值须大于 35.00。 |
			
 
				+   | /bookstore/book[price>35.00]/title | 选取 bookstore 元素中的 book 元素的所有 title 元素，且其中的 price 元素的值须大于 35.00。 |
			
 
				+
			
 
				+   通配符用法。
			
 
				+
			
 
				+   | 路径表达式   | 结果                              |
			
 
				+   | ------------ | --------------------------------- |
			
 
				+   | /bookstore/* | 选取 bookstore 元素的所有子元素。 |
			
 
				+   | //*          | 选取文档中的所有元素。            |
			
 
				+   | //title[@*]  | 选取所有带有属性的 title 元素。   |
			
 
				+
			
 
				+   选取多个路径。
			
 
				+
			
 
				+   | 路径表达式                       | 结果                                                         |
			
 
				+   | -------------------------------- | ------------------------------------------------------------ |
			
 
				+   | //book/title \| //book/price     | 选取 book 元素的所有 title 和 price 元素。                   |
			
 
				+   | //title \| //price               | 选取文档中的所有 title 和 price 元素。                       |
			
 
				+   | /bookstore/book/title \| //price | 选取属于 bookstore 元素的 book 元素的所有 title 元素，以及文档中所有的 price 元素。 |
			
 
				+
			
 
				+#### 在Chrome浏览器中查看元素XPath语法
			
 
				+
			
 
				+![](./res/douban-xpath.png)
			
 
				 
			
--- a/Day66-75/Scrapy的应用03.md
+++ b/Day66-75/Scrapy的应用03.md
--- a/Day66-75/code/douban/douban/__init__.py
+++ b/Day66-75/code/douban/douban/__init__.py
--- a/Day66-75/code/douban/douban/items.py
+++ b/Day66-75/code/douban/douban/items.py
@@ -0,0 +1,18 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+
			
 
				+# Define here the models for your scraped items
			
 
				+#
			
 
				+# See documentation in:
			
 
				+# https://doc.scrapy.org/en/latest/topics/items.html
			
 
				+
			
 
				+import scrapy
			
 
				+
			
 
				+
			
 
				+class DoubanItem(scrapy.Item):
			
 
				+
			
 
				+    name = scrapy.Field()
			
 
				+    year = scrapy.Field()
			
 
				+    score = scrapy.Field()
			
 
				+    director = scrapy.Field()
			
 
				+    classification = scrapy.Field()
			
 
				+    actor = scrapy.Field()
			
--- a/Day66-75/code/douban/douban/middlewares.py
+++ b/Day66-75/code/douban/douban/middlewares.py
@@ -0,0 +1,103 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+
			
 
				+# Define here the models for your spider middleware
			
 
				+#
			
 
				+# See documentation in:
			
 
				+# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
			
 
				+
			
 
				+from scrapy import signals
			
 
				+
			
 
				+
			
 
				+class DoubanSpiderMiddleware(object):
			
 
				+    # Not all methods need to be defined. If a method is not defined,
			
 
				+    # scrapy acts as if the spider middleware does not modify the
			
 
				+    # passed objects.
			
 
				+
			
 
				+    @classmethod
			
 
				+    def from_crawler(cls, crawler):
			
 
				+        # This method is used by Scrapy to create your spiders.
			
 
				+        s = cls()
			
 
				+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
			
 
				+        return s
			
 
				+
			
 
				+    def process_spider_input(self, response, spider):
			
 
				+        # Called for each response that goes through the spider
			
 
				+        # middleware and into the spider.
			
 
				+
			
 
				+        # Should return None or raise an exception.
			
 
				+        return None
			
 
				+
			
 
				+    def process_spider_output(self, response, result, spider):
			
 
				+        # Called with the results returned from the Spider, after
			
 
				+        # it has processed the response.
			
 
				+
			
 
				+        # Must return an iterable of Request, dict or Item objects.
			
 
				+        for i in result:
			
 
				+            yield i
			
 
				+
			
 
				+    def process_spider_exception(self, response, exception, spider):
			
 
				+        # Called when a spider or process_spider_input() method
			
 
				+        # (from other spider middleware) raises an exception.
			
 
				+
			
 
				+        # Should return either None or an iterable of Response, dict
			
 
				+        # or Item objects.
			
 
				+        pass
			
 
				+
			
 
				+    def process_start_requests(self, start_requests, spider):
			
 
				+        # Called with the start requests of the spider, and works
			
 
				+        # similarly to the process_spider_output() method, except
			
 
				+        # that it doesn’t have a response associated.
			
 
				+
			
 
				+        # Must return only requests (not items).
			
 
				+        for r in start_requests:
			
 
				+            yield r
			
 
				+
			
 
				+    def spider_opened(self, spider):
			
 
				+        spider.logger.info('Spider opened: %s' % spider.name)
			
 
				+
			
 
				+
			
 
				+class DoubanDownloaderMiddleware(object):
			
 
				+    # Not all methods need to be defined. If a method is not defined,
			
 
				+    # scrapy acts as if the downloader middleware does not modify the
			
 
				+    # passed objects.
			
 
				+
			
 
				+    @classmethod
			
 
				+    def from_crawler(cls, crawler):
			
 
				+        # This method is used by Scrapy to create your spiders.
			
 
				+        s = cls()
			
 
				+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
			
 
				+        return s
			
 
				+
			
 
				+    def process_request(self, request, spider):
			
 
				+        # Called for each request that goes through the downloader
			
 
				+        # middleware.
			
 
				+
			
 
				+        # Must either:
			
 
				+        # - return None: continue processing this request
			
 
				+        # - or return a Response object
			
 
				+        # - or return a Request object
			
 
				+        # - or raise IgnoreRequest: process_exception() methods of
			
 
				+        #   installed downloader middleware will be called
			
 
				+        return None
			
 
				+
			
 
				+    def process_response(self, request, response, spider):
			
 
				+        # Called with the response returned from the downloader.
			
 
				+
			
 
				+        # Must either;
			
 
				+        # - return a Response object
			
 
				+        # - return a Request object
			
 
				+        # - or raise IgnoreRequest
			
 
				+        return response
			
 
				+
			
 
				+    def process_exception(self, request, exception, spider):
			
 
				+        # Called when a download handler or a process_request()
			
 
				+        # (from other downloader middleware) raises an exception.
			
 
				+
			
 
				+        # Must either:
			
 
				+        # - return None: continue processing this exception
			
 
				+        # - return a Response object: stops process_exception() chain
			
 
				+        # - return a Request object: stops process_exception() chain
			
 
				+        pass
			
 
				+
			
 
				+    def spider_opened(self, spider):
			
 
				+        spider.logger.info('Spider opened: %s' % spider.name)
			
--- a/Day66-75/code/douban/douban/pipelines.py
+++ b/Day66-75/code/douban/douban/pipelines.py
@@ -0,0 +1,43 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+
			
 
				+# Define your item pipelines here
			
 
				+#
			
 
				+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
			
 
				+# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
			
 
				+import pymongo
			
 
				+
			
 
				+from scrapy.exceptions import DropItem
			
 
				+from scrapy.conf import settings
			
 
				+from scrapy import log
			
 
				+
			
 
				+
			
 
				+class DoubanPipeline(object):
			
 
				+
			
 
				+    def __init__(self):
			
 
				+        connection = pymongo.MongoClient(settings['MONGODB_SERVER'], settings['MONGODB_PORT'])
			
 
				+        db = connection[settings['MONGODB_DB']]
			
 
				+        self.collection = db[settings['MONGODB_COLLECTION']]
			
 
				+
			
 
				+    def process_item(self, item, spider):
			
 
				+        #Remove invalid data
			
 
				+        valid = True
			
 
				+        for data in item:
			
 
				+          if not data:
			
 
				+            valid = False
			
 
				+            raise DropItem("Missing %s of blogpost from %s" %(data, item['url']))
			
 
				+        if valid:
			
 
				+        #Insert data into database
			
 
				+            new_moive=[{
			
 
				+                "name":item['name'][0],
			
 
				+                "year":item['year'][0],
			
 
				+                "score":item['score'],
			
 
				+                "director":item['director'],
			
 
				+                "classification":item['classification'],
			
 
				+                "actor":item['actor']
			
 
				+            }]
			
 
				+            self.collection.insert(new_moive)
			
 
				+            log.msg("Item wrote to MongoDB database %s/%s" %
			
 
				+            (settings['MONGODB_DB'], settings['MONGODB_COLLECTION']),
			
 
				+            level=log.DEBUG, spider=spider) 
			
 
				+        return item
			
 
				+
			
--- a/Day66-75/code/douban/douban/settings.py
+++ b/Day66-75/code/douban/douban/settings.py
@@ -0,0 +1,98 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+
			
 
				+# Scrapy settings for douban project
			
 
				+#
			
 
				+# For simplicity, this file contains only settings considered important or
			
 
				+# commonly used. You can find more settings consulting the documentation:
			
 
				+#
			
 
				+#     https://doc.scrapy.org/en/latest/topics/settings.html
			
 
				+#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
			
 
				+#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
			
 
				+
			
 
				+BOT_NAME = 'douban'
			
 
				+
			
 
				+SPIDER_MODULES = ['douban.spiders']
			
 
				+NEWSPIDER_MODULE = 'douban.spiders'
			
 
				+
			
 
				+
			
 
				+# Crawl responsibly by identifying yourself (and your website) on the user-agent
			
 
				+USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.54 Safari/536.5'
			
 
				+
			
 
				+# Obey robots.txt rules
			
 
				+ROBOTSTXT_OBEY = True
			
 
				+
			
 
				+# Configure maximum concurrent requests performed by Scrapy (default: 16)
			
 
				+#CONCURRENT_REQUESTS = 32
			
 
				+
			
 
				+# Configure a delay for requests for the same website (default: 0)
			
 
				+# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
			
 
				+# See also autothrottle settings and docs
			
 
				+DOWNLOAD_DELAY = 3
			
 
				+RANDOMIZE_DOWNLOAD_DELAY = True
			
 
				+# The download delay setting will honor only one of:
			
 
				+#CONCURRENT_REQUESTS_PER_DOMAIN = 16
			
 
				+#CONCURRENT_REQUESTS_PER_IP = 16
			
 
				+
			
 
				+# Disable cookies (enabled by default)
			
 
				+COOKIES_ENABLED = True
			
 
				+
			
 
				+MONGODB_SERVER = '120.77.222.217'
			
 
				+MONGODB_PORT = 27017
			
 
				+MONGODB_DB = 'douban'
			
 
				+MONGODB_COLLECTION = 'movie'
			
 
				+
			
 
				+# Disable Telnet Console (enabled by default)
			
 
				+#TELNETCONSOLE_ENABLED = False
			
 
				+
			
 
				+# Override the default request headers:
			
 
				+#DEFAULT_REQUEST_HEADERS = {
			
 
				+#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
			
 
				+#   'Accept-Language': 'en',
			
 
				+#}
			
 
				+
			
 
				+# Enable or disable spider middlewares
			
 
				+# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
			
 
				+#SPIDER_MIDDLEWARES = {
			
 
				+#    'douban.middlewares.DoubanSpiderMiddleware': 543,
			
 
				+#}
			
 
				+
			
 
				+# Enable or disable downloader middlewares
			
 
				+# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
			
 
				+#DOWNLOADER_MIDDLEWARES = {
			
 
				+#    'douban.middlewares.DoubanDownloaderMiddleware': 543,
			
 
				+#}
			
 
				+
			
 
				+# Enable or disable extensions
			
 
				+# See https://doc.scrapy.org/en/latest/topics/extensions.html
			
 
				+#EXTENSIONS = {
			
 
				+#    'scrapy.extensions.telnet.TelnetConsole': None,
			
 
				+#}
			
 
				+
			
 
				+# Configure item pipelines
			
 
				+# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
			
 
				+ITEM_PIPELINES = {
			
 
				+    'douban.pipelines.DoubanPipeline': 400,
			
 
				+}
			
 
				+
			
 
				+LOG_LEVEL = 'DEBUG'
			
 
				+
			
 
				+# Enable and configure the AutoThrottle extension (disabled by default)
			
 
				+# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
			
 
				+#AUTOTHROTTLE_ENABLED = True
			
 
				+# The initial download delay
			
 
				+#AUTOTHROTTLE_START_DELAY = 5
			
 
				+# The maximum download delay to be set in case of high latencies
			
 
				+#AUTOTHROTTLE_MAX_DELAY = 60
			
 
				+# The average number of requests Scrapy should be sending in parallel to
			
 
				+# each remote server
			
 
				+#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
			
 
				+# Enable showing throttling stats for every response received:
			
 
				+#AUTOTHROTTLE_DEBUG = False
			
 
				+
			
 
				+# Enable and configure HTTP caching (disabled by default)
			
 
				+# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
			
 
				+#HTTPCACHE_ENABLED = True
			
 
				+#HTTPCACHE_EXPIRATION_SECS = 0
			
 
				+#HTTPCACHE_DIR = 'httpcache'
			
 
				+#HTTPCACHE_IGNORE_HTTP_CODES = []
			
 
				+#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
			
--- a/Day66-75/code/douban/douban/spiders/__init__.py
+++ b/Day66-75/code/douban/douban/spiders/__init__.py
@@ -0,0 +1,4 @@
 
				+# This package will contain the spiders of your Scrapy project
			
 
				+#
			
 
				+# Please refer to the documentation for information on how to create and manage
			
 
				+# your spiders.
			
--- a/Day66-75/code/douban/douban/spiders/movie.py
+++ b/Day66-75/code/douban/douban/spiders/movie.py
@@ -0,0 +1,32 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+import scrapy
			
 
				+from scrapy.selector import Selector
			
 
				+from scrapy.linkextractors import LinkExtractor
			
 
				+from scrapy.spiders import CrawlSpider, Rule
			
 
				+
			
 
				+from douban.items import DoubanItem
			
 
				+
			
 
				+
			
 
				+class MovieSpider(CrawlSpider):
			
 
				+    name = 'movie'
			
 
				+    allowed_domains = ['movie.douban.com']
			
 
				+    start_urls = ['https://movie.douban.com/top250']
			
 
				+    rules = (
			
 
				+        Rule(LinkExtractor(allow=(r'https://movie.douban.com/top250\?start=\d+.*'))),
			
 
				+        Rule(LinkExtractor(allow=(r'https://movie.douban.com/subject/\d+')), callback='parse_item'),
			
 
				+    )
			
 
				+
			
 
				+    def parse_item(self, response):
			
 
				+        sel = Selector(response)
			
 
				+        item = DoubanItem()
			
 
				+        item['name']=sel.xpath('//*[@id="content"]/h1/span[1]/text()').extract()
			
 
				+        item['year']=sel.xpath('//*[@id="content"]/h1/span[2]/text()').re(r'\((\d+)\)')
			
 
				+        item['score']=sel.xpath('//*[@id="interest_sectl"]/div/p[1]/strong/text()').extract()
			
 
				+        item['director']=sel.xpath('//*[@id="info"]/span[1]/a/text()').extract()
			
 
				+        item['classification']= sel.xpath('//span[@property="v:genre"]/text()').extract()
			
 
				+        item['actor']= sel.xpath('//*[@id="info"]/span[3]/a[1]/text()').extract()
			
 
				+        #i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract()
			
 
				+        #i['name'] = response.xpath('//div[@id="name"]').extract()
			
 
				+        #i['description'] = response.xpath('//div[@id="description"]').extract()
			
 
				+        return item
			
 
				+
			
--- a/Day66-75/code/douban/scrapy.cfg
+++ b/Day66-75/code/douban/scrapy.cfg
@@ -0,0 +1,11 @@
 
				+# Automatically created by: scrapy startproject
			
 
				+#
			
 
				+# For more information about the [deploy] section see:
			
 
				+# https://scrapyd.readthedocs.io/en/latest/deploy.html
			
 
				+
			
 
				+[settings]
			
 
				+default = douban.settings
			
 
				+
			
 
				+[deploy]
			
 
				+#url = http://localhost:6800/
			
 
				+project = douban
			
--- a/Day66-75/code/example07.py
+++ b/Day66-75/code/example07.py
@@ -0,0 +1,37 @@
 
				+import pymongo
			
 
				+
			
 
				+
			
 
				+# BSON - Binary JSON - dict
			
 
				+def main():
			
 
				+    # client = pymongo.MongoClient('mongodb://120.77.222.217:27017')
			
 
				+    client = pymongo.MongoClient(host='120.77.222.217', port=27017)
			
 
				+    db = client.zhihu
			
 
				+    pages_cache = db.webpages
			
 
				+    """
			
 
				+    pages_cache.insert_many([
			
 
				+        {'_id': 1, 'url': 'http://www.baidu.com', 'content': 'shit'},
			
 
				+        {'_id': 2, 'url': 'http://www.qq.com', 'content': 'another shit'},
			
 
				+        {'_id': 3, 'url': 'http://www.qfedu.com', 'content': 'biggest shit'}
			
 
				+    ])
			
 
				+    
			
 
				+    print(pages_cache.update({'_id': 5}, {'$set': {'content': 'hello, world!'}}, upsert=True))
			
 
				+    # page_id = pages_cache.insert_one({'url': 'http://www.baidu.com', 'content': '<html></html>'})
			
 
				+    # print(page_id.inserted_id)
			
 
				+    # print(pages_cache.remove({'url': 'http://www.baidu.com'}))
			
 
				+    print(pages_cache.find().count())
			
 
				+    for doc in pages_cache.find().sort('_id'):
			
 
				+        print(doc)
			
 
				+    """
			
 
				+    pages_cache.insert_one({
			
 
				+        'url': 'http://www.baidu.com',
			
 
				+        'content': 'bull shit!',
			
 
				+        'owner': {
			
 
				+            'name': 'Lee Yanhong',
			
 
				+            'age': 50,
			
 
				+            'idcard': '110220196804091203'
			
 
				+        }
			
 
				+    })
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    main()
			
--- a/Day66-75/code/example08.py
+++ b/Day66-75/code/example08.py
@@ -0,0 +1,28 @@
 
				+import requests
			
 
				+from bs4 import BeautifulSoup
			
 
				+
			
 
				+
			
 
				+def main():
			
 
				+    resp = requests.get('https://github.com/login')
			
 
				+    if resp.status_code != 200:
			
 
				+        return
			
 
				+    cookies = resp.cookies.get_dict()
			
 
				+    print(cookies)
			
 
				+    soup = BeautifulSoup(resp.text, 'lxml')
			
 
				+    utf8_value = \
			
 
				+        soup.select_one('form input[name=utf8]').attrs['value']
			
 
				+    authenticity_token_value = \
			
 
				+        soup.select_one('form input[name=authenticity_token]').attrs['value']
			
 
				+    data = {
			
 
				+        'utf8': utf8_value,
			
 
				+        'authenticity_token': authenticity_token_value,
			
 
				+        'login': 'jackfrued@gmail.com',
			
 
				+        'password': 'yourpassword'
			
 
				+    }
			
 
				+    resp = requests.post('https://github.com/session',
			
 
				+                         data=data, cookies=cookies)
			
 
				+    print(resp.text)
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    main()
			
--- a/Day66-75/code/example09.py
+++ b/Day66-75/code/example09.py
@@ -0,0 +1,16 @@
 
				+import robobrowser
			
 
				+
			
 
				+
			
 
				+def main():
			
 
				+    b = robobrowser.RoboBrowser(parser='lxml')
			
 
				+    b.open('https://github.com/login')
			
 
				+    f = b.get_form(action='/session')
			
 
				+    f['login'].value = 'jackfrued@gmail.com'
			
 
				+    f['password'].value = 'yourpassword'
			
 
				+    b.submit_form(f)
			
 
				+    for a_tag in b.select('a[href]'):
			
 
				+        print(a_tag.attrs['href'])
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    main()
			
--- a/Day66-75/code/example10.py
+++ b/Day66-75/code/example10.py
@@ -1,33 +1,12 @@
 
				-import requests
			
 
				-from bs4 import BeautifulSoup
			
 
				-# selenium是一个自动化测试工具
			
 
				-# 通过它可以模拟浏览器的行为来访问Web页面
			
 
				-from selenium import webdriver
			
 
				+import robobrowser
			
 
				 
			
 
				 
			
 
				 def main():
			
 
				-    # 先下载chromedriver并且将可执行程序放到PATH环境变量路径下
			
 
				-    # 创建谷歌Chrome浏览器内核
			
 
				-    driver = webdriver.Chrome()
			
 
				-    # 通过浏览器内核加载页面(可以加载动态生成的内容)
			
 
				-    driver.get('https://www.taobao.com/markets/mm/mm2017')
			
 
				-    # driver.page_source获得的页面包含了JavaScript动态创建的内容
			
 
				-    soup = BeautifulSoup(driver.page_source, 'lxml')
			
 
				-    all_images = soup.select('img[src]')
			
 
				-    for image in all_images:
			
 
				-        url = image.get('src')
			
 
				-        try:
			
 
				-            if not str(url).startswith('http'):
			
 
				-                url = 'http:' + url
			
 
				-            filename = url[url.rfind('/') + 1:]
			
 
				-            print(filename)
			
 
				-            resp = requests.get(url)
			
 
				-            with open('c:/images/' + filename, 'wb') as f:
			
 
				-                f.write(resp.content)
			
 
				-        except OSError:
			
 
				-            print(filename + '下载失败!')
			
 
				-    print('图片下载完成!')
			
 
				+    b = robobrowser.RoboBrowser(parser='lxml')
			
 
				+    b.open('https://v.taobao.com/v/content/live?catetype=704&from=taonvlang')
			
 
				+    for img_tag in b.select('img[src]'):
			
 
				+        print(img_tag.attrs['src'])
			
 
				 
			
 
				 
			
 
				 if __name__ == '__main__':
			
 
				-    main()
			
 
				+    main()
			
--- a/Day66-75/code/example11.py
+++ b/Day66-75/code/example11.py
@@ -0,0 +1,18 @@
 
				+from bs4 import BeautifulSoup
			
 
				+from selenium import webdriver
			
 
				+from selenium.webdriver.common.keys import Keys
			
 
				+
			
 
				+
			
 
				+def main():
			
 
				+    driver = webdriver.Chrome()
			
 
				+    driver.get('https://v.taobao.com/v/content/live?catetype=704&from=taonvlang')
			
 
				+    elem = driver.find_element_by_css_selector('input[placeholder=输入关键词搜索]')
			
 
				+    elem.send_keys('运动')
			
 
				+    elem.send_keys(Keys.ENTER)
			
 
				+    soup = BeautifulSoup(driver.page_source, 'lxml')
			
 
				+    for img_tag in soup.body.select('img[src]'):
			
 
				+        print(img_tag.attrs['src'])
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    main()
			
--- a/Day66-75/code/example12.py
+++ b/Day66-75/code/example12.py
@@ -0,0 +1,29 @@
 
				+import base64
			
 
				+
			
 
				+from PIL import Image, ImageFilter
			
 
				+from pytesseract import image_to_string
			
 
				+
			
 
				+import requests
			
 
				+from io import BytesIO
			
 
				+
			
 
				+
			
 
				+def main():
			
 
				+    guido_img = Image.open(open('guido.jpg', 'rb'))
			
 
				+    guido2_img = guido_img.filter(ImageFilter.GaussianBlur)
			
 
				+    guido2_img.save(open('guido2.jpg', 'wb'))
			
 
				+
			
 
				+    img1 = Image.open(open('tesseract.png', 'rb'))
			
 
				+    img2 = img1.point(lambda x: 0 if x < 128 else 255)
			
 
				+    img2.save(open('tesseract2.png', 'wb'))
			
 
				+
			
 
				+    print(image_to_string(img2))
			
 
				+
			
 
				+    resp = requests.get('https://pin2.aliyun.com/get_img?type=150_40&identity=mailsso.mxhichina.com&sessionid=k0xHyBxU3K3dGXb59mP9cdeTXxL9gLHSTKhRZCryHxpOoyk4lAVuJhgw==')
			
 
				+    img3 = Image.open(BytesIO(resp.content))
			
 
				+    img3.save('captcha.jpg')
			
 
				+    print(image_to_string(img3))
			
 
				+    print(base64.b64encode(resp.content))
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    main()
			
--- a/Day66-75/code/guido.jpg
+++ b/Day66-75/code/guido.jpg
--- a/Day66-75/code/tesseract.png
+++ b/Day66-75/code/tesseract.png
--- a/Day66-75/res/douban-xpath.png
+++ b/Day66-75/res/douban-xpath.png