7 роки тому · 452b6f1441
--- a/Day66-75/code/example01.py
+++ b/Day66-75/code/example01.py
@@ -0,0 +1,60 @@
 
				+from urllib.error import URLError
			
 
				+from urllib.request import urlopen
			
 
				+
			
 
				+import re
			
 
				+import pymysql
			
 
				+
			
 
				+
			
 
				+def get_page_code(start_url, *, retry_times=3, charsets=('utf-8', )):
			
 
				+    try:
			
 
				+        for charset in charsets:
			
 
				+            try:
			
 
				+                html = urlopen(start_url).read().decode(charset)
			
 
				+                break
			
 
				+            except UnicodeDecodeError:
			
 
				+                html = None
			
 
				+    except URLError as ex:
			
 
				+        print('Error:', ex)
			
 
				+        return get_page_code(start_url, retry_times=retry_times - 1, charsets=charsets) if \
			
 
				+            retry_times > 0 else None
			
 
				+    return html
			
 
				+
			
 
				+
			
 
				+def main():
			
 
				+    url_list = ['http://sports.sohu.com/nba_a.shtml']
			
 
				+    visited_list = set({})
			
 
				+    while len(url_list) > 0:
			
 
				+        current_url = url_list.pop(0)
			
 
				+        visited_list.add(current_url)
			
 
				+        print(current_url)
			
 
				+        html = get_page_code(current_url, charsets=('utf-8', 'gbk', 'gb2312'))
			
 
				+        if html:
			
 
				+            link_regex = re.compile(r'<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE)
			
 
				+            link_list = re.findall(link_regex, html)
			
 
				+            url_list += link_list
			
 
				+            conn = pymysql.connect(host='localhost', port=3306,
			
 
				+                                   db='crawler', user='root',
			
 
				+                                   passwd='123456', charset='utf8')
			
 
				+            try:
			
 
				+                for link in link_list:
			
 
				+                    if link not in visited_list:
			
 
				+                        visited_list.add(link)
			
 
				+                        print(link)
			
 
				+                        html = get_page_code(link, charsets=('utf-8', 'gbk', 'gb2312'))
			
 
				+                        if html:
			
 
				+                            title_regex = re.compile(r'<h1>(.*)<span', re.IGNORECASE)
			
 
				+                            match_list = title_regex.findall(html)
			
 
				+                            if len(match_list) > 0:
			
 
				+                                title = match_list[0]
			
 
				+                                with conn.cursor() as cursor:
			
 
				+                                    cursor.execute('insert into tb_result (rtitle, rurl) values (%s, %s)',
			
 
				+                                               (title, link))
			
 
				+                                conn.commit()
			
 
				+            finally:
			
 
				+                conn.close()
			
 
				+    print('执行完成!')
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    main()
			
 
				+
			
--- a/Day66-75/code/example02.py
+++ b/Day66-75/code/example02.py
@@ -0,0 +1,50 @@
 
				+from bs4 import BeautifulSoup
			
 
				+
			
 
				+import re
			
 
				+
			
 
				+
			
 
				+def main():
			
 
				+    html = """
			
 
				+    <!DOCTYPE html>
			
 
				+    <html lang="en">
			
 
				+    <head>
			
 
				+        <meta charset="UTF-8">
			
 
				+        <title>首页</title>
			
 
				+    </head>
			
 
				+    <body>
			
 
				+        <h1>Hello, world!</h1>
			
 
				+        <p>Good!!!</p>
			
 
				+        <hr>
			
 
				+        <div>
			
 
				+            <h2>这是一个例子程序</h2>
			
 
				+            <p>静夜思</p>
			
 
				+            <p class="foo">床前明月光</p>
			
 
				+            <p id="bar">疑似地上霜</p>
			
 
				+            <p class="foo">举头望明月</p>
			
 
				+            <div><a href="http://www.baidu.com"><p>低头思故乡</p></a></div>
			
 
				+        </div>
			
 
				+        <a class="foo" href="http://www.qq.com">腾讯网</a>
			
 
				+        <img src="./img/pretty-girl.png" alt="美女">
			
 
				+        <img src="./img/hellokitty.png" alt="凯蒂猫">
			
 
				+        <img src="./static/img/pretty-girl.png" alt="美女">
			
 
				+        <goup>Hello, Goup!</goup>
			
 
				+    </body>
			
 
				+    </html>
			
 
				+    """
			
 
				+    # resp = requests.get('http://sports.sohu.com/nba_a.shtml')
			
 
				+    # html = resp.content.decode('gbk')
			
 
				+    soup = BeautifulSoup(html, 'lxml')
			
 
				+    print(soup.title)
			
 
				+    # JavaScript: document.body.h1
			
 
				+    # JavaScript: document.forms[0]
			
 
				+    print(soup.body.h1)
			
 
				+    print(soup.find_all(re.compile(r'p$')))
			
 
				+    print(soup.find_all('img', {'src': re.compile(r'\./img/\w+.png')}))
			
 
				+    print(soup.find_all(lambda x: len(x.attrs) == 2))
			
 
				+    print(soup.find_all('p', {'class': 'foo'}))
			
 
				+    for elem in soup.select('a[href]'):
			
 
				+        print(elem.attrs['href'])
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    main()
			
--- a/Day66-75/code/example03.py
+++ b/Day66-75/code/example03.py
@@ -0,0 +1,27 @@
 
				+from bs4 import BeautifulSoup
			
 
				+
			
 
				+import requests
			
 
				+
			
 
				+import re
			
 
				+
			
 
				+
			
 
				+def main():
			
 
				+    # 通过requests第三方库的get方法获取页面
			
 
				+    resp = requests.get('http://sports.sohu.com/nba_a.shtml')
			
 
				+    # 对响应的字节串(bytes)进行解码操作(搜狐的部分页面使用了GBK编码)
			
 
				+    html = resp.content.decode('gbk')
			
 
				+    # 创建BeautifulSoup对象来解析页面(相当于JavaScript的DOM)
			
 
				+    bs = BeautifulSoup(html, 'lxml')
			
 
				+    # 通过CSS选择器语法查找元素并通过循环进行处理
			
 
				+    # for elem in bs.find_all(lambda x: 'test' in x.attrs):
			
 
				+    for elem in bs.select('a[test]'):
			
 
				+        # 通过attrs属性(字典)获取元素的属性值
			
 
				+        link_url = elem.attrs['href']
			
 
				+        resp = requests.get(link_url)
			
 
				+        bs_sub = BeautifulSoup(resp.text, 'lxml')
			
 
				+        # 使用正则表达式对获取的数据做进一步的处理
			
 
				+        print(re.sub(r'[\r\n]', '', bs_sub.find('h1').text))
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    main()