├── chapter_2
    ├── books.csv
    ├── class.py
    ├── def.py
    ├── for_and_while.py
    ├── greet-with-comments.py
    ├── greet.py
    ├── if.py
    ├── import.py
    ├── python_scraper.py
    ├── save.py
    ├── save_csv.py
    ├── save_csv_dict.py
    ├── save_csv_join.py
    ├── save_sqlite3.py
    ├── scrape_re.py
    ├── scrape_rss.py
    ├── try_and_with.py
    ├── urlopen_encoding.py
    └── urlopen_meta.py
├── chapter_3
    ├── python_crawler_1.py
    ├── python_crawler_2.py
    ├── python_crawler_3.py
    ├── python_crawler_4.py
    ├── python_crawler_5.py
    ├── python_crawler_6.py
    ├── python_crawler_final.py
    ├── save_mongo.py
    ├── save_mysql.py
    ├── scrape_by_bs4.py
    ├── scrape_by_feedparser.py
    └── scrape_by_lxml.py
├── chapter_4
    ├── error_handling.py
    ├── error_handling_with_retrying.py
    ├── request_with_cache.py
    ├── send_email.py
    ├── validate_with_re.py
    └── validate_with_voluptuous.py
├── chapter_5
    ├── get_museums.py
    ├── get_museums_with_location.py
    ├── import_from_stream_api_to_bigquery.py
    ├── konlpy_sample.py
    ├── museums.html
    ├── naver_order_history.py
    ├── plot_advanced_graph.py
    ├── plot_historical_data.py
    ├── print_pdf_textboxes.py
    ├── rest_api_with_requests_oauthlib.py
    ├── rest_api_with_tweepy.py
    ├── robobrowser_google.py
    ├── save_youtube_video_metadata.py
    ├── search_youtube_videos.py
    ├── selenium_google.py
    ├── shopping_rss.py
    ├── shopping_selenium.py
    ├── streaming_api_with_tweepy.py
    └── word_frequency.py
├── chapter_6
    ├── 6-1
    │   └── myspider.py
    ├── 6-2
    │   ├── myproject
    │   │   ├── __init__.py
    │   │   ├── items.py
    │   │   ├── pipelines.py
    │   │   ├── settings.py
    │   │   └── spiders
    │   │   │   └── news.py
    │   └── scrapy.cfg
    ├── 6-3
    │   ├── myproject
    │   │   ├── __init__.py
    │   │   ├── items.py
    │   │   ├── pipelines.py
    │   │   ├── settings.py
    │   │   └── spiders
    │   │   │   ├── hanbit.py
    │   │   │   └── news_crawl.py
    │   └── scrapy.cfg
    ├── 6-4
    │   └── pipelines.py
    ├── 6-7
    │   ├── myproject
    │   │   ├── __init__.py
    │   │   ├── items.py
    │   │   ├── pipelines.py
    │   │   ├── settings.py
    │   │   ├── spiders
    │   │   │   ├── broad.py
    │   │   │   └── visitseoul.py
    │   │   └── utils.py
    │   └── scrapy.cfg
    └── 6-8
    │   ├── extract_faces.py
    │   ├── myproject
    │       ├── __init__.py
    │       ├── items.py
    │       ├── pipelines.py
    │       ├── settings.py
    │       ├── spiders
    │       │   └── flickr.py
    │       └── utils.py
    │   └── scrapy.cfg
├── chapter_7
    ├── crawl.py
    ├── crawl_images.py
    ├── crawl_with_aiohttp.py
    ├── crawl_with_multi_thread.py
    ├── enqueue.py
    ├── scraper_tasks.py
    ├── slow_jobs_async.py
    ├── slow_jobs_sync.py
    └── tasks.py
└── readme.md


/chapter_2/books.csv:
--------------------------------------------------------------------------------
 1 | No,도서명,가격
 2 | 1,처음 시작하는 R 데이터 분석,19800원
 3 | 2,데이터 과학을 위한 통계,32000원
 4 | 3,오준석의 안드로이드 생존코딩(코틀린 편),32000원
 5 | 4,처음 배우는 스프링 부트 2,22000원
 6 | 5,회사에서 바로 통하는 실무 엑셀+파워포인트+워드&한글,22000원
 7 | 6,RxJS 프로그래밍: 75가지 핵심 문법과 예제로 익히는 RxJS 기초,32000원
 8 | 7,스프링 5 레시피(4판),60000원
 9 | 8,자바를 활용한 딥러닝,38000원
10 | 9,회사에서 바로 통하는 엑셀 실무 강의,21000원
11 | 10,모던 스타트업,20000원
12 | 11,파이썬 웹 프로그래밍(개정판),22000원
13 | 12,피 땀 픽셀 : 트리플 A 게임은 어떻게 만들어지는가,18000원
14 | 13,엔트리 피지컬 컴퓨팅을 만나다,18000원
15 | 14,회사에서 바로 통하는 오토캐드 2019,28000원
16 | 15,맛있는 디자인 프리미어 프로&애프터 이펙트 CC 2018,24000원
17 | 16,맛있는 디자인 포토샵&인디자인 CC 2018,24000원
18 | 17,만들면서 배우는 워드프레스(개정판),26000원
19 | 18,처음 배우는 암호화,29000원
20 | 19,나의 첫 안드로이드 : 처음 시작하는 개발자를 위한,32000원
21 | 20,윤피티의 SNS 콘텐츠 만들기 with 파워포인트,18000원
22 | 21,Head First Android Development : 개념과 구조를 머릿속에 그려주는 안드로이드 개발 입문서(개정판),40000원
23 | 22,그것이 R고 싶다,32000원
24 | 23,이것이 C#이다,30000원
25 | 24,맛있는 디자인 애프터 이펙트 CC 2018,24000원
26 | 25,Hello Coding 한입에 쏙 파이썬,15000원
27 | 26,고객이 보이는 구글 애널리틱스,28000원
28 | 27,머신러닝 실무 프로젝트,18000원
29 | 28,맛있는 디자인 일러스트레이터 CC 2018,25000원
30 | 29,좋은 사진을 만드는 박승근의 드론 사진 강의,28000원
31 | 30,엔지니어를 위한 블록체인 프로그래밍,26000원
32 | 31,우아한 사이파이,28000원
33 | 32,파이썬을 활용한 금융공학 레시피,28000원
34 | 33,자바로 배우는 핵심 자료구조와 알고리즘,16000원
35 | 34,처음 배우는 블록체인,28000원
36 | 35,아무것도 모르고 시작하는 인공지능 첫걸음,22000원
37 | 36,인공지능 콘텐츠 혁명,18000원
38 | 37,맛있는 디자인 프리미어 프로 CC 2018,23000원
39 | 38,엑셀 2016 함수&수식 바이블,35000원
40 | 39,러닝 텐서플로,23000원
41 | 40,Java 9 모듈 프로그래밍,21000원
42 | 41,자바 프로젝트 필수 유틸리티,35000원
43 | 42,달인과 함께 하는 마인크래프트 세계 건축 여행 : 아시아와 아프리카,10000원
44 | 43,달인과 함께 하는 마인크래프트 세계 건축 여행: 유럽과 아메리카,10000원
45 | 44,핸즈온 머신러닝,33000원
46 | 45,NDC ART BOOK 2018,20000원
47 | 46,짧은 애니메이션 만들기 with 클립 스튜디오,25000원
48 | 47,맛있는 디자인 포토샵&일러스트레이터 CC 2018,23000원
49 | 48,파이썬 정복,22000원
50 | 49,Vue.js 첫걸음,22000원
51 | 50,데이터 분석을 위한 SQL 레시피,36000원


--------------------------------------------------------------------------------
/chapter_2/class.py:
--------------------------------------------------------------------------------
 1 | # Rect라는 이름의 클래스를 지정합니다.
 2 | class Rect:
 3 |     # 인스턴스가 생성될 때 호출되는 특수한 메서드를 정의합니다.
 4 |     def __init__(self, width, height):
 5 |         self.width = width    # width 속성에 값을 할당합니다.
 6 |         self.height = height  # height 속성에 값을 할당합니다.
 7 |     # 사각형의 넓이를 계산하는 메서드를 정의합니다.
 8 |     def area(self):
 9 |         return self.width * self.height
10 | 
11 | r = Rect(100, 20)
12 | print(r.width, r.height, r.area())   # 100 20 2000을 출력합니다.
13 | 
14 | # Rect를 상속받아 Square 클래스를 정의합니다.
15 | class Square(Rect):
16 |     def __init__(self, width):
17 |         # 부모 클래스의 메서드를 호출합니다.
18 |         super().__init__(width, width)


--------------------------------------------------------------------------------
/chapter_2/def.py:
--------------------------------------------------------------------------------
 1 | # add라는 이름의 함수를 정의합니다.
 2 | # 이 함수는 매개변수로 a와 b를 받고 더한 뒤 반환합니다.
 3 | def add(a, b):
 4 |     return a + b  # return 구문으로 값을 반환합니다.
 5 | 
 6 | # 함수를 호출할 때는 함수 이름 뒤에 괄호를 입력하고
 7 | # 내부에 매개변수를 지정합니다.
 8 | print(add(1, 2))  # 3라고 출력합니다.
 9 | 
10 | # <매개변수>=<값>이라는 형태로도 매개변수를 지정할 수 있습니다.
11 | # 이를 키워드 매개변수라고 합니다.
12 | print(add(1, b=3))  # 4라고 출력합니다.


--------------------------------------------------------------------------------
/chapter_2/for_and_while.py:
--------------------------------------------------------------------------------
 1 | # 변수 x에 in의 오른쪽 리스트가 차례대로 들어갑니다.
 2 | # 따라서 블록 내부의 처리가 3번 반복됩니다.
 3 | for x in [1, 2, 3]:
 4 |     # 1, 2, 3이 차례대로 출력됩니다.
 5 |     print(x)  
 6 | 
 7 | # 횟수를 지정해서 반복할 때는 range()를 사용합니다
 8 | for i in range(10):
 9 |     # 0 9가 차례대로 출력됩니다.
10 |     print(i)  
11 | 
12 | # for 구문으로 dict를 지정하면 키를 기반으로 순회합니다.
13 | d = {'a': 1, 'b': 2}
14 | for key in d:
15 |     value = d[key]
16 |     print(key, value)
17 | 
18 | # dict의 items() 메서드로 dict 키와 값을 순회합니다.
19 | for key, value in d.items():
20 |     print(key, value)
21 | 
22 | # while 구문으로 식이 참일 때 반복 처리합니다.
23 | s = 1
24 | while s < 1000:
25 |     # # 1, 2, 4, 8, 16, 32, 64, 128, 256, 512가 차례대로 출력됩니다.
26 |     print(s)
27 |     s = s * 2


--------------------------------------------------------------------------------
/chapter_2/greet-with-comments.py:
--------------------------------------------------------------------------------
 1 | # import 구문으로 sys 모듈을 읽어 들입니다.
 2 | import sys
 3 | 
 4 | # def 구문으로 greet() 함수를 정의합니다.
 5 | # 들여쓰기돼 있는 줄이 함수의 내용을 나타냅니다.
 6 | def greet(name):
 7 |     # print() 함수를 사용해 문자열을 출력합니다.
 8 |     print('Hello, {0}!'.format(name))  
 9 | 
10 | # if 구문도 들여쓰기로 범위를 나타냅니다.
11 | # sys.argv는 명령줄 매개변수를 나타내는 리스트 형식의 변수입니다.
12 | if len(sys.argv) > 1:
13 |     # if 구문의 조건이 참일 때
14 |     # 변수는 정의하지 않고 곧바로 사용할 수 있습니다.
15 |     name = sys.argv[1]
16 |     # greet() 함수를 호출합니다.
17 |     greet(name)
18 | else:
19 |     # if 구문의 조건이 거짓일 때
20 |     # greet 함수를 호출합니다.
21 |     greet('world')


--------------------------------------------------------------------------------
/chapter_2/greet.py:
--------------------------------------------------------------------------------
1 | import sys
2 | 
3 | def greet(name):
4 |     print('Hello, {0}!'.format(name))
5 | if len(sys.argv) > 1:
6 |     name = sys.argv[1]
7 |     greet(name)
8 | else:
9 |     greet('world')


--------------------------------------------------------------------------------
/chapter_2/if.py:
--------------------------------------------------------------------------------
 1 | # 변수를 선언합니다.
 2 | a = 1
 3 | 
 4 | # if 구문으로 처리를 분기합니다.
 5 | if a == 1:
 6 |     # if 구문의 식이 참일 때 실행합니다.
 7 |     print('a is 1')
 8 | elif a == 2:
 9 |     # elif 절의 식이 참일 때 실행합니다.
10 |     print('a is 2')
11 | else:
12 |     # 어떠한 조건해도 해당하지 않을 때 실행합니다.
13 |     print('a is not 1 nor 2')
14 | 
15 | # 조건문을 한 줄로 적을 수 있지만 읽기 어려우므로 사용하지 않는 것이 좋습니다.
16 | print('a is 1' if a == 1 else 'a is not 1')


--------------------------------------------------------------------------------
/chapter_2/import.py:
--------------------------------------------------------------------------------
 1 | # sys 모듈을 현재 이름 공간으로 읽어 들입니다.
 2 | import sys 
 3 | 
 4 | # datetime 모듈에서 date 클래스를 읽어 들입니다.
 5 | from datetime import date
 6 | 
 7 | # sys 모듈의 argv라는 변수로 명령줄 매개변수 리스트를 추출하고 출력합니다.
 8 | print(sys.argv)
 9 | # date 클래스의 today() 메서드로 현재 날짜를 추출합니다.
10 | print(date.today())


--------------------------------------------------------------------------------
/chapter_2/python_scraper.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import sqlite3
 3 | from urllib.request import urlopen
 4 | from html import unescape
 5 | 
 6 | def main():
 7 |     """
 8 |     메인 처리입니다.
 9 |     fetch(), scrape(), save() 함수를 호출합니다.
10 |     """
11 |     html = fetch('http://www.hanbit.co.kr/store/books/full_book_list.html')
12 |     books = scrape(html)
13 |     save('books.db', books)
14 | 
15 | def fetch(url):
16 |     """
17 |     매개변수로 전달받을 url을 기반으로 웹 페이지를 추출합니다.
18 |     웹 페이지의 인코딩 형식은 Content-Type 헤더를 기반으로 알아냅니다.
19 |     반환값: str 자료형의 HTML
20 |     """
21 |     f = urlopen(url)
22 |     # HTTP 헤더를 기반으로 인코딩 형식을 추출합니다.
23 |     encoding = f.info().get_content_charset(failobj="utf-8")
24 |     # 추출한 인코딩 형식을 기반으로 문자열을 디코딩합니다.
25 |     html = f.read().decode(encoding)
26 |     return html
27 | 
28 | def scrape(html):
29 |     """
30 |     매개변수 html로 받은 HTML을 기반으로 정규 표현식을 사용해 도서 정보를 추출합니다.
31 |     반환값: 도서(dict) 리스트
32 |     """
33 |     books = []
34 |     # re.findall()을 사용해 도서 하나에 해당하는 HTML을 추출합니다.
35 |     for partial_html in re.findall(r'<td class="left"><a.*?</td>', html, re.DOTALL):
36 |         # 도서의 URL을 추출합니다.
37 |         url = re.search(r'<a href="(.*?)">', partial_html).group(1)
38 |         url = 'http://www.hanbit.co.kr' + url
39 |         # 태그를 제거해서 도서의 제목을 추출합니다.
40 |         title = re.sub(r'<.*?>', '', partial_html)
41 |         title = unescape(title)
42 |         books.append({'url': url, 'title': title})
43 |     
44 |     return books
45 | 
46 | def save(db_path, books):
47 |     """
48 |     매개변수 books로 전달된 도서 목록을 SQLite 데이터베이스에 저장합니다.
49 |     데이터베이스의 경로는 매개변수 dp_path로 지정합니다.
50 |     반환값: None(없음)
51 |     """
52 |     # 데이터베이스를 열고 연결을 확립합니다.
53 |     conn = sqlite3.connect(db_path)
54 |     # 커서를 추출합니다.
55 |     c = conn.cursor()
56 |     # execute() 메서드로 SQL을 실행합니다.
57 |     # 스크립트를 여러 번 실행할 수 있으므로 기존의 books 테이블을 제거합니다.
58 |     c.execute('DROP TABLE IF EXISTS books')
59 |     # books 테이블을 생성합니다.
60 |     c.execute('''
61 |         CREATE TABLE books (
62 |             title text,
63 |             url text
64 |         )
65 |     ''')
66 |     # executemany() 메서드를 사용하면 매개변수로 리스트를 지정할 수 있습니다.
67 |     c.executemany('INSERT INTO books VALUES (:title, :url)', books)
68 |     # 변경사항을 커밋(저장)합니다.
69 |     conn.commit()
70 |     # 연결을 종료합니다.
71 |     conn.close()
72 | 
73 | # python 명령어로 실행한 경우 main() 함수를 호출합니다.
74 | # 이는 모듈로써 다른 파일에서 읽어 들였을 때 main() 함수가 호출되지 않게 하는 것입니다.
75 | # 파이썬 프로그램의 일반적인 작성 방식입니다.
76 | if __name__ == '__main__':
77 |     main()


--------------------------------------------------------------------------------
/chapter_2/save.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | cities = [
 4 |     {'rank': 1, 'city': '상하이', 'population': 24150000},
 5 |     {'rank': 2, 'city': '카라치', 'population': 23500000},
 6 |     {'rank': 3, 'city': '베이징', 'population': 21516000},
 7 |     {'rank': 4, 'city': '텐진', 'population': 14722100},
 8 |     {'rank': 5, 'city': '이스탄불', 'population': 14160467},
 9 | ]
10 | 
11 | print(json.dumps(cities))


--------------------------------------------------------------------------------
/chapter_2/save_csv.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | 
 3 | # 파일을 엽니다. newline=''으로 줄바꿈 코드의 자동 변환을 제어합니다.
 4 | with open('top_cities.csv', 'w', newline='') as f:
 5 |     # csv.writer는 파일 객체를 매개변수로 지정합니다.
 6 |     writer = csv.writer(f)  
 7 |     # 첫 번째 줄에는 헤더를 작성합니다.
 8 |     writer.writerow(['rank', 'city', 'population'])  
 9 |     # writerows()에 리스트를 전달하면 여러 개의 값을 출력합니다.
10 |     writer.writerows([
11 |         [1, '상하이', 24150000],
12 |         [2, '카라치', 23500000],
13 |         [3, '베이징', 21516000],
14 |         [4, '텐진', 14722100],
15 |         [5, '이스탄불', 14160467],
16 |     ])


--------------------------------------------------------------------------------
/chapter_2/save_csv_dict.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | 
 3 | with open('top_cities.csv', 'w', newline='') as f:
 4 |     # 첫 번째 매개변수에 파일 객체
 5 |     # 두 번째 매개변수에 필드 이름 리스트를 지정합니다.
 6 |     writer = csv.DictWriter(f, ['rank', 'city', 'population'])
 7 |       # 첫 번째 줄에 헤더를 입력합니다.
 8 |     writer.writeheader()
 9 |     # writerows()로 여러 개의 데이터를 딕셔너리 형태로 작성합니다.
10 |     writer.writerows([
11 |         {'rank': 1, 'city': '상하이', 'population': 24150000},
12 |         {'rank': 2, 'city': '카라치', 'population': 23500000},
13 |         {'rank': 3, 'city': '베이징', 'population': 21516000},
14 |         {'rank': 4, 'city': '텐진', 'population': 14722100},
15 |         {'rank': 5, 'city': '이스탄불', 'population': 14160467},
16 |     ])


--------------------------------------------------------------------------------
/chapter_2/save_csv_join.py:
--------------------------------------------------------------------------------
1 | # 첫 번째 줄에 헤더를 작성합니다.
2 | print('rank,city,population')  
3 | 
4 | # join() 메서드의 매개변수로 전달한 list는 str이어야 하므로 주의해 주세요.
5 | print(','.join(['1', '상하이', '24150000']))
6 | print(','.join(['2', '카라치', '23500000']))
7 | print(','.join(['3', '베이징', '21516000']))
8 | print(','.join(['4', '텐진', '14722100']))
9 | print(','.join(['5', '이스탄불', '14160467']))


--------------------------------------------------------------------------------
/chapter_2/save_sqlite3.py:
--------------------------------------------------------------------------------
 1 | import sqlite3
 2 | 
 3 | # top_cities.db 파일을 열고 연결을 변수에 저장합니다.
 4 | conn = sqlite3.connect('top_cities.db')
 5 | 
 6 | # 커서를 추출합니다.
 7 | c = conn.cursor()
 8 | 
 9 | # execute() 메서드로 SQL 구문을 실행합니다.
10 | # 스크립트를 여러 번 사용해도 같은 결과를 출력할 수 있게 cities 테이블이 존재하는 경우 제거합니다.
11 | c.execute('DROP TABLE IF EXISTS cities')
12 | # cities 테이블을 생성합니다.
13 | c.execute('''
14 |     CREATE TABLE cities (
15 |         rank integer,
16 |         city text,
17 |         population integer
18 |     )
19 | ''')
20 | 
21 | # execute() 메서드의 두 번째 매개변수에는 파라미터를 지정할 수 있습니다.
22 | # SQL 내부에서 파라미터로 변경할 부분(플레이스홀더)은 ?로 지정합니다.
23 | c.execute('INSERT INTO cities VALUES (?, ?, ?)', (1, '상하이', 24150000))
24 | 
25 | # 파라미터가 딕셔너리일 때는 플레이스홀더를 :<이름> 형태로 지정합니다.
26 | c.execute('INSERT INTO cities VALUES (:rank, :city, :population)',
27 |           {'rank': 2, 'city': '카라치', 'population': 23500000})
28 | 
29 | # executemany() 메서드를 사용하면 여러 개의 파라미터를 리스트로 지정해서
30 | # 여러 개(현재 예제에서는 3개)의 SQL 구문을 실행할 수 있습니다.
31 | c.executemany('INSERT INTO cities VALUES (:rank, :city, :population)', [
32 |     {'rank': 3, 'city': '베이징', 'population': 21516000},
33 |     {'rank': 4, 'city': '텐진', 'population': 14722100},
34 |     {'rank': 5, 'city': '이스탄불', 'population': 14160467},
35 | ])
36 | 
37 | # 변경사항을 커밋(저장)합니다.
38 | conn.commit()
39 | 
40 | # 저장한 데이터를 추출합니다.
41 | c.execute('SELECT * FROM cities')
42 | # 쿼리의 결과는 fetchall() 메서드로 추출할 수 있습니다.
43 | for row in c.fetchall():
44 |     # 추출한 데이터를 출력합니다.
45 |     print(row)
46 | 
47 | # 연결을 닫습니다.
48 | conn.close()


--------------------------------------------------------------------------------
/chapter_2/scrape_re.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from html import unescape
 3 | 
 4 | # 이전 절에서 다운로드한 파일을 열고 html이라는 변수에 저장합니다.
 5 | with open('dp.html') as f:
 6 |     html = f.read()
 7 | 
 8 | # re.findall()을 사용해 도서 하나에 해당하는 HTML을 추출합니다.
 9 | for partial_html in re.findall(r'<td class="left"><a.*?</td>', html, re.DOTALL):
10 |     # 도서의 URL을 추출합니다.
11 |     url = re.search(r'<a href="(.*?)">', partial_html).group(1)
12 |     url = 'http://www.hanbit.co.kr' + url
13 |     # 태그를 제거해서 도서의 제목을 추출합니다.
14 |     title = re.sub(r'<.*?>', '', partial_html)
15 |     title = unescape(title)
16 |     print('url:', url)
17 |     print('title:', title)
18 |     print('---')


--------------------------------------------------------------------------------
/chapter_2/scrape_rss.py:
--------------------------------------------------------------------------------
 1 | # ElementTree 모듈을 읽어 들입니다.
 2 | from xml.etree import ElementTree
 3 | 
 4 | # parse() 함수로 파일을 읽어 들이고 ElementTree 객체를 만듭니다.
 5 | tree = ElementTree.parse('rss.xml')
 6 | 
 7 | # getroot() 메서드로 XML의 루트 요소를 추출합니다.
 8 | root = tree.getroot()
 9 | 
10 | # findall() 메서드로 요소 목록을 추출합니다.
11 | # 태그를 찾습니다(자세한 내용은 RSS를 열어 참고해주세요).
12 | for item in root.findall('channel/item/description/body/location/data'):
13 |     # find() 메서드로 요소를 찾고 text 속성으로 값을 추출합니다.
14 |     tm_ef = item.find('tmEf').text
15 |     tmn = item.find('tmn').text
16 |     tmx = item.find('tmx').text
17 |     wf = item.find('wf').text
18 |     print(tm_ef, tmn, tmx, wf) # 출력합니다.


--------------------------------------------------------------------------------
/chapter_2/try_and_with.py:
--------------------------------------------------------------------------------
 1 | d = {'a': 1, 'b': 2}
 2 | try:
 3 |     # 예외가 발생할 가능성이 있는 처리를 넣습니다.
 4 |     print(d['x'])
 5 | except KeyError:
 6 |     # try 절 내부에서 except 절에 작성된 예외(현재 예제에서는 KeyError)가 발생하면
 7 |     # except 절이 실행됩니다. 여기서는 키가 존재하지 않을 때의 처리 내용을 지정했습니다.
 8 |     print('x is not found')
 9 | 
10 | # open() 함수의 반환값은 변수 f에 할당되며 with 블록 내부에서 사용합니다.
11 | # 이렇게 사용하면 블록을 벗어날 때 f.close()가 자동으로 호출됩니다.
12 | with open('index.html') as f:
13 |     print(f.read())


--------------------------------------------------------------------------------
/chapter_2/urlopen_encoding.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from urllib.request import urlopen
 3 | f = urlopen('http://www.hanbit.co.kr/store/books/full_book_list.html')
 4 | 
 5 | # HTTP 헤더를 기반으로 인코딩 방식을 추출합니다(명시돼 있지 않을 경우 utf-8을 사용하게 합니다).
 6 | encoding = f.info().get_content_charset(failobj="utf-8")
 7 | # 인코딩 방식을 표준 오류에 출력합니다.
 8 | print('encoding:', encoding, file=sys.stderr)
 9 | 
10 | # 추출한 인코딩 방식으로 디코딩합니다.
11 | text = f.read().decode(encoding)
12 | # 웹 페이지의 내용을 표준 출력에 출력합니다.
13 | print(text)


--------------------------------------------------------------------------------
/chapter_2/urlopen_meta.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import sys
 3 | from urllib.request import urlopen
 4 | 
 5 | f = urlopen('http://www.hanbit.co.kr/store/books/full_book_list.html')
 6 | # bytes 자료형의 응답 본문을 일단 변수에 저장합니다.
 7 | bytes_content = f.read()  
 8 | 
 9 | # charset은 HTML의 앞부분에 적혀 있는 경우가 많으므로
10 | # 응답 본문의 앞부분 1024바이트를 ASCII 문자로 디코딩해 둡니다.
11 | # ASCII 범위 이위의 문자는 U+FFFD(REPLACEMENT CHARACTER)로 변환되어 예외가 발생하지 않습니다.
12 | scanned_text = bytes_content[:1024].decode('ascii', errors='replace')
13 | 
14 | # 디코딩한 문자열에서 정규 표현식으로 charset 값을 추출합니다.
15 | match = re.search(r'charset=["\']?([\w-]+)', scanned_text)
16 | if match:
17 |     encoding = match.group(1)
18 | else:
19 |     # charset이 명시돼 있지 않으면 UTF-8을 사용합니다.
20 |     encoding = 'utf-8'
21 | 
22 | # 추출한 인코딩을 표준 오류에 출력합니다.
23 | print('encoding:', encoding, file=sys.stderr)
24 | 
25 | # 추출한 인코딩으로 다시 디코딩합니다.
26 | text = bytes_content.decode(encoding)
27 | # 응답 본문을 표준 출력에 출력합니다.
28 | print(text)


--------------------------------------------------------------------------------
/chapter_3/python_crawler_1.py:
--------------------------------------------------------------------------------
1 | import requests
2 | import lxml.html
3 | 
4 | response = requests.get('http://www.hanbit.co.kr/store/books/new_book_list.html')
5 | root = lxml.html.fromstring(response.content)
6 | for a in root.cssselect('.view_box a'):
7 |     url = a.get('href')
8 |     print(url)


--------------------------------------------------------------------------------
/chapter_3/python_crawler_2.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import lxml.html
 3 | 
 4 | response = requests.get('http://www.hanbit.co.kr/store/books/new_book_list.html')
 5 | root = lxml.html.fromstring(response.content)
 6 | 
 7 | # 모든 링크를 절대 URL로 변환합니다.
 8 | root.make_links_absolute(response.url)
 9 | 
10 | # 선택자를 추가해서 명확한 선택을 할 수 있게 합니다.
11 | for a in root.cssselect('.view_box .book_tit a'):
12 |     url = a.get('href')
13 |     print(url)


--------------------------------------------------------------------------------
/chapter_3/python_crawler_3.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import lxml.html
 3 | def main():
 4 |     """
 5 |     크롤러의 메인 처리
 6 |     """
 7 |     # 여러 페이지에서 크롤링할 것이므로 Session을 사용합니다.
 8 |     session = requests.Session()  
 9 |     # scrape_list_page() 함수를 호출해서 제너레이터를 추출합니다.
10 |     response = session.get('http://www.hanbit.co.kr/store/books/new_book_list.html')
11 |     urls = scrape_list_page(response)
12 |     # 제너레이터는 list처럼 사용할 수 있습니다.
13 |     for url in urls:
14 |         print(url)
15 | 
16 | def scrape_list_page(response):
17 |     root = lxml.html.fromstring(response.content)
18 |     root.make_links_absolute(response.url)
19 |     for a in root.cssselect('.view_box .book_tit a'):
20 |         url = a.get('href')
21 |         # yield 구문으로 제너레이터의 요소 반환
22 |         yield url
23 | 
24 | if __name__ == '__main__':
25 |     main()


--------------------------------------------------------------------------------
/chapter_3/python_crawler_4.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import lxml.html
 3 | 
 4 | def main():
 5 |     # 여러 페이지에서 크롤링할 것이므로 Session을 사용합니다.
 6 |     session = requests.Session()  
 7 |     response = session.get('http://www.hanbit.co.kr/store/books/new_book_list.html')
 8 |     urls = scrape_list_page(response)
 9 |     for url in urls:
10 |         response = session.get(url)  # Session을 사용해 상세 페이지를 추출합니다.
11 |         ebook = scrape_detail_page(response)  # 상세 페이지에서 상세 정보를 추출합니다.
12 |         print(ebook)  # 책 관련 정보를 출력합니다.
13 |         break  # 책 한 권이 제대로 되는지 확인하고 종료합니다.
14 | 
15 | def scrape_list_page(response):
16 |     root = lxml.html.fromstring(response.content)
17 |     root.make_links_absolute(response.url)
18 |     for a in root.cssselect('.view_box .book_tit a'):
19 |         url = a.get('href')
20 |         yield url
21 | 
22 | def scrape_detail_page(response):
23 |     """
24 |     상세 페이지의 Response에서 책 정보를 dict로 추출합니다.
25 |     """
26 |     root = lxml.html.fromstring(response.content)
27 |     ebook = {
28 |         'url': response.url,
29 |         'title': root.cssselect('.store_product_info_box h3')[0].text_content(),
30 |         'price': root.cssselect('.pbr strong')[0].text_content(),
31 |         'content': [p.text_content()\
32 |             for p in root.cssselect('#tabs_3 .hanbit_edit_view p')]
33 |     }
34 |     return ebook
35 | 
36 | if __name__ == '__main__':
37 |     main()


--------------------------------------------------------------------------------
/chapter_3/python_crawler_5.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import lxml.html
 3 | 
 4 | def main():
 5 |     # 여러 페이지에서 크롤링할 것이므로 Session을 사용합니다.
 6 |     session = requests.Session()  
 7 |     response = session.get('http://www.hanbit.co.kr/store/books/new_book_list.html')
 8 |     urls = scrape_list_page(response)
 9 |     for url in urls:
10 |         response = session.get(url)  # Session을 사용해 상세 페이지를 추출합니다.
11 |         ebook = scrape_detail_page(response)  # 상세 페이지에서 상세 정보를 추출합니다.
12 |         print(ebook)  # 책 관련 정보를 출력합니다.
13 |         break  # 책 한 권이 제대로 되는지 확인하고 종료합니다.
14 | 
15 | def scrape_list_page(response):
16 |     root = lxml.html.fromstring(response.content)
17 |     root.make_links_absolute(response.url)
18 |     for a in root.cssselect('.view_box .book_tit a'):
19 |         url = a.get('href')
20 |         yield url
21 | 
22 | def scrape_detail_page(response):
23 |     """
24 |     상세 페이지의 Response에서 책 정보를 dict로 추출합니다.
25 |     """
26 |     root = lxml.html.fromstring(response.content)
27 |     ebook = {
28 |         'url': response.url,
29 |         'title': root.cssselect('.store_product_info_box h3')[0].text_content(),
30 |         'price': root.cssselect('.pbr strong')[0].text_content(),
31 |         'content': [normalize_spaces(p.text_content())
32 |             for p in root.cssselect('#tabs_3 .hanbit_edit_view p')
33 |             if normalize_spaces(p.text_content()) != '']
34 |     }
35 |     return ebook
36 | 
37 | def normalize_spaces(s):
38 |     """
39 |     연결돼 있는 공백을 하나의 공백으로 변경합니다.
40 |     """
41 |     return re.sub(r'\s+', ' ', s).strip()
42 | 
43 | if __name__ == '__main__':
44 |     main()


--------------------------------------------------------------------------------
/chapter_3/python_crawler_6.py:
--------------------------------------------------------------------------------
 1 | import time # time 모듈을 임포트합니다.
 2 | import re 
 3 | import requests
 4 | import lxml.html
 5 | 
 6 | def main():
 7 |     session = requests.Session()
 8 |     response = session.get('http://www.hanbit.co.kr/store/books/new_book_list.html')
 9 |     urls = scrape_list_page(response)
10 |     for url in urls:
11 |         time.sleep(1) # 1초 동안 휴식합니다.
12 |         response = session.get(url)
13 |         ebook = scrape_detail_page(response)
14 |         print(ebook)
15 | 
16 | def scrape_list_page(response):
17 |     root = lxml.html.fromstring(response.content)
18 |     root.make_links_absolute(response.url)
19 |     for a in root.cssselect('.view_box .book_tit a'):
20 |         url = a.get('href')
21 |         yield url
22 | 
23 | def scrape_detail_page(response):
24 |     """
25 |     상세 페이지의 Response에서 책 정보를 dict로 추출합니다.
26 |     """
27 |     root = lxml.html.fromstring(response.content)
28 |     ebook = {
29 |         'url': response.url,
30 |         'title': root.cssselect('.store_product_info_box h3')[0].text_content(),
31 |         'price': root.cssselect('.pbr strong')[0].text_content(),
32 |         'content': [normalize_spaces(p.text_content())
33 |             for p in root.cssselect('#tabs_3 .hanbit_edit_view p')
34 |             if normalize_spaces(p.text_content()) != '']
35 |     }
36 |     return ebook
37 | 
38 | def normalize_spaces(s):
39 |     """
40 |     연결돼 있는 공백을 하나의 공백으로 변경합니다.
41 |     """
42 |     return re.sub(r'\s+', ' ', s).strip()
43 | 
44 | if __name__ == '__main__':
45 |     main()


--------------------------------------------------------------------------------
/chapter_3/python_crawler_final.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import re 
 3 | import requests
 4 | import lxml.html
 5 | from pymongo import MongoClient
 6 | 
 7 | def main():
 8 |     """
 9 |     크롤러의 메인 처리
10 |     """
11 |     # 크롤러 호스트의 MongoDB에 접속합니다.
12 |     client = MongoClient('localhost', 27017)
13 |     # scraping 데이터베이스의 ebooks 콜렉션
14 |     collection = client.scraping.ebooks 
15 |     # 데이터를 식별할 수 있는 유일키를 저장할 key 필드에 인덱스를 생성합니다.
16 |     collection.create_index('key', unique=True)
17 | 
18 |     # 목록 페이지를 추출합니다.
19 |     response = requests.get('http://www.hanbit.co.kr/store/books/new_book_list.html')
20 |     # 상세 페이지의 URL 목록을 추출합니다.
21 |     urls = scrape_list_page(response)
22 |     for url in urls:
23 |         # URL로 키를 추출합니다.
24 |         key = extract_key(url)
25 |         # MongoDB에서 key에 해당하는 데이터를 검색합니다.
26 |         ebook = collection.find_one({'key': key})
27 |         # MongoDB에 존재하지 않는 경우만 상세 페이지를 크롤링합니다.
28 |         if not ebook:
29 |             time.sleep(1)
30 |             response = requests.get(url)
31 |             ebook = scrape_detail_page(response)
32 |             # 책 정보를 MongoDB에 저장합니다.
33 |             collection.insert_one(ebook)
34 |         # 책 정보를 출력합니다.
35 |         print(ebook)
36 | 
37 | def scrape_list_page(response):
38 |     """
39 |     목록 페이지의 Response에서 상세 페이지의 URL을 추출합니다.
40 |     """
41 |     root = lxml.html.fromstring(response.content)
42 |     root.make_links_absolute(response.url)
43 |     for a in root.cssselect('.view_box .book_tit a'):
44 |         url = a.get('href')
45 |         yield url
46 | 
47 | def scrape_detail_page(response):
48 |     """
49 |     상세 페이지의 Response에서 책 정보를 dict로 추출합니다.
50 |     """
51 |     root = lxml.html.fromstring(response.content)
52 |     ebook = {
53 |         'url': response.url,
54 |         'key': extract_key(response.url),
55 |         'title': root.cssselect('.store_product_info_box h3')[0].text_content(),
56 |         'price': root.cssselect('.pbr strong')[0].text_content(),
57 |         'content': "생략"
58 |     }
59 |     return ebook
60 | 
61 | def extract_key(url):
62 |     """
63 |     URL에서 키(URL 끝의 p_code)를 추출합니다.
64 |     """
65 |     m = re.search(r"p_code=(.+)", url)
66 |     return m.group(1)
67 | 
68 | def normalize_spaces(s):
69 |     """
70 |     연결돼 있는 공백을 하나의 공백으로 변경합니다.
71 |     """
72 |     return re.sub(r'\s+', ' ', s).strip()
73 |     
74 | if __name__ == '__main__':
75 |     main()


--------------------------------------------------------------------------------
/chapter_3/save_mongo.py:
--------------------------------------------------------------------------------
 1 | import lxml.html
 2 | from pymongo import MongoClient
 3 | 
 4 | # HTML 파일을 읽어 들이고 
 5 | # getroot() 메서드를 사용해 HtmlElement 객체를 추출합니다.
 6 | tree = lxml.html.parse('full_book_list.html')
 7 | html = tree.getroot()
 8 | 
 9 | client = MongoClient('localhost', 27017)
10 | db = client.scraping  # scraping 데이터베이스를 추출합니다.
11 | collection = db.links  # links 콜렉션을 추출합니다.
12 | 
13 | # 스크립트를 여러 번 사용해도 같은 결과를 출력할 수 있게 콜렉션의 문서를 제거합니다.
14 | collection.delete_many({})
15 | 
16 | # cssselect() 메서드로 a 요소의 목록을 추출합니다.
17 | for a in html.cssselect('a'):
18 |     # href 속성과 링크의 글자를 추출해서 저장합니다.
19 |     collection.insert_one({
20 |         'url': a.get('href'),
21 |         'title': a.text.strip(),
22 |     })
23 | 
24 | # 콜렉션의 모든 문서를 _id 순서로 정렬해서 추출합니다.
25 | for link in collection.find().sort('_id'):
26 |     print(link['_id'], link['url'], link['title'])


--------------------------------------------------------------------------------
/chapter_3/save_mysql.py:
--------------------------------------------------------------------------------
 1 | import MySQLdb
 2 | 
 3 | # MySQL 서버에 접속하고 연결을 변수에 저장합니다.
 4 | # 사용자 이름과 비밀번호를 지정한 뒤 scraping 데이터베이스를 사용(USE)합니다.
 5 | # 접속에 사용할 문자 코드는 utf8mb4로 지정합니다.
 6 | conn = MySQLdb.connect(db='scraping', user='scraper', passwd='password', charset='utf8mb4')
 7 | 
 8 | # 커서를 추출합니다.
 9 | c = conn.cursor()
10 | 
11 | # execute() 메서드로 SQL 구문을 실행합니다.
12 | # 스크립트를 여러 번 사용해도 같은 결과를 출력할 수 있게 cities 테이블이 존재하는 경우 제거합니다.
13 | c.execute('DROP TABLE IF EXISTS cities')
14 | # cities 테이블을 생성합니다.
15 | c.execute('''
16 |     CREATE TABLE cities (
17 |         rank integer,
18 |         city text,
19 |         population integer
20 |     )
21 | ''')
22 | 
23 | # execute() 메서드의 두 번째 매개변수에는 파라미터를 지정할 수 있습니다.
24 | # SQL 내부에서 파라미터로 변경할 부분(플레이스홀더)은 %s로 지정합니다.
25 | c.execute('INSERT INTO cities VALUES (%s, %s, %s)', (1, '상하이', 24150000))
26 | 
27 | # 파라미터가 딕셔너리일 때는 플레이스홀더를 %(<이름>)s 형태로 지정합니다.
28 | c.execute('INSERT INTO cities VALUES (%(rank)s, %(city)s, %(population)s)',
29 |           {'rank': 2, 'city': '카라치', 'population': 23500000})
30 | 
31 | # executemany() 메서드를 사용하면 여러 개의 파라미터를 리스트로 지정해서
32 | # 여러 개(현재 예제에서는 3개)의 SQL 구문을 실행할 수 있습니다.
33 | c.executemany('INSERT INTO cities VALUES (%(rank)s, %(city)s, %(population)s)', [
34 |     {'rank': 3, 'city': '베이징', 'population': 21516000},
35 |     {'rank': 4, 'city': '텐진', 'population': 14722100},
36 |     {'rank': 5, 'city': '이스탄불', 'population': 14160467},
37 | ])
38 | 
39 | # 변경사항을 커밋(저장)합니다.
40 | conn.commit() 
41 | 
42 | # 저장한 데이터를 추출합니다.
43 | c.execute('SELECT * FROM cities')
44 | # 쿼리의 결과는 fetchall() 메서드로 추출할 수 있습니다.
45 | for row in c.fetchall():
46 |     # 추출한 데이터를 출력합니다.
47 |     print(row)
48 | 
49 | # 연결을 닫습니다.
50 | conn.close()


--------------------------------------------------------------------------------
/chapter_3/scrape_by_bs4.py:
--------------------------------------------------------------------------------
 1 | from bs4 import BeautifulSoup
 2 | 
 3 | # HTML 파일을 읽어 들이고 BeautifulSoup 객체를 생성합니다.
 4 | with open('full_book_list.html') as f:
 5 |     soup = BeautifulSoup(f, 'html.parser')
 6 | 
 7 | # find_all() 메서드로 a 요소를 추출하고 반복을 돌립니다.
 8 | for a in soup.find_all('a'):
 9 |     # href 속성과 글자를 추출합니다.
10 |     print(a.get('href'), a.text)


--------------------------------------------------------------------------------
/chapter_3/scrape_by_feedparser.py:
--------------------------------------------------------------------------------
 1 | import feedparser
 2 | 
 3 | # 알라딘 도서 RSS를 읽어 들입니다.
 4 | d = feedparser.parse('http://www.aladin.co.kr/rss/special_new/351')
 5 | 
 6 | # 항목을 순회합니다.
 7 | for entry in d.entries:
 8 |     print('이름:', entry.title)
 9 |     print('타이틀:', entry.title)
10 |     print()


--------------------------------------------------------------------------------
/chapter_3/scrape_by_lxml.py:
--------------------------------------------------------------------------------
 1 | import lxml.html
 2 | 
 3 | # HTML 파일을 읽어 들이고, getroot() 메서드로 HtmlElement 객체를 생성합니다.
 4 | tree = lxml.html.parse('full_book_list.html')
 5 | html = tree.getroot()
 6 | 
 7 | # cssselect() 메서드로 a 요소의 리스트를 추출하고 반복을 돌립니다.
 8 | for a in html.cssselect('a'):
 9 |     # href 속성과 글자를 추출합니다.
10 |     print(a.get('href'), a.text)


--------------------------------------------------------------------------------
/chapter_4/error_handling.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | 
 3 | import requests
 4 | # 일시적인 오류를 나타내는 상태 코드를 지정합니다.
 5 | TEMPORARY_ERROR_CODES = (408, 500, 502, 503, 504)  
 6 | 
 7 | def main():
 8 |     """
 9 |     메인 처리입니다.
10 |     """
11 |     response = fetch('http://httpbin.org/status/200,404,503')
12 |     if 200 <= response.status_code < 300:
13 |         print('Success!')
14 |     else:
15 |         print('Error!')
16 | 
17 | def fetch(url):
18 |     """
19 |     지정한 URL에 요청한 뒤 Response 객체를 반환합니다.
20 |     일시적인 오류가 발생하면 최대 3번 재시도합니다.
21 |     """
22 |     max_retries = 3  # 최대 3번 재시도합니다.
23 |     retries = 0  # 현재 재시도 횟수를 나타내는 변수입니다.
24 |     while True:
25 |         try:
26 |             print('Retrieving {0}...'.format(url))
27 |             response = requests.get(url)
28 |             print('Status: {0}'.format(response.status_code))
29 |             if response.status_code not in TEMPORARY_ERROR_CODES:
30 |                 return response  # 일시적인 오류가 아니라면 response를 반환합니다.
31 |         except requests.exceptions.RequestException as ex:
32 |             # 네트워크 레벨 오류(RequestException)의 경우 재시도합니다.
33 |             print('Exception occured: {0}'.format(ex))
34 |             retries += 1
35 |             if retries >= max_retries:
36 |                 # 재시도 횟수 상한을 넘으면 예외를 발생시켜버립니다.
37 |                 raise Exception('Too many retries.')  
38 |             # 지수 함수적으로 재시도 간격을 증가합니다(**는 제곱 연산자입니다).
39 |             wait = 2**(retries - 1)  
40 |             print('Waiting {0} seconds...'.format(wait))
41 |             time.sleep(wait)  # 대기합니다.
42 | 
43 | if __name__ == '__main__':
44 |     main()


--------------------------------------------------------------------------------
/chapter_4/error_handling_with_retrying.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from retrying import retry  # pip install retrying
 3 | import time
 4 | # 일시적인 오류를 나타내는 상태 코드를 지정합니다.
 5 | TEMPORARY_ERROR_CODES = (408, 500, 502, 503, 504)  
 6 | 
 7 | def main():
 8 |     """
 9 |     메인 처리입니다.
10 |     """
11 |     response = fetch('http://httpbin.org/status/200,404,503')
12 |     if 200 <= response.status_code < 300:
13 |         print('Success!')
14 |     else:
15 |         print('Error!')
16 | 
17 | # stop_max_attempt_number로 최대 재시도 횟수를 지정합니다.
18 | # wait_exponential_multiplier로 특정한 시간 만큼 대기하고 재시도하게 합니다. 단위는 밀리초로 입력합니다.
19 | @retry(stop_max_attempt_number=3, wait_exponential_multiplier=1000)
20 | def fetch(url):
21 |     """
22 |     지정한 URL에 접근한 뒤 Response 객체를 반환합니다.
23 |     일시적인 오류가 발생할 경우 3번까지 재시도합니다.
24 |     """
25 |     print('Retrieving {0}...'.format(url))
26 |     response = requests.get(url)
27 |     print('Status: {0}'.format(response.status_code))
28 |     if response.status_code not in TEMPORARY_ERROR_CODES:
29 |         # 오류가 없다면 response를 반환합니다.
30 |         return response
31 |     # 오류가 있다면 예외를 발생시킵니다.
32 |     raise Exception('Temporary Error: {0}'.format(response.status_code))
33 | 
34 | if __name__ == '__main__':
35 |     main()


--------------------------------------------------------------------------------
/chapter_4/request_with_cache.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | # pip install CacheControl
 3 | from cachecontrol import CacheControl  
 4 | 
 5 | session = requests.session()
 6 | # session을 래핑한 cached_session 만들기
 7 | cached_session = CacheControl(session)
 8 | 
 9 | # 첫 번째는 캐시돼 있지 않으므로 서버에서 추출한 이후 캐시합니다.
10 | response = cached_session.get('https://docs.python.org/3/')
11 | print(response.from_cache)  # False
12 | 
13 | # 두 번째는 ETag와 Last-Modified 값을 사용해 업데이트됐는지 확인합니다.
14 | # 변경사항이 없는 경우에는 콘텐츠를 캐시에서 추출해서 사용하므로 빠른 처리가 가능합니다.
15 | response = cached_session.get('https://docs.python.org/3/')
16 | print(response.from_cache)  # True


--------------------------------------------------------------------------------
/chapter_4/send_email.py:
--------------------------------------------------------------------------------
 1 | import smtplib
 2 | from email.mime.text import MIMEText
 3 | from email.header import Header
 4 | 
 5 | # MIMEText 객체로 메일을 생성합니다.
 6 | msg = MIMEText('메일 본분입니다.')  
 7 | 
 8 | # 제목에 한글이 포함될 경우 Header 객체를 사용합니다.
 9 | msg['Subject'] = Header('메일 제목입니다.', 'utf-8') 
10 | msg['From'] = 'me@example.com'
11 | msg['To'] = 'you@example.com'
12 | 
13 | # SMTP()의 첫 번째 매개변수에 SMTP 서버의 호스트 이름을 지정합니다.
14 | with smtplib.SMTP('localhost') as smtp:
15 |     # 메일을 전송합니다.
16 |     smtp.send_message(msg)
17 | 
18 | '''
19 | with smtplib.SMTP_SSL('smtp.gmail.com') as smtp:
20 |     # 구글 계정의 사용자 이름과 비밀번호를 지정해서 로그인합니다.
21 |     # 2단계 인증을 설정한 경우 애플리케이션 비밀번호를 사용해 주세요.
22 |     smtp.login('사용자 이름', '비밀번호')
23 |     # send_message() 메서드로 메일을 전송합니다.
24 |     smtp.send_message(msg)
25 | '''
26 | 


--------------------------------------------------------------------------------
/chapter_4/validate_with_re.py:
--------------------------------------------------------------------------------
1 | import re
2 | value = '3,000'
3 | 
4 | # 숫자와 쉼표만을 포함한 정규 표현식에 매치하는지 확인합니다.
5 | if not re.search(r'^[0-9,]+$', value):
6 |     # 값이 제대로 돼 있지 않다면 예외를 발생시킵니다.
7 |     raise ValueError('Invalid price')


--------------------------------------------------------------------------------
/chapter_4/validate_with_voluptuous.py:
--------------------------------------------------------------------------------
 1 | # pip install voluptuous
 2 | from voluptuous import Schema, Match  
 3 | 
 4 | # 다음 4개의 규칙을 가진 스키마를 정의합니다
 5 | schema = Schema({                  # 규칙1: 객체는 dict 자료형
 6 |     'name': str,                   # 규칙2：name은 str(문자열) 자료형
 7 |     'price': Match(r'^[0-9,]+$'),  # 규칙3：price가 정규 표현식에 맞는지 확인
 8 | }, required=True)                  # 규칙4：dict의 키는 필수
 9 | 
10 | # Schema 객체는 함수처럼 호출해서 사용합니다.
11 | # 매개변수에 대상을 넣으면 유효성 검사를 수행합니다.
12 | schema({
13 |     'name': '포도',
14 |     'price': '3,000',
15 | })  # 유효성 검사를 통과하므로 아무 문제 없음
16 | 
17 | schema({
18 |     'name': None,
19 |     'price': '3,000',
20 | })  # 유효성 검사를 통과하지 못 하므로, MultipleInvalid 예외가 발생


--------------------------------------------------------------------------------
/chapter_5/get_museums.py:
--------------------------------------------------------------------------------
 1 | # pip install SPARQLWrapper
 2 | from SPARQLWrapper import SPARQLWrapper  
 3 | 
 4 | # SPARQL 엔드 포인트를 지정해서 인스턴스를 생성합니다.
 5 | sparql = SPARQLWrapper('http://ko.dbpedia.org/sparql')
 6 | 
 7 | # 한국의 박물관을 추출하는 쿼리입니다.
 8 | sparql.setQuery('''
 9 | SELECT * WHERE {
10 |     ?s rdf:type dbpedia-owl:Museum .
11 |     ?s prop-ko:소재지 ?address .
12 | } ORDER BY ?s
13 | ''')
14 | 
15 | # 반환 형식을 JSON으로 지정합니다.
16 | sparql.setReturnFormat('json')
17 | 
18 | # query()로 쿼리를 실행한 뒤 convert()로 파싱합니다.
19 | response = sparql.query().convert()
20 | for result in response['results']['bindings']:
21 |     # 출력합니다.
22 |     print(result['s']['value'], result['address']['value'])


--------------------------------------------------------------------------------
/chapter_5/get_museums_with_location.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | import sys
  3 | import os
  4 | import json
  5 | import dbm
  6 | from urllib.request import urlopen
  7 | from urllib.parse import urlencode
  8 | from SPARQLWrapper import SPARQLWrapper
  9 | 
 10 | def main():
 11 |     features = []  # 박물관 정보 저장을 위한 리스트
 12 |     for museum in get_museums():
 13 |         # 레이블이 있는 경우에는 레이블, 없는 경우에는 s를 추출합니다.
 14 |         label = museum.get('label', museum['s'])
 15 |         address = museum['address']
 16 |         lng, lat = geocode(address)
 17 |         
 18 |         # 값을 출력해 봅니다.
 19 |         print(label, address, lng, lat)
 20 |         # 위치 정보를 추출하지 못 했을 경우 리스트에 추가하지 않습니다.
 21 |         if lng is None:
 22 |             continue
 23 |         
 24 |         # features에 박물관 정보를 GeoJSON Feature 형식으로 추가합니다.
 25 |         features.append({
 26 |             'type': 'Feature',
 27 |             'geometry': {'type': 'Point', 'coordinates': [lng, lat]},
 28 |             'properties': {'label': label, 'address': address},
 29 |         })
 30 | 
 31 |     # GeoJSON FeatureCollection 형식으로 dict를 생성합니다.
 32 |     feature_collection = {
 33 |         'type': 'FeatureCollection',
 34 |         'features': features,
 35 |     }
 36 |     # FeatureCollection을 .geojson이라는 확장자의 파일로 저장합니다.
 37 |     with open('museums.geojson', 'w') as f:
 38 |         json.dump(feature_collection, f)
 39 | 
 40 | def get_museums():
 41 |     """
 42 |     SPARQL을 사용해 DBpedia에서 박물관 정보 추출하기
 43 |     """
 44 |     print('Executing SPARQL query...', file=sys.stderr)
 45 |     
 46 |     # SPARQL 엔드 포인트를 지정해서 인스턴스를 생성합니다.
 47 |     sparql = SPARQLWrapper('http://ko.dbpedia.org/sparql')
 48 |     
 49 |     # 한국의 박물관을 추출하는 쿼리입니다.
 50 |     sparql.setQuery('''
 51 |     SELECT * WHERE {
 52 |         ?s rdf:type dbpedia-owl:Museum .
 53 |         ?s prop-ko:소재지 ?address .
 54 |         OPTIONAL { ?s rdfs:label ?label . }
 55 |     } ORDER BY ?s
 56 |     ''')
 57 | 
 58 |     # 반환 형식을 JSON으로 지정합니다.
 59 |     sparql.setReturnFormat('json')
 60 | 
 61 |     # query()로 쿼리를 실행한 뒤 convert()로 파싱합니다.
 62 |     response = sparql.query().convert()
 63 |     print('Got {0} results'.format(len(response['results']['bindings']), file=sys.stderr))
 64 |     # 쿼리 결과를 반복 처리합니다.
 65 |     for result in response['results']['bindings']:
 66 |         # 다루기 쉽게 dict 형태로 변환해서 yield합니다.
 67 |         yield {name: binding['value'] for name, binding in result.items()}
 68 | 
 69 | # Google Geolocation API
 70 | GOOGLE_GEOCODER_API_URL = 'https://maps.googleapis.com/maps/api/geocode/json'
 71 | # DBM(파일을 사용한 Key-Value 데이터베이스)로 지오코딩 결과를 캐시합니다.
 72 | # 이 변수는 dict처럼 다룰 수 있습니다.
 73 | geocoding_cache = dbm.open('geocoding.db', 'c')
 74 | 
 75 | def geocode(address):
 76 |     """
 77 |     매개변수로 지정한 주소를 지오코딩해서 위도와 경도를 반환합니다.
 78 |     """
 79 |     if address not in geocoding_cache:
 80 |         # 주소가 캐시에 존재하지 않는 경우 지오코딩합니다.
 81 |         print('Geocoding {0}...'.format(address), file=sys.stderr)
 82 |         time.sleep(1)
 83 |         url = GOOGLE_GEOCODER_API_URL + '?' + urlencode({
 84 |             'key': os.environ['GOOGLE_API_ID'],
 85 |             'language': 'ko',
 86 |             'address': address,
 87 |         })
 88 |         response_text = urlopen(url).read()
 89 |         # API 응답을 캐시에 저장합니다.
 90 |         # 문자열을 키와 값에 넣으면 자동으로 bytes로 변환합니다.
 91 |         geocoding_cache[address] = response_text
 92 |     
 93 |     # 캐시 내의 API 응답을 dict로 변환합니다.
 94 |     # 값은 bytes 자료형이므로 문자열로 변환합니다.
 95 |     response = json.loads(geocoding_cache[address].decode('utf-8'))
 96 |     try:
 97 |         # JSON 형식에서 값을 추출합니다.
 98 |         lng = response['results'][0]['geometry']['location']['lng']
 99 |         lat = response['results'][0]['geometry']['location']['lat']
100 |         # float 형태로 변환한 뒤 튜플을 반환합니다.
101 |         return (float(lng), float(lat))
102 |     except:
103 |         return (None, None)
104 | 
105 | if __name__ == '__main__':
106 |     main()


--------------------------------------------------------------------------------
/chapter_5/import_from_stream_api_to_bigquery.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | from datetime import timezone
 4 | import tweepy
 5 | import bigquery
 6 | 
 7 | # 트위터 인증 정보를 읽어 들입니다.
 8 | CONSUMER_KEY = os.environ['CONSUMER_KEY']
 9 | CONSUMER_SECRET = os.environ['CONSUMER_SECRET']
10 | ACCESS_TOKEN = os.environ['ACCESS_TOKEN']
11 | ACCESS_TOKEN_SECRET = os.environ['ACCESS_TOKEN_SECRET']
12 | auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
13 | auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET)
14 | 
15 | # BigQuery 인증 정보(credentials.json)을 지정해 BigQuery 클라이언트를 생성합니다.
16 | # 명시적으로 readonly=False를 지정하지 않으면 쓰기 작업을 할 수 없습니다.
17 | client = bigquery.get_client(json_key_file='credentials.json', readonly=False)
18 | 
19 | # BigQuery 데이터 세트 이름
20 | DATASET_NAME = 'twitter'
21 | 
22 | # BigQuery 테이블 이름
23 | TABLE_NAME = 'tweets'
24 | 
25 | # 테이블이 존재하지 않으면 생성합니다.
26 | if not client.check_table(DATASET_NAME, TABLE_NAME):
27 |     print('Creating table {0}.{1}'.format(DATASET_NAME, TABLE_NAME), file=sys.stderr)
28 |     # create_table()의 3번째 매개변수로 스키마를 지정합니다.
29 |     client.create_table(DATASET_NAME, TABLE_NAME, [
30 |         {'name': 'id',          'type': 'string',    'description': '트윗 ID'},
31 |         {'name': 'lang',        'type': 'string',    'description': '트윗 언어'},
32 |         {'name': 'screen_name', 'type': 'string',    'description': '사용자 이름'},
33 |         {'name': 'text',        'type': 'string',    'description': '트윗 문장'},
34 |         {'name': 'created_at',  'type': 'timestamp', 'description': '트윗 날짜'},
35 |     ])
36 | 
37 | class MyStreamListener(tweepy.streaming.StreamListener):
38 |     """
39 |     Streaming API로 추출한 트윗을 처리하기 위한 클래스
40 |     """
41 |     status_list = []
42 |     num_imported = 0
43 |     def on_status(self, status):
44 |         """
45 |         트윗을 추출할 때 호출되는 메서드입니다.
46 |         매개변수: 트윗을 나타내는 Status 객체
47 |         """
48 |         # Status 객체를 status_list에 추가합니다.
49 |         self.status_list.append(status)
50 |         if len(self.status_list) >= 500:
51 |             # status_list에 500개의 데이터가 모이면 BigQuery에 임포트합니다.
52 |             if not push_to_bigquery(self.status_list):
53 |                 # 임포트에 실패하면 False가 반환되므로 오류를 출력하고 종료합니다.
54 |                 print('Failed to send to bigquery', file=sys.stderr)
55 |                 return False
56 |             # num_imported를 추가한 뒤 status_list를 비웁니다.
57 |             self.num_imported += len(self.status_list)
58 |             self.status_list = []
59 |             print('Imported {0} rows'.format(self.num_imported), file=sys.stderr)
60 |             # 요금이 많이 나오지 않게 5000개를 임포트했으면 종료합니다.
61 |             # 계속 임포트하고 싶다면 다음 두 줄을 주석 처리해 주세요.
62 |             if self.num_imported >= 5000:
63 |                 return False
64 | 
65 |     def push_to_bigquery(status_list):
66 |         """
67 |         트윗 리스트를 BigQuery에 임포트하는 메서드입니다.
68 |         """
69 |         # Tweepy의 Status 객체 리스트를 dict 리스트로 변환합니다.
70 |         rows = []
71 |         for status in status_list:
72 |             rows.append({
73 |                 'id': status.id_str,
74 |                 'lang': status.lang,
75 |                 'screen_name': status.author.screen_name,
76 |                 'text': status.text,
77 |                 # datetime 객체를 UTC POSIX 타임스탬프로 변환합니다.
78 |                 'created_at': status.created_at.replace(tzinfo=timezone.utc).timestamp(),
79 |             })
80 |         # dict 리스트를 BigQuery에 임포트합니다.
81 |         # 매개변수는 순서대로
82 |         # <데이터 세트 이름>, <테이블 이름>, <데이터 리스트>, <데이터를 식별할 필드 이름>입니다.
83 |         # insert_id_key는 데이터가 중복되지 않게 만들려고 사용했습니다.
84 |         return client.push_rows(DATASET_NAME, TABLE_NAME, rows, insert_id_key='id')
85 | 
86 | # Stream API로 읽어 들이기 시작합니다.
87 | print('Collecting tweets...', file=sys.stderr)
88 | stream = tweepy.Stream(auth, MyStreamListener())
89 | 
90 | # 공개된 트윗을 샘플링한 스트림을 받습니다.
91 | # 언어를 지정하지 않았으므로 모든 언어의 트윗을 추출할 수 있습니다.
92 | stream.sample()


--------------------------------------------------------------------------------
/chapter_5/konlpy_sample.py:
--------------------------------------------------------------------------------
1 | from konlpy.tag import Kkma
2 | 
3 | kkma = Kkma()
4 | malist = kkma.pos("아버지 가방에 들어가신다.")
5 | print(malist)


--------------------------------------------------------------------------------
/chapter_5/museums.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE HTML>
 2 | <meta charset="utf-8">
 3 | <title>한국의 박물관</title>
 4 | <style>
 5 | html, body, #map { height: 100%; margin: 0; padding: 0; }
 6 | </style>
 7 | <div id="map"></div>
 8 | <script>
 9 | function initMap() {
10 |     // 지도를 초기화합니다.
11 |     var map = new google.maps.Map(document.getElementById('map'), {
12 |         center: { lat: 35.7, lng: 137.7 },
13 |         zoom: 7
14 |     });
15 |     // InfoWindow 객체를 생성합니다.
16 |     var infowindow = new google.maps.InfoWindow();
17 |     // geojson 파일의 상대 URL을 지정합니다.
18 |     var geojsonUrl = './museums.geojson';
19 |     // geojson 파일을 읽어 들이고 출력합니다.
20 |     map.data.loadGeoJson(geojsonUrl);
21 |     // 마커를 클릭했을 때 실행할 이벤트를 등록합니다.
22 |     map.data.addListener('click', function(e) {
23 |         // 생성하고 박물관 이름(labe)을 추가합니다.
24 |         var h2 = document.createElement('h2');
25 |         h2.textContent = e.feature.getProperty('label');
26 |         // div 요소를 생성하고, h2 요소와 박물관 주소(address)를 추가합니다.
27 |         var div = document.createElement('div');
28 |         div.appendChild(h2);
29 |         div.appendChild(document.createTextNode('주소: ' + e.feature.getProperty('address')));
30 |         // InfoWindow에 출력할 내용으로 div 요소를 지정합니다.
31 |         infowindow.setContent(div);
32 |         // 출력 위치로 마커의 위치를 지정합니다.
33 |         infowindow.setPosition(e.feature.getGeometry().get());
34 |         // 지정한 지점에서 38픽셀 위에 출력하게 합니다.
35 |         infowindow.setOptions({pixelOffset: new google.maps.Size(0, -38)});
36 |         // InfoWindow를 출력합니다.
37 |         infowindow.open(map);
38 |     });
39 | }
40 | </script>
41 | <!-- Google Maps JavaScript API 스크립트를 읽어 들입니다. 완료했을 때 initMap() 함수를 호출합니다. -->
42 | <script async defer src="https://maps.googleapis.com/maps/api/js?callback=initMap"></script>


--------------------------------------------------------------------------------
/chapter_5/naver_order_history.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import sys
 3 | import os
 4 | from robobrowser import RoboBrowser
 5 | 
 6 | # 인증 정보를 환경변수에서 추출합니다.
 7 | NAVER_ID = os.environ['NAVER_ID']
 8 | NAVER_PASSWORD = os.environ['NAVER_PASSWORD']
 9 | 
10 | # RoboBrowser 객체를 생성합니다.
11 | browser = RoboBrowser(
12 |     # Beautiful Soup에서 사용할 파서를 지정합니다.
13 |     parser='html.parser',
14 |     # 일반적인 웹 브라우저의 User-Agent(FireFox)를 사용합니다.
15 |     user_agent='Mozilla/5.0 (Macintosh; Intel Mac macOS 10.10; rv:45.0) Gecko/20100101 Firefox/45.0')
16 | 
17 | def main():
18 |     # 로그인 페이지를 엽니다.
19 |     print('Accessing to sign in page....', file=sys.stderr)
20 |     browser.open('https://nid.naver.com/nidlogin.login')
21 |     
22 |     # 로그인 페이지에 들어가졌는지 확인합니다.
23 |     assert '네이버 : 로그인' in browser.parsed.title.string
24 |     
25 |     # name='frmNIDLogin'이라는 입력 양식을 채웁니다.
26 |     # 입력 양식의 name 속성은 개발자 도구로 확인할 수 있습니다.
27 |     form = browser.get_form(attrs={'name': 'frmNIDLogin'})
28 |     
29 |     # name='id'라는 입력 양식을 채웁니다.
30 |     form['id'] = NAVER_ID
31 |     # name='pw'라는 입력 양식을 채웁니다.
32 |     form['pw'] = NAVER_PASSWORD
33 |     
34 |     # 입력 양식을 전송합니다.
35 |     # 로그인 때 로그인을 막는 것을 회피하고자 몇 가지 추가 정보를 전송합니다.
36 |     print('Signing in...', file=sys.stderr)
37 |     browser.submit_form(form, headers={
38 |         'Referer': browser.url,
39 |         'Accept-Language': 'ko,en-US;q=0.7,en;q=0.3',
40 |     })
41 |     
42 |     # 주문 이력 페이지를 엽니다.
43 |     browser.open('https://order.pay.naver.com/home?tabMenu=SHOPPING&frm=s_order')
44 |     
45 |     # 문제가 있을 경우 HTML 소스코드를 확인할 수 있게 출력합니다.
46 |     # print(browser.parsed.prettify())
47 |     # 주문 이력 페이지가 맞는지 확인합니다.
48 |     assert '네이버페이' in browser.parsed.title.string
49 |     # 주문 이력을 출력합니다.
50 |     print_order_history()
51 | 
52 | def print_order_history():
53 |     """
54 |     주문 이력을 출력합니다.
55 |     """
56 |     # 주문 이력을 순회합니다: 클래스 이름은 개발자 도구로 확인합니다.
57 |     for item in browser.select('.p_info'):
58 |         # 주문 이력 저장 전용 dict입니다.
59 |         order = {} 
60 |         # 주문 이력의 내용을 추출합니다.
61 |         name_element = item.select_one('span')
62 |         date_element = item.select_one('.date')
63 |         price_element = item.select_one('em')
64 |         # 내용이 있을 때만 저장합니다.
65 |         if name_element and date_element and price_element:
66 |             name = name_element.get_text().strip()
67 |             date = date_element.get_text().strip()
68 |             price = price_element.get_text().strip()
69 |             order[name] = {
70 |                 'date': date,
71 |                 'price': price
72 |             }
73 |             print(order[name]['date'], '-', order[name]['price'] + '원')
74 | 
75 | if __name__ == '__main__':
76 |     main()


--------------------------------------------------------------------------------
/chapter_5/plot_advanced_graph.py:
--------------------------------------------------------------------------------
 1 | import matplotlib
 2 | 
 3 | # 렌더링 백엔드로 데스크톱 환경이 필요 없는 Agg를 사용합니다.
 4 | matplotlib.use('Agg')
 5 | 
 6 | # 한국어를 렌더링할 수 있게 폰트를 지정합니다.
 7 | # macOS와 우분투 모두 정상적으로 출력하도록 2개의 폰트를 지정했습니다.
 8 | # 기본 상태에서는 한국어가 □로 출력됩니다.
 9 | matplotlib.rcParams['font.sans-serif'] = 'NanumGothic,AppleGothic'
10 | import matplotlib.pyplot as plt
11 | 
12 | # plot()의 세 번째 매개변수로 계열 스타일을 나타내는 문자열을 지정합니다.
13 | # 'b'는 파란색, 'x'는 × 표시 마커, '-'는 마커를 실선으로 연결하라는 의미입니다.
14 | # 키워드 매개변수 label로 지정한 계열의 이름은 범례로 사용됩니다.
15 | plt.plot([1, 2, 3, 4, 5], [1, 2, 3, 4, 5], 'bx-', label='첫 번째 함수')
16 | 
17 | # 'r'은 붉은색,'o'는 ○ 표시 마커, '--'는 점선을 의미합니다.
18 | plt.plot([1, 2, 3, 4, 5], [1, 4, 9, 16, 25], 'ro--', label='두 번째 함수')
19 | # xlabel() 함수로 X축의 레이블을 지정합니다.
20 | plt.xlabel('X 값')
21 | # ylabel() 함수로 Y축의 레이블을 지정합니다.
22 | plt.ylabel('Y 값')
23 | # title() 함수로 그래프의 제목을 지정합니다.
24 | plt.title('matplotlib 샘플')
25 | # legend() 함수로 범례를 출력합니다. loc='best'는 적당한 위치에 출력하라는 의미입니다.
26 | plt.legend(loc='best')
27 | 
28 | # X축 범위를 0~6으로 지정합니다. ylim() 함수를 사용하면 Y축 범위를 지정할 수 있습니다.
29 | plt.xlim(0, 6)
30 | 
31 | # 그래프를 그리고 파일로 저장합니다.
32 | plt.savefig('advanced_graph.png', dpi=300)


--------------------------------------------------------------------------------
/chapter_5/plot_historical_data.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime
 2 | import pandas as pd
 3 | import matplotlib
 4 | 
 5 | matplotlib.use('Agg') 
 6 | matplotlib.rcParams['font.sans-serif'] = 'NanumGothic,AppleGothic' 
 7 | import matplotlib.pyplot as plt
 8 | 
 9 | def main():
10 |     # 1981년과 2014년 사이의 환율과 고용률을 출력해 봅니다. 
11 |     # 조금 이해하기 쉽게 Pandas 대신 기본 숫자 비교와 문자열 비교를 사용해 봤습니다.
12 |     # 환율 정보 읽어 들이기
13 |     df_exchange = pd.read_csv('DEXKOUS.csv', header=1, 
14 |         names=['DATE', 'DEXKOUS'], skipinitialspace=True, index_col=0)
15 |     years = {}
16 |     output = []
17 |     for index in df_exchange.index:
18 |         year = int(index.split('-')[0])
19 |         if (year not in years) and (1981 < year < 2014):
20 |             if df_exchange.DEXKOUS[index] != ".":
21 |                 years[year] = True
22 |                 output.append([year, float(df_exchange.DEXKOUS[index])])
23 |     df_exchange = pd.DataFrame(output)
24 | 
25 |     # 고용률 통계를 구합니다.
26 |     df_jobs = pd.read_excel('gugik.xlsx') 
27 |     output = []
28 |     stacked = df_jobs.stack()[7]
29 |     for index in stacked.index:
30 |         try:
31 |             if 1981 <= int(index) <= 2014:
32 |                 output.append([int(index), float(stacked[index])])
33 |         except:
34 |             pass
35 |     s_jobs = pd.DataFrame(output)
36 | 
37 |     # 첫 번째 그래프 그리기
38 |     plt.subplot(2, 1, 1)
39 |     plt.plot(df_exchange[0], df_exchange[1], label='원/달러') 
40 |     plt.xlim(1981, 2014) # X축의 범위를 설정합니다.
41 |     plt.ylim(500, 2500)
42 |     plt.legend(loc='best')
43 |     
44 |     # 두 번째 그래프 그리기
45 |     print(s_jobs)
46 |     plt.subplot(2, 1, 2) # 3 1 の3 のサブプロットを作成。 
47 |     plt.plot(s_jobs[0], s_jobs[1], label='고용률(%)') 
48 |     plt.xlim(1981, 2014) # X축의 범위를 설정합니다.
49 |     plt.ylim(0, 100) # Y축의 범위를 설정합니다.
50 |     plt.legend(loc='best')
51 |     plt.savefig('historical_data.png', dpi=300) # 이미지를 저장합니다.
52 | 
53 | if __name__ == '__main__': 
54 |     main()


--------------------------------------------------------------------------------
/chapter_5/print_pdf_textboxes.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from pdfminer.converter import PDFPageAggregator
 3 | from pdfminer.layout import LAParams, LTContainer, LTTextBox
 4 | from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
 5 | from pdfminer.pdfpage import PDFPage
 6 | 
 7 | def find_textboxes_recursively(layout_obj):
 8 |     """
 9 |     재귀적으로 텍스트 박스(LTTextBox)를 찾고
10 |     텍스트 박스들을 리스트로 반환합니다.
11 |     """
12 |     # LTTextBox를 상속받은 객체의 경우 리스트에 곧바로 넣어서 반환합니다.
13 |     if isinstance(layout_obj, LTTextBox):
14 |         return [layout_obj]
15 |     # LTContainer를 상속받은 객체의 경우 자식 요소를 포함하고 있다는 의미이므로
16 |     # 재귀적으로 자식 요소를 계속 찾습니다.
17 |     if isinstance(layout_obj, LTContainer):
18 |         boxes = []
19 |         for child in layout_obj:
20 |             boxes.extend(find_textboxes_recursively(child))
21 |         return boxes
22 |     # 아무것도 없다면 빈 리스트를 반환합니다.
23 |     return []
24 | 
25 | # 공유 리소스를 관리하는 리소스 매니저를 생성합니다.
26 | laparams = LAParams()
27 | resource_manager = PDFResourceManager()
28 | 
29 | # 페이지를 모으는 PageAggregator 객체를 생성합니다.
30 | device = PDFPageAggregator(resource_manager, laparams=laparams)
31 | 
32 | # Interpreter 객체를 생성합니다.
33 | interpreter = PDFPageInterpreter(resource_manager, device)
34 | 
35 | # 파일을 바이너리 형식으로 읽어 들입니다.
36 | with open(sys.argv[1], 'rb') as f:
37 |     # PDFPage.get_pages()로 파일 객체를 지정합니다.
38 |     # PDFPage 객체를 차례대로 추출합니다.
39 |     # 키워드 매개변수인 pagenos로 처리할 페이지 번호(0-index)를 리스트 형식으로 지정할 수도 있습니다.
40 |     for page in PDFPage.get_pages(f):
41 |         # 페이지를 처리합니다.
42 |         interpreter.process_page(page)
43 |         # LTPage 객체를 추출합니다.
44 |         layout = device.get_result()
45 |         # 페이지 내부의 텍스트 박스를 리스트로 추출합니다.
46 |         boxes = find_textboxes_recursively(layout)
47 |         # 텍스트 박스를 왼쪽 위의 좌표부터 차례대로 정렬합니다.
48 |         # y1(Y 좌표)는 위에 있을수록 크므로 음수로 변환하게 해서 비교했습니다.
49 |         boxes.sort(key=lambda b: (-b.y1, b.x0))
50 |         for box in boxes:
51 |             # 읽기 쉽게 선을 출력합니다.
52 |             print('-' * 10)
53 |             # 텍스트 박스의 내용을 출력합니다.
54 |             print(box.get_text().strip())


--------------------------------------------------------------------------------
/chapter_5/rest_api_with_requests_oauthlib.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from requests_oauthlib import OAuth1Session
 3 | 
 4 | # 환경변수에서 인증 정보를 추출합니다.
 5 | CONSUMER_KEY = os.environ['CONSUMER_KEY']
 6 | CONSUMER_SECRET = os.environ['CONSUMER_SECRET']
 7 | ACCESS_TOKEN = os.environ['ACCESS_TOKEN']
 8 | ACCESS_TOKEN_SECRET = os.environ['ACCESS_TOKEN_SECRET']
 9 | 
10 | # 인증 정보를 사용해 OAuth1Session 객체를 생성합니다.
11 | twitter = OAuth1Session(CONSUMER_KEY,
12 |                         client_secret=CONSUMER_SECRET,
13 |                         resource_owner_key=ACCESS_TOKEN,
14 |                         resource_owner_secret=ACCESS_TOKEN_SECRET)
15 | 
16 | # 사용자의 타임라인을 추출합니다.
17 | response = twitter.get('https://api.twitter.com/1.1/statuses/home_timeline.json')
18 | 
19 | # API 응답이 JSON 형식의 문자열이므로 response.json()으로 파싱합니다.
20 | # status는 트윗(Twitter API에서는 Status라고 부릅니다)를 나타내는 dict입니다.
21 | for status in response.json():
22 |     # 사용자 이름과 트윗을 출력합니다.
23 |     print('@' + status['user']['screen_name'], status['text'])


--------------------------------------------------------------------------------
/chapter_5/rest_api_with_tweepy.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | # pip install tweepy
 3 | import tweepy
 4 | 
 5 | # 환경변수에서 인증 정보를 추출합니다.
 6 | CONSUMER_KEY = os.environ['CONSUMER_KEY']
 7 | CONSUMER_SECRET = os.environ['CONSUMER_SECRET']
 8 | ACCESS_TOKEN = os.environ['ACCESS_TOKEN']
 9 | ACCESS_TOKEN_SECRET = os.environ['ACCESS_TOKEN_SECRET']
10 | 
11 | # 인증 정보를 설정합니다.
12 | auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
13 | auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET)
14 | 
15 | # API 클라이언트를 생성합니다.
16 | api = tweepy.API(auth)
17 | 
18 | # 사용자의 타임라인을 추출합니다.
19 | public_tweets = api.home_timeline()
20 | for status in public_tweets:
21 |     # 사용자 이름과 트윗을 출력합니다.
22 |     print('@' + status.user.screen_name, status.text)


--------------------------------------------------------------------------------
/chapter_5/robobrowser_google.py:
--------------------------------------------------------------------------------
 1 | from robobrowser import RoboBrowser
 2 | 
 3 | # RoboBrowser 객체를 생성합니다.
 4 | # 키워드 매개변수 parser는 BeautifulSoup()의 두 번째 매개변수와 같습니다.
 5 | browser = RoboBrowser(parser='html.parser')
 6 | 
 7 | # open() 메서드로 구글 메인 페이지를 엽니다.
 8 | browser.open('https://www.google.co.kr/')
 9 | 
10 | # 키워드를 입력합니다.
11 | form = browser.get_form(action='/search')
12 | form['q'] = 'Python'
13 | browser.submit_form(form, list(form.submit_fields.values())[0])
14 | 
15 | # 검색 결과 제목을 추출합니다.
16 | # select() 메서드는 BeautifulSoup의 select() 메서드와 같습니다.
17 | for a in browser.select('h3 > a'):
18 |     print(a.text)
19 |     print(a.get('href'))
20 |     print()


--------------------------------------------------------------------------------
/chapter_5/save_youtube_video_metadata.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | 
 4 | from apiclient.discovery import build
 5 | from pymongo import MongoClient, DESCENDING
 6 | 
 7 | # 환경변수에서 API 키를 추출합니다.
 8 | YOUTUBE_API_KEY = os.environ['YOUTUBE_API_KEY']
 9 | 
10 | def main():
11 |     """
12 |     메인 처리
13 |     """
14 |     # MongoDB 클라이언트 객체를 생성합니다.
15 |     mongo_client = MongoClient('localhost', 27017)
16 |     # youtube 데이터베이스의 videos 콜렉션을 추출합니다.
17 |     collection = mongo_client.youtube.videos
18 |     # 기존의 모든 문서를 제거합니다.
19 |     collection.delete_many({})
20 |     
21 |     # 동영상을 검색하고, 페이지 단위로 아이템 목록을 저장합니다.
22 |     for items_per_page in search_videos('요리'):
23 |         save_to_mongodb(collection, items_per_page)
24 |     
25 |     # 뷰 수가 높은 동영상을 출력합니다.
26 |     show_top_videos(collection)
27 | 
28 | def search_videos(query, max_pages=5):
29 |     """
30 |     동영상을 검색하고, 페이지 단위로 list를 yield합니다.
31 |     """
32 |     # YouTube의 API 클라이언트 생성하기
33 |     youtube = build('youtube', 'v3', developerKey=YOUTUBE_API_KEY)  
34 |     # search.list 메서드로 처음 페이지 추출을 위한 요청 전송하기
35 |     search_request = youtube.search().list(
36 |         part='id',  # search.list에서 동영상 ID만 추출해도 괜찮음
37 |         q=query,
38 |         type='video',
39 |         maxResults=50,  # 1페이지에 최대 50개의 동영상 추출
40 |     )
41 |     # 요청이 성공하고 페이지 수가 max_pages보다 작을 때 반복
42 |     # 페이지 수를 제한하는 것은 실행 시간이 너무 길어지는 것을 막기 위해서입니다.
43 |     # 더 많은 페이지를 요청해도 상관없습니다
44 |     i = 0
45 |     while search_request and i < max_pages:
46 |         # 요청을 전송합니다.
47 |         search_response = search_request.execute()
48 |             # 동영상 ID의 리스트를 추출합니다.
49 |             video_ids = [item['id']['videoId'] for item in search_response['items']]
50 |         # videos.list 메서드로 동영상의 상세 정보를 추출합니다.
51 |         videos_response = youtube.videos().list(
52 |             part='snippet,statistics',
53 |             id=','.join(video_ids)
54 |         ).execute()
55 |         # 현재 페이지 내부의 아이템을 yield합니다.
56 |         yield videos_response['items']
57 |         
58 |         # list_next() 메서드로 다음 페이지를 추출하기 위한 요청을 보냅니다.
59 |         search_request = youtube.search().list_next(search_request, search_response)
60 |         i += 1
61 | 
62 | def save_to_mongodb(collection, items):
63 |     """
64 |     MongoDB에 아이템을 저장합니다.
65 |     """
66 |     # MongoDB에 저장하기 전에 이후에 사용하기 쉽게 아이템을 가공합니다.
67 |     for item in items:
68 |         # 각 아이템의 id 속성을 _id 속성으로 사용합니다.
69 |         item['_id'] = item['id']  
70 |         # statistics에 포함된 viewCount 속성 등은 문자열이므로 숫자로 변환합니다.
71 |         for key, value in item['statistics'].items():
72 |             item['statistics'][key] = int(value)
73 |     
74 |     # 콜렉션에 추가합니다.
75 |     result = collection.insert_many(items)
76 |     print('Inserted {0} documents'.format(len(result.inserted_ids)), file=sys.stderr)
77 | 
78 | def show_top_videos(collection):
79 |     """
80 |     MongoDB의 콜렉션 내부에서 뷰 수를 기준으로 상위 5개를 출력합니다.
81 |     """
82 |     for item in collection.find().sort('statistics.viewCount', DESCENDING).limit(5):
83 |         print(item['statistics']['viewCount'], item['snippet']['title'])
84 | 
85 | if __name__ == '__main__':
86 |     main()


--------------------------------------------------------------------------------
/chapter_5/search_youtube_videos.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | # pip install google-api-python-client
 4 | from apiclient.discovery import build
 5 | 
 6 | # 환경변수에서 API 키 추출하기
 7 | YOUTUBE_API_KEY = os.environ['YOUTUBE_API_KEY']
 8 | 
 9 | # YouTube API 클라이언트를 생성합니다.
10 | # build() 함수의 첫 번째 매개변수에는 API 이름
11 | # 두 번째 매개변수에는 API 버전을 지정합니다.
12 | # 키워드 매개변수 developerKey에는 API 키를 지정합니다.
13 | # 이 함수는 내부적으로 https://www.googleapis.com/discovery/v1/apis/youtube/v3/rest라는
14 | # URL에 접근하고 API 리소스와 메서드 정보를 추출합니다.
15 | youtube = build('youtube', 'v3', developerKey=YOUTUBE_API_KEY)
16 | 
17 | # 키워드 매개변수로 매개변수를 지정하고
18 | # search.list 메서드를 호출합니다.
19 | # list() 메서드를 실행하면 googleapiclient.http.HttpRequest가 반환됩니다. 
20 | # execute() 메서드를 실행하면 실제 HTTP 요청이 보내지며, API 응답이 반환됩니다.
21 | search_response = youtube.search().list(
22 |     part='snippet',
23 |     q='요리',
24 |     type='video',
25 | ).execute()
26 | 
27 | # search_response는 API 응답을 JSON으로 나타낸 dict 객체입니다.
28 | for item in search_response['items']:
29 |     # 동영상 제목을 출력합니다.
30 |     print(item['snippet']['title'])


--------------------------------------------------------------------------------
/chapter_5/selenium_google.py:
--------------------------------------------------------------------------------
 1 | from selenium import webdriver
 2 | from selenium.webdriver.common.keys import Keys
 3 | 
 4 | # PhantomJS 모듈의 WebDriver 객체를 생성합니다.
 5 | driver = webdriver.PhantomJS()
 6 | 
 7 | # Google 메인 페이지를 엽니다.
 8 | driver.get('https://www.google.co.kr/')
 9 | 
10 | # 타이틀에 'Google'이 포함돼 있는지 확인합니다.
11 | assert 'Google' in driver.title
12 | 
13 | # 검색어를 입력하고 검색합니다.
14 | input_element = driver.find_element_by_name('q')
15 | input_element.send_keys('Python')
16 | input_element.send_keys(Keys.RETURN)
17 | 
18 | # 타이틀에 'Python'이 포함돼 있는지 확인합니다.
19 | assert 'Python' in driver.title
20 | 
21 | # 스크린샷을 찍습니다.
22 | driver.save_screenshot('search_results.png')
23 | 
24 | # 검색 결과를 출력합니다.
25 | for a in driver.find_elements_by_css_selector('h3 > a'):
26 |     print(a.text)
27 |     print(a.get_attribute('href'))
28 |     print()


--------------------------------------------------------------------------------
/chapter_5/shopping_rss.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import time
  3 | 
  4 | from selenium import webdriver
  5 | from selenium.webdriver.common.by import By
  6 | from selenium.webdriver.support import expected_conditions as EC
  7 | from selenium.webdriver.support.ui import WebDriverWait
  8 | import feedgenerator
  9 | 
 10 | # 인증 정보를 환경변수에서 추출합니다.
 11 | NAVER_ID = os.environ['NAVER_ID']
 12 | NAVER_PASSWORD = os.environ['NAVER_PASSWORD']
 13 | 
 14 | def main():
 15 |     """
 16 |     메인 처리
 17 |     """
 18 |     # PhantomJS의 WebDriver 객체를 생성합니다.
 19 |     driver = webdriver.PhantomJS()
 20 |     
 21 |     # 화면 크기를 설정합니다.
 22 |     driver.set_window_size(800, 600)
 23 |     
 24 |     # 로그인하고 이동한 뒤 주문 이력을 가져옵니다.
 25 |     sign_in(driver)
 26 |     navigate(driver)
 27 |     goods = scrape_history(driver)
 28 |     
 29 |     # RSS 피드로 저장합니다.
 30 |     with open('shopping_history.rss', 'w') as f:
 31 |         save_as_feed(f, goods)
 32 | 
 33 | def sign_in(driver):
 34 |     """
 35 |     로그인합니다
 36 |     """
 37 |     print('Navigating...', file=sys.stderr)
 38 |     print('Waiting for sign in page loaded...', file=sys.stderr)
 39 |     time.sleep(2)
 40 |     
 41 |     # 입력 양식을 입력하고 전송합니다.
 42 |     driver.get('https://nid.naver.com/nidlogin.login')
 43 |     e = driver.find_element_by_id('id')
 44 |     e.clear()
 45 |     e.send_keys(NAVER_ID)
 46 |     e = driver.find_element_by_id('pw')
 47 |     e.clear()
 48 |     e.send_keys(NAVER_PASSWORD)
 49 |     form = driver.find_element_by_css_selector("input.btn_global[type=submit]")
 50 |     form.submit()
 51 | 
 52 | def navigate(driver):
 53 |     """
 54 |     적절한 페이지로 이동한 뒤 
 55 |     """
 56 |     print('Navigating...', file=sys.stderr)
 57 |     driver.get("https://order.pay.naver.com/home?tabMenu=SHOPPING")
 58 |     print('Waiting for contents to be loaded...', file=sys.stderr)
 59 |     time.sleep(2)
 60 |     # 페이지를 아래로 스크롤합니다.
 61 |     # 사실 현재 예제에서는 필요 없지만 활용 예를 위해 넣어봤습니다.
 62 |     # 스크롤을 해서 데이터를 가져오는 페이지의 경우 활용할 수 있습니다.
 63 |     driver.execute_script('scroll(0, document.body.scrollHeight)')
 64 |     wait = WebDriverWait(driver, 10)
 65 |     
 66 |     # [더보기] 버튼을 클릭할 수 있는 상태가 될 때까지 대기하고 클릭합니다.
 67 |     # 두 번 클릭해서 과거의 정보까지 들고옵니다.
 68 |     driver.save_screenshot('note-1.png')
 69 |     button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#_moreButton a')))
 70 |     button.click()
 71 |     button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#_moreButton a')))
 72 |     button.click()
 73 |     # 2초 대기합니다.
 74 |     print('Waiting for contents to be loaded...', file=sys.stderr)
 75 |     time.sleep(2)
 76 | 
 77 | def scrape_history(driver):
 78 |     """
 79 |     페이지에서 주문 이력을 추출합니다.
 80 |     """
 81 |     goods = []
 82 |     for info in driver.find_elements_by_css_selector('.p_info'):
 83 |         # 요소를 추출합니다.
 84 |         link_element = info.find_element_by_css_selector('a')
 85 |         title_element = info.find_element_by_css_selector('span')
 86 |         date_element = info.find_element_by_css_selector('.date')
 87 |         price_element = info.find_element_by_css_selector('em')
 88 |         # 텍스트를 추출합니다.
 89 |         goods.append({
 90 |             'url': link_element.get_attribute('.a'),
 91 |             'title': title_element.text,
 92 |             'description': date_element.text + " - " + price_element.text + "원"
 93 |         })
 94 |     return goods
 95 | 
 96 | def save_as_feed(f, posts):
 97 |     """
 98 |     주문 내역을 피드로 저장합니다.
 99 |     """
100 |     # Rss201rev2Feed 객체를 생성합니다.
101 |     feed = feedgenerator.Rss201rev2Feed(
102 |         title='네이버페이 주문 이력',
103 |         link='https://order.pay.naver.com/',
104 |         description='주문 이력')
105 |     
106 |     # 피드를 추가합니다.
107 |     for post in posts:
108 |         feed.add_item(title=post['title'],
109 |                       link=post['url'],
110 |                       description=post['description'],
111 |                       unique_id=post['url'])
112 |     
113 |     # 피드를 저장합니다.
114 |     feed.write(f, 'utf-8')
115 | 
116 | if __name__ == '__main__':
117 |     main()


--------------------------------------------------------------------------------
/chapter_5/shopping_selenium.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import time
 3 | 
 4 | from selenium import webdriver
 5 | from selenium.webdriver.common.by import By
 6 | from selenium.webdriver.support import expected_conditions as EC
 7 | from selenium.webdriver.support.ui import WebDriverWait
 8 | 
 9 | # 인증 정보를 환경변수에서 추출합니다.
10 | NAVER_ID = os.environ['NAVER_ID']
11 | NAVER_PASSWORD = os.environ['NAVER_PASSWORD']
12 | 
13 | def main():
14 |     """
15 |     메인 처리
16 |     """
17 |     # PhantomJS의 WebDriver 객체를 생성합니다.
18 |     driver = webdriver.PhantomJS()
19 |     
20 |     # 화면 크기를 설정합니다.
21 |     driver.set_window_size(800, 600)
22 |     
23 |     # 로그인하고 이동한 뒤 주문 이력을 가져옵니다.
24 |     sign_in(driver)
25 |     navigate(driver)
26 |     goods = scrape_history(driver)
27 |     # 출력합니다.
28 |     print(goods)
29 | 
30 | def sign_in(driver):
31 |     """
32 |     로그인합니다
33 |     """
34 |     print('Navigating...', file=sys.stderr)
35 |     print('Waiting for sign in page loaded...', file=sys.stderr)
36 |     time.sleep(2)
37 |     
38 |     # 입력 양식을 입력하고 전송합니다.
39 |     driver.get('https://nid.naver.com/nidlogin.login')
40 |     e = driver.find_element_by_id('id')
41 |     e.clear()
42 |     e.send_keys(NAVER_ID)
43 |     e = driver.find_element_by_id('pw')
44 |     e.clear()
45 |     e.send_keys(NAVER_PASSWORD)
46 |     form = driver.find_element_by_css_selector("input.btn_global[type=submit]")
47 |     form.submit()
48 | 
49 | def navigate(driver):
50 |     """
51 |     적절한 페이지로 이동한 뒤 
52 |     """
53 |     print('Navigating...', file=sys.stderr)
54 |     driver.get("https://order.pay.naver.com/home?tabMenu=SHOPPING")
55 |     print('Waiting for contents to be loaded...', file=sys.stderr)
56 |     time.sleep(2)
57 |     
58 |     # 페이지를 아래로 스크롤합니다.
59 |     # 사실 현재 예제에서는 필요 없지만 활용 예를 위해 넣어봤습니다.
60 |     # 스크롤을 해서 데이터를 가져오는 페이지의 경우 활용할 수 있습니다.
61 |     driver.execute_script('scroll(0, document.body.scrollHeight)')
62 |     wait = WebDriverWait(driver, 10)
63 |     
64 |     # [더보기] 버튼을 클릭할 수 있는 상태가 될 때까지 대기하고 클릭합니다.
65 |     # 두 번 클릭해서 과거의 정보까지 들고옵니다.
66 |     driver.save_screenshot('note-1.png')
67 |     button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#_moreButton a')))
68 |     button.click()
69 |     button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#_moreButton a')))
70 |     button.click()
71 |     # 2초 대기합니다.
72 |     print('Waiting for contents to be loaded...', file=sys.stderr)
73 |     time.sleep(2)
74 | 
75 | def scrape_history(driver):
76 |     """
77 |     페이지에서 주문 이력을 추출합니다.
78 |     """
79 |     goods = []
80 |     for info in driver.find_elements_by_css_selector('.p_info'):
81 |         # 요소를 추출합니다.
82 |         link_element = info.find_element_by_css_selector('a')
83 |         title_element = info.find_element_by_css_selector('span')
84 |         date_element = info.find_element_by_css_selector('.date')
85 |         price_element = info.find_element_by_css_selector('em')
86 |         # 텍스트를 추출합니다.
87 |         goods.append({
88 |             'url': link_element.get_attribute('.a'),
89 |             'title': title_element.text,
90 |             'description': date_element.text + " - " + price_element.text + "원"
91 |         })
92 |     return goods
93 |     
94 | if __name__ == '__main__':
95 |     main()


--------------------------------------------------------------------------------
/chapter_5/streaming_api_with_tweepy.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import tweepy
 3 | 
 4 | # 환경변수에서 인증 정보를 추출합니다.
 5 | CONSUMER_KEY = os.environ['CONSUMER_KEY']
 6 | CONSUMER_SECRET = os.environ['CONSUMER_SECRET']
 7 | ACCESS_TOKEN = os.environ['ACCESS_TOKEN']
 8 | ACCESS_TOKEN_SECRET = os.environ['ACCESS_TOKEN_SECRET']
 9 | 
10 | # 인증 정보를 설정합니다.
11 | auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
12 | auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET)
13 | class MyStreamListener(tweepy.StreamListener):
14 |     """
15 |     Streaming API로 추출한 트윗을 처리하는 클래스입니다.
16 |     """
17 |     def on_status(self, status):
18 |         """
19 |         트윗을 받을 때 호출되는 메서드
20 |         매개변수로 트윗을 나타내는 Status 객체가 전달됩니다.
21 |         """
22 |         print('@' + status.author.screen_name, status.text)
23 | # 인증 정보와 StreamListener를 지정해서 Stream 객체를 추출합니다.
24 | stream = tweepy.Stream(auth, MyStreamListener())
25 | 
26 | # 공개돼 있는 트윗을 샘플링한 스트림을 받습니다.
27 | # 키워드 매개변수인 languages로 한국어 트윗만 추출합니다
28 | stream.sample(languages=['ko'])


--------------------------------------------------------------------------------
/chapter_5/word_frequency.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | from glob import glob
 4 | from collections import Counter
 5 | from konlpy.tag import Kkma
 6 | 
 7 | def main():
 8 |     """
 9 |     명령라인 매개변수로 지정한
10 |     디렉터리 내부의 파일을 읽어 들이고
11 |     빈출 단어를 출력합니다.
12 |     """
13 |     # 명령어의 첫 번째 매개변수로
14 |     # WikiExtractor의 출력 디렉터리를 지정합니다.
15 |     input_dir = sys.argv[1]
16 |     kkma = Kkma()
17 |     # 단어의 빈도를 저장하기 위한 Counter 객체를 생성합니다.
18 |     # Counter 클래스는 dict를 상속받는 클래스입니다.
19 |     frequency = Counter()
20 |     count_proccessed = 0
21 |     # glob()으로 와일드카드 매치 파일 목록을 추출하고
22 |     # 매치한 모든 파일을 처리합니다.
23 |     for path in glob(os.path.join(input_dir, '*', 'wiki_*')):
24 |         print('Processing {0}...'.format(path), file=sys.stderr)
25 |         # 파일을 엽니다.
26 |         with open(path) as file:
27 |             # 파일 내부의 모든 기사에 반복을 돌립니다.
28 |             for content in iter_docs(file):
29 |                 # 페이지에서 명사 리스트를 추출합니다.
30 |                 tokens = get_tokens(kkma, content)
31 |                 # Counter의 update() 메서드로 리스트 등의 반복 가능 객체를 지정하면
32 |                 # 리스트에 포함된 값의 출현 빈도를 세어줍니다.
33 |                 frequency.update(tokens)
34 |                 # 10,000개의 글을 읽을 때마다 간단하게 출력합니다.
35 |                 count_proccessed += 1
36 |                 if count_proccessed % 10000 == 0:
37 |                     print('{0} documents were processed.'
38 |                         .format(count_proccessed),file=sys.stderr)
39 |     
40 |     # 모든 기사의 처리가 끝나면 상위 30개의 단어를 출력합니다
41 |     for token, count in frequency.most_common(30):
42 |         print(token, count)
43 | 
44 | def iter_docs(file):
45 |     """
46 |     파일 객체를 읽어 들이고
47 |     기사의 내용(시작 태그 <doc>와 종료 태그 </doc> 사이의 텍스트)를 꺼내는
48 |     제너레이터 함수
49 |     """
50 |     for line in file:
51 |         if line.startswith('<doc '):
52 |             # 시작 태그가 찾아지면 버퍼를 초기화합니다.
53 |             buffer = []
54 |         elif line.startswith('</doc>'):
55 |             # 종료 태그가 찾아지면 버퍼의 내용을 결합한 뒤 yield합니다.
56 |             content = ''.join(buffer)
57 |             yield content
58 |         else:
59 |             # 시작 태그/종료 태그 이외의 줄은 버퍼에 추가합니다.
60 |             buffer.append(line)
61 | 
62 | def get_tokens(kkma, content):
63 |     """
64 |     문장 내부에 출현한 명사 리스트를 추출하는 함수
65 |     """
66 |     # 명사를 저장할 리스트입니다.
67 |     tokens = []
68 |     node = kkma.pos(content)
69 |     for (taeso, pumsa) in node:
70 |         # 고유 명사와 일반 명사만 추출합니다.
71 |         if pumsa in ('NNG', 'NNP'):
72 |             tokens.append(taeso)
73 |     return tokens
74 | 
75 | if __name__ == '__main__':
76 |     main()


--------------------------------------------------------------------------------
/chapter_6/6-1/myspider.py:
--------------------------------------------------------------------------------
 1 | import scrapy
 2 |  
 3 | class BlogSpider(scrapy.Spider):
 4 |     # spider의 이름
 5 |     name = 'blogspider'
 6 | 
 7 |     # 크롤링을 시작할 URL 리스트
 8 |     start_urls = ['https://blog.scrapinghub.com']
 9 |  
10 |     def parse(self, response):
11 |         """
12 |         최상위 페이지에서 카테고리 페이지의 링크를 추출합니다.
13 |         """
14 |         for url in response.css('ul li a::attr("href")').re('.*/tag/.*'):
15 |             yield scrapy.Request(response.urljoin(url), self.parse_titles)
16 | 
17 |     def parse_titles(self, response):
18 |         """
19 |         카페고리 페이지에서 카테고리 타이틀을 모두 추출합니다.
20 |         """
21 |         for post_title in response.css('div.post-header > h2 > a::text').extract():
22 |             yield {'title': post_title}
23 | 


--------------------------------------------------------------------------------
/chapter_6/6-2/myproject/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wikibook/python-for-web-scraping/ee720e5453456650e67febc3cb7ce2bdc21b46d6/chapter_6/6-2/myproject/__init__.py


--------------------------------------------------------------------------------
/chapter_6/6-2/myproject/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class Headline(scrapy.Item):
12 |     """
13 |     뉴스 헤드라인을 나타내는 Item 객체
14 |     """
15 |     title = scrapy.Field()
16 |     body = scrapy.Field()


--------------------------------------------------------------------------------
/chapter_6/6-2/myproject/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | 
 9 | class MyprojectPipeline(object):
10 |     def process_item(self, item, spider):
11 |         return item
12 | 


--------------------------------------------------------------------------------
/chapter_6/6-2/myproject/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for myproject project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'myproject'
13 | 
14 | SPIDER_MODULES = ['myproject.spiders']
15 | NEWSPIDER_MODULE = 'myproject.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'myproject (+http://www.yourdomain.com)'
20 | 
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = True
23 | 
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 | 
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | DOWNLOAD_DELAY = 1
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 | 
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 | 
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 | 
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | #   'Accept-Language': 'en',
45 | #}
46 | 
47 | # Enable or disable spider middlewares
48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | #    'myproject.middlewares.MyCustomSpiderMiddleware': 543,
51 | #}
52 | 
53 | # Enable or disable downloader middlewares
54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | #    'myproject.middlewares.MyCustomDownloaderMiddleware': 543,
57 | #}
58 | 
59 | # Enable or disable extensions
60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | #    'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 | 
65 | # Configure item pipelines
66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
67 | #ITEM_PIPELINES = {
68 | #    'myproject.pipelines.SomePipeline': 300,
69 | #}
70 | 
71 | # Enable and configure the AutoThrottle extension (disabled by default)
72 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
73 | #AUTOTHROTTLE_ENABLED = True
74 | # The initial download delay
75 | #AUTOTHROTTLE_START_DELAY = 5
76 | # The maximum download delay to be set in case of high latencies
77 | #AUTOTHROTTLE_MAX_DELAY = 60
78 | # The average number of requests Scrapy should be sending in parallel to
79 | # each remote server
80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
81 | # Enable showing throttling stats for every response received:
82 | #AUTOTHROTTLE_DEBUG = False
83 | 
84 | # Enable and configure HTTP caching (disabled by default)
85 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86 | #HTTPCACHE_ENABLED = True
87 | #HTTPCACHE_EXPIRATION_SECS = 0
88 | #HTTPCACHE_DIR = 'httpcache'
89 | #HTTPCACHE_IGNORE_HTTP_CODES = []
90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
91 | 


--------------------------------------------------------------------------------
/chapter_6/6-2/myproject/spiders/news.py:
--------------------------------------------------------------------------------
 1 | import scrapy
 2 | 
 3 | # Item의 Headline 클래스를 읽어 들입니다.
 4 | from myproject.items import Headline
 5 | 
 6 | class NewsSpider(scrapy.Spider):
 7 |     name = 'news'
 8 |     # 크롤링 대상 도메인 리스트
 9 |     allowed_domains = ['engadget.com']
10 |     # 크롤링을 시작할 URL 리스트
11 |     start_urls = ['http://engadget.com/']
12 |     def parse(self, response):
13 |         """
14 |         메인 페이지의 토픽 목록에서 링크를 추출하고 출력합니다.
15 |         """
16 |         link = response.css('a.o-hit__link::attr("href")').extract()
17 |         for url in link:
18 |             # 광고 페이지 제외
19 |             if url.find("products") == 1: 
20 |                 continue
21 |             # 의미 없는 페이지 제외
22 |             if url == "#": 
23 |                 continue
24 |             # 기사 페이지
25 |             yield scrapy.Request(response.urljoin(url), self.parse_topics)
26 | 
27 |     def parse_topics(self, response):
28 |         item = Headline()
29 |         item['title'] = response.css('head title::text').extract_first()
30 |         item['body'] = " ".join(response.css('.o-article_block p')\
31 |             .xpath('string()')\
32 |             .extract())
33 |         yield item


--------------------------------------------------------------------------------
/chapter_6/6-2/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = myproject.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = myproject
12 | 


--------------------------------------------------------------------------------
/chapter_6/6-3/myproject/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wikibook/python-for-web-scraping/ee720e5453456650e67febc3cb7ce2bdc21b46d6/chapter_6/6-3/myproject/__init__.py


--------------------------------------------------------------------------------
/chapter_6/6-3/myproject/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class Headline(scrapy.Item):
12 |     """
13 |     뉴스 헤드라인을 나타내는 Item 객체
14 |     """
15 |     title = scrapy.Field()
16 |     body = scrapy.Field()


--------------------------------------------------------------------------------
/chapter_6/6-3/myproject/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | 
 9 | class MyprojectPipeline(object):
10 |     def process_item(self, item, spider):
11 |         return item
12 | 


--------------------------------------------------------------------------------
/chapter_6/6-3/myproject/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for myproject project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'myproject'
13 | 
14 | SPIDER_MODULES = ['myproject.spiders']
15 | NEWSPIDER_MODULE = 'myproject.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'myproject (+http://www.yourdomain.com)'
20 | 
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = True
23 | 
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 | 
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | DOWNLOAD_DELAY = 1  
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 | 
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 | 
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 | 
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | #   'Accept-Language': 'en',
45 | #}
46 | 
47 | # Enable or disable spider middlewares
48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | #    'myproject.middlewares.MyCustomSpiderMiddleware': 543,
51 | #}
52 | 
53 | # Enable or disable downloader middlewares
54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | #    'myproject.middlewares.MyCustomDownloaderMiddleware': 543,
57 | #}
58 | 
59 | # Enable or disable extensions
60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | #    'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 | 
65 | # Configure item pipelines
66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
67 | #ITEM_PIPELINES = {
68 | #    'myproject.pipelines.SomePipeline': 300,
69 | #}
70 | 
71 | # Enable and configure the AutoThrottle extension (disabled by default)
72 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
73 | #AUTOTHROTTLE_ENABLED = True
74 | # The initial download delay
75 | #AUTOTHROTTLE_START_DELAY = 5
76 | # The maximum download delay to be set in case of high latencies
77 | #AUTOTHROTTLE_MAX_DELAY = 60
78 | # The average number of requests Scrapy should be sending in parallel to
79 | # each remote server
80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
81 | # Enable showing throttling stats for every response received:
82 | #AUTOTHROTTLE_DEBUG = False
83 | 
84 | # Enable and configure HTTP caching (disabled by default)
85 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86 | #HTTPCACHE_ENABLED = True
87 | #HTTPCACHE_EXPIRATION_SECS = 0
88 | #HTTPCACHE_DIR = 'httpcache'
89 | #HTTPCACHE_IGNORE_HTTP_CODES = []
90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
91 | 


--------------------------------------------------------------------------------
/chapter_6/6-3/myproject/spiders/hanbit.py:
--------------------------------------------------------------------------------
 1 | from scrapy.spiders import SitemapSpider
 2 | 
 3 | class HanbitSpider(SitemapSpider):
 4 |     name = "hanbit"
 5 |     allowed_domains = ["hanbit.co.kr"]
 6 |     # XML 사이트맵을 지정합니다.
 7 |     # robots.txt에서 Sitemap 디렉티브를 사용하고 있다면
 8 |     # robots.txt의 링크를 지정해도 됩니다.
 9 |     sitemap_urls = [
10 |         "http://hanbit.co.kr/sitemap.xml",
11 |     ]
12 |     # 사이트맵 디렉티브에서 순회할 링크의 정규 표현식을 지정합니다.
13 |     # sitemap_follow를 지정하지 않으면 모든 링크를 순회합니다.
14 |     sitemap_follow = [
15 |         r'post-2015-',
16 |     ]
17 |     # 사이트맵에 포함돼 있는 URL을 처리할 콜백을 지정합니다.
18 |     # 규칙은 (<정규 표현식>, <처리할 콜백 함수>) 형태의 튜플을 지정합니다.
19 |     # sitemap_rules를 지정하지 않으면 모든 URL을 parse() 메서드에 전달합니다.
20 |     sitemap_rules = [
21 |         (r'/2015/\d\d/\d\d/', 'parse_book'),
22 |     ]
23 | 
24 |     def parse_post(self, response):
25 |         # 책 페이지에서 제목을 추출합니다.
26 |         yield {
27 |             'title': response.css('.store_product_info_box h3::text').extract_first(),
28 |         }


--------------------------------------------------------------------------------
/chapter_6/6-3/myproject/spiders/news_crawl.py:
--------------------------------------------------------------------------------
 1 | from scrapy.spiders import CrawlSpider, Rule
 2 | from scrapy.linkextractors import LinkExtractor
 3 | 
 4 | # Item의 Headline 클래스를 읽어 들입니다.
 5 | from myproject.items import Headline
 6 | 
 7 | class NewsSpider(scrapy.Spider):
 8 |     name = 'news'
 9 |     # 크롤링 대상 도메인 리스트
10 |     allowed_domains = ['engadget.com']
11 |     # 크롤링을 시작할 URL 리스트
12 |     start_urls = ['http://engadget.com/']
13 |     # 링크 순회를 위한 규칙 리스트
14 |     rules = [
15 |         # 토픽 페이지를 추출한 뒤 응답을 parse_topics() 메서드에 전달합니다.
16 |         Rule(LinkExtractor(allow=r'/\d{4}/\d{2}/\d{2}/.+$'), callback='parse_topics'),
17 |     ]
18 | 
19 |     def parse_topics(self, response):
20 |         item = Headline()
21 |         item['title'] = response.css('head title::text').extract_first()
22 |         item['body'] = " ".join(response.css('.o-article_block p')\
23 |             .xpath('string()')\
24 |             .extract())
25 |         yield item


--------------------------------------------------------------------------------
/chapter_6/6-3/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = myproject.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = myproject
12 | 


--------------------------------------------------------------------------------
/chapter_6/6-4/pipelines.py:
--------------------------------------------------------------------------------
 1 | from scrapy.exceptions import DropItem
 2 | 
 3 | from pymongo import MongoClient
 4 | import MySQLdb
 5 | 
 6 | 
 7 | class ValidationPipeline(object):
 8 |     """
 9 |     Item을 검증하는 Pipeline
10 |     """
11 |     def process_item(self, item, spider):
12 |         if not item['title']:
13 |             # title 필드가 추출되지 않으면 제거합니다.
14 |             # DropItem()의 매개변수로 제거 이유를 나타내는 메시지를 입력합니다.
15 |             raise DropItem('Missing title')
16 |         # title 필드가 제대로 추출된 경우
17 |         return item  
18 |         
19 | class MongoPipeline(object):
20 |     """
21 |     Itemd을 MongoDB에 저장하는 Pipeline
22 |     """
23 |     def open_spider(self, spider):
24 |         """
25 |         Spider를 시작할 때 MongoDB에 접속합니다.
26 |         """
27 |         # 호스트와 포트를 지정해서 클라이언트를 생성합니다.
28 |         self.client = MongoClient('localhost', 27017)
29 |          # scraping-book 데이터베이스를 추출합니다.
30 |         self.db = self.client['scraping-book']
31 |         # items 콜렉션을 추출합니다.
32 |         self.collection = self.db['items']
33 |     
34 |     def close_spider(self, spider):
35 |         """
36 |         Spider가 종료될 때 MongoDB 접속을 끊습니다.
37 |         """
38 |         self.client.close()
39 |     def process_item(self, item, spider):
40 |         """
41 |         Item을 콜렉션에 추가합니다.
42 |         """
43 |         # insert_one()의 매개변수에는 item을 깊은 복사를 통해 전달합니다.
44 |         self.collection.insert_one(dict(item))
45 |         return item
46 | 
47 | class MySQLPipeline(object):
48 |     """
49 |     Item을 MySQL에 저장하는 Pipeline
50 |     """
51 |     
52 |     def open_spider(self, spider):
53 |         """
54 |         Spider를 시작할 때 MySQL 서버에 접속합니다.
55 |         items 테이블이 존재하지 않으면 생성합니다.
56 |         """
57 |         # settings.py에서 설정을 읽어 들입니다.
58 |         settings = spider.settings
59 |         params = {
60 |             'host': settings.get('MYSQL_HOST', 'localhost'),
61 |             'db': settings.get('MYSQL_DATABASE', 'scraping'),
62 |             'user': settings.get('MYSQL_USER', ''),
63 |             'passwd': settings.get('MYSQL_PASSWORD', ''),
64 |             'charset': settings.get('MYSQL_CHARSET', 'utf8mb4'),
65 |         }
66 |         # MySQL 서버에 접속합니다.
67 |         self.conn = MySQLdb.connect(**params) 
68 |         # 커서를 추출합니다.
69 |         self.c = self.conn.cursor() 
70 |         # items 테이블이 존재하지 않으면 생성합니다.
71 |         self.c.execute('''
72 |             CREATE TABLE IF NOT EXISTS items (
73 |                 id INTEGER NOT NULL AUTO_INCREMENT,
74 |                 title CHAR(200) NOT NULL,
75 |                 PRIMARY KEY (id)
76 |             )
77 |         ''')
78 |         # 변경을 커밋합니다.
79 |         self.conn.commit()
80 |     
81 |     def close_spider(self, spider):
82 |         """
83 |         Spider가 종료될 때 MySQL 서버와의 접속을 끊습니다.
84 |         """
85 |         self.conn.close()
86 |     def process_item(self, item, spider):
87 |         """
88 |         Item을 items 테이블에 삽입합니다.
89 |         """
90 |         self.c.execute('INSERT INTO items (title) VALUES (%(title)s)', dict(item))
91 |         self.conn.commit()
92 |         return item


--------------------------------------------------------------------------------
/chapter_6/6-7/myproject/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wikibook/python-for-web-scraping/ee720e5453456650e67febc3cb7ce2bdc21b46d6/chapter_6/6-7/myproject/__init__.py


--------------------------------------------------------------------------------
/chapter_6/6-7/myproject/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class Restaurant(scrapy.Item):
12 |     """
13 |     서울 음식점 정보
14 |     """
15 |     name = scrapy.Field()
16 |     address = scrapy.Field()
17 |     phone = scrapy.Field()
18 |     station = scrapy.Field()
19 |     latitude = scrapy.Field()
20 |     longitude = scrapy.Field()
21 | 
22 | class Page(scrapy.Item):
23 |     """
24 |     Web 페이지
25 |     """
26 |     url = scrapy.Field()
27 |     title = scrapy.Field()
28 |     content = scrapy.Field()
29 | 
30 |     def __repr__(self):
31 |         """
32 |         로그에 출력할 때 너무 길게 출력하지 않게
33 |         content를 생략합니다.
34 |         """
35 |         # 해당 페이지를 복제합니다.
36 |         p = Page(self)
37 |         if len(p['content']) > 100:
38 |             # 100자 이후의 내용은 생략합니다.
39 |             p['content'] = p['content'][:100] + '...'
40 |         # 복제한 Page를 문자열로 만들어서 반환합니다.
41 |         return super(Page, p).__repr__()


--------------------------------------------------------------------------------
/chapter_6/6-7/myproject/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | 
 9 | class MyprojectPipeline(object):
10 |     def process_item(self, item, spider):
11 |         return item
12 | 


--------------------------------------------------------------------------------
/chapter_6/6-7/myproject/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for myproject project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'myproject'
13 | 
14 | SPIDER_MODULES = ['myproject.spiders']
15 | NEWSPIDER_MODULE = 'myproject.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'myproject (+http://www.yourdomain.com)'
20 | 
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = True
23 | 
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 | 
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | DOWNLOAD_DELAY = 1   
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 | 
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 | 
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 | 
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | #   'Accept-Language': 'en',
45 | #}
46 | 
47 | # Enable or disable spider middlewares
48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | #    'myproject.middlewares.MyCustomSpiderMiddleware': 543,
51 | #}
52 | 
53 | # Enable or disable downloader middlewares
54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | #    'myproject.middlewares.MyCustomDownloaderMiddleware': 543,
57 | #}
58 | 
59 | # Enable or disable extensions
60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | #    'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 | 
65 | # Configure item pipelines
66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
67 | #ITEM_PIPELINES = {
68 | #    'myproject.pipelines.SomePipeline': 300,
69 | #}
70 | 
71 | # Enable and configure the AutoThrottle extension (disabled by default)
72 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
73 | #AUTOTHROTTLE_ENABLED = True
74 | # The initial download delay
75 | #AUTOTHROTTLE_START_DELAY = 5
76 | # The maximum download delay to be set in case of high latencies
77 | #AUTOTHROTTLE_MAX_DELAY = 60
78 | # The average number of requests Scrapy should be sending in parallel to
79 | # each remote server
80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
81 | # Enable showing throttling stats for every response received:
82 | #AUTOTHROTTLE_DEBUG = False
83 | 
84 | # Enable and configure HTTP caching (disabled by default)
85 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86 | #HTTPCACHE_ENABLED = True
87 | #HTTPCACHE_EXPIRATION_SECS = 0
88 | #HTTPCACHE_DIR = 'httpcache'
89 | #HTTPCACHE_IGNORE_HTTP_CODES = []
90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
91 | 


--------------------------------------------------------------------------------
/chapter_6/6-7/myproject/spiders/broad.py:
--------------------------------------------------------------------------------
 1 | import scrapy
 2 | from myproject.items import Page
 3 | from myproject.utils import get_content
 4 | 
 5 | class BroadSpider(scrapy.Spider):
 6 |     name = "broad"
 7 |     start_urls = (
 8 |         # 하테나 북마크 엔트리 페이지
 9 |         'http://b.hatena.ne.jp/entrylist',
10 |     )
11 |     
12 |     def parse(self, response):
13 |         """
14 |         하테나 북마크의 엔트리 페이지를 파싱합니다.
15 |         """
16 |         # 각각의 웹 페이지 링크를 추출합니다.
17 |         for url in response.css('a.entry-link::attr("href")').extract():
18 |             # parse_page() 메서드를 콜백 함수로 지정합니다.
19 |             yield scrapy.Request(url, callback=self.parse_page)
20 |         # of 뒤의 숫자를 두 자리로 지정해 5페이지(첫 페이지, 20, 40, 60, 80)만 추출하게 합니다.
21 |         url_more = response.css('a::attr("href")').re_first(r'.*\?of=\d{2}$')
22 |         if url_more:
23 |             # url_more의 값은 /entrylist로 시작하는 상대 URL이므로
24 |             # response.urljoiin() 메서드를 사용해 절대 URL로 변경합니다.
25 |             # 콜백 함수를 지정하지 않았으므로 응답은 기본적으로
26 |             # parse() 메서드에서 처리하게 됩니다.
27 |             yield scrapy.Request(response.urljoin(url_more))
28 |     
29 |     def parse_page(self, response):
30 |         """
31 |         각 페이지를 파싱합니다.
32 |         """
33 |         # utils.py에 정의돼 있는 get_content() 함수로 타이틀과 본문을 추출합니다.
34 |         title, content = get_content(response.text)
35 |         # Page 객체로 반환합니다.
36 |         yield Page(url=response.url, title=title, content=content)


--------------------------------------------------------------------------------
/chapter_6/6-7/myproject/spiders/visitseoul.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from scrapy.spiders import CrawlSpider, Rule
 3 | from scrapy.linkextractors import LinkExtractor
 4 | from myproject.items import Restaurant
 5 | 
 6 | class VisitSeoulSpider(CrawlSpider):
 7 |     name = "visitseoul"
 8 |     allowed_domains = ["korean.visitseoul.net"]
 9 |     start_urls = ['http://korean.visitseoul.net/eat?curPage=1']
10 |     rules = [
11 |         # 9페이지까지 순회합니다.
12 |         # 정규 표현식 \d를 \d+로 지정하면 모든 페이지를 순회합니다.
13 |         Rule(LinkExtractor(allow=r'/eat\?curPage=\d$')),
14 |         # 음식점 상세 페이지를 분석합니다.
15 |         Rule(LinkExtractor(allow=r'/eat/\w+/\d+'),
16 |              callback='parse_restaurant'),
17 |     ]
18 | 
19 |     def parse_restaurant(self, response):
20 |         """
21 |         음식점 정보 페이지를 파싱합니다.
22 |         """
23 |         # 정보를 추출합니다.
24 |         name = response.css("#pageheader h3")\
25 |             .xpath("string()").extract_first().strip()
26 |         address = response.css("dt:contains('주소') + dd")\
27 |             .xpath("string()").extract_first().strip()
28 |         phone = response.css("dt:contains('전화번호') + dd")\
29 |             .xpath("string()").extract_first().strip()
30 |         station = response.css("th:contains('지하철') + td")\
31 |             .xpath("string()").extract_first().strip()
32 |         
33 |         # 위도 경도를 추출합니다.
34 |         try:
35 |             scripts = response.css("script:contains('var lat')").xpath("string()").extract_first()
36 |             latitude = re.findall(r"var lat = '(.+)'", scripts)[0]
37 |             longitude = re.findall(r"var lng = '(.+)'", scripts)[0]
38 |         except Exception as exception:
39 |             print("예외 발생")
40 |             print(exception)
41 |             print()
42 |             
43 |         # 음식점 객체를 생성합니다.
44 |         item = Restaurant(
45 |             name=name,
46 |             address=address,
47 |             phone=phone,
48 |             latitude=latitude,
49 |             longitude=longitude,
50 |             station=station
51 |         )
52 |         yield item


--------------------------------------------------------------------------------
/chapter_6/6-7/myproject/utils.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import lxml.html
 3 | import readability
 4 | # Readability의 DEBUG/INFO 수준의 로그를 출력하지 않게 합니다.
 5 | # Spider를 실행할 때 Readability의 로그가 많이 출력되므로
 6 | # 출력이 보기 힘들어지는 것을 막는 것입니다.
 7 | logging.getLogger('readability.readability').setLevel(logging.WARNING)
 8 | def get_content(html):
 9 |     """
10 |     HTML 문자열에서 (<제목>, <본문>) 형태의 튜플을 찾은 뒤 반환합니다. 
11 |     """
12 |     document = readability.Document(html)
13 |     content_html = document.summary()
14 |     # HTM 태그를 제거하고 텍스트만 추출합니다.
15 |     content_text = lxml.html.fromstring(content_html).text_content().strip()
16 |     short_title = document.short_title()
17 |     
18 |     return short_title, content_text


--------------------------------------------------------------------------------
/chapter_6/6-7/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = myproject.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = myproject
12 | 


--------------------------------------------------------------------------------
/chapter_6/6-8/extract_faces.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | import cv2
 4 | 
 5 | try:
 6 |     # 얼굴 검출 전용 특징량 파일의 경로
 7 |     cascade_path = sys.argv[1]
 8 | except IndexError:
 9 |     # 명령어 매개변수가 부족한 경우에는 사용법을 출력하고 곧바로 종료합니다.
10 |     print('Usage: python extract_faces.py CASCADE_PATH IMAGE_PATH...', file=sys.stderr)
11 |     exit(1)
12 | 
13 | # 얼굴 이미지 출력 대상 디렉터리가 존재하지 않으면 생성해 둡니다.
14 | output_dir = 'faces'
15 | if not os.path.exists(output_dir):
16 |     os.makedirs(output_dir)
17 | 
18 | # 특징량 파일이 존재하는지 확인합니다.
19 | assert os.path.exists(cascade_path)
20 | # 특징량 파일의 경로를 지정해 분석 객체를 생성합니다.
21 | classifier = cv2.CascadeClassifier(cascade_path)
22 | 
23 | # 두 번째 이후의 매개변수 파일 경로를 반복 처리합니다.
24 | for image_path in sys.argv[2:]:
25 |     print('Processing', image_path, file=sys.stderr)
26 |     
27 |     # 명령어 매개변수에서 얻은 경로의 이미지 파일을 읽어 들입니다.
28 |     image = cv2.imread(image_path)
29 |     # 얼굴 검출을 빠르게 할 수 있게 이미지를 그레이스케일로 변환합니다.
30 |     gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
31 |     # 얼굴을 검출합니다.
32 |     faces = classifier.detectMultiScale(gray_image)
33 |     
34 |     # 이미지 파일 이름의 확장자를 제거합니다.
35 |     image_name = os.path.splitext(os.path.basename(image_path))[0]
36 |     
37 |     # 추출된 얼굴의 리스트를 반복 처리합니다.
38 |     # i는 0부터 시작되는 순번입니다.
39 |     for i, (x, y, w, h) in enumerate(faces):
40 |         # 얼굴 부분만 자릅니다.
41 |         face_image = image[y:y + h, x: x + w]
42 |         # 출력 대상 파일 경로를 생성합니다.
43 |         output_path = os.path.join(output_dir, '{0}_{1}.jpg'.format(image_name, i))
44 |         # 얼굴 이미지를 저장합니다.
45 |         cv2.imwrite(output_path, face_image)


--------------------------------------------------------------------------------
/chapter_6/6-8/myproject/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wikibook/python-for-web-scraping/ee720e5453456650e67febc3cb7ce2bdc21b46d6/chapter_6/6-8/myproject/__init__.py


--------------------------------------------------------------------------------
/chapter_6/6-8/myproject/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class MyprojectItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     pass
15 | 


--------------------------------------------------------------------------------
/chapter_6/6-8/myproject/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | 
 9 | class MyprojectPipeline(object):
10 |     def process_item(self, item, spider):
11 |         return item
12 | 


--------------------------------------------------------------------------------
/chapter_6/6-8/myproject/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for myproject project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'myproject'
13 | 
14 | SPIDER_MODULES = ['myproject.spiders']
15 | NEWSPIDER_MODULE = 'myproject.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'myproject (+http://www.yourdomain.com)'
20 | 
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = True
23 | 
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 | 
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | DOWNLOAD_DELAY = 1 
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 | 
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 | 
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 | 
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | #   'Accept-Language': 'en',
45 | #}
46 | 
47 | # Enable or disable spider middlewares
48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | #    'myproject.middlewares.MyCustomSpiderMiddleware': 543,
51 | #}
52 | 
53 | # Enable or disable downloader middlewares
54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | #    'myproject.middlewares.MyCustomDownloaderMiddleware': 543,
57 | #}
58 | 
59 | # Enable or disable extensions
60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | #    'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 | 
65 | # Configure item pipelines
66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
67 | #ITEM_PIPELINES = {
68 | #    'myproject.pipelines.SomePipeline': 300,
69 | #}
70 | 
71 | # Enable and configure the AutoThrottle extension (disabled by default)
72 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
73 | #AUTOTHROTTLE_ENABLED = True
74 | # The initial download delay
75 | #AUTOTHROTTLE_START_DELAY = 5
76 | # The maximum download delay to be set in case of high latencies
77 | #AUTOTHROTTLE_MAX_DELAY = 60
78 | # The average number of requests Scrapy should be sending in parallel to
79 | # each remote server
80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
81 | # Enable showing throttling stats for every response received:
82 | #AUTOTHROTTLE_DEBUG = False
83 | 
84 | # Enable and configure HTTP caching (disabled by default)
85 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86 | #HTTPCACHE_ENABLED = True
87 | #HTTPCACHE_EXPIRATION_SECS = 0
88 | #HTTPCACHE_DIR = 'httpcache'
89 | #HTTPCACHE_IGNORE_HTTP_CODES = []
90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
91 | 
92 | # ダウンロードした画像ファイルの保存場所
93 | FILES_STORE = 'images'
94 | # SpiderでyieldしたItemを処理するパイプライン
95 | ITEM_PIPELINES = {
96 |     'scrapy.pipelines.files.FilesPipeline': 1
97 | }
98 | 


--------------------------------------------------------------------------------
/chapter_6/6-8/myproject/spiders/flickr.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from urllib.parse import urlencode
 3 | import scrapy
 4 | class FlickrSpider(scrapy.Spider):
 5 |     name = "flickr"
 6 |     # Files Pipeline으로 다운로드하는 이미지 파일은 allowed_domains에
 7 |     # 제한을 받으므로 allowed_domains에 'staticflickr.com'을 추가해야 합니다
 8 |     allowed_domains = ["api.flickr.com"]
 9 |     
10 |     # 키워드 매개변수로 Spider 매개변수 값을 받습니다.
11 |     def __init__(self, text='sushi'):
12 |         # 부모 클래스의 __init__()을 실행합니다.
13 |         super().__init__()
14 |         # 환경변수와 Spider 매개변수 값을 사용해 start_urls를 조합합니다.
15 |         # urlencode() 함수는 매개변수로 지정한 dict의 키와 값을 URI 인코드해서
16 |         # key1=value1&key2=value2라는 문자열로 반환해 줍니다.
17 |         self.start_urls = [
18 |             'https://api.flickr.com/services/rest/?' + urlencode({
19 |                 'method': 'flickr.photos.search',
20 |                 'api_key': os.environ['FLICKR_API_KEY'],
21 |                 'text': text,
22 |                 'sort': 'relevance',
23 |                 # CC BY 2.0, CC BY-SA 2.0, CC0를 지정합니다.
24 |                 'license': '4,5,9',  
25 |             }),
26 |         ]
27 |     def parse(self, response):
28 |         """
29 |         API의 응답을 파싱해서 file_urls라는 키를 포함한 dict를 생성하고 yield합니다.
30 |         """
31 |         for photo in response.css('photo'):
32 |             yield {'file_urls': [flickr_photo_url(photo)]}
33 | 
34 | def flickr_photo_url(photo):
35 |     """
36 |     플리커 사진 URL을 조합합니다.
37 |     참고: https://www.flickr.com/services/api/misc.urls.html
38 |     """
39 |     # 이 경우는 XPath가 CSS 선택자보다 쉬우므로 XPath를 사용하겠습니
40 |     return 'https://farm{farm}.staticflickr.com/{server}/{id}_{secret}_{size}.jpg'.format(
41 |         farm=photo.xpath('@farm').extract_first(),
42 |         server=photo.xpath('@server').extract_first(),
43 |         id=photo.xpath('@id').extract_first(),
44 |         secret=photo.xpath('@secret').extract_first(),
45 |         size='b',
46 |     )


--------------------------------------------------------------------------------
/chapter_6/6-8/myproject/utils.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | import lxml.html
 4 | import readability
 5 | 
 6 | 
 7 | logging.getLogger('readability.readability').setLevel(logging.WARNING)
 8 | 
 9 | 
10 | def get_content(html):
11 |     document = readability.Document(html)
12 |     content_html = document.summary()
13 |     content_text = lxml.html.fromstring(content_html).text_content().strip()
14 |     short_title = document.short_title()
15 | 
16 |     return short_title, content_text
17 | 


--------------------------------------------------------------------------------
/chapter_6/6-8/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = myproject.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = myproject
12 | 


--------------------------------------------------------------------------------
/chapter_7/crawl.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import re
 3 | import sys
 4 | 
 5 | import requests
 6 | import lxml.html
 7 | from pymongo import MongoClient
 8 | from redis import Redis
 9 | from rq import Queue
10 | 
11 | def main():
12 |     """
13 |     크롤러의 메인 처리
14 |     """
15 |     q = Queue(connection=Redis())
16 |     # 로컬 호스트의 MongoDB에 접속
17 |     client = MongoClient('localhost', 27017)
18 |     # scraping 데이터베이스의 ebook_htmls 콜렉션을 추출합니다.
19 |     collection = client.scraping.ebook_htmls
20 |     # key로 빠르게 검색할 수 있게 유니크 인덱스를 생성합니다.
21 |     collection.create_index('key', unique=True)
22 |     
23 |     session = requests.Session()
24 |     # 목록 페이지를 추출합니다.
25 |     response = requests.get('http://www.hanbit.co.kr/store/books/new_book_list.html')
26 |     # 상세 페이지의 URL 목록을 추출합니다.
27 |     urls = scrape_list_page(response)
28 |     for url in urls:
29 |         # URL로 키를 추출합니다.
30 |         key = extract_key(url)
31 |         # MongoDB에서 key에 해당하는 데이터를 검색합니다.
32 |         ebook_html = collection.find_one({'key': key})
33 |         # MongoDB에 존재하지 않는 경우에만 상세 페이지를 크롤링합니다.
34 |         if not ebook_html:
35 |             time.sleep(1)
36 |             print('Fetching {0}'.format(url), file=sys.stderr)
37 |             # 상세 페이지를 추출합니다.
38 |             response = session.get(url)
39 |             # HTML을 MongoDB에 저장합니다.
40 |             collection.insert_one({
41 |                 'url': url,
42 |                 'key': key,
43 |                 'html': response.content,
44 |             })
45 |             # 큐에 잡을 주가합니다.
46 |             # result_ttl=0을 매개변수로 지정해서
47 |             # 태스크의 반환값이 저장되지 않게 합니다.
48 |             q.enqueue('scraper_tasks.scrape', key, result_ttl=0)
49 | 
50 | def scrape_list_page(response):
51 |     """
52 |     목록 페이지의 Response에서 상세 페이지의 URL을 추출합니다.
53 |     """
54 |     root = lxml.html.fromstring(response.content)
55 |     root.make_links_absolute(response.url)
56 |     for a in root.cssselect('.view_box .book_tit a'):
57 |         url = a.get('href')
58 |         yield url
59 | 
60 | def extract_key(url):
61 |     """
62 |     URL에서 키(URL 끝의 p_code)를 추출합니다.
63 |     """
64 |     m = re.search(r"p_code=(.+)", url)
65 |     return m.group(1)
66 | 
67 | if __name__ == '__main__':
68 |     main()


--------------------------------------------------------------------------------
/chapter_7/crawl_images.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import time
 3 | import requests
 4 | import lxml.html
 5 | import boto3
 6 | 
 7 | # S3 버킷 이름[자신이 생성한 버킷 이름으로 변경해 주세요]
 8 | S3_BUCKET_NAME = 'scraping-book'
 9 | 
10 | def main():
11 |     # Wikimedia Commons 페이지에서 이미지 URL을 추출합니다.
12 |     image_urls = get_image_urls('https://commons.wikimedia.org/wiki/Category:Mountain_glaciers')
13 |     # S3 Bucket 객체를 추출합니다.
14 |     s3 = boto3.resource('s3')
15 |     bucket = s3.Bucket(S3_BUCKET_NAME)
16 |     
17 |     for image_url in image_urls:
18 |         # 2초 동안 대기합니다.
19 |         time.sleep(2)
20 |         
21 |         # 이미지 파일을 내려받습니다.
22 |         print('Downloading', image_url, file=sys.stderr)
23 |         response = requests.get(image_url)
24 |         
25 |         # URL을 기반으로 파일 이름을 추출합니다.
26 |         _, filename = image_url.rsplit('/', maxsplit=1)
27 |         
28 |         # 다운로드한 파일을 S3에 저장합니다.
29 |         print('Putting', filename, file=sys.stderr)
30 |         bucket.put_object(Key=filename, Body=response.content)
31 | 
32 | def get_image_urls(page_url):
33 |     """
34 |     매개변수로 전달된 페이지에 출력되고 있는 섬네일 이미지의 원래 URL을 추출합니다.
35 |     """
36 |     response = requests.get(page_url)
37 |     html = lxml.html.fromstring(response.text)
38 |     
39 |     image_urls = []
40 |     for img in html.cssselect('.thumb img'):
41 |         thumbnail_url = img.get('src')
42 |         image_urls.append(get_original_url(thumbnail_url))
43 |     
44 |     return image_urls
45 | 
46 | def get_original_url(thumbnail_url):
47 |     """
48 |     섬네일 URL에서 원래 이미지 URL을 추출합니다.
49 |     """
50 |     # /로 잘라서 디렉터리에 대응하는 부분의 URL을 추출합니다.
51 |     directory_url, _ = thumbnail_url.rsplit('/', maxsplit=1)
52 |     # /thumb/을 /로 변경해서 원래 이미지 URL을 추출합니다.
53 |     original_url = directory_url.replace('/thumb/', '/')
54 |     return original_url
55 | 
56 | if __name__ == '__main__':
57 |     main()


--------------------------------------------------------------------------------
/chapter_7/crawl_with_aiohttp.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import asyncio
 3 | 
 4 | import aiohttp
 5 | import feedparser
 6 | from bs4 import BeautifulSoup
 7 | 
 8 | # 최대 동시 다운로드 수를 3개로 제한하기 위한 세마포어를 생성합니다.
 9 | semaphore = asyncio.Semaphore(3)
10 | 
11 | async def main():
12 |     # 인기 항목 RSS에서 URL 목록을 추출합니다
13 |     d = feedparser.parse('http://www.reddit.com/r/python/.rss')
14 |     urls = [entry.link for entry in d.entries]
15 |     # 세션 객체를 생성합니다.
16 |     with aiohttp.ClientSession() as session:
17 |         # URL 개수만큼 코루틴을 생성합니다.
18 |         coroutines = []
19 |         for url in urls:
20 |             coroutine = fetch_and_scrape(session, url)
21 |             coroutines.append(coroutine)
22 |         # 코루틴을 완료한 뒤 반복합니다.
23 |         for coroutine in asyncio.as_completed(coroutines):
24 |             # 코루틴 결과를 출력합니다: 간단하게 출력을 보여드리고자 가공했습니다.
25 |             output = await coroutine
26 |             output['url'] = output['url'].replace('https://www.reddit.com/r/Python/comments', '')
27 |             print(output)
28 | 
29 | async def fetch_and_scrape(session, url):
30 |     """
31 |     매개변수로 지정한 URL과 제목을 포함한 dict를 반환합니다.
32 |     """
33 |     # 세마포어 락이 풀릴 때까지 대기합니다.
34 |     with await semaphore:
35 |         print('Start downloading', 
36 |             url.replace('https://www.reddit.com/r/Python/comments', ''), 
37 |             file=sys.stderr)
38 |         # 비동기로 요청을 보내고 응답 헤더를 추출합니다.
39 |         response = await session.get(url)
40 |         # 응답 본문을 비동기적으로 추출합니다.
41 |         soup = BeautifulSoup(await response.read(), 'lxml')
42 |         return {
43 |             'url': url,
44 |             'title': soup.title.text.strip(),
45 |         }
46 |     
47 | if __name__ == '__main__':
48 |     # 이벤트 루프를 추출합니다.
49 |     loop = asyncio.get_event_loop()
50 |     # 이벤트 루프로 main()을 실행하고 종료할 때까지 대기합니다.
51 |     loop.run_until_complete(main())


--------------------------------------------------------------------------------
/chapter_7/crawl_with_multi_thread.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from concurrent.futures import ThreadPoolExecutor
 3 | import feedparser
 4 | import requests
 5 | from bs4 import BeautifulSoup
 6 | 
 7 | def main():
 8 |     # URL을 추출합니다.
 9 |     d = feedparser.parse('http://www.aladin.co.kr/rss/special_new/351')
10 |     urls = [entry.link for entry in d.entries]
11 |     # 최대 3개의 스레드로 병렬 처리하는 Executor를 생성합니다.
12 |     executer = ThreadPoolExecutor(max_workers=3)
13 |     # Future 객체를 저장할 리스트를 선언합니다.
14 |     futures = []
15 |     for url in urls:
16 |         # 함수의 실행을 스케줄링하고, Future 객체를 저장합니다.
17 |         # submit()의 두 번째 이후 매개변수는 getch_and_scrape() 함수의 매개변수로써 전달됩니다.
18 |         future = executer.submit(fetch_and_scrape, url)
19 |         futures.append(future)
20 |     
21 |     for future in futures:
22 |         # Future 객체의 결과를 출력합니다.
23 |         print(future.result())
24 | 
25 | def fetch_and_scrape(url):
26 |     """
27 |     매개변수에 지정된 URL 페이지를 추출합니다.
28 |     URL와 타이틀을 추출해서 dict 자료형으로 반환합니다.
29 |     """
30 |     # RSS 링크를 분석합니다.
31 |     print('Parse Link', url.split('itemId=')[-1], file=sys.stderr)
32 |     response_a = requests.get(url)
33 |     soup_a = BeautifulSoup(response_a.content, 'lxml')
34 |     book_url = soup_a.select_one('noscript').text.strip().split('\n')[-1]
35 |     # 책 링크에 들어갑니다. 알라딘 사이트의 RSS가 이상하게 구성돼 있어서
36 |     # 이러한 형태로 타고 들어가도록 코드를 구성했습니다.
37 |     print('Parse Book Link', book_url.split('ISBN=')[-1], file=sys.stderr)
38 |     response_b = requests.get(book_url)
39 |     soup_b = BeautifulSoup(response_b.content, 'lxml')
40 |     return {
41 |         'url': url,
42 |         'title': soup_b.title.text.strip(),
43 |     }
44 | 
45 | if __name__ == '__main__':
46 |     main()


--------------------------------------------------------------------------------
/chapter_7/enqueue.py:
--------------------------------------------------------------------------------
 1 | from redis import Redis
 2 | from rq import Queue
 3 | from tasks import add
 4 | 
 5 | # localhost의 TCP 포트 6379에 있는 Redis에 접속합니다.
 6 | # 이러한 매개변수는 기본값이므로 생략해도 됩니다.
 7 | conn = Redis('localhost', 6379)
 8 | 
 9 | # default라는 이름의 Queue 객체를 추출합니다.
10 | # 이 이름도 기본값이므로 생략해도 됩니다
11 | q = Queue('default', connection=conn)
12 | 
13 | # 함수와 매개변수를 지정하고 잡을 추가합니다.
14 | q.enqueue(add, 3, 4)


--------------------------------------------------------------------------------
/chapter_7/scraper_tasks.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import lxml.html
 3 | from pymongo import MongoClient
 4 | 
 5 | def scrape(key):
 6 |     """
 7 |     워커로 실행할 대상
 8 |     """
 9 |     # 로컬 호스트의 MongoDB에 접속합니다.
10 |     client = MongoClient('localhost', 27017)
11 | 
12 |     # scraping 데이터베이스의 ebook_htmls 콜렉션을 추출합니다.
13 |     html_collection = client.scraping.ebook_htmls
14 | 
15 |     # MongoDB에서 key에 해당하는 데이터를 찾습니다.
16 |     ebook_html = html_collection.find_one({'key': key})
17 |     ebook = scrape_detail_page(key, ebook_html['url'], ebook_html['html'])
18 | 
19 |     # ebooks 콜렉션을 추출합니다.
20 |     ebook_collection = client.scraping.ebooks
21 | 
22 |     # key로 빠르게 검색할 수 있게 유니크 인덱스를 생성합니다.
23 |     ebook_collection.create_index('key', unique=True)
24 | 
25 |     # ebook을 저장합니다.
26 |     ebook_collection.insert_one(ebook)
27 | 
28 | def scrape_detail_page(key, url, html):
29 |     """
30 |     상세 페이지의 Response에서 책 정보를 dict로 추출하기
31 |     """
32 |     root = lxml.html.fromstring(html)
33 |     ebook = {
34 |         'url': response.url,
35 |         'key': key,
36 |         'title': root.cssselect('.store_product_info_box h3')[0].text_content(),
37 |         'price': root.cssselect('.pbr strong')[0].text_content(),
38 |         'content': [normalize_spaces(p.text_content())
39 |             for p in root.cssselect('#tabs_3 .hanbit_edit_view p')
40 |             if normalize_spaces(p.text_content()) != ""]
41 |     }
42 |     return ebook
43 | 
44 | def normalize_spaces(s):
45 |     """
46 |     연결돼 있는 공백을 하나의 공백으로 변경합니다.
47 |     """
48 |     return re.sub(r'\s+', ' ', s).strip()


--------------------------------------------------------------------------------
/chapter_7/slow_jobs_async.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | 
 3 | async def slow_job(n):
 4 |     """
 5 |     매개변수로 지정한 시간 만큼 시간이 걸리는 처리를
 6 |     비동기적으로 수행하는 코루틴입니다.
 7 |     asyncio.sleep()을 사용해 시간이 걸리는 처리를 비슷하게 재현해 봤습니다.
 8 |     """
 9 |     print('Job {0} will take {0} seconds'.format(n))
10 |     # n초 동안 정지
11 |     # await는 처리가 끝날 때까지 대기하는 구문입니다.
12 |     await asyncio.sleep(n) 
13 |     print('Job {0} finished'.format(n))
14 | 
15 | # 이벤트 루프 추출
16 | loop = asyncio.get_event_loop()
17 | # 3개의 코루틴을 생성합니다. 코루틴은 현재 시점에서 실행되는 것이 아닙니다.
18 | coroutines = [slow_job(1), slow_job(2), slow_job(3)]
19 | # 이벤트 루프로 3개의 코루틴을 실행합니다. 모두 종료될 때까지 이 줄에서 대기합니다.
20 | loop.run_until_complete(asyncio.wait(coroutines))


--------------------------------------------------------------------------------
/chapter_7/slow_jobs_sync.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | 
 3 | def slow_job(n):
 4 |     """
 5 |     매개변수로 지정한 시간 만큼 시간이 걸리는 처리를 수행하는 함수입니다.
 6 |     time.sleep()을 사용해 시간이 걸리는 처리를 비슷하게 재현해 봤습니다.
 7 |     """
 8 |     print('Job {0} will take {0} seconds'.format(n))
 9 |     # n초 대기합니다.
10 |     time.sleep(n)
11 |     print('Job {0} finished'.format(n))
12 | 
13 | slow_job(1)
14 | slow_job(2)
15 | slow_job(3)


--------------------------------------------------------------------------------
/chapter_7/tasks.py:
--------------------------------------------------------------------------------
1 | def add(x, y):
2 |     print(x + y)


--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
1 | # 파이썬을 이용한 웹 크롤링과 스크레이핑
2 | 
3 | 『파이썬을 이용한 웹 크롤링과 스크레이핑』의 예제 파일입니다.


--------------------------------------------------------------------------------