├── tests └── __init__.py ├── web_crawler ├── __init__.py ├── drop_collection.py └── crawler.py ├── .gitignore ├── requirements.txt ├── docker-compose-db.yml ├── config.py ├── manage.py ├── search_engine ├── __init__.py └── templates │ └── index.html ├── constraints.txt └── README.md /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /web_crawler/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | *.py[cod] 3 | 4 | # To hide mongo url 5 | .idea/ -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | flask 2 | janome 3 | pymongo 4 | requests 5 | beautifulsoup4 6 | -------------------------------------------------------------------------------- /docker-compose-db.yml: -------------------------------------------------------------------------------- 1 | mongo: 2 | image: mongo 3 | container_name: pysearch_mongo 4 | ports: 5 | - "27017:27017" 6 | -------------------------------------------------------------------------------- /config.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | # application settings 4 | MONGO_URL = 'MONGO_URL' 5 | 6 | # Generate a random secret key 7 | SECRET_KEY = os.urandom(24) 8 | CSRF_ENABLED = True 9 | -------------------------------------------------------------------------------- /web_crawler/drop_collection.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | __author__ = 'masashi' 3 | 4 | from web_crawler import collection as col 5 | 6 | 7 | def drop_collection(): 8 | col.drop() 9 | 10 | if __name__ == '__main__': 11 | drop_collection() 12 | -------------------------------------------------------------------------------- /manage.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | __author__ = 'c-bata' 3 | 4 | 5 | if __name__ == '__main__': 6 | parser = argparse.ArgumentParser("Runner") 7 | parser.add_argument('action', type=str, nargs=None, help="Select target 'crawler' or 'webpage'?") 8 | args = parser.parse_args() 9 | 10 | if args.action == 'crawler': 11 | from web_crawler.crawler import crawl_web 12 | crawl_web('http://docs.sphinx-users.jp/contents.html', 2) 13 | elif args.action == 'webpage': 14 | from search_engine import app 15 | app.run(debug=True, port=9000) 16 | else: 17 | raise ValueError('Please select "crawler" or "webpage".') 18 | -------------------------------------------------------------------------------- /search_engine/__init__.py: -------------------------------------------------------------------------------- 1 | from urllib.parse import urlparse 2 | from flask import Flask, render_template, request 3 | from pymongo import MongoClient 4 | 5 | app = Flask(__name__) 6 | app.config.from_object('config') 7 | 8 | # DB settings 9 | MONGO_URL = app.config['MONGO_URL'] 10 | client = MongoClient(MONGO_URL) 11 | db = client[urlparse(MONGO_URL).path[1:]] 12 | col = db["Index"] 13 | 14 | 15 | @app.route('/', methods=['GET', 'POST']) 16 | def index(): 17 | """Return index.html 18 | """ 19 | if request.method == 'POST': 20 | keyword = request.form['keyword'] 21 | if keyword: 22 | return render_template( 23 | 'index.html', 24 | query=col.find_one({'keyword': keyword}), 25 | keyword=keyword) 26 | return render_template('index.html') 27 | -------------------------------------------------------------------------------- /constraints.txt: -------------------------------------------------------------------------------- 1 | appnope==0.1.0 2 | backports-abc==0.4 3 | backports.shutil-get-terminal-size==1.0.0 4 | backports.ssl-match-hostname==3.5.0.1 5 | certifi==2016.2.28 6 | configparser==3.3.0.post2 7 | decorator==4.0.9 8 | entrypoints==0.2.1 9 | functools32==3.2.3.post2 10 | gnureadline==6.3.3 11 | ipykernel==4.3.1 12 | ipython==4.2.0 13 | ipython-genutils==0.1.0 14 | ipywidgets==5.1.4 15 | Jinja2==2.8 16 | jsonschema==2.5.1 17 | jupyter==1.0.0 18 | jupyter-client==4.2.2 19 | jupyter-console==4.1.1 20 | jupyter-core==4.1.0 21 | MarkupSafe==0.23 22 | mercurial==3.7.3 23 | mistune==0.7.2 24 | nbconvert==4.2.0 25 | nbformat==4.0.1 26 | notebook==4.2.0 27 | pathlib2==2.1.0 28 | pexpect==4.0.1 29 | pickleshare==0.7.2 30 | pluggy==0.3.1 31 | ptyprocess==0.5.1 32 | py==1.4.31 33 | Pygments==2.1.3 34 | pyzmq==15.2.0 35 | qtconsole==4.2.1 36 | redis==2.10.5 37 | simplegeneric==0.8.1 38 | singledispatch==3.4.0.3 39 | six==1.10.0 40 | terminado==0.6 41 | tornado==4.3 42 | tox==2.3.1 43 | traitlets==4.2.1 44 | vboxapi==1.0 45 | virtualenv==14.0.6 46 | widgetsnbextension==1.2.2 47 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Search Engine and Web Crawler in Python 2 | 3 | ![Screenshot](https://qiita-image-store.s3.amazonaws.com/0/29989/786c36ad-4de7-43a7-75a0-98c82e412fa3.png "Screenshot") 4 | 5 | - Implement a web crawler 6 | - japanese morphological analysis using [janome](https://github.com/mocobeta/janome) 7 | - Implement search engine 8 | - Store in MongoDB 9 | - Web frontend using [Flask](http://flask.pocoo.org/) 10 | 11 | More details are avairable from [My Tech Blog(Japanese)](http://nwpct1.hatenablog.com/entry/python-search-engine). 12 | 13 | ## Requirements 14 | 15 | - Python 3.5 16 | 17 | ## Setup 18 | 19 | 1. Clone repository 20 | 21 | ``` 22 | $ git clone git@github.com:mejiro/SearchEngine.git 23 | ``` 24 | 25 | 2. Install python packages 26 | 27 | ``` 28 | $ cd SearchEngine 29 | $ pip install -r requirements.txt -c constraints.txt 30 | ``` 31 | 32 | 3. MongoDB settings 33 | 4. Run 34 | 35 | ``` 36 | $ python manage.py crawler # build a index 37 | $ python manage.py webpage # access to http://127.0.0.1:5000 38 | ``` 39 | -------------------------------------------------------------------------------- /search_engine/templates/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Pythonで作る検索エンジン 8 | 9 | 10 | 11 |
12 |

Pythonで作る検索エンジン

13 | 14 |
15 |
16 |
17 | 18 |
19 |
20 | 21 |
22 |
23 |
24 | 25 |
26 | 27 | {% if keyword %} 28 |

「{{keyword}}」の検索結果...

29 | 36 | {% else %} 37 |

キーワードを入れて検索ボタンを押して下さい.

38 | {% endif %} 39 |
40 | 41 | 42 | 43 | -------------------------------------------------------------------------------- /web_crawler/crawler.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from urllib.parse import urlparse 3 | 4 | from pymongo import MongoClient 5 | from janome.tokenizer import Tokenizer 6 | from bs4 import BeautifulSoup 7 | 8 | from config import MONGO_URL 9 | 10 | client = MongoClient(MONGO_URL) 11 | db = client[urlparse(MONGO_URL).path[1:]] 12 | col = db["Index"] 13 | 14 | 15 | def _split_to_word(text): 16 | """Japanese morphological analysis with janome. 17 | Splitting text and creating words list. 18 | """ 19 | t = Tokenizer() 20 | return [token.surface for token in t.tokenize(text)] 21 | 22 | 23 | def _get_page(url): 24 | r = requests.get(url) 25 | if r.status_code == 200: 26 | return r.text 27 | 28 | 29 | def _extract_url_links(html): 30 | """extract url links 31 | 32 | >>> _extract_url_links('aalink1bblink2cc') 33 | ['link1', 'link2'] 34 | """ 35 | soup = BeautifulSoup(html, "html.parser") 36 | return soup.find_all('a') 37 | 38 | 39 | def add_to_index(keyword, url): 40 | entry = col.find_one({'keyword': keyword}) 41 | if entry: 42 | if url not in entry['url']: 43 | entry['url'].append(url) 44 | col.save(entry) 45 | return 46 | # not found, add new keyword to index 47 | col.insert({'keyword': keyword, 'url': [url]}) 48 | 49 | 50 | def add_page_to_index(url, html): 51 | body_soup = BeautifulSoup(html, "html.parser").find('body') 52 | for child_tag in body_soup.findChildren(): 53 | if child_tag.name == 'script': 54 | continue 55 | child_text = child_tag.text 56 | for line in child_text.split('\n'): 57 | line = line.rstrip().lstrip() 58 | for word in _split_to_word(line): 59 | add_to_index(word, url) 60 | 61 | 62 | def crawl_web(seed, max_depth): 63 | to_crawl = {seed} 64 | crawled = [] 65 | next_depth = [] 66 | depth = 0 67 | while to_crawl and depth <= max_depth: 68 | page_url = to_crawl.pop() 69 | if page_url not in crawled: 70 | html = _get_page(page_url) 71 | add_page_to_index(page_url, html) 72 | to_crawl = to_crawl.union(_extract_url_links(html)) 73 | crawled.append(page_url) 74 | if not to_crawl: 75 | to_crawl, next_depth = next_depth, [] 76 | depth += 1 77 | --------------------------------------------------------------------------------