├── tests
    └── __init__.py
├── web_crawler
    ├── __init__.py
    ├── drop_collection.py
    └── crawler.py
├── .gitignore
├── requirements.txt
├── docker-compose-db.yml
├── config.py
├── manage.py
├── search_engine
    ├── __init__.py
    └── templates
    │   └── index.html
├── constraints.txt
└── README.md


/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/web_crawler/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__/
2 | *.py[cod]
3 | 
4 | # To hide mongo url
5 | .idea/


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | flask
2 | janome
3 | pymongo
4 | requests
5 | beautifulsoup4
6 | 


--------------------------------------------------------------------------------
/docker-compose-db.yml:
--------------------------------------------------------------------------------
1 | mongo:
2 |   image: mongo
3 |   container_name: pysearch_mongo
4 |   ports:
5 |    - "27017:27017"
6 | 


--------------------------------------------------------------------------------
/config.py:
--------------------------------------------------------------------------------
1 | import os
2 | 
3 | # application settings
4 | MONGO_URL = 'MONGO_URL'
5 | 
6 | # Generate a random secret key
7 | SECRET_KEY = os.urandom(24)
8 | CSRF_ENABLED = True
9 | 


--------------------------------------------------------------------------------
/web_crawler/drop_collection.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | __author__ = 'masashi'
 3 | 
 4 | from web_crawler import collection as col
 5 | 
 6 | 
 7 | def drop_collection():
 8 |     col.drop()
 9 | 
10 | if __name__ == '__main__':
11 |     drop_collection()
12 | 


--------------------------------------------------------------------------------
/manage.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | __author__ = 'c-bata'
 3 | 
 4 | 
 5 | if __name__ == '__main__':
 6 |     parser = argparse.ArgumentParser("Runner")
 7 |     parser.add_argument('action', type=str, nargs=None, help="Select target 'crawler' or 'webpage'?")
 8 |     args = parser.parse_args()
 9 | 
10 |     if args.action == 'crawler':
11 |         from web_crawler.crawler import crawl_web
12 |         crawl_web('http://docs.sphinx-users.jp/contents.html', 2)
13 |     elif args.action == 'webpage':
14 |         from search_engine import app
15 |         app.run(debug=True, port=9000)
16 |     else:
17 |         raise ValueError('Please select "crawler" or "webpage".')
18 | 


--------------------------------------------------------------------------------
/search_engine/__init__.py:
--------------------------------------------------------------------------------
 1 | from urllib.parse import urlparse
 2 | from flask import Flask, render_template, request
 3 | from pymongo import MongoClient
 4 | 
 5 | app = Flask(__name__)
 6 | app.config.from_object('config')
 7 | 
 8 | # DB settings
 9 | MONGO_URL = app.config['MONGO_URL']
10 | client = MongoClient(MONGO_URL)
11 | db = client[urlparse(MONGO_URL).path[1:]]
12 | col = db["Index"]
13 | 
14 | 
15 | @app.route('/', methods=['GET', 'POST'])
16 | def index():
17 |     """Return index.html
18 |     """
19 |     if request.method == 'POST':
20 |         keyword = request.form['keyword']
21 |         if keyword:
22 |             return render_template(
23 |                 'index.html',
24 |                 query=col.find_one({'keyword': keyword}),
25 |                 keyword=keyword)
26 |     return render_template('index.html')
27 | 


--------------------------------------------------------------------------------
/constraints.txt:
--------------------------------------------------------------------------------
 1 | appnope==0.1.0
 2 | backports-abc==0.4
 3 | backports.shutil-get-terminal-size==1.0.0
 4 | backports.ssl-match-hostname==3.5.0.1
 5 | certifi==2016.2.28
 6 | configparser==3.3.0.post2
 7 | decorator==4.0.9
 8 | entrypoints==0.2.1
 9 | functools32==3.2.3.post2
10 | gnureadline==6.3.3
11 | ipykernel==4.3.1
12 | ipython==4.2.0
13 | ipython-genutils==0.1.0
14 | ipywidgets==5.1.4
15 | Jinja2==2.8
16 | jsonschema==2.5.1
17 | jupyter==1.0.0
18 | jupyter-client==4.2.2
19 | jupyter-console==4.1.1
20 | jupyter-core==4.1.0
21 | MarkupSafe==0.23
22 | mercurial==3.7.3
23 | mistune==0.7.2
24 | nbconvert==4.2.0
25 | nbformat==4.0.1
26 | notebook==4.2.0
27 | pathlib2==2.1.0
28 | pexpect==4.0.1
29 | pickleshare==0.7.2
30 | pluggy==0.3.1
31 | ptyprocess==0.5.1
32 | py==1.4.31
33 | Pygments==2.1.3
34 | pyzmq==15.2.0
35 | qtconsole==4.2.1
36 | redis==2.10.5
37 | simplegeneric==0.8.1
38 | singledispatch==3.4.0.3
39 | six==1.10.0
40 | terminado==0.6
41 | tornado==4.3
42 | tox==2.3.1
43 | traitlets==4.2.1
44 | vboxapi==1.0
45 | virtualenv==14.0.6
46 | widgetsnbextension==1.2.2
47 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Search Engine and Web Crawler in Python
 2 | 
 3 | ![Screenshot](https://qiita-image-store.s3.amazonaws.com/0/29989/786c36ad-4de7-43a7-75a0-98c82e412fa3.png "Screenshot")
 4 | 
 5 | - Implement a web crawler
 6 | - japanese morphological analysis using [janome](https://github.com/mocobeta/janome)
 7 | - Implement search engine
 8 | - Store in MongoDB
 9 | - Web frontend using [Flask](http://flask.pocoo.org/)
10 | 
11 | More details are avairable from [My Tech Blog(Japanese)](http://nwpct1.hatenablog.com/entry/python-search-engine).
12 | 
13 | ## Requirements
14 | 
15 | - Python 3.5
16 | 
17 | ## Setup
18 | 
19 | 1. Clone repository
20 | 
21 |     ```
22 |     $ git clone git@github.com:mejiro/SearchEngine.git
23 |     ```
24 |     
25 | 2. Install python packages
26 | 
27 |     ```
28 |     $ cd SearchEngine
29 |     $ pip install -r requirements.txt -c constraints.txt
30 |     ```
31 | 
32 | 3. MongoDB settings
33 | 4. Run
34 | 
35 |     ```
36 |     $ python manage.py crawler # build a index
37 |     $ python manage.py webpage # access to http://127.0.0.1:5000
38 |     ```
39 | 


--------------------------------------------------------------------------------
/search_engine/templates/index.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 | <head lang="ja">
 4 |     <meta charset="UTF-8">
 5 |     <meta name="viewport" content="width=device-width, initial-scale=1.0">
 6 |     <link href="http://maxcdn.bootstrapcdn.com/bootstrap/3.1.1/css/bootstrap.min.css" rel="stylesheet">
 7 |     <title>Pythonで作る検索エンジン</title>
 8 | </head>
 9 | <body>
10 | 
11 |     <div class="container">
12 |         <h1>Pythonで作る検索エンジン</h1>
13 | 
14 |         <form method="post" action="{{url_for('index')}}">
15 |             <div class="row">
16 |                 <div class="col-xs-10">
17 |                     <input type="text" class="form-control" name="keyword" placeholder="キーワードを入力して下さい.">
18 |                 </div>
19 |                 <div class="col-xs-2">
20 |                     <input type="submit" class="btn btn-md btn-block btn-primary" value="検索">
21 |                 </div>
22 |             </div>
23 |         </form>
24 | 
25 |         <br>
26 | 
27 |         {% if keyword %}
28 |         <h3>「{{keyword}}」の検索結果...</h3>
29 |         <ul>
30 |             {% for url in query['url'] %}
31 |             <li><a href="{{url}}">{{url}}</a></li>
32 |             {% else %}
33 |             <p>そのキーワードを持つページはありませんでした．</p>
34 |             {% endfor %}
35 |         </ul>
36 |         {% else %}
37 |         <p>キーワードを入れて検索ボタンを押して下さい.</p>
38 |         {% endif %}
39 |     </div><!-- container -->
40 |     <script src="//code.jquery.com/jquery.js"></script>
41 |     <script src="//maxcdn.bootstrapcdn.com/bootstrap/3.2.0/js/bootstrap.min.js"></script>
42 | </body>
43 | </html>


--------------------------------------------------------------------------------
/web_crawler/crawler.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from urllib.parse import urlparse
 3 | 
 4 | from pymongo import MongoClient
 5 | from janome.tokenizer import Tokenizer
 6 | from bs4 import BeautifulSoup
 7 | 
 8 | from config import MONGO_URL
 9 | 
10 | client = MongoClient(MONGO_URL)
11 | db = client[urlparse(MONGO_URL).path[1:]]
12 | col = db["Index"]
13 | 
14 | 
15 | def _split_to_word(text):
16 |     """Japanese morphological analysis with janome.
17 |     Splitting text and creating words list.
18 |     """
19 |     t = Tokenizer()
20 |     return [token.surface for token in t.tokenize(text)]
21 | 
22 | 
23 | def _get_page(url):
24 |     r = requests.get(url)
25 |     if r.status_code == 200:
26 |         return r.text
27 | 
28 | 
29 | def _extract_url_links(html):
30 |     """extract url links
31 | 
32 |     >>> _extract_url_links('aa<a href="link1">link1</a>bb<a href="link2">link2</a>cc')
33 |     ['link1', 'link2']
34 |     """
35 |     soup = BeautifulSoup(html, "html.parser")
36 |     return soup.find_all('a')
37 | 
38 | 
39 | def add_to_index(keyword, url):
40 |     entry = col.find_one({'keyword': keyword})
41 |     if entry:
42 |         if url not in entry['url']:
43 |             entry['url'].append(url)
44 |             col.save(entry)
45 |         return
46 |     # not found, add new keyword to index
47 |     col.insert({'keyword': keyword, 'url': [url]})
48 | 
49 | 
50 | def add_page_to_index(url, html):
51 |     body_soup = BeautifulSoup(html, "html.parser").find('body')
52 |     for child_tag in body_soup.findChildren():
53 |         if child_tag.name == 'script':
54 |             continue
55 |         child_text = child_tag.text
56 |         for line in child_text.split('\n'):
57 |             line = line.rstrip().lstrip()
58 |             for word in _split_to_word(line):
59 |                 add_to_index(word, url)
60 | 
61 | 
62 | def crawl_web(seed, max_depth):
63 |     to_crawl = {seed}
64 |     crawled = []
65 |     next_depth = []
66 |     depth = 0
67 |     while to_crawl and depth <= max_depth:
68 |         page_url = to_crawl.pop()
69 |         if page_url not in crawled:
70 |             html = _get_page(page_url)
71 |             add_page_to_index(page_url, html)
72 |             to_crawl = to_crawl.union(_extract_url_links(html))
73 |             crawled.append(page_url)
74 |         if not to_crawl:
75 |             to_crawl, next_depth = next_depth, []
76 |             depth += 1
77 | 


--------------------------------------------------------------------------------