├── tests └── __init__.py ├── web_crawler ├── __init__.py ├── drop_collection.py └── crawler.py ├── .gitignore ├── requirements.txt ├── docker-compose-db.yml ├── config.py ├── manage.py ├── search_engine ├── __init__.py └── templates │ └── index.html ├── constraints.txt └── README.md /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /web_crawler/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | *.py[cod] 3 | 4 | # To hide mongo url 5 | .idea/ -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | flask 2 | janome 3 | pymongo 4 | requests 5 | beautifulsoup4 6 | -------------------------------------------------------------------------------- /docker-compose-db.yml: -------------------------------------------------------------------------------- 1 | mongo: 2 | image: mongo 3 | container_name: pysearch_mongo 4 | ports: 5 | - "27017:27017" 6 | -------------------------------------------------------------------------------- /config.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | # application settings 4 | MONGO_URL = 'MONGO_URL' 5 | 6 | # Generate a random secret key 7 | SECRET_KEY = os.urandom(24) 8 | CSRF_ENABLED = True 9 | -------------------------------------------------------------------------------- /web_crawler/drop_collection.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | __author__ = 'masashi' 3 | 4 | from web_crawler import collection as col 5 | 6 | 7 | def drop_collection(): 8 | col.drop() 9 | 10 | if __name__ == '__main__': 11 | drop_collection() 12 | -------------------------------------------------------------------------------- /manage.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | __author__ = 'c-bata' 3 | 4 | 5 | if __name__ == '__main__': 6 | parser = argparse.ArgumentParser("Runner") 7 | parser.add_argument('action', type=str, nargs=None, help="Select target 'crawler' or 'webpage'?") 8 | args = parser.parse_args() 9 | 10 | if args.action == 'crawler': 11 | from web_crawler.crawler import crawl_web 12 | crawl_web('http://docs.sphinx-users.jp/contents.html', 2) 13 | elif args.action == 'webpage': 14 | from search_engine import app 15 | app.run(debug=True, port=9000) 16 | else: 17 | raise ValueError('Please select "crawler" or "webpage".') 18 | -------------------------------------------------------------------------------- /search_engine/__init__.py: -------------------------------------------------------------------------------- 1 | from urllib.parse import urlparse 2 | from flask import Flask, render_template, request 3 | from pymongo import MongoClient 4 | 5 | app = Flask(__name__) 6 | app.config.from_object('config') 7 | 8 | # DB settings 9 | MONGO_URL = app.config['MONGO_URL'] 10 | client = MongoClient(MONGO_URL) 11 | db = client[urlparse(MONGO_URL).path[1:]] 12 | col = db["Index"] 13 | 14 | 15 | @app.route('/', methods=['GET', 'POST']) 16 | def index(): 17 | """Return index.html 18 | """ 19 | if request.method == 'POST': 20 | keyword = request.form['keyword'] 21 | if keyword: 22 | return render_template( 23 | 'index.html', 24 | query=col.find_one({'keyword': keyword}), 25 | keyword=keyword) 26 | return render_template('index.html') 27 | -------------------------------------------------------------------------------- /constraints.txt: -------------------------------------------------------------------------------- 1 | appnope==0.1.0 2 | backports-abc==0.4 3 | backports.shutil-get-terminal-size==1.0.0 4 | backports.ssl-match-hostname==3.5.0.1 5 | certifi==2016.2.28 6 | configparser==3.3.0.post2 7 | decorator==4.0.9 8 | entrypoints==0.2.1 9 | functools32==3.2.3.post2 10 | gnureadline==6.3.3 11 | ipykernel==4.3.1 12 | ipython==4.2.0 13 | ipython-genutils==0.1.0 14 | ipywidgets==5.1.4 15 | Jinja2==2.8 16 | jsonschema==2.5.1 17 | jupyter==1.0.0 18 | jupyter-client==4.2.2 19 | jupyter-console==4.1.1 20 | jupyter-core==4.1.0 21 | MarkupSafe==0.23 22 | mercurial==3.7.3 23 | mistune==0.7.2 24 | nbconvert==4.2.0 25 | nbformat==4.0.1 26 | notebook==4.2.0 27 | pathlib2==2.1.0 28 | pexpect==4.0.1 29 | pickleshare==0.7.2 30 | pluggy==0.3.1 31 | ptyprocess==0.5.1 32 | py==1.4.31 33 | Pygments==2.1.3 34 | pyzmq==15.2.0 35 | qtconsole==4.2.1 36 | redis==2.10.5 37 | simplegeneric==0.8.1 38 | singledispatch==3.4.0.3 39 | six==1.10.0 40 | terminado==0.6 41 | tornado==4.3 42 | tox==2.3.1 43 | traitlets==4.2.1 44 | vboxapi==1.0 45 | virtualenv==14.0.6 46 | widgetsnbextension==1.2.2 47 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Search Engine and Web Crawler in Python 2 | 3 |  4 | 5 | - Implement a web crawler 6 | - japanese morphological analysis using [janome](https://github.com/mocobeta/janome) 7 | - Implement search engine 8 | - Store in MongoDB 9 | - Web frontend using [Flask](http://flask.pocoo.org/) 10 | 11 | More details are avairable from [My Tech Blog(Japanese)](http://nwpct1.hatenablog.com/entry/python-search-engine). 12 | 13 | ## Requirements 14 | 15 | - Python 3.5 16 | 17 | ## Setup 18 | 19 | 1. Clone repository 20 | 21 | ``` 22 | $ git clone git@github.com:mejiro/SearchEngine.git 23 | ``` 24 | 25 | 2. Install python packages 26 | 27 | ``` 28 | $ cd SearchEngine 29 | $ pip install -r requirements.txt -c constraints.txt 30 | ``` 31 | 32 | 3. MongoDB settings 33 | 4. Run 34 | 35 | ``` 36 | $ python manage.py crawler # build a index 37 | $ python manage.py webpage # access to http://127.0.0.1:5000 38 | ``` 39 | -------------------------------------------------------------------------------- /search_engine/templates/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 |
4 | 5 | 6 | 7 |そのキーワードを持つページはありませんでした.
34 | {% endfor %} 35 |キーワードを入れて検索ボタンを押して下さい.
38 | {% endif %} 39 |