├── .gitignore ├── .travis.yml ├── CHANGES.md ├── LICENSE ├── Makefile.buildbot ├── README.md ├── page_finder ├── __init__.py ├── edit_distance.c ├── page_finder.py └── url_distance.py ├── requirements.txt ├── setup.py ├── tests ├── data │ ├── Hacker News 1.html │ ├── Hacker News 2.html │ ├── tablet | eBay 1.html │ ├── tablet | eBay 2.html │ └── tablet | eBay 3.html ├── demo.py ├── requirements.txt ├── test_link_annotation.py ├── test_url_distance.py └── util.py └── tox.ini /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *.so 3 | .tox 4 | build 5 | dist 6 | page_finder.egg-info 7 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/page_finder/HEAD/.travis.yml -------------------------------------------------------------------------------- /CHANGES.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/page_finder/HEAD/CHANGES.md -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/page_finder/HEAD/LICENSE -------------------------------------------------------------------------------- /Makefile.buildbot: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/page_finder/HEAD/Makefile.buildbot -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/page_finder/HEAD/README.md -------------------------------------------------------------------------------- /page_finder/__init__.py: -------------------------------------------------------------------------------- 1 | from .page_finder import * 2 | -------------------------------------------------------------------------------- /page_finder/edit_distance.c: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/page_finder/HEAD/page_finder/edit_distance.c -------------------------------------------------------------------------------- /page_finder/page_finder.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/page_finder/HEAD/page_finder/page_finder.py -------------------------------------------------------------------------------- /page_finder/url_distance.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/page_finder/HEAD/page_finder/url_distance.py -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/page_finder/HEAD/requirements.txt -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/page_finder/HEAD/setup.py -------------------------------------------------------------------------------- /tests/data/Hacker News 1.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/page_finder/HEAD/tests/data/Hacker News 1.html -------------------------------------------------------------------------------- /tests/data/Hacker News 2.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/page_finder/HEAD/tests/data/Hacker News 2.html -------------------------------------------------------------------------------- /tests/data/tablet | eBay 1.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/page_finder/HEAD/tests/data/tablet | eBay 1.html -------------------------------------------------------------------------------- /tests/data/tablet | eBay 2.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/page_finder/HEAD/tests/data/tablet | eBay 2.html -------------------------------------------------------------------------------- /tests/data/tablet | eBay 3.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/page_finder/HEAD/tests/data/tablet | eBay 3.html -------------------------------------------------------------------------------- /tests/demo.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/page_finder/HEAD/tests/demo.py -------------------------------------------------------------------------------- /tests/requirements.txt: -------------------------------------------------------------------------------- 1 | pytest>=2.8.5 2 | scrapely>=0.13.2 3 | pyreadline>=2.1 4 | -------------------------------------------------------------------------------- /tests/test_link_annotation.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/page_finder/HEAD/tests/test_link_annotation.py -------------------------------------------------------------------------------- /tests/test_url_distance.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/page_finder/HEAD/tests/test_url_distance.py -------------------------------------------------------------------------------- /tests/util.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/page_finder/HEAD/tests/util.py -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/page_finder/HEAD/tox.ini --------------------------------------------------------------------------------