├── .github ├── ISSUE_TEMPLATE │ ├── bug_report.md │ └── feature_request.md └── workflows │ └── python-package.yml ├── .gitignore ├── CHANGELOG.md ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE ├── MANIFEST.in ├── README.md ├── gerapy_auto_extractor ├── __init__.py ├── __version__.py ├── classifiers │ ├── __init__.py │ ├── base.py │ ├── detail.py │ ├── list.py │ └── models │ │ ├── list_model.pkl │ │ └── list_scaler.pkl ├── extractors │ ├── __init__.py │ ├── base.py │ ├── content.py │ ├── datetime.py │ ├── list.py │ └── title.py ├── helpers.py ├── patterns │ ├── __init__.py │ ├── datetime.py │ └── title.py ├── schemas │ ├── __init__.py │ ├── element.py │ └── tag.py ├── settings.py └── utils │ ├── __init__.py │ ├── cluster.py │ ├── element.py │ ├── helper.py │ ├── lcs.py │ ├── preprocess.py │ └── similarity.py ├── main.py ├── requirements.txt ├── samples ├── detail │ ├── china_news1.html │ ├── ifeng_news1.html │ ├── ifeng_news1_detail_20220630.html │ ├── netease_news1.html │ └── sample.html └── list │ ├── dfa66_announcement.html │ ├── hrfund_announcement.html │ ├── hsqhfunds_announcement.html │ ├── netease_international_news.html │ ├── netease_leaderboard_news.html │ ├── netease_rolling_news.html │ ├── rtfund_xxpl.html │ ├── sample.html │ ├── tencent_important_news.html │ └── zhihu_search_result.html ├── setup.py └── tests ├── __init__.py ├── settings.py ├── test_base.py ├── test_classify_detail.py ├── test_classify_list.py ├── test_extract_list.py ├── test_extract_title.py └── test_prod_case.py /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Gerapy/GerapyAutoExtractor/HEAD/.github/ISSUE_TEMPLATE/bug_report.md -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Gerapy/GerapyAutoExtractor/HEAD/.github/ISSUE_TEMPLATE/feature_request.md -------------------------------------------------------------------------------- /.github/workflows/python-package.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Gerapy/GerapyAutoExtractor/HEAD/.github/workflows/python-package.yml -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Gerapy/GerapyAutoExtractor/HEAD/.gitignore -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Gerapy/GerapyAutoExtractor/HEAD/CHANGELOG.md -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Gerapy/GerapyAutoExtractor/HEAD/CODE_OF_CONDUCT.md -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Gerapy/GerapyAutoExtractor/HEAD/CONTRIBUTING.md -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Gerapy/GerapyAutoExtractor/HEAD/LICENSE -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include gerapy_auto_extractor/classifiers/models/*.pkl -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Gerapy/GerapyAutoExtractor/HEAD/README.md -------------------------------------------------------------------------------- /gerapy_auto_extractor/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Gerapy/GerapyAutoExtractor/HEAD/gerapy_auto_extractor/__init__.py -------------------------------------------------------------------------------- /gerapy_auto_extractor/__version__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Gerapy/GerapyAutoExtractor/HEAD/gerapy_auto_extractor/__version__.py -------------------------------------------------------------------------------- /gerapy_auto_extractor/classifiers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Gerapy/GerapyAutoExtractor/HEAD/gerapy_auto_extractor/classifiers/__init__.py -------------------------------------------------------------------------------- /gerapy_auto_extractor/classifiers/base.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Gerapy/GerapyAutoExtractor/HEAD/gerapy_auto_extractor/classifiers/base.py -------------------------------------------------------------------------------- /gerapy_auto_extractor/classifiers/detail.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Gerapy/GerapyAutoExtractor/HEAD/gerapy_auto_extractor/classifiers/detail.py -------------------------------------------------------------------------------- /gerapy_auto_extractor/classifiers/list.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Gerapy/GerapyAutoExtractor/HEAD/gerapy_auto_extractor/classifiers/list.py -------------------------------------------------------------------------------- /gerapy_auto_extractor/classifiers/models/list_model.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Gerapy/GerapyAutoExtractor/HEAD/gerapy_auto_extractor/classifiers/models/list_model.pkl -------------------------------------------------------------------------------- /gerapy_auto_extractor/classifiers/models/list_scaler.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Gerapy/GerapyAutoExtractor/HEAD/gerapy_auto_extractor/classifiers/models/list_scaler.pkl -------------------------------------------------------------------------------- /gerapy_auto_extractor/extractors/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Gerapy/GerapyAutoExtractor/HEAD/gerapy_auto_extractor/extractors/__init__.py -------------------------------------------------------------------------------- /gerapy_auto_extractor/extractors/base.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Gerapy/GerapyAutoExtractor/HEAD/gerapy_auto_extractor/extractors/base.py -------------------------------------------------------------------------------- /gerapy_auto_extractor/extractors/content.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Gerapy/GerapyAutoExtractor/HEAD/gerapy_auto_extractor/extractors/content.py -------------------------------------------------------------------------------- /gerapy_auto_extractor/extractors/datetime.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Gerapy/GerapyAutoExtractor/HEAD/gerapy_auto_extractor/extractors/datetime.py -------------------------------------------------------------------------------- /gerapy_auto_extractor/extractors/list.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Gerapy/GerapyAutoExtractor/HEAD/gerapy_auto_extractor/extractors/list.py -------------------------------------------------------------------------------- /gerapy_auto_extractor/extractors/title.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Gerapy/GerapyAutoExtractor/HEAD/gerapy_auto_extractor/extractors/title.py -------------------------------------------------------------------------------- /gerapy_auto_extractor/helpers.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Gerapy/GerapyAutoExtractor/HEAD/gerapy_auto_extractor/helpers.py -------------------------------------------------------------------------------- /gerapy_auto_extractor/patterns/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /gerapy_auto_extractor/patterns/datetime.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Gerapy/GerapyAutoExtractor/HEAD/gerapy_auto_extractor/patterns/datetime.py -------------------------------------------------------------------------------- /gerapy_auto_extractor/patterns/title.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Gerapy/GerapyAutoExtractor/HEAD/gerapy_auto_extractor/patterns/title.py -------------------------------------------------------------------------------- /gerapy_auto_extractor/schemas/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /gerapy_auto_extractor/schemas/element.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Gerapy/GerapyAutoExtractor/HEAD/gerapy_auto_extractor/schemas/element.py -------------------------------------------------------------------------------- /gerapy_auto_extractor/schemas/tag.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Gerapy/GerapyAutoExtractor/HEAD/gerapy_auto_extractor/schemas/tag.py -------------------------------------------------------------------------------- /gerapy_auto_extractor/settings.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Gerapy/GerapyAutoExtractor/HEAD/gerapy_auto_extractor/settings.py -------------------------------------------------------------------------------- /gerapy_auto_extractor/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /gerapy_auto_extractor/utils/cluster.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Gerapy/GerapyAutoExtractor/HEAD/gerapy_auto_extractor/utils/cluster.py -------------------------------------------------------------------------------- /gerapy_auto_extractor/utils/element.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Gerapy/GerapyAutoExtractor/HEAD/gerapy_auto_extractor/utils/element.py -------------------------------------------------------------------------------- /gerapy_auto_extractor/utils/helper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Gerapy/GerapyAutoExtractor/HEAD/gerapy_auto_extractor/utils/helper.py -------------------------------------------------------------------------------- /gerapy_auto_extractor/utils/lcs.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Gerapy/GerapyAutoExtractor/HEAD/gerapy_auto_extractor/utils/lcs.py -------------------------------------------------------------------------------- /gerapy_auto_extractor/utils/preprocess.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Gerapy/GerapyAutoExtractor/HEAD/gerapy_auto_extractor/utils/preprocess.py -------------------------------------------------------------------------------- /gerapy_auto_extractor/utils/similarity.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Gerapy/GerapyAutoExtractor/HEAD/gerapy_auto_extractor/utils/similarity.py -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Gerapy/GerapyAutoExtractor/HEAD/main.py -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Gerapy/GerapyAutoExtractor/HEAD/requirements.txt -------------------------------------------------------------------------------- /samples/detail/china_news1.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Gerapy/GerapyAutoExtractor/HEAD/samples/detail/china_news1.html -------------------------------------------------------------------------------- /samples/detail/ifeng_news1.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Gerapy/GerapyAutoExtractor/HEAD/samples/detail/ifeng_news1.html -------------------------------------------------------------------------------- /samples/detail/ifeng_news1_detail_20220630.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Gerapy/GerapyAutoExtractor/HEAD/samples/detail/ifeng_news1_detail_20220630.html -------------------------------------------------------------------------------- /samples/detail/netease_news1.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Gerapy/GerapyAutoExtractor/HEAD/samples/detail/netease_news1.html -------------------------------------------------------------------------------- /samples/detail/sample.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Gerapy/GerapyAutoExtractor/HEAD/samples/detail/sample.html -------------------------------------------------------------------------------- /samples/list/dfa66_announcement.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Gerapy/GerapyAutoExtractor/HEAD/samples/list/dfa66_announcement.html -------------------------------------------------------------------------------- /samples/list/hrfund_announcement.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Gerapy/GerapyAutoExtractor/HEAD/samples/list/hrfund_announcement.html -------------------------------------------------------------------------------- /samples/list/hsqhfunds_announcement.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Gerapy/GerapyAutoExtractor/HEAD/samples/list/hsqhfunds_announcement.html -------------------------------------------------------------------------------- /samples/list/netease_international_news.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Gerapy/GerapyAutoExtractor/HEAD/samples/list/netease_international_news.html -------------------------------------------------------------------------------- /samples/list/netease_leaderboard_news.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Gerapy/GerapyAutoExtractor/HEAD/samples/list/netease_leaderboard_news.html -------------------------------------------------------------------------------- /samples/list/netease_rolling_news.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Gerapy/GerapyAutoExtractor/HEAD/samples/list/netease_rolling_news.html -------------------------------------------------------------------------------- /samples/list/rtfund_xxpl.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Gerapy/GerapyAutoExtractor/HEAD/samples/list/rtfund_xxpl.html -------------------------------------------------------------------------------- /samples/list/sample.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Gerapy/GerapyAutoExtractor/HEAD/samples/list/sample.html -------------------------------------------------------------------------------- /samples/list/tencent_important_news.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Gerapy/GerapyAutoExtractor/HEAD/samples/list/tencent_important_news.html -------------------------------------------------------------------------------- /samples/list/zhihu_search_result.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Gerapy/GerapyAutoExtractor/HEAD/samples/list/zhihu_search_result.html -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Gerapy/GerapyAutoExtractor/HEAD/setup.py -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/settings.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Gerapy/GerapyAutoExtractor/HEAD/tests/settings.py -------------------------------------------------------------------------------- /tests/test_base.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Gerapy/GerapyAutoExtractor/HEAD/tests/test_base.py -------------------------------------------------------------------------------- /tests/test_classify_detail.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Gerapy/GerapyAutoExtractor/HEAD/tests/test_classify_detail.py -------------------------------------------------------------------------------- /tests/test_classify_list.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Gerapy/GerapyAutoExtractor/HEAD/tests/test_classify_list.py -------------------------------------------------------------------------------- /tests/test_extract_list.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Gerapy/GerapyAutoExtractor/HEAD/tests/test_extract_list.py -------------------------------------------------------------------------------- /tests/test_extract_title.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Gerapy/GerapyAutoExtractor/HEAD/tests/test_extract_title.py -------------------------------------------------------------------------------- /tests/test_prod_case.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Gerapy/GerapyAutoExtractor/HEAD/tests/test_prod_case.py --------------------------------------------------------------------------------