├── .gitignore ├── captcha ├── captcha-mapping │ ├── captcha_decoder.py │ ├── croped.jpg │ ├── filtered.jpg │ ├── geckodriver.log │ ├── ghostdriver.log │ ├── letters.bmp │ ├── login.png │ ├── test.jpg │ ├── test2.jpg │ └── yaochen.png └── captcha │ ├── 0325.png │ ├── 54xe.jpeg │ ├── 5enn.png │ ├── __init__.py │ ├── bw.png │ ├── captcha_gray.png │ ├── ea6d.jpeg │ ├── fwuo.png │ ├── ke8m.png │ ├── kwdg.jpeg │ ├── m3hn.png │ ├── mkek.jpeg │ ├── nkng.jpeg │ ├── ocr.conf │ ├── ocr.py │ ├── online_recog_api.py │ ├── teew.png │ ├── w3lh.jpeg │ └── whgn.jpeg ├── classification ├── comment │ ├── negative │ │ ├── 1.txt │ │ ├── 2.txt │ │ ├── 3.txt │ │ ├── 4.txt │ │ ├── 5.txt │ │ ├── 6.txt │ │ ├── 7.txt │ │ ├── 8.txt │ │ └── 9.txt │ └── positive │ │ ├── 1.txt │ │ ├── 10.txt │ │ ├── 11.txt │ │ ├── 2.txt │ │ ├── 3.txt │ │ ├── 4.txt │ │ ├── 5.txt │ │ ├── 6.txt │ │ ├── 7.txt │ │ ├── 8.txt │ │ └── 9.txt ├── features.py ├── kmeans.py ├── lr.py ├── stopwords.txt ├── svm.py └── word_cut.py ├── databases ├── mongo_redis_mgr.py ├── mongomgr.py ├── mysqlmanager.py └── spider_process_mongo.py ├── distributed ├── List of Google domains - Wikipedia.htm ├── __init__.py ├── client_crawler.py ├── domains.py ├── heartbeat_client.py ├── location_spider.py ├── master.py ├── mongo_mgr.py ├── protocol_constants.py ├── socket_client.py └── socket_server.py ├── elastic ├── HtmlRetrival.py ├── command.py ├── indice.py ├── insert.py ├── list_indices.py └── query.py ├── hbase └── hbasemgr.py ├── jd ├── client_crawler.py ├── hbasemgr.py ├── jd_price.py ├── master.py ├── mongo_redis_mgr.py ├── protocol_constants.py ├── socket_client.py └── socket_server.py ├── logger ├── Python logging.FileHandler Examples.htm ├── Python logging.FileHandler Examples_files │ ├── analytics.js │ ├── code.css │ ├── jquery.highlight-3.js │ ├── jquery.js │ ├── prettify.css │ ├── prettify.js │ ├── search.png │ ├── style.css │ ├── vote_down.png │ └── vote_up.png ├── conf.py ├── dictlog.py ├── logfile.conf ├── spider.log └── standard.py ├── login ├── form_info.py ├── login.py ├── normal_login.py ├── proxy.py ├── request_login.py └── web │ ├── chlogo.png │ ├── homepage.php │ ├── login.php │ ├── main.php │ ├── style.css │ └── userhome.html ├── lxml └── lxml.py ├── mafengwo ├── demo.html └── mfw_url_feed.py ├── multi-process ├── dbmanager.py └── process_crawl.py ├── multithread └── multi_thread_mfw.py ├── phantomjs ├── follows.html ├── follows.py ├── ghostdriver.log ├── install_phantomjs.txt └── weibo.py ├── taobao ├── taobao.py └── taobao_urllib.py ├── text_extraction ├── HtmlRetrival.py ├── __init__.py ├── cleaned.txt ├── extract_demo1.py ├── extract_demo_use_tag.py ├── html_sample.html ├── python-goose │ ├── .travis.yml │ ├── LICENSE.txt │ ├── MANIFEST.in │ ├── README.rst │ ├── THANKS │ ├── build │ │ └── lib │ │ │ ├── goose │ │ │ ├── __init__.py │ │ │ ├── article.py │ │ │ ├── cleaners.py │ │ │ ├── configuration.py │ │ │ ├── crawler.py │ │ │ ├── extractors │ │ │ │ ├── __init__.py │ │ │ │ ├── authors.py │ │ │ │ ├── content.py │ │ │ │ ├── images.py │ │ │ │ ├── links.py │ │ │ │ ├── metas.py │ │ │ │ ├── opengraph.py │ │ │ │ ├── publishdate.py │ │ │ │ ├── tags.py │ │ │ │ ├── title.py │ │ │ │ ├── tweets.py │ │ │ │ └── videos.py │ │ │ ├── image.py │ │ │ ├── network.py │ │ │ ├── outputformatters.py │ │ │ ├── parsers.py │ │ │ ├── resources │ │ │ │ ├── images │ │ │ │ │ └── known-image-css.txt │ │ │ │ └── text │ │ │ │ │ ├── stopwords-ar.txt │ │ │ │ │ ├── stopwords-da.txt │ │ │ │ │ ├── stopwords-de.txt │ │ │ │ │ ├── stopwords-en.txt │ │ │ │ │ ├── stopwords-es.txt │ │ │ │ │ ├── stopwords-fi.txt │ │ │ │ │ ├── stopwords-fr.txt │ │ │ │ │ ├── stopwords-hu.txt │ │ │ │ │ ├── stopwords-id.txt │ │ │ │ │ ├── stopwords-it.txt │ │ │ │ │ ├── stopwords-ko.txt │ │ │ │ │ ├── stopwords-nb.txt │ │ │ │ │ ├── stopwords-nl.txt │ │ │ │ │ ├── stopwords-no.txt │ │ │ │ │ ├── stopwords-pl.txt │ │ │ │ │ ├── stopwords-pt.txt │ │ │ │ │ ├── stopwords-ru.txt │ │ │ │ │ ├── stopwords-sv.txt │ │ │ │ │ └── stopwords-zh.txt │ │ │ ├── text.py │ │ │ ├── utils │ │ │ │ ├── __init__.py │ │ │ │ ├── encoding.py │ │ │ │ └── images.py │ │ │ ├── version.py │ │ │ └── video.py │ │ │ └── tests │ │ │ ├── __init__.py │ │ │ ├── article.py │ │ │ ├── configuration.py │ │ │ ├── extractors │ │ │ ├── __init__.py │ │ │ ├── authors.py │ │ │ ├── base.py │ │ │ ├── content.py │ │ │ ├── images.py │ │ │ ├── links.py │ │ │ ├── metas.py │ │ │ ├── opengraph.py │ │ │ ├── publishdate.py │ │ │ ├── tags.py │ │ │ ├── title.py │ │ │ ├── tweets.py │ │ │ └── videos.py │ │ │ └── parsers.py │ ├── dist │ │ └── goose_extractor-1.0.25-py2.7.egg │ ├── goose │ │ ├── __init__.py │ │ ├── article.py │ │ ├── cleaners.py │ │ ├── configuration.py │ │ ├── crawler.py │ │ ├── extractors │ │ │ ├── __init__.py │ │ │ ├── authors.py │ │ │ ├── content.py │ │ │ ├── images.py │ │ │ ├── links.py │ │ │ ├── metas.py │ │ │ ├── opengraph.py │ │ │ ├── publishdate.py │ │ │ ├── tags.py │ │ │ ├── title.py │ │ │ ├── tweets.py │ │ │ └── videos.py │ │ ├── image.py │ │ ├── network.py │ │ ├── outputformatters.py │ │ ├── parsers.py │ │ ├── resources │ │ │ ├── images │ │ │ │ └── known-image-css.txt │ │ │ └── text │ │ │ │ ├── stopwords-ar.txt │ │ │ │ ├── stopwords-da.txt │ │ │ │ ├── stopwords-de.txt │ │ │ │ ├── stopwords-en.txt │ │ │ │ ├── stopwords-es.txt │ │ │ │ ├── stopwords-fi.txt │ │ │ │ ├── stopwords-fr.txt │ │ │ │ ├── stopwords-hu.txt │ │ │ │ ├── stopwords-id.txt │ │ │ │ ├── stopwords-it.txt │ │ │ │ ├── stopwords-ko.txt │ │ │ │ ├── stopwords-nb.txt │ │ │ │ ├── stopwords-nl.txt │ │ │ │ ├── stopwords-no.txt │ │ │ │ ├── stopwords-pl.txt │ │ │ │ ├── stopwords-pt.txt │ │ │ │ ├── stopwords-ru.txt │ │ │ │ ├── stopwords-sv.txt │ │ │ │ └── stopwords-zh.txt │ │ ├── text.py │ │ ├── utils │ │ │ ├── __init__.py │ │ │ ├── encoding.py │ │ │ └── images.py │ │ ├── version.py │ │ └── video.py │ ├── goose_extractor.egg-info │ │ ├── PKG-INFO │ │ ├── SOURCES.txt │ │ ├── dependency_links.txt │ │ ├── not-zip-safe │ │ ├── requires.txt │ │ └── top_level.txt │ ├── requirements.txt │ ├── setup.py │ └── tests │ │ ├── __init__.py │ │ ├── article.py │ │ ├── configuration.py │ │ ├── data │ │ ├── extractors │ │ │ ├── authors │ │ │ │ ├── test_author_schema.html │ │ │ │ └── test_author_schema.json │ │ │ ├── content │ │ │ │ ├── test_allnewlyrics1.html │ │ │ │ ├── test_allnewlyrics1.json │ │ │ │ ├── test_aolNews.html │ │ │ │ ├── test_aolNews.json │ │ │ │ ├── test_articlebody_attribute.html │ │ │ │ ├── test_articlebody_attribute.json │ │ │ │ ├── test_articlebody_itemprop.html │ │ │ │ ├── test_articlebody_itemprop.json │ │ │ │ ├── test_articlebody_tag.html │ │ │ │ ├── test_articlebody_tag.json │ │ │ │ ├── test_bbc_chinese.html │ │ │ │ ├── test_bbc_chinese.json │ │ │ │ ├── test_businessWeek1.html │ │ │ │ ├── test_businessWeek1.json │ │ │ │ ├── test_businessWeek2.html │ │ │ │ ├── test_businessWeek2.json │ │ │ │ ├── test_businessWeek3.html │ │ │ │ ├── test_businessWeek3.json │ │ │ │ ├── test_businessinsider3.html │ │ │ │ ├── test_businessinsider3.json │ │ │ │ ├── test_cbslocal.html │ │ │ │ ├── test_cbslocal.json │ │ │ │ ├── test_cnbc1.html │ │ │ │ ├── test_cnbc1.json │ │ │ │ ├── test_cnet.html │ │ │ │ ├── test_cnet.json │ │ │ │ ├── test_cnn1.html │ │ │ │ ├── test_cnn1.json │ │ │ │ ├── test_cnn_arabic.html │ │ │ │ ├── test_cnn_arabic.json │ │ │ │ ├── test_donga_korean.html │ │ │ │ ├── test_donga_korean.json │ │ │ │ ├── test_elmondo1.html │ │ │ │ ├── test_elmondo1.json │ │ │ │ ├── test_elpais.html │ │ │ │ ├── test_elpais.json │ │ │ │ ├── test_engadget.html │ │ │ │ ├── test_engadget.json │ │ │ │ ├── test_espn.html │ │ │ │ ├── test_espn.json │ │ │ │ ├── test_foxNews.html │ │ │ │ ├── test_foxNews.json │ │ │ │ ├── test_get_canonical_url.html │ │ │ │ ├── test_get_canonical_url.json │ │ │ │ ├── test_gizmodo1.html │ │ │ │ ├── test_gizmodo1.json │ │ │ │ ├── test_guardian1.html │ │ │ │ ├── test_guardian1.json │ │ │ │ ├── test_huffingtonPost2.html │ │ │ │ ├── test_huffingtonPost2.json │ │ │ │ ├── test_issue115.html │ │ │ │ ├── test_issue115.json │ │ │ │ ├── test_issue129.html │ │ │ │ ├── test_issue129.json │ │ │ │ ├── test_issue24.html │ │ │ │ ├── test_issue24.json │ │ │ │ ├── test_issue25.html │ │ │ │ ├── test_issue25.json │ │ │ │ ├── test_issue28.html │ │ │ │ ├── test_issue28.json │ │ │ │ ├── test_issue32.html │ │ │ │ ├── test_issue32.json │ │ │ │ ├── test_issue4.html │ │ │ │ ├── test_issue4.json │ │ │ │ ├── test_lefigaro.html │ │ │ │ ├── test_lefigaro.json │ │ │ │ ├── test_liberation.html │ │ │ │ ├── test_liberation.json │ │ │ │ ├── test_marketplace.html │ │ │ │ ├── test_marketplace.json │ │ │ │ ├── test_mashable_issue_74.html │ │ │ │ ├── test_mashable_issue_74.json │ │ │ │ ├── test_msn1.html │ │ │ │ ├── test_msn1.json │ │ │ │ ├── test_okaymarketing.html │ │ │ │ ├── test_okaymarketing.json │ │ │ │ ├── test_politico.html │ │ │ │ ├── test_politico.json │ │ │ │ ├── test_techcrunch1.html │ │ │ │ ├── test_techcrunch1.json │ │ │ │ ├── test_testHuffingtonPost.html │ │ │ │ ├── test_testHuffingtonPost.json │ │ │ │ ├── test_time.html │ │ │ │ ├── test_time.json │ │ │ │ ├── test_time2.html │ │ │ │ ├── test_time2.json │ │ │ │ ├── test_usatoday_issue_74.html │ │ │ │ ├── test_usatoday_issue_74.json │ │ │ │ ├── test_yahoo.html │ │ │ │ └── test_yahoo.json │ │ │ ├── images │ │ │ │ ├── test_basic_image │ │ │ │ │ ├── 50850547cc7310bc53e30e802c6318f1 │ │ │ │ │ ├── test_basic_image.html │ │ │ │ │ └── test_basic_image.json │ │ │ │ ├── test_known_image_css_class │ │ │ │ │ ├── test_known_image_css_class.html │ │ │ │ │ └── test_known_image_css_class.json │ │ │ │ ├── test_known_image_css_id │ │ │ │ │ ├── test_known_image_css_id.html │ │ │ │ │ └── test_known_image_css_id.json │ │ │ │ ├── test_known_image_css_parent_class │ │ │ │ │ ├── test_known_image_css_parent_class.html │ │ │ │ │ └── test_known_image_css_parent_class.json │ │ │ │ ├── test_known_image_css_parent_id │ │ │ │ │ ├── test_known_image_css_parent_id.html │ │ │ │ │ └── test_known_image_css_parent_id.json │ │ │ │ ├── test_known_image_empty_src │ │ │ │ │ ├── test_known_image_empty_src.html │ │ │ │ │ └── test_known_image_empty_src.json │ │ │ │ ├── test_known_image_name_parent │ │ │ │ │ ├── test_known_image_name_parent.html │ │ │ │ │ └── test_known_image_name_parent.json │ │ │ │ └── test_opengraph_tag │ │ │ │ │ ├── test_opengraph_tag.html │ │ │ │ │ └── test_opengraph_tag.json │ │ │ ├── links │ │ │ │ ├── test_links.html │ │ │ │ └── test_links.json │ │ │ ├── opengraph │ │ │ │ ├── test_opengraph.html │ │ │ │ └── test_opengraph.json │ │ │ ├── publishdate │ │ │ │ ├── test_publish_date.html │ │ │ │ ├── test_publish_date.json │ │ │ │ ├── test_publish_date_article.html │ │ │ │ ├── test_publish_date_article.json │ │ │ │ ├── test_publish_date_rnews.html │ │ │ │ ├── test_publish_date_rnews.json │ │ │ │ ├── test_publish_date_schema.html │ │ │ │ └── test_publish_date_schema.json │ │ │ ├── tags │ │ │ │ ├── test_tags_abcau.html │ │ │ │ ├── test_tags_abcau.json │ │ │ │ ├── test_tags_cnet.html │ │ │ │ ├── test_tags_cnet.json │ │ │ │ ├── test_tags_deadline.html │ │ │ │ ├── test_tags_deadline.json │ │ │ │ ├── test_tags_kexp.html │ │ │ │ ├── test_tags_kexp.json │ │ │ │ ├── test_tags_wnyc.html │ │ │ │ └── test_tags_wnyc.json │ │ │ ├── title │ │ │ │ ├── test_title_empty.html │ │ │ │ ├── test_title_empty.json │ │ │ │ ├── test_title_opengraph.html │ │ │ │ └── test_title_opengraph.json │ │ │ ├── tweets │ │ │ │ ├── test_tweet.html │ │ │ │ └── test_tweet.json │ │ │ └── videos │ │ │ │ ├── test_embed.html │ │ │ │ ├── test_embed.json │ │ │ │ ├── test_iframe.html │ │ │ │ ├── test_iframe.json │ │ │ │ ├── test_object.html │ │ │ │ └── test_object.json │ │ └── parser │ │ │ └── test1.html │ │ ├── extractors │ │ ├── __init__.py │ │ ├── authors.py │ │ ├── base.py │ │ ├── content.py │ │ ├── images.py │ │ ├── links.py │ │ ├── metas.py │ │ ├── opengraph.py │ │ ├── publishdate.py │ │ ├── tags.py │ │ ├── title.py │ │ ├── tweets.py │ │ └── videos.py │ │ └── parsers.py ├── pythongoose.txt ├── te_goose.py ├── template.txt ├── test.txt └── word_tag_ratio.py ├── weibo ├── __init__.py ├── feeds_crawler.py ├── mongo_db_manager.py ├── mysql_db_manager.py ├── single_demo │ ├── feeds_test.py │ └── user_test.py └── user_crawler.py └── wikipedia ├── __init__.py ├── articles.py ├── categories.py ├── infobox_parse_city.py └── redis_manager.py /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/.gitignore -------------------------------------------------------------------------------- /captcha/captcha-mapping/captcha_decoder.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/captcha/captcha-mapping/captcha_decoder.py -------------------------------------------------------------------------------- /captcha/captcha-mapping/croped.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/captcha/captcha-mapping/croped.jpg -------------------------------------------------------------------------------- /captcha/captcha-mapping/filtered.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/captcha/captcha-mapping/filtered.jpg -------------------------------------------------------------------------------- /captcha/captcha-mapping/geckodriver.log: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /captcha/captcha-mapping/ghostdriver.log: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/captcha/captcha-mapping/ghostdriver.log -------------------------------------------------------------------------------- /captcha/captcha-mapping/letters.bmp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/captcha/captcha-mapping/letters.bmp -------------------------------------------------------------------------------- /captcha/captcha-mapping/login.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/captcha/captcha-mapping/login.png -------------------------------------------------------------------------------- /captcha/captcha-mapping/test.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/captcha/captcha-mapping/test.jpg -------------------------------------------------------------------------------- /captcha/captcha-mapping/test2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/captcha/captcha-mapping/test2.jpg -------------------------------------------------------------------------------- /captcha/captcha-mapping/yaochen.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/captcha/captcha-mapping/yaochen.png -------------------------------------------------------------------------------- /captcha/captcha/0325.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/captcha/captcha/0325.png -------------------------------------------------------------------------------- /captcha/captcha/54xe.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/captcha/captcha/54xe.jpeg -------------------------------------------------------------------------------- /captcha/captcha/5enn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/captcha/captcha/5enn.png -------------------------------------------------------------------------------- /captcha/captcha/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /captcha/captcha/bw.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/captcha/captcha/bw.png -------------------------------------------------------------------------------- /captcha/captcha/captcha_gray.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/captcha/captcha/captcha_gray.png -------------------------------------------------------------------------------- /captcha/captcha/ea6d.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/captcha/captcha/ea6d.jpeg -------------------------------------------------------------------------------- /captcha/captcha/fwuo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/captcha/captcha/fwuo.png -------------------------------------------------------------------------------- /captcha/captcha/ke8m.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/captcha/captcha/ke8m.png -------------------------------------------------------------------------------- /captcha/captcha/kwdg.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/captcha/captcha/kwdg.jpeg -------------------------------------------------------------------------------- /captcha/captcha/m3hn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/captcha/captcha/m3hn.png -------------------------------------------------------------------------------- /captcha/captcha/mkek.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/captcha/captcha/mkek.jpeg -------------------------------------------------------------------------------- /captcha/captcha/nkng.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/captcha/captcha/nkng.jpeg -------------------------------------------------------------------------------- /captcha/captcha/ocr.conf: -------------------------------------------------------------------------------- 1 | tessedit_char_whitelist 1234567890abdefghijklmnoprstuvwxyzABDEFGHIJKLMNOPQRSTUVWXYZ -------------------------------------------------------------------------------- /captcha/captcha/ocr.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/captcha/captcha/ocr.py -------------------------------------------------------------------------------- /captcha/captcha/online_recog_api.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/captcha/captcha/online_recog_api.py -------------------------------------------------------------------------------- /captcha/captcha/teew.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/captcha/captcha/teew.png -------------------------------------------------------------------------------- /captcha/captcha/w3lh.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/captcha/captcha/w3lh.jpeg -------------------------------------------------------------------------------- /captcha/captcha/whgn.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/captcha/captcha/whgn.jpeg -------------------------------------------------------------------------------- /classification/comment/negative/1.txt: -------------------------------------------------------------------------------- 1 | 烂片 -------------------------------------------------------------------------------- /classification/comment/negative/2.txt: -------------------------------------------------------------------------------- 1 | 太垃圾的电影,不好看 -------------------------------------------------------------------------------- /classification/comment/negative/3.txt: -------------------------------------------------------------------------------- 1 | 没上一部好看。大场面太多不见得是好事。 -------------------------------------------------------------------------------- /classification/comment/negative/4.txt: -------------------------------------------------------------------------------- 1 | 垃圾电影,烂片 -------------------------------------------------------------------------------- /classification/comment/negative/5.txt: -------------------------------------------------------------------------------- 1 | 完全看不懂,超级烂 -------------------------------------------------------------------------------- /classification/comment/negative/6.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/classification/comment/negative/6.txt -------------------------------------------------------------------------------- /classification/comment/negative/7.txt: -------------------------------------------------------------------------------- 1 | 不好看,太失望了 -------------------------------------------------------------------------------- /classification/comment/negative/8.txt: -------------------------------------------------------------------------------- 1 | 让人失望,完全没心情了 -------------------------------------------------------------------------------- /classification/comment/negative/9.txt: -------------------------------------------------------------------------------- 1 | 很让人失望的电影,情节简单,叙事混乱 -------------------------------------------------------------------------------- /classification/comment/positive/1.txt: -------------------------------------------------------------------------------- 1 | 很不错呢 -------------------------------------------------------------------------------- /classification/comment/positive/10.txt: -------------------------------------------------------------------------------- 1 | 等了很久了,大爱 -------------------------------------------------------------------------------- /classification/comment/positive/11.txt: -------------------------------------------------------------------------------- 1 | 等了好久,终于上映了,最爱马特达蒙,没有让我失望 -------------------------------------------------------------------------------- /classification/comment/positive/2.txt: -------------------------------------------------------------------------------- 1 | 很好看的电影,超爱马特达蒙 -------------------------------------------------------------------------------- /classification/comment/positive/3.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/classification/comment/positive/3.txt -------------------------------------------------------------------------------- /classification/comment/positive/4.txt: -------------------------------------------------------------------------------- 1 | 电影不错,值得看 -------------------------------------------------------------------------------- /classification/comment/positive/5.txt: -------------------------------------------------------------------------------- 1 | 喜欢这部,没有美女vs野兽这种性暗示。更为尊重自然。 -------------------------------------------------------------------------------- /classification/comment/positive/6.txt: -------------------------------------------------------------------------------- 1 | 特效牛逼 故事平淡 中途有点走神 -------------------------------------------------------------------------------- /classification/comment/positive/7.txt: -------------------------------------------------------------------------------- 1 | 还行,挺好的 -------------------------------------------------------------------------------- /classification/comment/positive/8.txt: -------------------------------------------------------------------------------- 1 | 太赞了,演技很好 -------------------------------------------------------------------------------- /classification/comment/positive/9.txt: -------------------------------------------------------------------------------- 1 | 专门跑来看,非常满意 -------------------------------------------------------------------------------- /classification/features.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/classification/features.py -------------------------------------------------------------------------------- /classification/kmeans.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/classification/kmeans.py -------------------------------------------------------------------------------- /classification/lr.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/classification/lr.py -------------------------------------------------------------------------------- /classification/stopwords.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/classification/stopwords.txt -------------------------------------------------------------------------------- /classification/svm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/classification/svm.py -------------------------------------------------------------------------------- /classification/word_cut.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/classification/word_cut.py -------------------------------------------------------------------------------- /databases/mongo_redis_mgr.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/databases/mongo_redis_mgr.py -------------------------------------------------------------------------------- /databases/mongomgr.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/databases/mongomgr.py -------------------------------------------------------------------------------- /databases/mysqlmanager.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/databases/mysqlmanager.py -------------------------------------------------------------------------------- /databases/spider_process_mongo.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/databases/spider_process_mongo.py -------------------------------------------------------------------------------- /distributed/List of Google domains - Wikipedia.htm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/distributed/List of Google domains - Wikipedia.htm -------------------------------------------------------------------------------- /distributed/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /distributed/client_crawler.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/distributed/client_crawler.py -------------------------------------------------------------------------------- /distributed/domains.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/distributed/domains.py -------------------------------------------------------------------------------- /distributed/heartbeat_client.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/distributed/heartbeat_client.py -------------------------------------------------------------------------------- /distributed/location_spider.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/distributed/location_spider.py -------------------------------------------------------------------------------- /distributed/master.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/distributed/master.py -------------------------------------------------------------------------------- /distributed/mongo_mgr.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/distributed/mongo_mgr.py -------------------------------------------------------------------------------- /distributed/protocol_constants.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/distributed/protocol_constants.py -------------------------------------------------------------------------------- /distributed/socket_client.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/distributed/socket_client.py -------------------------------------------------------------------------------- /distributed/socket_server.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/distributed/socket_server.py -------------------------------------------------------------------------------- /elastic/HtmlRetrival.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/elastic/HtmlRetrival.py -------------------------------------------------------------------------------- /elastic/command.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/elastic/command.py -------------------------------------------------------------------------------- /elastic/indice.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/elastic/indice.py -------------------------------------------------------------------------------- /elastic/insert.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/elastic/insert.py -------------------------------------------------------------------------------- /elastic/list_indices.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /elastic/query.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/elastic/query.py -------------------------------------------------------------------------------- /hbase/hbasemgr.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/hbase/hbasemgr.py -------------------------------------------------------------------------------- /jd/client_crawler.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/jd/client_crawler.py -------------------------------------------------------------------------------- /jd/hbasemgr.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/jd/hbasemgr.py -------------------------------------------------------------------------------- /jd/jd_price.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/jd/jd_price.py -------------------------------------------------------------------------------- /jd/master.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/jd/master.py -------------------------------------------------------------------------------- /jd/mongo_redis_mgr.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/jd/mongo_redis_mgr.py -------------------------------------------------------------------------------- /jd/protocol_constants.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/jd/protocol_constants.py -------------------------------------------------------------------------------- /jd/socket_client.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/jd/socket_client.py -------------------------------------------------------------------------------- /jd/socket_server.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/jd/socket_server.py -------------------------------------------------------------------------------- /logger/Python logging.FileHandler Examples.htm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/logger/Python logging.FileHandler Examples.htm -------------------------------------------------------------------------------- /logger/Python logging.FileHandler Examples_files/analytics.js: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/logger/Python logging.FileHandler Examples_files/analytics.js -------------------------------------------------------------------------------- /logger/Python logging.FileHandler Examples_files/code.css: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/logger/Python logging.FileHandler Examples_files/code.css -------------------------------------------------------------------------------- /logger/Python logging.FileHandler Examples_files/jquery.highlight-3.js: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/logger/Python logging.FileHandler Examples_files/jquery.highlight-3.js -------------------------------------------------------------------------------- /logger/Python logging.FileHandler Examples_files/jquery.js: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/logger/Python logging.FileHandler Examples_files/jquery.js -------------------------------------------------------------------------------- /logger/Python logging.FileHandler Examples_files/prettify.css: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/logger/Python logging.FileHandler Examples_files/prettify.css -------------------------------------------------------------------------------- /logger/Python logging.FileHandler Examples_files/prettify.js: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/logger/Python logging.FileHandler Examples_files/prettify.js -------------------------------------------------------------------------------- /logger/Python logging.FileHandler Examples_files/search.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/logger/Python logging.FileHandler Examples_files/search.png -------------------------------------------------------------------------------- /logger/Python logging.FileHandler Examples_files/style.css: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/logger/Python logging.FileHandler Examples_files/style.css -------------------------------------------------------------------------------- /logger/Python logging.FileHandler Examples_files/vote_down.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/logger/Python logging.FileHandler Examples_files/vote_down.png -------------------------------------------------------------------------------- /logger/Python logging.FileHandler Examples_files/vote_up.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/logger/Python logging.FileHandler Examples_files/vote_up.png -------------------------------------------------------------------------------- /logger/conf.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/logger/conf.py -------------------------------------------------------------------------------- /logger/dictlog.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/logger/dictlog.py -------------------------------------------------------------------------------- /logger/logfile.conf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/logger/logfile.conf -------------------------------------------------------------------------------- /logger/spider.log: -------------------------------------------------------------------------------- 1 | 2017-07-11 18:42:05,258 -- redis !!!DEBUG!!!: Simple Log Test! 2 | -------------------------------------------------------------------------------- /logger/standard.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/logger/standard.py -------------------------------------------------------------------------------- /login/form_info.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/login/form_info.py -------------------------------------------------------------------------------- /login/login.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/login/login.py -------------------------------------------------------------------------------- /login/normal_login.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/login/normal_login.py -------------------------------------------------------------------------------- /login/proxy.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/login/proxy.py -------------------------------------------------------------------------------- /login/request_login.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/login/request_login.py -------------------------------------------------------------------------------- /login/web/chlogo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/login/web/chlogo.png -------------------------------------------------------------------------------- /login/web/homepage.php: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/login/web/homepage.php -------------------------------------------------------------------------------- /login/web/login.php: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/login/web/login.php -------------------------------------------------------------------------------- /login/web/main.php: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/login/web/main.php -------------------------------------------------------------------------------- /login/web/style.css: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/login/web/style.css -------------------------------------------------------------------------------- /login/web/userhome.html: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lxml/lxml.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/lxml/lxml.py -------------------------------------------------------------------------------- /mafengwo/demo.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/mafengwo/demo.html -------------------------------------------------------------------------------- /mafengwo/mfw_url_feed.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/mafengwo/mfw_url_feed.py -------------------------------------------------------------------------------- /multi-process/dbmanager.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/multi-process/dbmanager.py -------------------------------------------------------------------------------- /multi-process/process_crawl.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/multi-process/process_crawl.py -------------------------------------------------------------------------------- /multithread/multi_thread_mfw.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/multithread/multi_thread_mfw.py -------------------------------------------------------------------------------- /phantomjs/follows.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/phantomjs/follows.html -------------------------------------------------------------------------------- /phantomjs/follows.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/phantomjs/follows.py -------------------------------------------------------------------------------- /phantomjs/ghostdriver.log: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/phantomjs/ghostdriver.log -------------------------------------------------------------------------------- /phantomjs/install_phantomjs.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/phantomjs/install_phantomjs.txt -------------------------------------------------------------------------------- /phantomjs/weibo.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/phantomjs/weibo.py -------------------------------------------------------------------------------- /taobao/taobao.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/taobao/taobao.py -------------------------------------------------------------------------------- /taobao/taobao_urllib.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/taobao/taobao_urllib.py -------------------------------------------------------------------------------- /text_extraction/HtmlRetrival.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/HtmlRetrival.py -------------------------------------------------------------------------------- /text_extraction/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /text_extraction/cleaned.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/cleaned.txt -------------------------------------------------------------------------------- /text_extraction/extract_demo1.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/extract_demo1.py -------------------------------------------------------------------------------- /text_extraction/extract_demo_use_tag.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/extract_demo_use_tag.py -------------------------------------------------------------------------------- /text_extraction/html_sample.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/html_sample.html -------------------------------------------------------------------------------- /text_extraction/python-goose/.travis.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/.travis.yml -------------------------------------------------------------------------------- /text_extraction/python-goose/LICENSE.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/LICENSE.txt -------------------------------------------------------------------------------- /text_extraction/python-goose/MANIFEST.in: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/MANIFEST.in -------------------------------------------------------------------------------- /text_extraction/python-goose/README.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/README.rst -------------------------------------------------------------------------------- /text_extraction/python-goose/THANKS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/THANKS -------------------------------------------------------------------------------- /text_extraction/python-goose/build/lib/goose/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/build/lib/goose/__init__.py -------------------------------------------------------------------------------- /text_extraction/python-goose/build/lib/goose/article.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/build/lib/goose/article.py -------------------------------------------------------------------------------- /text_extraction/python-goose/build/lib/goose/cleaners.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/build/lib/goose/cleaners.py -------------------------------------------------------------------------------- /text_extraction/python-goose/build/lib/goose/configuration.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/build/lib/goose/configuration.py -------------------------------------------------------------------------------- /text_extraction/python-goose/build/lib/goose/crawler.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/build/lib/goose/crawler.py -------------------------------------------------------------------------------- /text_extraction/python-goose/build/lib/goose/extractors/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/build/lib/goose/extractors/__init__.py -------------------------------------------------------------------------------- /text_extraction/python-goose/build/lib/goose/extractors/authors.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/build/lib/goose/extractors/authors.py -------------------------------------------------------------------------------- /text_extraction/python-goose/build/lib/goose/extractors/content.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/build/lib/goose/extractors/content.py -------------------------------------------------------------------------------- /text_extraction/python-goose/build/lib/goose/extractors/images.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/build/lib/goose/extractors/images.py -------------------------------------------------------------------------------- /text_extraction/python-goose/build/lib/goose/extractors/links.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/build/lib/goose/extractors/links.py -------------------------------------------------------------------------------- /text_extraction/python-goose/build/lib/goose/extractors/metas.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/build/lib/goose/extractors/metas.py -------------------------------------------------------------------------------- /text_extraction/python-goose/build/lib/goose/extractors/opengraph.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/build/lib/goose/extractors/opengraph.py -------------------------------------------------------------------------------- /text_extraction/python-goose/build/lib/goose/extractors/publishdate.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/build/lib/goose/extractors/publishdate.py -------------------------------------------------------------------------------- /text_extraction/python-goose/build/lib/goose/extractors/tags.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/build/lib/goose/extractors/tags.py -------------------------------------------------------------------------------- /text_extraction/python-goose/build/lib/goose/extractors/title.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/build/lib/goose/extractors/title.py -------------------------------------------------------------------------------- /text_extraction/python-goose/build/lib/goose/extractors/tweets.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/build/lib/goose/extractors/tweets.py -------------------------------------------------------------------------------- /text_extraction/python-goose/build/lib/goose/extractors/videos.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/build/lib/goose/extractors/videos.py -------------------------------------------------------------------------------- /text_extraction/python-goose/build/lib/goose/image.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/build/lib/goose/image.py -------------------------------------------------------------------------------- /text_extraction/python-goose/build/lib/goose/network.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/build/lib/goose/network.py -------------------------------------------------------------------------------- /text_extraction/python-goose/build/lib/goose/outputformatters.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/build/lib/goose/outputformatters.py -------------------------------------------------------------------------------- /text_extraction/python-goose/build/lib/goose/parsers.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/build/lib/goose/parsers.py -------------------------------------------------------------------------------- /text_extraction/python-goose/build/lib/goose/resources/images/known-image-css.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/build/lib/goose/resources/images/known-image-css.txt -------------------------------------------------------------------------------- /text_extraction/python-goose/build/lib/goose/resources/text/stopwords-ar.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/build/lib/goose/resources/text/stopwords-ar.txt -------------------------------------------------------------------------------- /text_extraction/python-goose/build/lib/goose/resources/text/stopwords-da.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/build/lib/goose/resources/text/stopwords-da.txt -------------------------------------------------------------------------------- /text_extraction/python-goose/build/lib/goose/resources/text/stopwords-de.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/build/lib/goose/resources/text/stopwords-de.txt -------------------------------------------------------------------------------- /text_extraction/python-goose/build/lib/goose/resources/text/stopwords-en.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/build/lib/goose/resources/text/stopwords-en.txt -------------------------------------------------------------------------------- /text_extraction/python-goose/build/lib/goose/resources/text/stopwords-es.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/build/lib/goose/resources/text/stopwords-es.txt -------------------------------------------------------------------------------- /text_extraction/python-goose/build/lib/goose/resources/text/stopwords-fi.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/build/lib/goose/resources/text/stopwords-fi.txt -------------------------------------------------------------------------------- /text_extraction/python-goose/build/lib/goose/resources/text/stopwords-fr.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/build/lib/goose/resources/text/stopwords-fr.txt -------------------------------------------------------------------------------- /text_extraction/python-goose/build/lib/goose/resources/text/stopwords-hu.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/build/lib/goose/resources/text/stopwords-hu.txt -------------------------------------------------------------------------------- /text_extraction/python-goose/build/lib/goose/resources/text/stopwords-id.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/build/lib/goose/resources/text/stopwords-id.txt -------------------------------------------------------------------------------- /text_extraction/python-goose/build/lib/goose/resources/text/stopwords-it.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/build/lib/goose/resources/text/stopwords-it.txt -------------------------------------------------------------------------------- /text_extraction/python-goose/build/lib/goose/resources/text/stopwords-ko.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/build/lib/goose/resources/text/stopwords-ko.txt -------------------------------------------------------------------------------- /text_extraction/python-goose/build/lib/goose/resources/text/stopwords-nb.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/build/lib/goose/resources/text/stopwords-nb.txt -------------------------------------------------------------------------------- /text_extraction/python-goose/build/lib/goose/resources/text/stopwords-nl.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/build/lib/goose/resources/text/stopwords-nl.txt -------------------------------------------------------------------------------- /text_extraction/python-goose/build/lib/goose/resources/text/stopwords-no.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/build/lib/goose/resources/text/stopwords-no.txt -------------------------------------------------------------------------------- /text_extraction/python-goose/build/lib/goose/resources/text/stopwords-pl.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/build/lib/goose/resources/text/stopwords-pl.txt -------------------------------------------------------------------------------- /text_extraction/python-goose/build/lib/goose/resources/text/stopwords-pt.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/build/lib/goose/resources/text/stopwords-pt.txt -------------------------------------------------------------------------------- /text_extraction/python-goose/build/lib/goose/resources/text/stopwords-ru.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/build/lib/goose/resources/text/stopwords-ru.txt -------------------------------------------------------------------------------- /text_extraction/python-goose/build/lib/goose/resources/text/stopwords-sv.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/build/lib/goose/resources/text/stopwords-sv.txt -------------------------------------------------------------------------------- /text_extraction/python-goose/build/lib/goose/resources/text/stopwords-zh.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/build/lib/goose/resources/text/stopwords-zh.txt -------------------------------------------------------------------------------- /text_extraction/python-goose/build/lib/goose/text.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/build/lib/goose/text.py -------------------------------------------------------------------------------- /text_extraction/python-goose/build/lib/goose/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/build/lib/goose/utils/__init__.py -------------------------------------------------------------------------------- /text_extraction/python-goose/build/lib/goose/utils/encoding.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/build/lib/goose/utils/encoding.py -------------------------------------------------------------------------------- /text_extraction/python-goose/build/lib/goose/utils/images.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/build/lib/goose/utils/images.py -------------------------------------------------------------------------------- /text_extraction/python-goose/build/lib/goose/version.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/build/lib/goose/version.py -------------------------------------------------------------------------------- /text_extraction/python-goose/build/lib/goose/video.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/build/lib/goose/video.py -------------------------------------------------------------------------------- /text_extraction/python-goose/build/lib/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/build/lib/tests/__init__.py -------------------------------------------------------------------------------- /text_extraction/python-goose/build/lib/tests/article.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/build/lib/tests/article.py -------------------------------------------------------------------------------- /text_extraction/python-goose/build/lib/tests/configuration.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/build/lib/tests/configuration.py -------------------------------------------------------------------------------- /text_extraction/python-goose/build/lib/tests/extractors/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /text_extraction/python-goose/build/lib/tests/extractors/authors.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/build/lib/tests/extractors/authors.py -------------------------------------------------------------------------------- /text_extraction/python-goose/build/lib/tests/extractors/base.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/build/lib/tests/extractors/base.py -------------------------------------------------------------------------------- /text_extraction/python-goose/build/lib/tests/extractors/content.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/build/lib/tests/extractors/content.py -------------------------------------------------------------------------------- /text_extraction/python-goose/build/lib/tests/extractors/images.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/build/lib/tests/extractors/images.py -------------------------------------------------------------------------------- /text_extraction/python-goose/build/lib/tests/extractors/links.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/build/lib/tests/extractors/links.py -------------------------------------------------------------------------------- /text_extraction/python-goose/build/lib/tests/extractors/metas.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/build/lib/tests/extractors/metas.py -------------------------------------------------------------------------------- /text_extraction/python-goose/build/lib/tests/extractors/opengraph.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/build/lib/tests/extractors/opengraph.py -------------------------------------------------------------------------------- /text_extraction/python-goose/build/lib/tests/extractors/publishdate.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/build/lib/tests/extractors/publishdate.py -------------------------------------------------------------------------------- /text_extraction/python-goose/build/lib/tests/extractors/tags.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/build/lib/tests/extractors/tags.py -------------------------------------------------------------------------------- /text_extraction/python-goose/build/lib/tests/extractors/title.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/build/lib/tests/extractors/title.py -------------------------------------------------------------------------------- /text_extraction/python-goose/build/lib/tests/extractors/tweets.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/build/lib/tests/extractors/tweets.py -------------------------------------------------------------------------------- /text_extraction/python-goose/build/lib/tests/extractors/videos.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/build/lib/tests/extractors/videos.py -------------------------------------------------------------------------------- /text_extraction/python-goose/build/lib/tests/parsers.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/build/lib/tests/parsers.py -------------------------------------------------------------------------------- /text_extraction/python-goose/dist/goose_extractor-1.0.25-py2.7.egg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/dist/goose_extractor-1.0.25-py2.7.egg -------------------------------------------------------------------------------- /text_extraction/python-goose/goose/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/goose/__init__.py -------------------------------------------------------------------------------- /text_extraction/python-goose/goose/article.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/goose/article.py -------------------------------------------------------------------------------- /text_extraction/python-goose/goose/cleaners.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/goose/cleaners.py -------------------------------------------------------------------------------- /text_extraction/python-goose/goose/configuration.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/goose/configuration.py -------------------------------------------------------------------------------- /text_extraction/python-goose/goose/crawler.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/goose/crawler.py -------------------------------------------------------------------------------- /text_extraction/python-goose/goose/extractors/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/goose/extractors/__init__.py -------------------------------------------------------------------------------- /text_extraction/python-goose/goose/extractors/authors.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/goose/extractors/authors.py -------------------------------------------------------------------------------- /text_extraction/python-goose/goose/extractors/content.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/goose/extractors/content.py -------------------------------------------------------------------------------- /text_extraction/python-goose/goose/extractors/images.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/goose/extractors/images.py -------------------------------------------------------------------------------- /text_extraction/python-goose/goose/extractors/links.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/goose/extractors/links.py -------------------------------------------------------------------------------- /text_extraction/python-goose/goose/extractors/metas.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/goose/extractors/metas.py -------------------------------------------------------------------------------- /text_extraction/python-goose/goose/extractors/opengraph.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/goose/extractors/opengraph.py -------------------------------------------------------------------------------- /text_extraction/python-goose/goose/extractors/publishdate.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/goose/extractors/publishdate.py -------------------------------------------------------------------------------- /text_extraction/python-goose/goose/extractors/tags.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/goose/extractors/tags.py -------------------------------------------------------------------------------- /text_extraction/python-goose/goose/extractors/title.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/goose/extractors/title.py -------------------------------------------------------------------------------- /text_extraction/python-goose/goose/extractors/tweets.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/goose/extractors/tweets.py -------------------------------------------------------------------------------- /text_extraction/python-goose/goose/extractors/videos.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/goose/extractors/videos.py -------------------------------------------------------------------------------- /text_extraction/python-goose/goose/image.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/goose/image.py -------------------------------------------------------------------------------- /text_extraction/python-goose/goose/network.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/goose/network.py -------------------------------------------------------------------------------- /text_extraction/python-goose/goose/outputformatters.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/goose/outputformatters.py -------------------------------------------------------------------------------- /text_extraction/python-goose/goose/parsers.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/goose/parsers.py -------------------------------------------------------------------------------- /text_extraction/python-goose/goose/resources/images/known-image-css.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/goose/resources/images/known-image-css.txt -------------------------------------------------------------------------------- /text_extraction/python-goose/goose/resources/text/stopwords-ar.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/goose/resources/text/stopwords-ar.txt -------------------------------------------------------------------------------- /text_extraction/python-goose/goose/resources/text/stopwords-da.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/goose/resources/text/stopwords-da.txt -------------------------------------------------------------------------------- /text_extraction/python-goose/goose/resources/text/stopwords-de.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/goose/resources/text/stopwords-de.txt -------------------------------------------------------------------------------- /text_extraction/python-goose/goose/resources/text/stopwords-en.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/goose/resources/text/stopwords-en.txt -------------------------------------------------------------------------------- /text_extraction/python-goose/goose/resources/text/stopwords-es.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/goose/resources/text/stopwords-es.txt -------------------------------------------------------------------------------- /text_extraction/python-goose/goose/resources/text/stopwords-fi.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/goose/resources/text/stopwords-fi.txt -------------------------------------------------------------------------------- /text_extraction/python-goose/goose/resources/text/stopwords-fr.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/goose/resources/text/stopwords-fr.txt -------------------------------------------------------------------------------- /text_extraction/python-goose/goose/resources/text/stopwords-hu.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/goose/resources/text/stopwords-hu.txt -------------------------------------------------------------------------------- /text_extraction/python-goose/goose/resources/text/stopwords-id.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/goose/resources/text/stopwords-id.txt -------------------------------------------------------------------------------- /text_extraction/python-goose/goose/resources/text/stopwords-it.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/goose/resources/text/stopwords-it.txt -------------------------------------------------------------------------------- /text_extraction/python-goose/goose/resources/text/stopwords-ko.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/goose/resources/text/stopwords-ko.txt -------------------------------------------------------------------------------- /text_extraction/python-goose/goose/resources/text/stopwords-nb.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/goose/resources/text/stopwords-nb.txt -------------------------------------------------------------------------------- /text_extraction/python-goose/goose/resources/text/stopwords-nl.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/goose/resources/text/stopwords-nl.txt -------------------------------------------------------------------------------- /text_extraction/python-goose/goose/resources/text/stopwords-no.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/goose/resources/text/stopwords-no.txt -------------------------------------------------------------------------------- /text_extraction/python-goose/goose/resources/text/stopwords-pl.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/goose/resources/text/stopwords-pl.txt -------------------------------------------------------------------------------- /text_extraction/python-goose/goose/resources/text/stopwords-pt.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/goose/resources/text/stopwords-pt.txt -------------------------------------------------------------------------------- /text_extraction/python-goose/goose/resources/text/stopwords-ru.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/goose/resources/text/stopwords-ru.txt -------------------------------------------------------------------------------- /text_extraction/python-goose/goose/resources/text/stopwords-sv.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/goose/resources/text/stopwords-sv.txt -------------------------------------------------------------------------------- /text_extraction/python-goose/goose/resources/text/stopwords-zh.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/goose/resources/text/stopwords-zh.txt -------------------------------------------------------------------------------- /text_extraction/python-goose/goose/text.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/goose/text.py -------------------------------------------------------------------------------- /text_extraction/python-goose/goose/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/goose/utils/__init__.py -------------------------------------------------------------------------------- /text_extraction/python-goose/goose/utils/encoding.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/goose/utils/encoding.py -------------------------------------------------------------------------------- /text_extraction/python-goose/goose/utils/images.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/goose/utils/images.py -------------------------------------------------------------------------------- /text_extraction/python-goose/goose/version.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/goose/version.py -------------------------------------------------------------------------------- /text_extraction/python-goose/goose/video.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/goose/video.py -------------------------------------------------------------------------------- /text_extraction/python-goose/goose_extractor.egg-info/PKG-INFO: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/goose_extractor.egg-info/PKG-INFO -------------------------------------------------------------------------------- /text_extraction/python-goose/goose_extractor.egg-info/SOURCES.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/goose_extractor.egg-info/SOURCES.txt -------------------------------------------------------------------------------- /text_extraction/python-goose/goose_extractor.egg-info/dependency_links.txt: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /text_extraction/python-goose/goose_extractor.egg-info/not-zip-safe: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /text_extraction/python-goose/goose_extractor.egg-info/requires.txt: -------------------------------------------------------------------------------- 1 | Pillow 2 | lxml 3 | cssselect 4 | jieba 5 | beautifulsoup 6 | nltk 7 | -------------------------------------------------------------------------------- /text_extraction/python-goose/goose_extractor.egg-info/top_level.txt: -------------------------------------------------------------------------------- 1 | goose 2 | tests 3 | -------------------------------------------------------------------------------- /text_extraction/python-goose/requirements.txt: -------------------------------------------------------------------------------- 1 | Pillow 2 | lxml 3 | cssselect 4 | jieba 5 | beautifulsoup 6 | nltk 7 | -------------------------------------------------------------------------------- /text_extraction/python-goose/setup.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/setup.py -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/__init__.py -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/article.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/article.py -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/configuration.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/configuration.py -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/authors/test_author_schema.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/authors/test_author_schema.html -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/authors/test_author_schema.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/authors/test_author_schema.json -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/content/test_allnewlyrics1.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/content/test_allnewlyrics1.html -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/content/test_allnewlyrics1.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/content/test_allnewlyrics1.json -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/content/test_aolNews.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/content/test_aolNews.html -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/content/test_aolNews.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/content/test_aolNews.json -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/content/test_articlebody_attribute.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/content/test_articlebody_attribute.html -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/content/test_articlebody_attribute.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/content/test_articlebody_attribute.json -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/content/test_articlebody_itemprop.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/content/test_articlebody_itemprop.html -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/content/test_articlebody_itemprop.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/content/test_articlebody_itemprop.json -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/content/test_articlebody_tag.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/content/test_articlebody_tag.html -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/content/test_articlebody_tag.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/content/test_articlebody_tag.json -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/content/test_bbc_chinese.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/content/test_bbc_chinese.html -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/content/test_bbc_chinese.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/content/test_bbc_chinese.json -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/content/test_businessWeek1.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/content/test_businessWeek1.html -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/content/test_businessWeek1.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/content/test_businessWeek1.json -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/content/test_businessWeek2.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/content/test_businessWeek2.html -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/content/test_businessWeek2.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/content/test_businessWeek2.json -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/content/test_businessWeek3.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/content/test_businessWeek3.html -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/content/test_businessWeek3.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/content/test_businessWeek3.json -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/content/test_businessinsider3.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/content/test_businessinsider3.html -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/content/test_businessinsider3.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/content/test_businessinsider3.json -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/content/test_cbslocal.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/content/test_cbslocal.html -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/content/test_cbslocal.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/content/test_cbslocal.json -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/content/test_cnbc1.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/content/test_cnbc1.html -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/content/test_cnbc1.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/content/test_cnbc1.json -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/content/test_cnet.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/content/test_cnet.html -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/content/test_cnet.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/content/test_cnet.json -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/content/test_cnn1.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/content/test_cnn1.html -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/content/test_cnn1.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/content/test_cnn1.json -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/content/test_cnn_arabic.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/content/test_cnn_arabic.html -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/content/test_cnn_arabic.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/content/test_cnn_arabic.json -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/content/test_donga_korean.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/content/test_donga_korean.html -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/content/test_donga_korean.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/content/test_donga_korean.json -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/content/test_elmondo1.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/content/test_elmondo1.html -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/content/test_elmondo1.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/content/test_elmondo1.json -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/content/test_elpais.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/content/test_elpais.html -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/content/test_elpais.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/content/test_elpais.json -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/content/test_engadget.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/content/test_engadget.html -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/content/test_engadget.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/content/test_engadget.json -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/content/test_espn.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/content/test_espn.html -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/content/test_espn.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/content/test_espn.json -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/content/test_foxNews.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/content/test_foxNews.html -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/content/test_foxNews.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/content/test_foxNews.json -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/content/test_get_canonical_url.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/content/test_get_canonical_url.html -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/content/test_get_canonical_url.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/content/test_get_canonical_url.json -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/content/test_gizmodo1.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/content/test_gizmodo1.html -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/content/test_gizmodo1.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/content/test_gizmodo1.json -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/content/test_guardian1.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/content/test_guardian1.html -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/content/test_guardian1.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/content/test_guardian1.json -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/content/test_huffingtonPost2.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/content/test_huffingtonPost2.html -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/content/test_huffingtonPost2.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/content/test_huffingtonPost2.json -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/content/test_issue115.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/content/test_issue115.html -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/content/test_issue115.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/content/test_issue115.json -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/content/test_issue129.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/content/test_issue129.html -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/content/test_issue129.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/content/test_issue129.json -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/content/test_issue24.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/content/test_issue24.html -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/content/test_issue24.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/content/test_issue24.json -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/content/test_issue25.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/content/test_issue25.html -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/content/test_issue25.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/content/test_issue25.json -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/content/test_issue28.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/content/test_issue28.html -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/content/test_issue28.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/content/test_issue28.json -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/content/test_issue32.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/content/test_issue32.html -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/content/test_issue32.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/content/test_issue32.json -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/content/test_issue4.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/content/test_issue4.html -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/content/test_issue4.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/content/test_issue4.json -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/content/test_lefigaro.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/content/test_lefigaro.html -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/content/test_lefigaro.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/content/test_lefigaro.json -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/content/test_liberation.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/content/test_liberation.html -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/content/test_liberation.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/content/test_liberation.json -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/content/test_marketplace.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/content/test_marketplace.html -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/content/test_marketplace.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/content/test_marketplace.json -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/content/test_mashable_issue_74.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/content/test_mashable_issue_74.html -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/content/test_mashable_issue_74.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/content/test_mashable_issue_74.json -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/content/test_msn1.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/content/test_msn1.html -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/content/test_msn1.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/content/test_msn1.json -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/content/test_okaymarketing.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/content/test_okaymarketing.html -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/content/test_okaymarketing.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/content/test_okaymarketing.json -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/content/test_politico.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/content/test_politico.html -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/content/test_politico.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/content/test_politico.json -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/content/test_techcrunch1.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/content/test_techcrunch1.html -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/content/test_techcrunch1.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/content/test_techcrunch1.json -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/content/test_testHuffingtonPost.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/content/test_testHuffingtonPost.html -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/content/test_testHuffingtonPost.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/content/test_testHuffingtonPost.json -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/content/test_time.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/content/test_time.html -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/content/test_time.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/content/test_time.json -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/content/test_time2.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/content/test_time2.html -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/content/test_time2.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/content/test_time2.json -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/content/test_usatoday_issue_74.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/content/test_usatoday_issue_74.html -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/content/test_usatoday_issue_74.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/content/test_usatoday_issue_74.json -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/content/test_yahoo.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/content/test_yahoo.html -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/content/test_yahoo.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/content/test_yahoo.json -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/images/test_basic_image/50850547cc7310bc53e30e802c6318f1: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/images/test_basic_image/50850547cc7310bc53e30e802c6318f1 -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/images/test_basic_image/test_basic_image.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/images/test_basic_image/test_basic_image.html -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/images/test_basic_image/test_basic_image.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/images/test_basic_image/test_basic_image.json -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/images/test_known_image_css_class/test_known_image_css_class.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/images/test_known_image_css_class/test_known_image_css_class.html -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/images/test_known_image_css_class/test_known_image_css_class.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/images/test_known_image_css_class/test_known_image_css_class.json -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/images/test_known_image_css_id/test_known_image_css_id.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/images/test_known_image_css_id/test_known_image_css_id.html -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/images/test_known_image_css_id/test_known_image_css_id.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/images/test_known_image_css_id/test_known_image_css_id.json -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/images/test_known_image_css_parent_class/test_known_image_css_parent_class.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/images/test_known_image_css_parent_class/test_known_image_css_parent_class.html -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/images/test_known_image_css_parent_class/test_known_image_css_parent_class.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/images/test_known_image_css_parent_class/test_known_image_css_parent_class.json -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/images/test_known_image_css_parent_id/test_known_image_css_parent_id.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/images/test_known_image_css_parent_id/test_known_image_css_parent_id.html -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/images/test_known_image_css_parent_id/test_known_image_css_parent_id.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/images/test_known_image_css_parent_id/test_known_image_css_parent_id.json -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/images/test_known_image_empty_src/test_known_image_empty_src.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/images/test_known_image_empty_src/test_known_image_empty_src.html -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/images/test_known_image_empty_src/test_known_image_empty_src.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/images/test_known_image_empty_src/test_known_image_empty_src.json -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/images/test_known_image_name_parent/test_known_image_name_parent.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/images/test_known_image_name_parent/test_known_image_name_parent.html -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/images/test_known_image_name_parent/test_known_image_name_parent.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/images/test_known_image_name_parent/test_known_image_name_parent.json -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/images/test_opengraph_tag/test_opengraph_tag.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/images/test_opengraph_tag/test_opengraph_tag.html -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/images/test_opengraph_tag/test_opengraph_tag.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/images/test_opengraph_tag/test_opengraph_tag.json -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/links/test_links.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/links/test_links.html -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/links/test_links.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/links/test_links.json -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/opengraph/test_opengraph.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/opengraph/test_opengraph.html -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/opengraph/test_opengraph.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/opengraph/test_opengraph.json -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/publishdate/test_publish_date.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/publishdate/test_publish_date.html -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/publishdate/test_publish_date.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/publishdate/test_publish_date.json -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/publishdate/test_publish_date_article.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/publishdate/test_publish_date_article.html -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/publishdate/test_publish_date_article.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/publishdate/test_publish_date_article.json -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/publishdate/test_publish_date_rnews.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/publishdate/test_publish_date_rnews.html -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/publishdate/test_publish_date_rnews.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/publishdate/test_publish_date_rnews.json -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/publishdate/test_publish_date_schema.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/publishdate/test_publish_date_schema.html -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/publishdate/test_publish_date_schema.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/publishdate/test_publish_date_schema.json -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/tags/test_tags_abcau.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/tags/test_tags_abcau.html -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/tags/test_tags_abcau.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/tags/test_tags_abcau.json -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/tags/test_tags_cnet.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/tags/test_tags_cnet.html -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/tags/test_tags_cnet.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/tags/test_tags_cnet.json -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/tags/test_tags_deadline.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/tags/test_tags_deadline.html -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/tags/test_tags_deadline.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/tags/test_tags_deadline.json -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/tags/test_tags_kexp.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/tags/test_tags_kexp.html -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/tags/test_tags_kexp.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/tags/test_tags_kexp.json -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/tags/test_tags_wnyc.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/tags/test_tags_wnyc.html -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/tags/test_tags_wnyc.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/tags/test_tags_wnyc.json -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/title/test_title_empty.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/title/test_title_empty.html -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/title/test_title_empty.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/title/test_title_empty.json -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/title/test_title_opengraph.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/title/test_title_opengraph.html -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/title/test_title_opengraph.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/title/test_title_opengraph.json -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/tweets/test_tweet.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/tweets/test_tweet.html -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/tweets/test_tweet.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/tweets/test_tweet.json -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/videos/test_embed.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/videos/test_embed.html -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/videos/test_embed.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/videos/test_embed.json -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/videos/test_iframe.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/videos/test_iframe.html -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/videos/test_iframe.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/videos/test_iframe.json -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/videos/test_object.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/videos/test_object.html -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/extractors/videos/test_object.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/extractors/videos/test_object.json -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/data/parser/test1.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/data/parser/test1.html -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/extractors/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/extractors/authors.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/extractors/authors.py -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/extractors/base.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/extractors/base.py -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/extractors/content.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/extractors/content.py -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/extractors/images.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/extractors/images.py -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/extractors/links.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/extractors/links.py -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/extractors/metas.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/extractors/metas.py -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/extractors/opengraph.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/extractors/opengraph.py -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/extractors/publishdate.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/extractors/publishdate.py -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/extractors/tags.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/extractors/tags.py -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/extractors/title.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/extractors/title.py -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/extractors/tweets.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/extractors/tweets.py -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/extractors/videos.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/extractors/videos.py -------------------------------------------------------------------------------- /text_extraction/python-goose/tests/parsers.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/python-goose/tests/parsers.py -------------------------------------------------------------------------------- /text_extraction/pythongoose.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/pythongoose.txt -------------------------------------------------------------------------------- /text_extraction/te_goose.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/te_goose.py -------------------------------------------------------------------------------- /text_extraction/template.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/template.txt -------------------------------------------------------------------------------- /text_extraction/test.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/test.txt -------------------------------------------------------------------------------- /text_extraction/word_tag_ratio.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/text_extraction/word_tag_ratio.py -------------------------------------------------------------------------------- /weibo/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /weibo/feeds_crawler.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/weibo/feeds_crawler.py -------------------------------------------------------------------------------- /weibo/mongo_db_manager.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/weibo/mongo_db_manager.py -------------------------------------------------------------------------------- /weibo/mysql_db_manager.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/weibo/mysql_db_manager.py -------------------------------------------------------------------------------- /weibo/single_demo/feeds_test.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/weibo/single_demo/feeds_test.py -------------------------------------------------------------------------------- /weibo/single_demo/user_test.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/weibo/single_demo/user_test.py -------------------------------------------------------------------------------- /weibo/user_crawler.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/weibo/user_crawler.py -------------------------------------------------------------------------------- /wikipedia/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /wikipedia/articles.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/wikipedia/articles.py -------------------------------------------------------------------------------- /wikipedia/categories.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/wikipedia/categories.py -------------------------------------------------------------------------------- /wikipedia/infobox_parse_city.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/wikipedia/infobox_parse_city.py -------------------------------------------------------------------------------- /wikipedia/redis_manager.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junglelord/spider-course-2/HEAD/wikipedia/redis_manager.py --------------------------------------------------------------------------------