├── .gitignore ├── README.md ├── baiduTranslate └── translater.py ├── bdbaike ├── baike_spider │ ├── __init__.py │ ├── html_downloader.py │ ├── html_outputer.py │ ├── html_parser.py │ └── url_manager.py └── spider_main.py ├── doubanBook ├── README.md ├── bookCrawler2 │ ├── books.txt │ ├── crawler.py │ └── tagList.txt ├── bookCrawler3 │ ├── README.md │ ├── bookSearch │ │ ├── bookSearch.py │ │ └── config.ini │ ├── crawler.py │ ├── initTable.sql │ └── test │ │ ├── bookinfotest.py │ │ └── multiThreading.py ├── bookSpiderXPath.py ├── book_list.txt └── book_list_spider.py ├── getWebpage ├── c1.py └── test.html ├── jenkinsJob ├── run.py └── utils.py ├── poem ├── README.md ├── config.py ├── crawler.py ├── models │ ├── __init__.py │ ├── author.py │ ├── poem.py │ └── poem_list.py └── utils │ └── http_util.py ├── proxyInfo └── proxyInfo2.py ├── qiushibaike ├── log.txt └── main.py ├── srtpInfo └── srtpSpider.py └── tieba ├── content.txt └── tiebaspider.py /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | *.pyc 3 | **/output* 4 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/plough/myCrawler/HEAD/README.md -------------------------------------------------------------------------------- /baiduTranslate/translater.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/plough/myCrawler/HEAD/baiduTranslate/translater.py -------------------------------------------------------------------------------- /bdbaike/baike_spider/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /bdbaike/baike_spider/html_downloader.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/plough/myCrawler/HEAD/bdbaike/baike_spider/html_downloader.py -------------------------------------------------------------------------------- /bdbaike/baike_spider/html_outputer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/plough/myCrawler/HEAD/bdbaike/baike_spider/html_outputer.py -------------------------------------------------------------------------------- /bdbaike/baike_spider/html_parser.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/plough/myCrawler/HEAD/bdbaike/baike_spider/html_parser.py -------------------------------------------------------------------------------- /bdbaike/baike_spider/url_manager.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/plough/myCrawler/HEAD/bdbaike/baike_spider/url_manager.py -------------------------------------------------------------------------------- /bdbaike/spider_main.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/plough/myCrawler/HEAD/bdbaike/spider_main.py -------------------------------------------------------------------------------- /doubanBook/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/plough/myCrawler/HEAD/doubanBook/README.md -------------------------------------------------------------------------------- /doubanBook/bookCrawler2/books.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/plough/myCrawler/HEAD/doubanBook/bookCrawler2/books.txt -------------------------------------------------------------------------------- /doubanBook/bookCrawler2/crawler.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/plough/myCrawler/HEAD/doubanBook/bookCrawler2/crawler.py -------------------------------------------------------------------------------- /doubanBook/bookCrawler2/tagList.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/plough/myCrawler/HEAD/doubanBook/bookCrawler2/tagList.txt -------------------------------------------------------------------------------- /doubanBook/bookCrawler3/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/plough/myCrawler/HEAD/doubanBook/bookCrawler3/README.md -------------------------------------------------------------------------------- /doubanBook/bookCrawler3/bookSearch/bookSearch.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/plough/myCrawler/HEAD/doubanBook/bookCrawler3/bookSearch/bookSearch.py -------------------------------------------------------------------------------- /doubanBook/bookCrawler3/bookSearch/config.ini: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/plough/myCrawler/HEAD/doubanBook/bookCrawler3/bookSearch/config.ini -------------------------------------------------------------------------------- /doubanBook/bookCrawler3/crawler.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/plough/myCrawler/HEAD/doubanBook/bookCrawler3/crawler.py -------------------------------------------------------------------------------- /doubanBook/bookCrawler3/initTable.sql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/plough/myCrawler/HEAD/doubanBook/bookCrawler3/initTable.sql -------------------------------------------------------------------------------- /doubanBook/bookCrawler3/test/bookinfotest.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/plough/myCrawler/HEAD/doubanBook/bookCrawler3/test/bookinfotest.py -------------------------------------------------------------------------------- /doubanBook/bookCrawler3/test/multiThreading.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/plough/myCrawler/HEAD/doubanBook/bookCrawler3/test/multiThreading.py -------------------------------------------------------------------------------- /doubanBook/bookSpiderXPath.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/plough/myCrawler/HEAD/doubanBook/bookSpiderXPath.py -------------------------------------------------------------------------------- /doubanBook/book_list.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/plough/myCrawler/HEAD/doubanBook/book_list.txt -------------------------------------------------------------------------------- /doubanBook/book_list_spider.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/plough/myCrawler/HEAD/doubanBook/book_list_spider.py -------------------------------------------------------------------------------- /getWebpage/c1.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/plough/myCrawler/HEAD/getWebpage/c1.py -------------------------------------------------------------------------------- /getWebpage/test.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/plough/myCrawler/HEAD/getWebpage/test.html -------------------------------------------------------------------------------- /jenkinsJob/run.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/plough/myCrawler/HEAD/jenkinsJob/run.py -------------------------------------------------------------------------------- /jenkinsJob/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/plough/myCrawler/HEAD/jenkinsJob/utils.py -------------------------------------------------------------------------------- /poem/README.md: -------------------------------------------------------------------------------- 1 | 从百度汉语中,爬取某个作者的所有诗。 2 | 3 | 使用方法: 4 | - 修改 config.py 中的用户配置部分 5 | - 运行 crawler.py 6 | -------------------------------------------------------------------------------- /poem/config.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/plough/myCrawler/HEAD/poem/config.py -------------------------------------------------------------------------------- /poem/crawler.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/plough/myCrawler/HEAD/poem/crawler.py -------------------------------------------------------------------------------- /poem/models/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | 模型类都放这里 3 | """ 4 | 5 | from models.poem_list import PoemList 6 | -------------------------------------------------------------------------------- /poem/models/author.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/plough/myCrawler/HEAD/poem/models/author.py -------------------------------------------------------------------------------- /poem/models/poem.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/plough/myCrawler/HEAD/poem/models/poem.py -------------------------------------------------------------------------------- /poem/models/poem_list.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/plough/myCrawler/HEAD/poem/models/poem_list.py -------------------------------------------------------------------------------- /poem/utils/http_util.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/plough/myCrawler/HEAD/poem/utils/http_util.py -------------------------------------------------------------------------------- /proxyInfo/proxyInfo2.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/plough/myCrawler/HEAD/proxyInfo/proxyInfo2.py -------------------------------------------------------------------------------- /qiushibaike/log.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/plough/myCrawler/HEAD/qiushibaike/log.txt -------------------------------------------------------------------------------- /qiushibaike/main.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/plough/myCrawler/HEAD/qiushibaike/main.py -------------------------------------------------------------------------------- /srtpInfo/srtpSpider.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/plough/myCrawler/HEAD/srtpInfo/srtpSpider.py -------------------------------------------------------------------------------- /tieba/content.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/plough/myCrawler/HEAD/tieba/content.txt -------------------------------------------------------------------------------- /tieba/tiebaspider.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/plough/myCrawler/HEAD/tieba/tiebaspider.py --------------------------------------------------------------------------------