├── requirements.txt ├── README.md ├── src ├── main.py └── du8.py └── test └── test_du8.py /requirements.txt: -------------------------------------------------------------------------------- 1 | PyYAML==3.10 2 | beautifulsoup4==4.1.1 3 | distribute==0.6.27 4 | html5lib==0.95 5 | mechanize==0.2.5 6 | nose==1.1.2 7 | virtualenv==1.7.2 8 | virtualenv-clone==0.2.4 9 | virtualenvwrapper==3.5 10 | wsgiref==0.1.2 11 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | du8_utils 2 | =========== 3 | 4 | pip install virtualenv 5 | pip install virtualenvwrapper 6 | pip freeze > requirements.txt 7 | pip install -r requirements.txt 8 | workon du8 9 | 10 | Test 11 | ==== 12 | 13 | nosetests -w . ./test 14 | nosetests -w . ./test/test_du8.py 15 | python test/test_du8.py 16 | 17 | Thanks 18 | ==== 19 | 20 | + [nose](https://github.com/nose-devs/nose) 21 | + [PyYAML](http://pyyaml.org/wiki/PyYAMLDocumentation) 22 | + [Mechanize](http://wwwsearch.sourceforge.net/mechanize/) 23 | + [BeautifulSoup4](http://www.crummy.com/software/BeautifulSoup/bs4/doc/) 24 | + [html5lib](https://code.google.com/p/html5lib/wiki/UserDocumentation) 25 | -------------------------------------------------------------------------------- /src/main.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | 4 | import sys 5 | import time 6 | from du8 import Du8Doc 7 | 8 | def usage(): 9 | print("./src/%s " %sys.argv[0]); 10 | print("\t is the book url in du8du8"); 11 | print 12 | print("\texample:") 13 | print("\t\thttp://www.du8du8.net/book/8/8592/") 14 | print 15 | 16 | if "__main__" == __name__: 17 | if len(sys.argv) != 2: 18 | usage() 19 | sys.exit(1) 20 | 21 | reload(sys).setdefaultencoding('utf8') 22 | url = sys.argv[1] 23 | doc = Du8Doc() 24 | links = doc.get_links(url) 25 | for link in links: 26 | print link 27 | title, chapter, content = doc.get_content(url + link) 28 | print title, chapter, content 29 | -------------------------------------------------------------------------------- /src/du8.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | 4 | import string,re 5 | from mechanize import Browser 6 | from bs4 import BeautifulSoup 7 | 8 | class Du8Doc: 9 | 10 | def __init__(self): 11 | self.br = Browser() 12 | 13 | def from_html(self, html): 14 | text = re.sub("<.+>\n", "", html) 15 | text = re.sub("\n", "", text) 16 | text = re.sub('(
\s*)+', '\n', text) 17 | text = re.sub(' ', ' ', text) 18 | return text 19 | 20 | def get_links(self, url): 21 | res = self.br.open(url) 22 | data = res.get_data() 23 | soup = BeautifulSoup(data, "html5lib") 24 | div_content = soup.find('table') 25 | urls = div_content.find_all("a") 26 | return [url.get('href') for url in urls ] 27 | 28 | def get_content(self, link): 29 | res = self.br.open(link) 30 | data = res.get_data() 31 | soup = BeautifulSoup(data, "html5lib") 32 | title, chapter = soup.html.title.string.split("-")[0:2] 33 | div_content = soup.find(id="content").prettify() 34 | content = self.from_html(div_content) 35 | return title, chapter, content 36 | -------------------------------------------------------------------------------- /test/test_du8.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | 4 | import unittest 5 | import sys 6 | sys.path.append('./src') 7 | 8 | from du8 import Du8Doc 9 | 10 | class TestDu8(unittest.TestCase): 11 | def setUp(self): 12 | self.doc = Du8Doc() 13 | self.url = "http://www.du8du8.net/book/8/8592/" 14 | 15 | def test_from_html_with_br(self): 16 | txt = self.doc.from_html("

hello
word

") 17 | assert txt != None, "txt is None." 18 | 19 | def test_from_html_with_comments(self): 20 | txt = self.doc.from_html("

hello word

") 21 | assert txt != None, "txt is None." 22 | 23 | def test_get_links(self): 24 | links = self.doc.get_links(self.url) 25 | assert links != None, "links is None." 26 | assert links[0] == u"949276.html", "first link is incorrect." 27 | assert links[1] == u"954639.html", "second link is incorrect." 28 | assert links[2] == u"958107.html", "third link is incorrect." 29 | 30 | def test_get_content(self): 31 | title, chapter, content = self.doc.get_content(self.url + "949276.html") 32 | assert content != None, "content is None." 33 | assert content.startswith(u"“唔。”"), "content is incorrect." 34 | 35 | def test_get_title(self): 36 | title, chapter, content = self.doc.get_content(self.url + "954639.html") 37 | assert title != None, "title is None." 38 | assert title.startswith(u"武动乾坤"), "title is incorrect." 39 | 40 | def test_get_chapter(self): 41 | title, chapter, content = self.doc.get_content(self.url + "958107.html") 42 | assert chapter != None, "chapter is None." 43 | assert chapter.startswith(u"正文 第三章 古怪的石池"), "chapter is incorrect." 44 | 45 | if __name__=="__main__": 46 | reload(sys).setdefaultencoding('utf8') 47 | unittest.main() --------------------------------------------------------------------------------