├── requirements.txt
├── README.md
├── src
    ├── main.py
    └── du8.py
└── test
    └── test_du8.py


/requirements.txt:
--------------------------------------------------------------------------------
 1 | PyYAML==3.10
 2 | beautifulsoup4==4.1.1
 3 | distribute==0.6.27
 4 | html5lib==0.95
 5 | mechanize==0.2.5
 6 | nose==1.1.2
 7 | virtualenv==1.7.2
 8 | virtualenv-clone==0.2.4
 9 | virtualenvwrapper==3.5
10 | wsgiref==0.1.2
11 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | du8_utils
 2 | ===========
 3 | 
 4 |     pip install virtualenv
 5 |     pip install virtualenvwrapper
 6 |     pip freeze > requirements.txt
 7 |     pip install -r requirements.txt
 8 |     workon du8
 9 | 
10 | Test
11 | ====
12 | 
13 | 	nosetests -w . ./test
14 | 	nosetests -w . ./test/test_du8.py
15 | 	python test/test_du8.py
16 | 
17 | Thanks
18 | ====
19 | 
20 | + [nose](https://github.com/nose-devs/nose)
21 | + [PyYAML](http://pyyaml.org/wiki/PyYAMLDocumentation)
22 | + [Mechanize](http://wwwsearch.sourceforge.net/mechanize/)
23 | + [BeautifulSoup4](http://www.crummy.com/software/BeautifulSoup/bs4/doc/)
24 | + [html5lib](https://code.google.com/p/html5lib/wiki/UserDocumentation)
25 | 


--------------------------------------------------------------------------------
/src/main.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding=utf-8
 3 | 
 4 | import sys
 5 | import time
 6 | from du8 import Du8Doc
 7 | 
 8 | def usage():
 9 |     print("./src/%s <book_url>" %sys.argv[0]); 
10 |     print("\t<book_url> is the book url in du8du8"); 
11 |     print
12 |     print("\texample:")
13 |     print("\t\thttp://www.du8du8.net/book/8/8592/")
14 |     print
15 | 
16 | if "__main__" == __name__:  
17 |     if len(sys.argv) != 2:  
18 |         usage()  
19 |         sys.exit(1)  
20 | 	
21 |     reload(sys).setdefaultencoding('utf8')
22 |     url = sys.argv[1]
23 |     doc = Du8Doc()
24 |     links = doc.get_links(url)
25 |     for link in links:
26 |         print link
27 |         title, chapter, content = doc.get_content(url + link)
28 |         print title, chapter, content
29 | 		


--------------------------------------------------------------------------------
/src/du8.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding=utf-8
 3 | 
 4 | import string,re
 5 | from mechanize import Browser
 6 | from bs4 import BeautifulSoup
 7 | 
 8 | class Du8Doc:
 9 | 
10 |     def __init__(self):
11 |         self.br = Browser()
12 |         
13 |     def from_html(self, html):
14 |         text = re.sub("<.+>\n", "", html)
15 |         text = re.sub("</.+>\n", "", text)
16 |         text = re.sub('(<br/?>\s*)+', '\n', text)
17 |         text = re.sub('&nbsp;', ' ', text)
18 |         return text
19 | 
20 |     def get_links(self, url):
21 |         res = self.br.open(url)
22 |         data = res.get_data() 
23 |         soup = BeautifulSoup(data, "html5lib")
24 |         div_content = soup.find('table')
25 |         urls = div_content.find_all("a")
26 |         return [url.get('href') for url in urls ]        
27 |         
28 |     def get_content(self, link):
29 |         res = self.br.open(link)
30 |         data = res.get_data() 
31 |         soup = BeautifulSoup(data, "html5lib")
32 |         title, chapter = soup.html.title.string.split("-")[0:2]
33 |         div_content = soup.find(id="content").prettify()
34 |         content = self.from_html(div_content)
35 |         return title, chapter, content
36 | 


--------------------------------------------------------------------------------
/test/test_du8.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding=utf-8
 3 | 
 4 | import unittest
 5 | import sys
 6 | sys.path.append('./src')
 7 | 
 8 | from du8 import Du8Doc
 9 | 
10 | class TestDu8(unittest.TestCase):
11 |     def setUp(self):
12 |         self.doc = Du8Doc()
13 |         self.url = "http://www.du8du8.net/book/8/8592/"
14 | 
15 |     def test_from_html_with_br(self):
16 |         txt = self.doc.from_html("<p>hello<br/>word</p>")
17 |         assert txt != None, "txt is None."
18 | 
19 |     def test_from_html_with_comments(self):
20 |         txt = self.doc.from_html("<p>hello <!--this is comments--> word</p>")
21 |         assert txt != None, "txt is None."
22 |         
23 |     def test_get_links(self):
24 |         links = self.doc.get_links(self.url)
25 |         assert links != None, "links is None."
26 |         assert links[0] == u"949276.html", "first link is incorrect."
27 |         assert links[1] == u"954639.html", "second link is incorrect."
28 |         assert links[2] == u"958107.html", "third link is incorrect."
29 |         
30 |     def test_get_content(self):
31 |         title, chapter, content = self.doc.get_content(self.url + "949276.html")
32 |         assert content != None, "content is None."
33 |         assert content.startswith(u"“唔。”"), "content is incorrect."
34 | 
35 |     def test_get_title(self):
36 |         title, chapter, content = self.doc.get_content(self.url + "954639.html")
37 |         assert title != None, "title is None."
38 |         assert title.startswith(u"武动乾坤"), "title is incorrect."
39 |     
40 |     def test_get_chapter(self):
41 |         title, chapter, content = self.doc.get_content(self.url + "958107.html")
42 |         assert chapter != None, "chapter is None."
43 |         assert chapter.startswith(u"正文 第三章 古怪的石池"), "chapter is incorrect."
44 |         
45 | if __name__=="__main__":
46 |     reload(sys).setdefaultencoding('utf8')
47 |     unittest.main()


--------------------------------------------------------------------------------