├── Ch2Data
    ├── DataIO
    │   ├── Mcsv
    │   │   ├── mycsv.csv
    │   │   ├── csvdemo.csv
    │   │   ├── csvdemo.py
    │   │   └── result.csv
    │   ├── Mxlsx
    │   │   ├── Myxlsxdata
    │   │   │   ├── 公牛山.jpg
    │   │   │   ├── 原谅石.jpg
    │   │   │   ├── 散步去.jpg
    │   │   │   ├── 生火.jpg
    │   │   │   ├── 致薇拉.jpg
    │   │   │   ├── 何故为敌.jpg
    │   │   │   ├── 信仰与观看.jpg
    │   │   │   ├── 全栈市场人.jpg
    │   │   │   ├── 地下铁道.jpg
    │   │   │   ├── 妹妹的坟墓.jpg
    │   │   │   ├── 宛如昨日.jpg
    │   │   │   ├── 托克维尔.jpg
    │   │   │   ├── 极简进步史.jpg
    │   │   │   ├── 消失的星国.jpg
    │   │   │   ├── 眼泪的化学.jpg
    │   │   │   ├── 草原动物园.jpg
    │   │   │   ├── 被占的宅子.jpg
    │   │   │   ├── 贩卖音乐.jpg
    │   │   │   ├── 达芬奇幽灵.jpg
    │   │   │   ├── 青年斯大林.jpg
    │   │   │   ├── 风雪追击.jpg
    │   │   │   ├── Books.xlsx
    │   │   │   ├── 中国1945.jpg
    │   │   │   ├── 午夜起来听寂静.jpg
    │   │   │   ├── 寻找时间的人.jpg
    │   │   │   ├── 希腊棺材之谜.jpg
    │   │   │   ├── 带艾伯特回家.jpg
    │   │   │   ├── 庇护二世闻见录.jpg
    │   │   │   ├── 有匪2：离恨楼.jpg
    │   │   │   ├── 私人生活的变革.jpg
    │   │   │   ├── 终极X战警2.jpg
    │   │   │   ├── 蝙蝠侠：黑与白1.jpg
    │   │   │   ├── 遇见野兔的那一年.jpg
    │   │   │   ├── 青苔不会消失.jpg
    │   │   │   ├── 驻马店伤心故事集.jpg
    │   │   │   ├── 几乎消失的偷闲艺术.jpg
    │   │   │   ├── 文明之光（第四册）.jpg
    │   │   │   ├── 食帖15：便当灵感集.jpg
    │   │   │   ├── 鲍勃·迪伦：诗人之歌.jpg
    │   │   │   ├── Hello_World.xlsx
    │   │   │   ├── Mxlsxclass.xlsx
    │   │   │   ├── 共享经济没有告诉你的事.jpg
    │   │   │   ├── 石挥谈艺录：把生命交给舞台.jpg
    │   │   │   ├── pandas_simple.xlsx
    │   │   │   ├── pandas_moresheet.xlsx
    │   │   │   ├── pandas_openpyxl.xlsx
    │   │   │   └── result.csv
    │   │   ├── Mpd_openpyxl.py
    │   │   ├── getbookpics.py
    │   │   ├── Mpd_xlsxwriter.py
    │   │   ├── Mopenpyxl.py
    │   │   ├── Mxlsxwriter.py
    │   │   └── MxlsxClass.py
    │   └── Mpymysql
    │   │   ├── csv_database.py
    │   │   ├── Mbase.py
    │   │   └── result.csv
    └── Clean
    │   ├── RealClean
    │       ├── Report.txt
    │       ├── clean_report.py
    │       └── result.csv
    │   └── Mnumpy.py
├── Ch0Grammar
    ├── name_main1.py
    ├── name_main.py
    ├── Decorator.py
    ├── OOP.py
    └── Recursion.py
├── Pics
    ├── UA.png
    ├── pi.png
    ├── Mxlsx.png
    ├── QP1.png
    ├── QQ_DA.png
    ├── Tom.png
    ├── V4_wc.png
    ├── chars.png
    ├── json1.png
    ├── json2.png
    ├── json3.png
    ├── numpy.png
    ├── nums.png
    ├── pca1.png
    ├── tree.png
    ├── 多线程1.png
    ├── 多线程2.png
    ├── 解析1.png
    ├── 解析2.png
    ├── 解析3.png
    ├── 解析4.png
    ├── Email1.png
    ├── Spider1.png
    ├── V4_Chi.png
    ├── V4_Cos.png
    ├── V4_fill.png
    ├── boxplot.png
    ├── chardet.png
    ├── cookie.png
    ├── Corr_Mat.png
    ├── V4_cos_sin.png
    ├── V4_snskde.png
    ├── V4_subplot.png
    ├── all_plot.png
    ├── charset2.png
    ├── movie_data.png
    ├── outliers.png
    ├── pair_plot.png
    ├── sepal_plot.png
    ├── V4_snsfactor.png
    ├── V4_snslmplot.png
    ├── V4_snsviolin.png
    ├── V4_pandas_bar.png
    ├── V4_pandas_bar1.png
    ├── V4_pandas_hist.png
    ├── V4_pandas_plot.png
    ├── V4_snscountplot.png
    ├── V4_pandas_boxplot.png
    ├── V4_pandas_scatter.png
    └── Data_visualization_process_v1.png
├── .gitignore
├── Asset
    └── cover.jpeg
├── Report
    ├── data
    │   ├── heart.png
    │   ├── rank.png
    │   ├── positive.png
    │   ├── wordcloud.png
    │   └── SpiderTest.txt
    └── Source
    │   ├── process_data.py
    │   ├── analy_vis_data.py
    │   ├── spider.py
    │   ├── get_data.py
    │   └── data_process.ipynb
├── Ch4Data-Life
    ├── QQ
    │   ├── QQ.jpg
    │   └── DataExtr.py
    ├── Math
    │   ├── LP.py
    │   ├── QP.py
    │   ├── ThrDoor.py
    │   └── pi.py
    └── News
    │   ├── MEmail.py
    │   ├── NewsReport.py
    │   └── NewsReportLog.txt
├── Ch1Spider
    ├── captures
    │   ├── chars.png
    │   ├── douban.jpg
    │   ├── nums.png
    │   ├── yundama.py
    │   └── yundamadoc.py
    ├── JsonandSelenium
    │   ├── jd.png
    │   ├── after-login.png
    │   ├── after-insert.png
    │   ├── before-login.png
    │   ├── jsondemo.py
    │   ├── ghostdriver.log
    │   └── seleniumdemo.py
    ├── regular expression
    │   └── demo.py
    ├── cookie
    │   ├── direct-cookies.py
    │   ├── yundama.py
    │   ├── douban_login_new.py
    │   ├── douban_login.py
    │   └── verifcode.py
    ├── muti-threads
    │   ├── getTestUrls.py
    │   └── mutithreadspool.py
    ├── exception
    │   └── try_and_exception.py
    └── first-demo
    │   ├── spider.py
    │   └── result.csv
├── Ch3Analysis-Visualization
    ├── ML
    │   ├── iris.pdf
    │   ├── tree.png
    │   ├── MTree_Demo.py
    │   ├── PipLine.py
    │   ├── MTree_Iris.py
    │   ├── MKnn.py
    │   └── MKnn2.py
    ├── Visualization
    │   ├── mask.jpg
    │   ├── 完美陌生人-短评.xlsx
    │   ├── Mseaborn.py
    │   ├── MwordClound.py
    │   ├── Mmatplotlib.py
    │   ├── Iris.py
    │   └── Iris.csv
    └── EDA
    │   └── DataCamp.py
├── README.md
└── errata.md


/Ch2Data/DataIO/Mcsv/mycsv.csv:
--------------------------------------------------------------------------------
1 | A,B
2 | 2,4
3 | 3,5
4 | 4,6
5 | 


--------------------------------------------------------------------------------
/Ch0Grammar/name_main1.py:
--------------------------------------------------------------------------------
1 | from name_main import printHello
2 | 
3 | printHello()
4 | 


--------------------------------------------------------------------------------
/Pics/UA.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenxiangzhuang/PythonDataAnalysis/HEAD/Pics/UA.png


--------------------------------------------------------------------------------
/Pics/pi.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenxiangzhuang/PythonDataAnalysis/HEAD/Pics/pi.png


--------------------------------------------------------------------------------
/Ch2Data/DataIO/Mcsv/csvdemo.csv:
--------------------------------------------------------------------------------
1 | 代号,体重,身高
2 | A,65,178
3 | B,70,177
4 | C,64,180
5 | D,67,175
6 | 


--------------------------------------------------------------------------------
/Pics/Mxlsx.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenxiangzhuang/PythonDataAnalysis/HEAD/Pics/Mxlsx.png


--------------------------------------------------------------------------------
/Pics/QP1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenxiangzhuang/PythonDataAnalysis/HEAD/Pics/QP1.png


--------------------------------------------------------------------------------
/Pics/QQ_DA.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenxiangzhuang/PythonDataAnalysis/HEAD/Pics/QQ_DA.png


--------------------------------------------------------------------------------
/Pics/Tom.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenxiangzhuang/PythonDataAnalysis/HEAD/Pics/Tom.png


--------------------------------------------------------------------------------
/Pics/V4_wc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenxiangzhuang/PythonDataAnalysis/HEAD/Pics/V4_wc.png


--------------------------------------------------------------------------------
/Pics/chars.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenxiangzhuang/PythonDataAnalysis/HEAD/Pics/chars.png


--------------------------------------------------------------------------------
/Pics/json1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenxiangzhuang/PythonDataAnalysis/HEAD/Pics/json1.png


--------------------------------------------------------------------------------
/Pics/json2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenxiangzhuang/PythonDataAnalysis/HEAD/Pics/json2.png


--------------------------------------------------------------------------------
/Pics/json3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenxiangzhuang/PythonDataAnalysis/HEAD/Pics/json3.png


--------------------------------------------------------------------------------
/Pics/numpy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenxiangzhuang/PythonDataAnalysis/HEAD/Pics/numpy.png


--------------------------------------------------------------------------------
/Pics/nums.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenxiangzhuang/PythonDataAnalysis/HEAD/Pics/nums.png


--------------------------------------------------------------------------------
/Pics/pca1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenxiangzhuang/PythonDataAnalysis/HEAD/Pics/pca1.png


--------------------------------------------------------------------------------
/Pics/tree.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenxiangzhuang/PythonDataAnalysis/HEAD/Pics/tree.png


--------------------------------------------------------------------------------
/Pics/多线程1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenxiangzhuang/PythonDataAnalysis/HEAD/Pics/多线程1.png


--------------------------------------------------------------------------------
/Pics/多线程2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenxiangzhuang/PythonDataAnalysis/HEAD/Pics/多线程2.png


--------------------------------------------------------------------------------
/Pics/解析1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenxiangzhuang/PythonDataAnalysis/HEAD/Pics/解析1.png


--------------------------------------------------------------------------------
/Pics/解析2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenxiangzhuang/PythonDataAnalysis/HEAD/Pics/解析2.png


--------------------------------------------------------------------------------
/Pics/解析3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenxiangzhuang/PythonDataAnalysis/HEAD/Pics/解析3.png


--------------------------------------------------------------------------------
/Pics/解析4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenxiangzhuang/PythonDataAnalysis/HEAD/Pics/解析4.png


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # idea
2 | .idea/
3 | 
4 | # cache
5 | **/__pycache__
6 | 
7 | # cookies
8 | **/cookies**


--------------------------------------------------------------------------------
/Asset/cover.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenxiangzhuang/PythonDataAnalysis/HEAD/Asset/cover.jpeg


--------------------------------------------------------------------------------
/Pics/Email1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenxiangzhuang/PythonDataAnalysis/HEAD/Pics/Email1.png


--------------------------------------------------------------------------------
/Pics/Spider1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenxiangzhuang/PythonDataAnalysis/HEAD/Pics/Spider1.png


--------------------------------------------------------------------------------
/Pics/V4_Chi.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenxiangzhuang/PythonDataAnalysis/HEAD/Pics/V4_Chi.png


--------------------------------------------------------------------------------
/Pics/V4_Cos.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenxiangzhuang/PythonDataAnalysis/HEAD/Pics/V4_Cos.png


--------------------------------------------------------------------------------
/Pics/V4_fill.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenxiangzhuang/PythonDataAnalysis/HEAD/Pics/V4_fill.png


--------------------------------------------------------------------------------
/Pics/boxplot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenxiangzhuang/PythonDataAnalysis/HEAD/Pics/boxplot.png


--------------------------------------------------------------------------------
/Pics/chardet.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenxiangzhuang/PythonDataAnalysis/HEAD/Pics/chardet.png


--------------------------------------------------------------------------------
/Pics/cookie.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenxiangzhuang/PythonDataAnalysis/HEAD/Pics/cookie.png


--------------------------------------------------------------------------------
/Pics/Corr_Mat.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenxiangzhuang/PythonDataAnalysis/HEAD/Pics/Corr_Mat.png


--------------------------------------------------------------------------------
/Pics/V4_cos_sin.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenxiangzhuang/PythonDataAnalysis/HEAD/Pics/V4_cos_sin.png


--------------------------------------------------------------------------------
/Pics/V4_snskde.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenxiangzhuang/PythonDataAnalysis/HEAD/Pics/V4_snskde.png


--------------------------------------------------------------------------------
/Pics/V4_subplot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenxiangzhuang/PythonDataAnalysis/HEAD/Pics/V4_subplot.png


--------------------------------------------------------------------------------
/Pics/all_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenxiangzhuang/PythonDataAnalysis/HEAD/Pics/all_plot.png


--------------------------------------------------------------------------------
/Pics/charset2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenxiangzhuang/PythonDataAnalysis/HEAD/Pics/charset2.png


--------------------------------------------------------------------------------
/Pics/movie_data.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenxiangzhuang/PythonDataAnalysis/HEAD/Pics/movie_data.png


--------------------------------------------------------------------------------
/Pics/outliers.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenxiangzhuang/PythonDataAnalysis/HEAD/Pics/outliers.png


--------------------------------------------------------------------------------
/Pics/pair_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenxiangzhuang/PythonDataAnalysis/HEAD/Pics/pair_plot.png


--------------------------------------------------------------------------------
/Pics/sepal_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenxiangzhuang/PythonDataAnalysis/HEAD/Pics/sepal_plot.png


--------------------------------------------------------------------------------
/Pics/V4_snsfactor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenxiangzhuang/PythonDataAnalysis/HEAD/Pics/V4_snsfactor.png


--------------------------------------------------------------------------------
/Pics/V4_snslmplot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenxiangzhuang/PythonDataAnalysis/HEAD/Pics/V4_snslmplot.png


--------------------------------------------------------------------------------
/Pics/V4_snsviolin.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenxiangzhuang/PythonDataAnalysis/HEAD/Pics/V4_snsviolin.png


--------------------------------------------------------------------------------
/Report/data/heart.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenxiangzhuang/PythonDataAnalysis/HEAD/Report/data/heart.png


--------------------------------------------------------------------------------
/Report/data/rank.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenxiangzhuang/PythonDataAnalysis/HEAD/Report/data/rank.png


--------------------------------------------------------------------------------
/Ch4Data-Life/QQ/QQ.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenxiangzhuang/PythonDataAnalysis/HEAD/Ch4Data-Life/QQ/QQ.jpg


--------------------------------------------------------------------------------
/Pics/V4_pandas_bar.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenxiangzhuang/PythonDataAnalysis/HEAD/Pics/V4_pandas_bar.png


--------------------------------------------------------------------------------
/Pics/V4_pandas_bar1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenxiangzhuang/PythonDataAnalysis/HEAD/Pics/V4_pandas_bar1.png


--------------------------------------------------------------------------------
/Pics/V4_pandas_hist.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenxiangzhuang/PythonDataAnalysis/HEAD/Pics/V4_pandas_hist.png


--------------------------------------------------------------------------------
/Pics/V4_pandas_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenxiangzhuang/PythonDataAnalysis/HEAD/Pics/V4_pandas_plot.png


--------------------------------------------------------------------------------
/Pics/V4_snscountplot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenxiangzhuang/PythonDataAnalysis/HEAD/Pics/V4_snscountplot.png


--------------------------------------------------------------------------------
/Report/data/positive.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenxiangzhuang/PythonDataAnalysis/HEAD/Report/data/positive.png


--------------------------------------------------------------------------------
/Pics/V4_pandas_boxplot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenxiangzhuang/PythonDataAnalysis/HEAD/Pics/V4_pandas_boxplot.png


--------------------------------------------------------------------------------
/Pics/V4_pandas_scatter.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenxiangzhuang/PythonDataAnalysis/HEAD/Pics/V4_pandas_scatter.png


--------------------------------------------------------------------------------
/Report/data/wordcloud.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenxiangzhuang/PythonDataAnalysis/HEAD/Report/data/wordcloud.png


--------------------------------------------------------------------------------
/Ch1Spider/captures/chars.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenxiangzhuang/PythonDataAnalysis/HEAD/Ch1Spider/captures/chars.png


--------------------------------------------------------------------------------
/Ch1Spider/captures/douban.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenxiangzhuang/PythonDataAnalysis/HEAD/Ch1Spider/captures/douban.jpg


--------------------------------------------------------------------------------
/Ch1Spider/captures/nums.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenxiangzhuang/PythonDataAnalysis/HEAD/Ch1Spider/captures/nums.png


--------------------------------------------------------------------------------
/Ch1Spider/JsonandSelenium/jd.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenxiangzhuang/PythonDataAnalysis/HEAD/Ch1Spider/JsonandSelenium/jd.png


--------------------------------------------------------------------------------
/Ch2Data/DataIO/Mxlsx/Myxlsxdata/公牛山.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenxiangzhuang/PythonDataAnalysis/HEAD/Ch2Data/DataIO/Mxlsx/Myxlsxdata/公牛山.jpg


--------------------------------------------------------------------------------
/Ch2Data/DataIO/Mxlsx/Myxlsxdata/原谅石.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenxiangzhuang/PythonDataAnalysis/HEAD/Ch2Data/DataIO/Mxlsx/Myxlsxdata/原谅石.jpg


--------------------------------------------------------------------------------
/Ch2Data/DataIO/Mxlsx/Myxlsxdata/散步去.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenxiangzhuang/PythonDataAnalysis/HEAD/Ch2Data/DataIO/Mxlsx/Myxlsxdata/散步去.jpg


--------------------------------------------------------------------------------
/Ch2Data/DataIO/Mxlsx/Myxlsxdata/生火.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenxiangzhuang/PythonDataAnalysis/HEAD/Ch2Data/DataIO/Mxlsx/Myxlsxdata/生火.jpg


--------------------------------------------------------------------------------
/Ch2Data/DataIO/Mxlsx/Myxlsxdata/致薇拉.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenxiangzhuang/PythonDataAnalysis/HEAD/Ch2Data/DataIO/Mxlsx/Myxlsxdata/致薇拉.jpg


--------------------------------------------------------------------------------
/Ch3Analysis-Visualization/ML/iris.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenxiangzhuang/PythonDataAnalysis/HEAD/Ch3Analysis-Visualization/ML/iris.pdf


--------------------------------------------------------------------------------
/Ch3Analysis-Visualization/ML/tree.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenxiangzhuang/PythonDataAnalysis/HEAD/Ch3Analysis-Visualization/ML/tree.png


--------------------------------------------------------------------------------
/Pics/Data_visualization_process_v1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenxiangzhuang/PythonDataAnalysis/HEAD/Pics/Data_visualization_process_v1.png


--------------------------------------------------------------------------------
/Ch1Spider/JsonandSelenium/after-login.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenxiangzhuang/PythonDataAnalysis/HEAD/Ch1Spider/JsonandSelenium/after-login.png


--------------------------------------------------------------------------------
/Ch2Data/DataIO/Mxlsx/Myxlsxdata/何故为敌.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenxiangzhuang/PythonDataAnalysis/HEAD/Ch2Data/DataIO/Mxlsx/Myxlsxdata/何故为敌.jpg


--------------------------------------------------------------------------------
/Ch2Data/DataIO/Mxlsx/Myxlsxdata/信仰与观看.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenxiangzhuang/PythonDataAnalysis/HEAD/Ch2Data/DataIO/Mxlsx/Myxlsxdata/信仰与观看.jpg


--------------------------------------------------------------------------------
/Ch2Data/DataIO/Mxlsx/Myxlsxdata/全栈市场人.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenxiangzhuang/PythonDataAnalysis/HEAD/Ch2Data/DataIO/Mxlsx/Myxlsxdata/全栈市场人.jpg


--------------------------------------------------------------------------------
/Ch2Data/DataIO/Mxlsx/Myxlsxdata/地下铁道.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenxiangzhuang/PythonDataAnalysis/HEAD/Ch2Data/DataIO/Mxlsx/Myxlsxdata/地下铁道.jpg


--------------------------------------------------------------------------------
/Ch2Data/DataIO/Mxlsx/Myxlsxdata/妹妹的坟墓.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenxiangzhuang/PythonDataAnalysis/HEAD/Ch2Data/DataIO/Mxlsx/Myxlsxdata/妹妹的坟墓.jpg


--------------------------------------------------------------------------------
/Ch2Data/DataIO/Mxlsx/Myxlsxdata/宛如昨日.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenxiangzhuang/PythonDataAnalysis/HEAD/Ch2Data/DataIO/Mxlsx/Myxlsxdata/宛如昨日.jpg


--------------------------------------------------------------------------------
/Ch2Data/DataIO/Mxlsx/Myxlsxdata/托克维尔.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenxiangzhuang/PythonDataAnalysis/HEAD/Ch2Data/DataIO/Mxlsx/Myxlsxdata/托克维尔.jpg


--------------------------------------------------------------------------------
/Ch2Data/DataIO/Mxlsx/Myxlsxdata/极简进步史.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenxiangzhuang/PythonDataAnalysis/HEAD/Ch2Data/DataIO/Mxlsx/Myxlsxdata/极简进步史.jpg


--------------------------------------------------------------------------------
/Ch2Data/DataIO/Mxlsx/Myxlsxdata/消失的星国.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenxiangzhuang/PythonDataAnalysis/HEAD/Ch2Data/DataIO/Mxlsx/Myxlsxdata/消失的星国.jpg


--------------------------------------------------------------------------------
/Ch2Data/DataIO/Mxlsx/Myxlsxdata/眼泪的化学.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenxiangzhuang/PythonDataAnalysis/HEAD/Ch2Data/DataIO/Mxlsx/Myxlsxdata/眼泪的化学.jpg


--------------------------------------------------------------------------------
/Ch2Data/DataIO/Mxlsx/Myxlsxdata/草原动物园.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenxiangzhuang/PythonDataAnalysis/HEAD/Ch2Data/DataIO/Mxlsx/Myxlsxdata/草原动物园.jpg


--------------------------------------------------------------------------------
/Ch2Data/DataIO/Mxlsx/Myxlsxdata/被占的宅子.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenxiangzhuang/PythonDataAnalysis/HEAD/Ch2Data/DataIO/Mxlsx/Myxlsxdata/被占的宅子.jpg


--------------------------------------------------------------------------------
/Ch2Data/DataIO/Mxlsx/Myxlsxdata/贩卖音乐.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenxiangzhuang/PythonDataAnalysis/HEAD/Ch2Data/DataIO/Mxlsx/Myxlsxdata/贩卖音乐.jpg


--------------------------------------------------------------------------------
/Ch2Data/DataIO/Mxlsx/Myxlsxdata/达芬奇幽灵.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenxiangzhuang/PythonDataAnalysis/HEAD/Ch2Data/DataIO/Mxlsx/Myxlsxdata/达芬奇幽灵.jpg


--------------------------------------------------------------------------------
/Ch2Data/DataIO/Mxlsx/Myxlsxdata/青年斯大林.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenxiangzhuang/PythonDataAnalysis/HEAD/Ch2Data/DataIO/Mxlsx/Myxlsxdata/青年斯大林.jpg


--------------------------------------------------------------------------------
/Ch2Data/DataIO/Mxlsx/Myxlsxdata/风雪追击.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenxiangzhuang/PythonDataAnalysis/HEAD/Ch2Data/DataIO/Mxlsx/Myxlsxdata/风雪追击.jpg


--------------------------------------------------------------------------------
/Ch1Spider/JsonandSelenium/after-insert.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenxiangzhuang/PythonDataAnalysis/HEAD/Ch1Spider/JsonandSelenium/after-insert.png


--------------------------------------------------------------------------------
/Ch1Spider/JsonandSelenium/before-login.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenxiangzhuang/PythonDataAnalysis/HEAD/Ch1Spider/JsonandSelenium/before-login.png


--------------------------------------------------------------------------------
/Ch2Data/DataIO/Mxlsx/Myxlsxdata/Books.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenxiangzhuang/PythonDataAnalysis/HEAD/Ch2Data/DataIO/Mxlsx/Myxlsxdata/Books.xlsx


--------------------------------------------------------------------------------
/Ch2Data/DataIO/Mxlsx/Myxlsxdata/中国1945.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenxiangzhuang/PythonDataAnalysis/HEAD/Ch2Data/DataIO/Mxlsx/Myxlsxdata/中国1945.jpg


--------------------------------------------------------------------------------
/Ch2Data/DataIO/Mxlsx/Myxlsxdata/午夜起来听寂静.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenxiangzhuang/PythonDataAnalysis/HEAD/Ch2Data/DataIO/Mxlsx/Myxlsxdata/午夜起来听寂静.jpg


--------------------------------------------------------------------------------
/Ch2Data/DataIO/Mxlsx/Myxlsxdata/寻找时间的人.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenxiangzhuang/PythonDataAnalysis/HEAD/Ch2Data/DataIO/Mxlsx/Myxlsxdata/寻找时间的人.jpg


--------------------------------------------------------------------------------
/Ch2Data/DataIO/Mxlsx/Myxlsxdata/希腊棺材之谜.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenxiangzhuang/PythonDataAnalysis/HEAD/Ch2Data/DataIO/Mxlsx/Myxlsxdata/希腊棺材之谜.jpg


--------------------------------------------------------------------------------
/Ch2Data/DataIO/Mxlsx/Myxlsxdata/带艾伯特回家.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenxiangzhuang/PythonDataAnalysis/HEAD/Ch2Data/DataIO/Mxlsx/Myxlsxdata/带艾伯特回家.jpg


--------------------------------------------------------------------------------
/Ch2Data/DataIO/Mxlsx/Myxlsxdata/庇护二世闻见录.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenxiangzhuang/PythonDataAnalysis/HEAD/Ch2Data/DataIO/Mxlsx/Myxlsxdata/庇护二世闻见录.jpg


--------------------------------------------------------------------------------
/Ch2Data/DataIO/Mxlsx/Myxlsxdata/有匪2：离恨楼.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenxiangzhuang/PythonDataAnalysis/HEAD/Ch2Data/DataIO/Mxlsx/Myxlsxdata/有匪2：离恨楼.jpg


--------------------------------------------------------------------------------
/Ch2Data/DataIO/Mxlsx/Myxlsxdata/私人生活的变革.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenxiangzhuang/PythonDataAnalysis/HEAD/Ch2Data/DataIO/Mxlsx/Myxlsxdata/私人生活的变革.jpg


--------------------------------------------------------------------------------
/Ch2Data/DataIO/Mxlsx/Myxlsxdata/终极X战警2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenxiangzhuang/PythonDataAnalysis/HEAD/Ch2Data/DataIO/Mxlsx/Myxlsxdata/终极X战警2.jpg


--------------------------------------------------------------------------------
/Ch2Data/DataIO/Mxlsx/Myxlsxdata/蝙蝠侠：黑与白1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenxiangzhuang/PythonDataAnalysis/HEAD/Ch2Data/DataIO/Mxlsx/Myxlsxdata/蝙蝠侠：黑与白1.jpg


--------------------------------------------------------------------------------
/Ch2Data/DataIO/Mxlsx/Myxlsxdata/遇见野兔的那一年.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenxiangzhuang/PythonDataAnalysis/HEAD/Ch2Data/DataIO/Mxlsx/Myxlsxdata/遇见野兔的那一年.jpg


--------------------------------------------------------------------------------
/Ch2Data/DataIO/Mxlsx/Myxlsxdata/青苔不会消失.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenxiangzhuang/PythonDataAnalysis/HEAD/Ch2Data/DataIO/Mxlsx/Myxlsxdata/青苔不会消失.jpg


--------------------------------------------------------------------------------
/Ch2Data/DataIO/Mxlsx/Myxlsxdata/驻马店伤心故事集.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenxiangzhuang/PythonDataAnalysis/HEAD/Ch2Data/DataIO/Mxlsx/Myxlsxdata/驻马店伤心故事集.jpg


--------------------------------------------------------------------------------
/Ch2Data/DataIO/Mxlsx/Myxlsxdata/几乎消失的偷闲艺术.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenxiangzhuang/PythonDataAnalysis/HEAD/Ch2Data/DataIO/Mxlsx/Myxlsxdata/几乎消失的偷闲艺术.jpg


--------------------------------------------------------------------------------
/Ch2Data/DataIO/Mxlsx/Myxlsxdata/文明之光（第四册）.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenxiangzhuang/PythonDataAnalysis/HEAD/Ch2Data/DataIO/Mxlsx/Myxlsxdata/文明之光（第四册）.jpg


--------------------------------------------------------------------------------
/Ch2Data/DataIO/Mxlsx/Myxlsxdata/食帖15：便当灵感集.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenxiangzhuang/PythonDataAnalysis/HEAD/Ch2Data/DataIO/Mxlsx/Myxlsxdata/食帖15：便当灵感集.jpg


--------------------------------------------------------------------------------
/Ch2Data/DataIO/Mxlsx/Myxlsxdata/鲍勃·迪伦：诗人之歌.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenxiangzhuang/PythonDataAnalysis/HEAD/Ch2Data/DataIO/Mxlsx/Myxlsxdata/鲍勃·迪伦：诗人之歌.jpg


--------------------------------------------------------------------------------
/Ch2Data/DataIO/Mxlsx/Myxlsxdata/Hello_World.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenxiangzhuang/PythonDataAnalysis/HEAD/Ch2Data/DataIO/Mxlsx/Myxlsxdata/Hello_World.xlsx


--------------------------------------------------------------------------------
/Ch2Data/DataIO/Mxlsx/Myxlsxdata/Mxlsxclass.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenxiangzhuang/PythonDataAnalysis/HEAD/Ch2Data/DataIO/Mxlsx/Myxlsxdata/Mxlsxclass.xlsx


--------------------------------------------------------------------------------
/Ch2Data/DataIO/Mxlsx/Myxlsxdata/共享经济没有告诉你的事.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenxiangzhuang/PythonDataAnalysis/HEAD/Ch2Data/DataIO/Mxlsx/Myxlsxdata/共享经济没有告诉你的事.jpg


--------------------------------------------------------------------------------
/Ch2Data/DataIO/Mxlsx/Myxlsxdata/石挥谈艺录：把生命交给舞台.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenxiangzhuang/PythonDataAnalysis/HEAD/Ch2Data/DataIO/Mxlsx/Myxlsxdata/石挥谈艺录：把生命交给舞台.jpg


--------------------------------------------------------------------------------
/Ch3Analysis-Visualization/Visualization/mask.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenxiangzhuang/PythonDataAnalysis/HEAD/Ch3Analysis-Visualization/Visualization/mask.jpg


--------------------------------------------------------------------------------
/Ch2Data/DataIO/Mcsv/csvdemo.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | df = pd.read_csv('csvdemo.csv')
 4 | print('DataFrame:\n', df)
 5 | 
 6 | 
 7 | 
 8 | 
 9 | 
10 | 
11 | 
12 | 


--------------------------------------------------------------------------------
/Ch2Data/DataIO/Mxlsx/Myxlsxdata/pandas_simple.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenxiangzhuang/PythonDataAnalysis/HEAD/Ch2Data/DataIO/Mxlsx/Myxlsxdata/pandas_simple.xlsx


--------------------------------------------------------------------------------
/Ch2Data/DataIO/Mxlsx/Myxlsxdata/pandas_moresheet.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenxiangzhuang/PythonDataAnalysis/HEAD/Ch2Data/DataIO/Mxlsx/Myxlsxdata/pandas_moresheet.xlsx


--------------------------------------------------------------------------------
/Ch2Data/DataIO/Mxlsx/Myxlsxdata/pandas_openpyxl.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenxiangzhuang/PythonDataAnalysis/HEAD/Ch2Data/DataIO/Mxlsx/Myxlsxdata/pandas_openpyxl.xlsx


--------------------------------------------------------------------------------
/Ch3Analysis-Visualization/Visualization/完美陌生人-短评.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenxiangzhuang/PythonDataAnalysis/HEAD/Ch3Analysis-Visualization/Visualization/完美陌生人-短评.xlsx


--------------------------------------------------------------------------------
/Ch0Grammar/name_main.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | 
 3 | if __name__ == '__main__' 总结
 4 | 
 5 | '''
 6 | 
 7 | def printHello():
 8 |     print("Hello World!")
 9 |     print(__name__)
10 | 
11 | 
12 | if __name__ ==  '__main__':
13 |     printHello()
14 | 


--------------------------------------------------------------------------------
/Ch3Analysis-Visualization/ML/MTree_Demo.py:
--------------------------------------------------------------------------------
1 | from sklearn import tree
2 | 
3 | features = [[140, 1], [130, 1], [150, 0], [170, 0]]
4 | labels = [0, 0, 1, 1]
5 | clf = tree.DecisionTreeClassifier()
6 | clf = clf.fit(features, labels)
7 | print(clf.predict([[150, 0]]))
8 | 


--------------------------------------------------------------------------------
/Ch0Grammar/Decorator.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | 
 3 | 
 4 | def printtime(func):
 5 |     def wrapper(*args, **kwargs):
 6 |         print(time.ctime())
 7 |         return func(*args, **kwargs)
 8 | 
 9 |     return wrapper
10 | 
11 | 
12 | @printtime
13 | def printhello(name):
14 |     print('Hello', name)
15 | 
16 | 
17 | if __name__ == '__main__':
18 |     printhello('Sam')
19 | 


--------------------------------------------------------------------------------
/Ch2Data/Clean/RealClean/Report.txt:
--------------------------------------------------------------------------------
1 | My Report  --Created by Shen on 2017-05-30 17:11:48
2 | 2017-05-30 17:11:48: Read data from result.csv
3 | 2017-05-30 17:11:48: drop the duplicate data
4 | 2017-05-30 18:24:49: Read data from result.csv
5 | 2017-05-30 18:24:50: drop the duplicate data
6 | 2017-07-02 01:07:58: Read data from result.csv
7 | 2017-07-02 01:07:59: drop the duplicate data
8 | 


--------------------------------------------------------------------------------
/Report/data/SpiderTest.txt:
--------------------------------------------------------------------------------
1 | https://movie.douban.com/subject/26787574/comments?status=P
2 | https://movie.douban.com/subject/26787574/comments?start=0&limit=20&sort=new_score&status=P&percent_type=
3 | https://movie.douban.com/subject/26787574/comments?start=20&limit=20&sort=new_score&status=P&percent_type=
4 | https://movie.douban.com/subject/26787574/comments?start=40&limit=20&sort=new_score&status=P&percent_type=


--------------------------------------------------------------------------------
/Ch1Spider/regular expression/demo.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import requests
 3 | from fake_useragent import UserAgent
 4 | 
 5 | ua = UserAgent()
 6 | headers = {'User-Agent': ua.random}
 7 | # headers = {}
 8 | html = requests.get('https://www.baidu.com/', headers=headers)
 9 | html.encoding = 'utf-8'
10 | html = html.text
11 | # print(html)
12 | titles = re.findall(r'<a href="(http://.*?.com)" name="tj_tr.*?" class="mnav">(\w{2})</a>', html)
13 | print(titles)
14 | 


--------------------------------------------------------------------------------
/Ch0Grammar/OOP.py:
--------------------------------------------------------------------------------
 1 | class Person:
 2 |     has_hair = True
 3 | 
 4 |     def __init__(self, name, age):
 5 |         self.name = name
 6 |         self.age = age
 7 | 
 8 |     def sayhello(self, words):
 9 |         print("Hello, I'm", self.name)
10 |         print(words)
11 | 
12 | 
13 | if __name__ == '__main__':
14 |     Sally = Person('Sally', 20)
15 |     Sally.sayhello("Nice to meet you")
16 | 
17 |     Tom = Person('Tom', 19)
18 |     Tom.sayhello("Nice to meet you too")
19 | 


--------------------------------------------------------------------------------
/Ch1Spider/cookie/direct-cookies.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from fake_useragent import UserAgent
 3 | 
 4 | mycookie_fromcopy = ''  # 这里填上从浏览器复制而来的cookie信息
 5 | 
 6 | ua = UserAgent()
 7 | headers = {'User-Agent': ua.random,
 8 |            'Cookie': mycookie_fromcopy}
 9 | url = "https://www.douban.com/people/146448257/"  # 这里是登录之前访问不到的个人信息页面
10 | data = requests.get(url, headers=headers)
11 | 
12 | print(data.status_code)
13 | print(data.request.headers)
14 | print(data.text)
15 | 


--------------------------------------------------------------------------------
/Ch1Spider/muti-threads/getTestUrls.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import requests
 3 | import pandas as pd
 4 | from fake_useragent import UserAgent
 5 | 
 6 | url = 'https://www.hao123.com/'
 7 | ua = UserAgent()
 8 | headers = {'User-Agent': ua.random}
 9 | 
10 | resp = requests.get(url, headers)
11 | data = resp.text
12 | urls = re.findall(r'href="(http.*?)"', data)
13 | 
14 | df = pd.DataFrame()
15 | 
16 | # 我们取前1000个
17 | df['url'] = urls[:1000]
18 | df.to_csv('TestUrls.csv', index=None)
19 | 


--------------------------------------------------------------------------------
/Ch0Grammar/Recursion.py:
--------------------------------------------------------------------------------
 1 | def factorial_normal(n):
 2 |     result = 1
 3 |     for i in range(n):
 4 |         result = result * n
 5 |         n = n - 1
 6 |     return result
 7 | 
 8 | 
 9 | def factorial_recursion(n):
10 |     if n == 1:
11 |         return 1
12 |     return n * factorial_recursion(n - 1)
13 | 
14 | 
15 | if __name__ == '__main__':
16 |     f10_normal = factorial_normal(5)
17 |     f10_recursion = factorial_recursion(10)
18 |     print(f10_normal)
19 |     print(f10_recursion)
20 | 
21 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # 《Python数据分析入门——从数据获取到可视化》
 2 | 
 3 | <p align="center">
 4 | <img src="./Asset/cover.jpeg" width="300" height="400" />
 5 | </p>
 6 | 
 7 | ## News
 8 | - Coming: 《写于出版6周年之后》
 9 | 
10 | ## 概览
11 | 
12 | 书籍[《Python数据分析入门——从数据获取到可视化》](http://www.broadview.com.cn/book/5010)
13 | 中使用的所有源代码，数据等文件。
14 | 关于本书的一些最新的进展的也会第一时间在这里公布。 
15 | 希望本书能对大家有所帮助。
16 | 
17 | 
18 | ## 反馈建议
19 | 
20 | - Issue/Discussion(推荐): 对于代码的问题可以提交Issue，对于其他问题可以在Discussion中讨论。
21 | - Email: 也可以发邮件给我(datahonor@gmail.com)，我会定期查看并尽快回复。
22 | 
23 | ## 勘误
24 | 详见[勘误表](./errata.md)。
25 | 


--------------------------------------------------------------------------------
/Ch4Data-Life/Math/LP.py:
--------------------------------------------------------------------------------
 1 | # # 无等式约束
 2 | import numpy as np
 3 | from cvxopt import matrix, solvers
 4 | 
 5 | c = matrix([-4., -5.])
 6 | G = matrix([[2., 1., -1., 0.], [1., 2., 0., -1.]])
 7 | h = matrix([3., 3., 0., 0.])
 8 | sol = solvers.lp(c, G, h)
 9 | print(sol['x'])
10 | 
11 | 
12 | # 有等式约束
13 | G = matrix([[1.0, 4.0, -2.0, -1.0, 0.0, 0.0], [-2.0, -1.0, 0.0, 0.0, -1.0, 0.0], [1.0, -2.0, 1.0, 0.0, 0.0, -1.0]])
14 | h = matrix([11.0, -3.0, 1.0, 0.0, 0.0, 0.0])
15 | A = matrix([-2.0, 0.0, 1.0])
16 | A = A.trans()  # 这里不转置会报错
17 | b = matrix([1.0])
18 | c = matrix([-3.0, 1.0, 1.0])
19 | sol = solvers.lp(c, G, h, A=A, b=b)
20 | print(sol['x'])
21 | 


--------------------------------------------------------------------------------
/errata.md:
--------------------------------------------------------------------------------
 1 | # 勘误
 2 | 
 3 | | 页码      | 错误                                          | 改正       |
 4 | |---------|---------------------------------------------|----------|
 5 | | 201     | 上方第一个阴影框（训练集数据）“种类”列最后两行将“bumpy”全改为“orange” | 第二次印刷时更正 |
 6 | | 202     | 第三行，“是橙子还是水果”改为“是橙子还是苹果”                    | 第二次印刷时更正 |
 7 | | 99      | 代码框最后两行交换位置（因为多线程会把`urls`清空）                | 第六次印刷时更正 |
 8 | | 115     | 正文第三行“运行输出如下。”下面的输出有误，下面的数据需要我们自己手动创建       | 第六次印刷时更正 |
 9 | | 245     | 代码框，最上面应加上`import random as rnd`            | 第六次印刷时更正 |
10 | | 247,248 | 两个LP问题的目标函数漏掉                               | 第六次印刷时更正 |
11 | | 71-73   | 豆瓣模拟登录报错                                    | 第六次印刷时更正 |
12 | 


--------------------------------------------------------------------------------
/Ch3Analysis-Visualization/ML/PipLine.py:
--------------------------------------------------------------------------------
 1 | from sklearn.datasets import load_iris
 2 | 
 3 | iris = load_iris()
 4 | 
 5 | X = iris.data
 6 | y = iris.target
 7 | 
 8 | from sklearn.model_selection import train_test_split
 9 | 
10 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
11 | 
12 | # from sklearn import tree
13 | # my_classifier = tree.DecisionTreeClassifier()
14 | 
15 | from sklearn.neighbors import KNeighborsClassifier
16 | 
17 | my_classifier = KNeighborsClassifier()
18 | my_classifier.fit(X_train, y_train)
19 | predictions = my_classifier.predict(X_test)
20 | 
21 | from sklearn.metrics import accuracy_score
22 | 
23 | print("测试准确率: ", accuracy_score(y_test, predictions))
24 | 
25 | # 0.947368421053
26 | 


--------------------------------------------------------------------------------
/Ch2Data/DataIO/Mxlsx/Mpd_openpyxl.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pandas as pd
 3 | from openpyxl import Workbook
 4 | from openpyxl.utils.dataframe import dataframe_to_rows
 5 | 
 6 | # 　修改工作目录
 7 | if 'Myxlsxdata' not in os.listdir():
 8 |     os.mkdir('Myxlsxdata')
 9 | os.chdir('Myxlsxdata')
10 | 
11 | # 创建数据
12 | data = {'代号': ['A', 'B', 'C', 'D'], '身高': [178, 177, 180, 175], '体重': [65, 70, 64, 67]}
13 | df = pd.DataFrame(data)
14 | 
15 | # 创建工作簿
16 | wb = Workbook()
17 | # 　插入表
18 | ws = wb.create_sheet("体测数据", 0)  # 0代表在开头插入，默认在末尾插入
19 | # 插入数据
20 | for r in dataframe_to_rows(df, index=True, header=True):
21 |     ws.append(r)
22 | 
23 | wb.save("pandas_openpyxl.xlsx")
24 | 
25 | # 读取
26 | df = pd.read_excel('pandas_openpyxl.xlsx')
27 | print(df)
28 | 


--------------------------------------------------------------------------------
/Ch4Data-Life/Math/QP.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | https://uqer.io/community/share/55c9a55df9f06c91f818c675
 3 | '''
 4 | 
 5 | # from cvxopt import solvers, matrix
 6 | # import numpy as np
 7 | #
 8 | # P = matrix(np.diag([1.0, 0]))  # 对于一些特殊矩阵，用numpy创建会方便很多（在本例中可能感受不大）
 9 | # q = matrix(np.array([3.0, 4]))
10 | # G = matrix(np.array([[-1.0, 0], [0, -1], [-1, -3], [2, 5], [3, 4]]))
11 | # h = matrix(np.array([0.0, 0, -15, 100, 80]))
12 | # sol = solvers.qp(P, q, G, h)
13 | # print(sol['x'])
14 | 
15 | import numpy as np
16 | from cvxopt import solvers, matrix
17 | 
18 | P = matrix([[1.0, 0.0], [0.0, 0.0]])
19 | q = matrix([3.0, 4.0])
20 | G = matrix([[-1.0, 0.0, -1.0, 2.0, 3.0], [0.0, -1.0, -3.0, 5.0, 4.0]])
21 | h = matrix([0.0, 0.0, -15.0, 100.0, 80.0])
22 | sol = solvers.qp(P, q, G, h)
23 | print(sol['x'])
24 | 


--------------------------------------------------------------------------------
/Ch3Analysis-Visualization/Visualization/Mseaborn.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | import numpy as np
 3 | import pandas as pd
 4 | import seaborn as sns
 5 | import matplotlib.pyplot as plt
 6 | 
 7 | # 核密度估计
 8 | 
 9 | x = np.random.normal(0, 1, 100)
10 | y = np.random.normal(1, 2, 100)
11 | 
12 | sns.kdeplot(x)
13 | sns.kdeplot(y)
14 | plt.show()
15 | 
16 | # 频数图
17 | tips = sns.load_dataset("tips")
18 | plt.subplot(121)
19 | sns.countplot('day', data=tips)
20 | plt.subplot(122)
21 | sns.countplot('sex', data=tips)
22 | plt.show()
23 | 
24 | # 线性回归
25 | sns.lmplot(x='total_bill', y='tip', hue='day', data=tips, fit_reg=True)
26 | plt.show()
27 | 
28 | # 小提琴图
29 | sns.violinplot(x='day', y='tip', data=tips)
30 | plt.show()
31 | 
32 | # 多因素分析
33 | sns.factorplot('day', 'total_bill', 'sex', data=tips, kind='violin')
34 | plt.show()
35 | 


--------------------------------------------------------------------------------
/Ch2Data/DataIO/Mxlsx/getbookpics.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import requests
 3 | import pandas as pd
 4 | 
 5 | 
 6 | # 获取所有图书的封面图片,以书名为文件名
 7 | def savepics(img_urls, titles):
 8 |     for i in range(len(img_urls)):
 9 |         img_url = img_urls[i]
10 |         title = titles[i]
11 |         img_data = requests.get(img_url).content  # 二进制内容
12 |         # 存储图片
13 |         with open(str(title) + '.jpg', 'wb') as f:
14 |             f.write(img_data)
15 | 
16 | 
17 | if __name__ == '__main__':
18 |     # 为了数据文件和程序文件的分离,我们可以选择新建文件夹,并在此文件夹下进行文件的读写
19 |     if 'Myxlsxdata' not in os.listdir():
20 |         os.mkdir('Myxlsxdata')
21 |     os.chdir('Myxlsxdata')
22 | 
23 |     books_data = pd.read_csv('result.csv')  # 读入爬取的数据
24 |     img_urls = books_data['img_urls']  # 图片地址
25 |     titles = books_data['titles']  # 图书名,为图片命名
26 |     savepics(img_urls, titles)
27 | 


--------------------------------------------------------------------------------
/Ch3Analysis-Visualization/Visualization/MwordClound.py:
--------------------------------------------------------------------------------
 1 | import jieba
 2 | import pandas as pd
 3 | import matplotlib.pyplot as plt
 4 | from wordcloud import WordCloud, STOPWORDS
 5 | from scipy.misc import imread
 6 | 
 7 | 
 8 | def get_wordList():
 9 |     df = pd.read_excel('完美陌生人-短评.xlsx')
10 |     wordList = df['评论内容'].tolist()
11 |     return wordList
12 | 
13 | 
14 | def get_wordClound(mylist):
15 |     word_list = [" ".join(jieba.cut(sentence)) for sentence in mylist]
16 |     new_text = ' '.join(word_list)
17 |     pic_path = 'mask.jpg'
18 |     img_mask = imread(pic_path)
19 | 
20 |     wordcloud = WordCloud(background_color="white", font_path='/home/shen/Downloads/font/msyh.ttc',
21 |                           mask=img_mask, stopwords=STOPWORDS, ).generate(new_text)
22 |     plt.imshow(wordcloud)
23 |     plt.axis("off")
24 |     plt.show()
25 | 
26 | 
27 | if __name__ == '__main__':
28 |     wordList = get_wordList()
29 |     get_wordClound(wordList)
30 | 


--------------------------------------------------------------------------------
/Ch4Data-Life/News/MEmail.py:
--------------------------------------------------------------------------------
 1 | # coding:utf-8
 2 | from email.header import Header
 3 | from email.mime.text import MIMEText
 4 | from email.utils import parseaddr, formataddr
 5 | import smtplib
 6 | 
 7 | 
 8 | def _format_addr(s):
 9 |     name, addr = parseaddr(s)
10 |     return formataddr((Header(name, 'utf-8').encode(), addr))
11 | 
12 | 
13 | def send_ms(text_data):
14 |     from_addr = "你的邮箱"
15 |     password = '你的KEY'
16 |     to_addr = '你的邮箱'
17 |     smtp_server = 'smtp.qq.com'
18 |     msg = MIMEText(text_data, 'plain', 'utf-8')
19 |     msg['From'] = _format_addr('MySpider')
20 |     msg['To'] = _format_addr('MyPhone')
21 |     msg['Subject'] = Header('The News Report', 'utf-8').encode()
22 |     server = smtplib.SMTP_SSL(smtp_server, 465, timeout=10)
23 |     server.set_debuglevel(0)
24 |     server.login(from_addr, password)
25 |     server.sendmail(from_addr, [to_addr], msg.as_string())
26 |     server.quit()
27 | 
28 | 
29 | if __name__ == '__main__':
30 |     send_ms('Test')
31 | 


--------------------------------------------------------------------------------
/Ch2Data/DataIO/Mxlsx/Mpd_xlsxwriter.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | 
 3 | pandas 与 xlsxwriter交互[需要安装xlsxwriter]
 4 | https://xlsxwriter.readthedocs.io/working_with_pandas.html
 5 | '''
 6 | 
 7 | import os
 8 | import pandas as pd
 9 | 
10 | # 文件夹切换
11 | if 'Myxlsxdata' not in os.listdir():
12 |     os.mkdir('Myxlsxdata')
13 | os.chdir('Myxlsxdata')
14 | 
15 | # 数据1
16 | books_data = pd.read_csv('result.csv', usecols=['titles', 'authors', 'ratings', 'details'], na_values='NULL')
17 | df1 = pd.DataFrame(books_data)
18 | # 数据2
19 | data = {'代号': ['A', 'B', 'C', 'D'], '身高': [178, 177, 180, 175], '体重': [65, 70, 64, 67]}
20 | df2 = pd.DataFrame(data)
21 | 
22 | # 以xlsxwriter为引擎，创建writer对象，并初始化文件名为pandas_simple.xlsx
23 | writer = pd.ExcelWriter('pandas_moresheet.xlsx', engine='xlsxwriter')
24 | 
25 | # 将DataFrame存储到writer里面
26 | df1.to_excel(writer, sheet_name='豆瓣图书')
27 | df2.to_excel(writer, sheet_name='体测数据')
28 | 
29 | # 关闭writer对象，并保存写入的数据
30 | writer.save()
31 | 
32 | # 读取xlsx文件
33 | df = pd.read_excel('pandas_moresheet.xlsx', sheetname='体测数据')
34 | print(df)
35 | 


--------------------------------------------------------------------------------
/Ch1Spider/JsonandSelenium/jsondemo.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | JD 秒杀
 3 | https://miaosha.jd.com/
 4 | '''
 5 | 
 6 | 
 7 | import re
 8 | import json
 9 | import requests
10 | from fake_useragent import UserAgent
11 | 
12 | 
13 | # 为了更好地输出显示json文件
14 | def printjson(data):
15 |     json_str = json.dumps(data, indent=4, ensure_ascii=False)
16 |     print(json_str)
17 | 
18 | # 获取网页数据
19 | def getdata(json_url):
20 |     ua = UserAgent()
21 |     headers = {'User-Agent': ua.random}
22 |     data = requests.get(json_url, headers=headers)
23 |     # print(data.text)
24 | 
25 |     # 正则表达式开启贪婪匹配模式，匹配到最外层的{}以包含所有内容
26 |     re_data = re.findall('pcMiaoShaAreaList\(({.*})\)', data.text)[0]
27 |     # 转化为json格式，方便处理
28 |     json_data = json.loads(re_data)
29 |     # printjson(json_data)
30 | 
31 |     # 观察到分为brandList和miaoshaList,我们以miaoshaList为例
32 |     miaoShaList = json_data['miaoShaList']
33 |     print(miaoShaList)
34 |     print(len(miaoShaList))
35 |     printjson(miaoShaList)
36 | 
37 | 
38 | 
39 | if __name__=='__main__':
40 |     json_url1 = 'https://ai.jd.com/index_new?app=Seckill&action=pcMiaoShaAreaList&callback=pcMiaoShaAreaList&_=1493626377063'
41 |     getdata(json_url1)
42 | 
43 | 
44 | 
45 | 
46 | 
47 | 
48 | 
49 | 
50 | 


--------------------------------------------------------------------------------
/Ch2Data/DataIO/Mpymysql/csv_database.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import pymysql
 3 | 
 4 | data = pd.read_csv('result.csv')
 5 | rows_num = data.shape[0]
 6 | 
 7 | # 创建连接
 8 | db = pymysql.connect(host="localhost", user="root", password="zhengfu5zhengfu", db="PyDataBase", charset='utf8')
 9 | # 获取游标，我们用它来执行数据库的操作
10 | cursor = db.cursor()
11 | 
12 | # 执行sql语句
13 | try:
14 |     # 删除表
15 |     # 在创建新表之前检查是否已经存在此表，若存在则先删除
16 |     cursor.execute("DROP TABLE IF EXISTS DOUBAN_BOOK;")
17 |     # 创建表
18 |     cursor.execute("CREATE TABLE DOUBAN_BOOK("
19 |                    "img_urls VARCHAR (100), "
20 |                    "titles VARCHAR (100),"
21 |                    "ratings VARCHAR (20),"
22 |                    "authors VARCHAR (100),"
23 |                    "details VARCHAR (200));")
24 | 
25 |     for i in range(rows_num):
26 |         sql = "INSERT INTO DOUBAN_BOOK (img_urls, titles, " \
27 |               "ratings, authors, details)VALUES (%s,%s,%s,%s,%s)"
28 |         cursor.execute(sql, (data.ix[i, :][0], data.ix[i, :][1],
29 |                              data.ix[i, :][2], data.ix[i, :][3], data.ix[i, :][4]))
30 |         db.commit()
31 | 
32 |     cursor.close()
33 | 
34 | except:
35 |     print("ERROR!")
36 |     db.rollback()
37 | 
38 | finally:
39 |     db.close()
40 | 


--------------------------------------------------------------------------------
/Ch3Analysis-Visualization/Visualization/Mmatplotlib.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | import matplotlib.pyplot as plt
 4 | 
 5 | # 基本使用
 6 | '''
 7 | x = np.linspace(-2, 2, 100)
 8 | y1 = np.cos(np.pi * x)
 9 | y2 = np.sin(np.pi * x)
10 | 
11 | plt.plot(x, y1, 'go', label=r"$y1=\cos(\pi \times x)$", alpha=0.8, linewidth=0.7)
12 | plt.plot(x, y2, 'r-', label=r"$y2=\sin(\pi \times x)$", alpha=0.8, linewidth=0.7)
13 | 
14 | plt.annotate("Important Point", (0, 1), xytext=(-1.5, 1.1),
15 |              arrowprops=dict(arrowstyle='->'))
16 | 
17 | plt.xlabel('x-axis')
18 | plt.ylabel('y-axis')
19 | 
20 | # 设置座标范围[xmin, xmax, ymin, ymax]
21 | plt.axis([-2.1, 2.1, -1.2, 1.2])
22 | 
23 | # 显示标签
24 | plt.legend()
25 | # 显示网格
26 | plt.grid(alpha=0.4)
27 | 
28 | plt.title("Two plots", color=(0.1, 0.3, 0.5))
29 | plt.show()
30 | 
31 | '''
32 | 
33 | # 进阶使用
34 | '''
35 | # 绘制子图[subplot]
36 | plt.style.use('ggplot')  # 设置绘图风格
37 | x = np.linspace(-2, 2, 100)
38 | y1 = np.sin(np.pi * x)
39 | y2 = np.cos(np.pi * x)
40 | y3 = np.tan(np.pi * x)
41 | y4 = x
42 | 
43 | plt.subplot(221)
44 | plt.plot(x, y1)
45 | 
46 | plt.subplot(222)
47 | plt.plot(x, y2)
48 | 
49 | plt.subplot(223)
50 | plt.plot(x, y3)
51 | 
52 | plt.subplot(224)
53 | plt.plot(x, y4)
54 | 
55 | plt.show()
56 | '''
57 | 


--------------------------------------------------------------------------------
/Ch1Spider/cookie/yundama.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | 参考：
 3 | http://www.bigdataway.net/index.php/node/4324
 4 | http://www.yundama.com/download/YDMHttp.html
 5 | '''
 6 | import json
 7 | import time
 8 | import requests
 9 | 
10 | def getcode_from_yundama():
11 | 
12 |     captcha_username = '你的用户名'
13 |     captcha_password = '你的密码'
14 |     captcha_id = 1
15 |     captcha_appkey = '你的KEY'
16 |     captcha_codetype = '3000'
17 |     captcha_url = 'http://api.yundama.com/api.php?method=upload'
18 |     captcha_result_url = 'http://api.yundama.com/api.php?cid{}&method=result'
19 |     filename = 'douban.jpg'
20 |     timeout = 30
21 | 
22 |     postdata = {'method': 'upload', 'username': captcha_username,
23 |                 'password': captcha_password, 'appid': captcha_id,
24 |                 'appkey': captcha_appkey, 'codetype': captcha_codetype,
25 |                 'timeout': timeout}
26 | 
27 |     fo = open(filename, 'rb')
28 |     file = {'file': fo.read()}
29 |     response = requests.post(captcha_url, postdata, files=file).text
30 |     print(response)
31 |     fo.close()
32 | 
33 |     response = json.loads(response)
34 |     code = response['text']
35 |     status = response['ret']
36 |     if status == 0:
37 |         print("识别成功！")
38 |         print('验证码为：', code)
39 | 
40 |     return code
41 | 


--------------------------------------------------------------------------------
/Ch1Spider/captures/yundama.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | 参考：
 3 | http://www.bigdataway.net/index.php/node/4324
 4 | http://www.yundama.com/download/YDMHttp.html
 5 | '''
 6 | 
 7 | import json
 8 | import time
 9 | import requests
10 | 
11 | def getcode_from_yundama():
12 | 
13 |     captcha_username = 'shensir'
14 |     captcha_password = 'zhengfu5zhengfu'
15 |     captcha_id = 1
16 |     captcha_appkey = '22cc5376925e9387a23cf797cb9ba745'
17 |     captcha_codetype = '3000'
18 |     captcha_url = 'http://api.yundama.com/api.php?method=upload'
19 |     captcha_result_url = 'http://api.yundama.com/api.php?cid{}&method=result'
20 |     filename = 'douban.jpg'
21 |     timeout = 30
22 | 
23 |     postdata = {'method': 'upload', 'username': captcha_username,
24 |                 'password': captcha_password, 'appid': captcha_id,
25 |                 'appkey': captcha_appkey, 'codetype': captcha_codetype,
26 |                 'timeout': timeout}
27 | 
28 |     fo = open(filename, 'rb')
29 |     file = {'file': fo.read()}
30 |     response = requests.post(captcha_url, postdata, files=file).text
31 |     # print(response)
32 |     fo.close()
33 | 
34 |     response = json.loads(response)
35 |     code = response['text']
36 |     status = response['ret']
37 |     if status == 0 and code:
38 |         print("识别成功！")
39 |         print('验证码为：', code)
40 | 
41 |     return code
42 | 
43 | # getcode_from_yundama()


--------------------------------------------------------------------------------
/Ch4Data-Life/Math/ThrDoor.py:
--------------------------------------------------------------------------------
 1 | import random as rnd
 2 | 
 3 | 
 4 | # 计算在第二次采取不同策略时,是否在游戏中获胜[选中汽车]
 5 | def game(strategy):
 6 |     win = 0
 7 |     # 假定汽车在0号门（参赛者并不了解这一事实）
 8 |     doors = [0, 1, 2]
 9 |     # 因为事先我们并不知道任何信息,所以第一次随机选取一扇门
10 |     first_choice = rnd.choice(doors)
11 |     # 根据第一次的选择情况的不同，第二次决策面临两种不同的备选组合
12 | 
13 |     # 如果第一次选择了0号门，那么在主持人打开另外两个门中的其中一个门后
14 |     # 第二次将要在0号门和未打开的空门（1 or 2）中作出选择
15 |     if first_choice == 0:
16 |         doors = [0, rnd.choice([1, 2])]
17 | 
18 |     # 如果第一次没有选中0，那么此时被打开的必然是另一个有山羊的门，那么
19 |     # 在第二次选择时，将在0和自己现在所处的门（first_choice）作出选择
20 |     else:
21 |         doors = [0, first_choice]
22 | 
23 |     # 采取不同的策略进行第二次选择
24 | 
25 |     # 保持原来位置不变
26 |     if strategy == 'stick':
27 |         second_choice = first_choice
28 | 
29 |     # 排除一扇空门后，放弃原来的选择，直接选择另一扇门
30 |     else:
31 |         doors.remove(first_choice)
32 |         second_choice = doors[0]
33 | 
34 |     # 记得，奖品在0号门
35 |     if second_choice == 0:
36 |         win = 1
37 | 
38 |     return win
39 | 
40 | 
41 | # 对特定策略进行的一定次数的模拟
42 | def MC(strategy, times):
43 |     wins = 0
44 |     for i in range(times):
45 |         wins += game(strategy)
46 |     # 计算获奖的概率值
47 |     p = wins / times
48 |     print('第二次选择采用' + strategy + '方法，获奖的概率为：' + str(p) + '(模拟次数为' + str(times) + ')')
49 | 
50 | 
51 | if __name__ == '__main__':
52 |     MC('stick', 10000)
53 |     MC('switch', 10000)
54 | 


--------------------------------------------------------------------------------
/Ch2Data/Clean/RealClean/clean_report.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import time
 3 | import datetime
 4 | import pandas as pd
 5 | 
 6 | 
 7 | # 获取日期和时间
 8 | def get_date_and_time():
 9 |     # 获取时间戳
10 |     timestamp = time.time()
11 |     # 将时间戳转化为指定格式的时间
12 |     value = datetime.datetime.fromtimestamp(timestamp)
13 |     date_and_time = value.strftime('%Y-%m-%d %H:%M:%S')
14 | 
15 |     return date_and_time
16 | 
17 | 
18 | # 日志文件操作
19 | def write_to_log(logname='Report.txt', operations=None):
20 |     # 检查是否创建了日志文件
21 |     if logname not in os.listdir():
22 |         with open(logname, 'w') as f:
23 |             # 创建文件
24 |             f.writelines(["My Report  --Created by Shen on ", get_date_and_time()])
25 |             f.write("\n")
26 |             # 写入数据
27 |             f.writelines([get_date_and_time(), ': '])
28 |             f.write(operations)
29 |             f.write("\n")
30 |     else:
31 |         # 已有日志文件的话，就以追加的模式写入记录
32 |         with open(logname, 'a') as f:
33 |             # 追加模式写入数据
34 |             f.writelines([get_date_and_time(), ': '])
35 |             f.write(operations)
36 |             f.write("\n")
37 | 
38 | 
39 | if __name__ == '__main__':
40 |     write_to_log(operations="Read data from result.csv")
41 |     df = pd.read_csv('result.csv')
42 | 
43 |     write_to_log(operations="drop the duplicate data")
44 |     df = df.drop_duplicates()
45 | 
46 |     '''
47 |     Other operations
48 |     '''
49 | 


--------------------------------------------------------------------------------
/Ch1Spider/JsonandSelenium/ghostdriver.log:
--------------------------------------------------------------------------------
1 | [INFO  - 2017-07-01T17:01:55.668Z] GhostDriver - Main - running on port 50069
2 | [INFO  - 2017-07-01T17:01:55.985Z] Session [fce473d0-5e7e-11e7-9c73-4b438a9347c1] - page.settings - {"XSSAuditingEnabled":false,"javascriptCanCloseWindows":true,"javascriptCanOpenWindows":true,"javascriptEnabled":true,"loadImages":true,"localToRemoteUrlAccessEnabled":false,"userAgent":"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:50.0) Gecko/20100101 Firefox/50.0","webSecurityEnabled":true}
3 | [INFO  - 2017-07-01T17:01:55.985Z] Session [fce473d0-5e7e-11e7-9c73-4b438a9347c1] - page.customHeaders:  - {}
4 | [INFO  - 2017-07-01T17:01:55.985Z] Session [fce473d0-5e7e-11e7-9c73-4b438a9347c1] - Session.negotiatedCapabilities - {"browserName":"phantomjs","version":"2.1.1","driverName":"ghostdriver","driverVersion":"1.2.0","platform":"linux-unknown-64bit","javascriptEnabled":true,"takesScreenshot":true,"handlesAlerts":false,"databaseEnabled":false,"locationContextEnabled":false,"applicationCacheEnabled":false,"browserConnectionEnabled":false,"cssSelectorsEnabled":true,"webStorageEnabled":false,"rotatable":false,"acceptSslCerts":false,"nativeEvents":true,"proxy":{"proxyType":"direct"},"phantomjs.page.settings.userAgent":"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:50.0) Gecko/20100101 Firefox/50.0"}
5 | [INFO  - 2017-07-01T17:01:55.985Z] SessionManagerReqHand - _postNewSessionCommand - New Session Created: fce473d0-5e7e-11e7-9c73-4b438a9347c1
6 | 


--------------------------------------------------------------------------------
/Ch3Analysis-Visualization/ML/MTree_Iris.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | pip install pydotplus
 3 | pip install graphviz
 4 | sudo apt-get install graphviz
 5 | '''
 6 | 
 7 | import numpy as np
 8 | from sklearn.datasets import load_iris
 9 | from sklearn import tree
10 | import pydotplus
11 | from io import StringIO
12 | 
13 | # 载入数据集
14 | iris = load_iris()
15 | '''
16 | do somethings to explore the dataset
17 | '''
18 | test_idx = [0, 50, 100]
19 | 
20 | # training data
21 | train_data = np.delete(iris.data, test_idx, axis=0)
22 | train_target = np.delete(iris.target, test_idx)
23 | print(train_target)
24 | # testing data
25 | test_data = iris.data[test_idx]
26 | test_target = iris.target[test_idx]
27 | 
28 | clf = tree.DecisionTreeClassifier()
29 | clf.fit(train_data, train_target)
30 | print("正确类别：", test_target)
31 | print("预测类别：", clf.predict(test_data))
32 | 
33 | # Displaying the decision tree
34 | out = StringIO()
35 | tree.export_graphviz(clf, out_file=out,
36 |                      feature_names=iris.feature_names,
37 |                      class_names=iris.target_names,
38 |                      filled=True, rounded=True,
39 |                      impurity=False)
40 | graph = pydotplus.graph_from_dot_data(out.getvalue())
41 | # graph.write_pdf('iris.pdf')
42 | data = graph.create_png()  # 图片的二进制数据
43 | with open('tree.png', 'wb') as f:
44 |     f.write(data)
45 | 
46 | print("测试集其一数据：", test_data[0], test_target[0])
47 | print("特征：", iris.feature_names)
48 | print("标签", iris.target_names)
49 | 


--------------------------------------------------------------------------------
/Ch3Analysis-Visualization/Visualization/Iris.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import seaborn as sns
 3 | import matplotlib.pyplot as plt
 4 | 
 5 | sns.set(style="white", color_codes=True)
 6 | 
 7 | iris = pd.read_csv("Iris.csv")
 8 | print(iris.head())
 9 | 
10 | # How many examples we have of each species
11 | print(iris["Species"].value_counts())
12 | 
13 | # scatter
14 | iris.plot(kind="scatter", x="SepalLengthCm", y="SepalWidthCm")
15 | 
16 | sns.jointplot(x="SepalLengthCm", y="SepalWidthCm", data=iris, size=5)
17 | sns.FacetGrid(iris, hue="Species", size=5).map(plt.scatter, "SepalLengthCm", "SepalWidthCm").add_legend()
18 | sns.boxplot(x="Species", y="PetalLengthCm", data=iris)
19 | 
20 | ax = sns.boxplot(x="Species", y="PetalLengthCm", data=iris)
21 | ax = sns.stripplot(x="Species", y="PetalLengthCm", data=iris, jitter=True, edgecolor="gray")
22 | 
23 | sns.violinplot(x="Species", y="PetalLengthCm", data=iris)
24 | 
25 | sns.FacetGrid(iris, hue="Species", size=6) \
26 |     .map(sns.kdeplot, "PetalLengthCm") \
27 |     .add_legend()
28 | 
29 | sns.pairplot(iris.drop("Id", axis=1), hue="Species", size=3)
30 | 
31 | sns.pairplot(iris.drop("Id", axis=1), hue="Species", size=3, diag_kind="kde")
32 | iris.drop("Id", axis=1).boxplot(by="Species", figsize=(12, 6))
33 | 
34 | from pandas.tools.plotting import andrews_curves
35 | 
36 | andrews_curves(iris.drop("Id", axis=1), "Species")
37 | 
38 | from pandas.tools.plotting import radviz
39 | 
40 | radviz(iris.drop("Id", axis=1), "Species")
41 | 


--------------------------------------------------------------------------------
/Ch3Analysis-Visualization/ML/MKnn.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | 参考YouTube上Google developers的系列视频
 3 | https://www.youtube.com/watch?v=cKxRvEZd3Mw
 4 | '''
 5 | 
 6 | from scipy.spatial import distance
 7 | 
 8 | 
 9 | class ScrappyKNN():
10 |     def fit(self, X_train, y_train):
11 |         self.X_train = X_train
12 |         self.y_train = y_train
13 | 
14 |     def predict(self, X_test):
15 |         predictions = []
16 |         for row in X_test:
17 |             label = self.closest(row)
18 |             predictions.append(label)
19 | 
20 |         return predictions
21 | 
22 |     def closest(self, row):
23 |         best_dist = self.euc(row, self.X_train[0])
24 |         best_index = 0
25 |         for i in range(len(X_train)):
26 |             dist = self.euc(row, self.X_train[i])
27 |             if dist < best_dist:
28 |                 best_dist = dist
29 |                 best_index = i
30 |         return self.y_train[best_index]
31 | 
32 |     def euc(self, a, b):
33 |         return distance.euclidean(a, b)
34 | 
35 | 
36 | from sklearn.datasets import load_iris
37 | 
38 | iris = load_iris()
39 | 
40 | X = iris.data
41 | y = iris.target
42 | 
43 | from sklearn.model_selection import train_test_split
44 | 
45 | X_train, X_test, y_train, y_test = train_test_split(X, y)
46 | 
47 | my_classifier = ScrappyKNN()
48 | my_classifier.fit(X_train, y_train)
49 | predictions = my_classifier.predict(X_test)
50 | 
51 | from sklearn.metrics import accuracy_score
52 | 
53 | print(accuracy_score(y_test, predictions))
54 | 
55 | 
56 | # 0.973684210526
57 | 


--------------------------------------------------------------------------------
/Ch2Data/DataIO/Mxlsx/Mopenpyxl.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import numpy as np
 3 | from openpyxl import load_workbook
 4 | 
 5 | # 文件夹切换
 6 | if 'Myxlsxdata' not in os.listdir():
 7 |     os.mkdir('Myxlsxdata')
 8 | os.chdir('Myxlsxdata')
 9 | 
10 | # 读取当前工作目录下的xlsx文件
11 | wb = load_workbook('pandas_moresheet.xlsx')
12 | 
13 | # 　查看所有表名
14 | print("表名：", wb.get_sheet_names())
15 | 
16 | # 通过表名称来选择工作表
17 | ws = wb['豆瓣图书']
18 | print("行数：", len(list(ws.rows)))
19 | print("列数：", len(list(ws.columns)))
20 | 
21 | # 获取一行的数据
22 | # 这里，我们通过篇设置min_row和max_row相同来实现只取一行
23 | # 一般第一行为列名，我们打印出来
24 | row_data = []
25 | for row in ws.iter_rows(min_row=1, max_row=1, max_col=5):
26 |     for cell in row:
27 |         row_data.append(cell.value)
28 | print("第一行（列名）：", row_data)
29 | 
30 | # 获取某一列的数据,假设为第二列
31 | row_data = []
32 | for col in ws.iter_cols(min_col=2, max_col=2, max_row=41):
33 |     for cell in col:
34 |         row_data.append(cell.value)
35 | print("第二列：", row_data)
36 | 
37 | # 获取某区块的数据
38 | # 通过上面的程序也能看出，只要设置好row和col的阀值就行了
39 | # 假设获取2-3列，1-5行的数据
40 | print("区域数据(1-5，2-3):")
41 | min_col = 2
42 | max_col = 3
43 | min_row = 1
44 | max_row = 5
45 | 
46 | areadata = np.matrix(np.zeros((max_row - min_row + 1, max_col - min_col + 1)), dtype=str)
47 | for col in ws.iter_cols(min_col=min_col, max_col=max_col, min_row=min_row, max_row=max_row):
48 |     for cell in col:
49 |         col_index = cell.col_idx  # 获取所在列数
50 |         row_index = cell.row  # 获取所在行数
51 |         areadata[row_index - min_row, col_index - min_col] = cell.value
52 | 
53 | print(areadata)
54 | 


--------------------------------------------------------------------------------
/Ch2Data/Clean/Mnumpy.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env/python3
 2 | 
 3 | import sys
 4 | from datetime import datetime
 5 | import numpy as np
 6 | import matplotlib.pyplot as plt
 7 | 
 8 | 
 9 | # 使用numpy计算
10 | def numpysum(n):
11 |     a = np.arange(n) ** 2
12 |     b = np.arange(n) ** 3
13 |     c = a + b
14 |     return c
15 | 
16 | 
17 | # 使用python计算
18 | def pythonsum(n):
19 |     # 这里由于源码为Python2的，python3中range的用法有变,不再直接返回列表
20 |     # 所以强制转化列表
21 |     a = list(range(n))
22 |     b = list(range(n))
23 |     c = []
24 | 
25 |     for i in range(len(a)):
26 |         a[i] = i ** 2
27 |         b[i] = i ** 3
28 |         c.append(a[i] + b[i])
29 | 
30 |     return c
31 | 
32 | 
33 | # prt表示是否打印结果
34 | def printest(func, size, prt=True):
35 |     start = datetime.now()
36 |     c = func(size)
37 |     delta = datetime.now() - start
38 |     if prt == True:
39 |         print("The last 2 elements of the sum ", c[-2:])
40 |         print('Elapsed time in microsecondas ', delta.microseconds)
41 |     return delta.microseconds
42 | 
43 | 
44 | # 用于作n-time图
45 | def timeplot():
46 |     pts = []
47 |     for i in range(100, 10000, 100):
48 |         t_numpy = printest(numpysum, i, prt=False)
49 |         t_python = printest(pythonsum, i, prt=False)
50 |         pts.append([t_numpy, t_python])
51 |     plt.plot(pts)
52 |     plt.legend(['Numpy', 'Python'])
53 |     plt.show()
54 | 
55 | 
56 | if __name__ == '__main__':
57 |     #size = int(sys.argv[1])
58 |     size = 1000
59 |     print('Numpysum...')
60 |     printest(numpysum, size)
61 |     print('Pythonsum...')
62 |     printest(pythonsum, size)
63 |     timeplot()
64 | 


--------------------------------------------------------------------------------
/Ch2Data/DataIO/Mxlsx/Mxlsxwriter.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | 
 3 | xlsxwriter基本用法
 4 | http://www.python-excel.org/
 5 | 
 6 | '''
 7 | 
 8 | import os
 9 | 
10 | import pandas as pd
11 | import xlsxwriter
12 | 
13 | # 为了数据文件和程序文件的分离,我们可以选择新建文件夹,并在此文件夹下进行文件的读写
14 | if 'Myxlsxdata' not in os.listdir():
15 |     os.mkdir('Myxlsxdata')
16 | 
17 | # 切换到此文件夹下
18 | os.chdir('Myxlsxdata')
19 | 
20 | # 导入数据,只导入需要的列.若有缺失值,显示为NULL
21 | books_data = pd.read_csv('result.csv', usecols=['titles', 'authors', 'ratings', 'details'], na_values='NULL')
22 | titles = books_data['titles']
23 | authors = books_data['authors']
24 | ratings = books_data['ratings']
25 | details = books_data['details']
26 | 
27 | # 新建文件名为Books.xlsx的电子表格工作薄
28 | workbook = xlsxwriter.Workbook('Books.xlsx')
29 | 
30 | # 为创建的电子表格增加一个名为表1的表格,默认表名为sheet1, sheet2...
31 | worksheet = workbook.add_worksheet('豆瓣新书')
32 | 
33 | # 写入数据
34 | nums = len(titles)  # 数据量
35 | 
36 | # 第一行写入列名
37 | worksheet.write(0, 0, '图书封面')
38 | worksheet.write(0, 1, '图书标题')
39 | worksheet.write(0, 2, '图书作者')
40 | worksheet.write(0, 3, '图书评价')
41 | worksheet.write(0, 4, '图书细节')
42 | 
43 | # 根据内容设置列宽
44 | worksheet.set_column('A:A', 20)
45 | worksheet.set_column('B:B', 20)
46 | worksheet.set_column('C:C', 20)
47 | worksheet.set_column('D:D', 10)
48 | worksheet.set_column('E:E', 150)
49 | 
50 | # 插入图片和文本数据
51 | for i in range(1, nums):
52 |     worksheet.insert_image(i, 0, titles[i] + '.jpg')
53 |     worksheet.write(i, 1, titles[i])
54 |     worksheet.write(i, 2, authors[i])
55 |     worksheet.write(i, 3, ratings[i])
56 |     worksheet.write(i, 4, details[i])
57 | 
58 | # 存储数据,关闭工作簿
59 | workbook.close()
60 | 


--------------------------------------------------------------------------------
/Ch1Spider/JsonandSelenium/seleniumdemo.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | from selenium import webdriver
 3 | from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
 4 | 
 5 | 
 6 | def getdata(html):
 7 |     pass
 8 | 
 9 | 
10 | def run():
11 |     login_url = 'https://accounts.douban.com/login'  # 要打开的页面
12 |     dcap = dict(DesiredCapabilities.PHANTOMJS)
13 |     dcap["phantomjs.page.settings.userAgent"] = (
14 |         "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:50.0) Gecko/20100101 Firefox/50.0")
15 |     driver = webdriver.PhantomJS('/home/shensir/phantomjs-2.1.1-linux-x86_64/bin/phantomjs',
16 |                                  desired_capabilities=dcap)
17 |     driver.get(login_url)  # 打开网页
18 |     time.sleep(5)  # 等待5s，使得网页加载完全
19 | 
20 |     # 获取登录页面的初始图片
21 |     driver.get_screenshot_as_file('before-login.png')
22 | 
23 |     # html = driver.page_source  # 获取当前网页源码
24 |     # print(html)
25 | 
26 |     # 填写帐号密码登录
27 |     driver.find_element_by_xpath('//*[@id="email"]').send_keys('你的帐号')
28 |     driver.find_element_by_xpath('//*[@id="password"]').send_keys('你的密码')
29 | 
30 |     time.sleep(3)
31 |     # 获取填写信息后的页面
32 |     driver.get_screenshot_as_file('after-insert.png')
33 | 
34 |     # 点击登录
35 |     driver.find_element_by_xpath('//*[@id="lzform"]/div[6]/input').click()
36 |     # 查看登陆后的界面
37 |     time.sleep(3)
38 |     driver.get_screenshot_as_file('after-login.png')
39 | 
40 |     '''
41 |     进行一些登录后的操作
42 |     html = driver.get('http://...')
43 |     getdata(html)
44 |     '''
45 | 
46 |     # 若程序异常中断,driver不会自动释放
47 |     # 所以实际使用时最好就上异常处理，保证driver的释放
48 |     driver.quit()
49 | 
50 | 
51 | if __name__ == '__main__':
52 |     run()
53 | 


--------------------------------------------------------------------------------
/Ch4Data-Life/Math/pi.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 | import matplotlib.patches as patches
 4 | 
 5 | 
 6 | def get_random_points(N):
 7 |     np.random.seed(42)
 8 |     random_points = np.random.rand(N, 2)
 9 |     return random_points
10 | 
11 | 
12 | # 计算pi的值，并将圆内外的点分开，方便做图
13 | def cal_pi(random_points):
14 |     inCircle_points = []  # 圆内部点
15 |     outCircle_points = []  # 外部点（以及边上的点）
16 | 
17 |     for point in random_points:
18 |         x = point[0]
19 |         y = point[1]
20 |         if (x - 0.5) ** 2 + (y - 0.5) ** 2 < 0.25:
21 |             inCircle_points.append([x, y])
22 |         else:
23 |             outCircle_points.append([x, y])
24 | 
25 |     ratio = len(inCircle_points) / len(random_points)
26 |     pi = 4 * ratio
27 | 
28 |     return pi, inCircle_points, outCircle_points
29 | 
30 | 
31 | def plot_data(random_points):
32 |     pi_estimation, inCircle_points, outCircle_points = cal_pi(random_points)
33 |     print('估计的pi值为:', pi_estimation)
34 | 
35 |     fig1 = plt.figure()
36 |     # 绘制圆的轮廓
37 |     ax1 = fig1.add_subplot(111, aspect='equal')
38 |     ax1.add_patch(
39 |         patches.Circle((0.5, 0.5), 0.5, fill=False, lw=2))
40 | 
41 |     # 绘制圆内外的点
42 |     ax1.plot(np.array(inCircle_points)[:, 0], np.array(inCircle_points)[:, 1],
43 |              'go', alpha=0.3, markersize=0.5)
44 |     ax1.plot(np.array(outCircle_points)[:, 0], np.array(outCircle_points)[:, 1], 'ro', alpha=0.3, markersize=0.5)
45 | 
46 |     plt.axis([0, 1, 0, 1])  # 座标轴范围约束
47 |     plt.title('$\pi\\approx' + str(pi_estimation) + '$')
48 |     plt.show()
49 | 
50 | 
51 | if __name__ == '__main__':
52 |     N = 30000
53 |     random_points = get_random_points(N)
54 |     plot_data(random_points)
55 | 


--------------------------------------------------------------------------------
/Report/Source/process_data.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import pandas as pd
 3 | 
 4 | # 读入数据，给定各字段的名字
 5 | df = pd.read_csv('../data/douban.csv', header=None, skip_blank_lines=True,
 6 |                  names=['p_name', 'p_url', 'c_date_time', 'c_data', 'c_rank', 'c_recom'])
 7 | 
 8 | # 预览数据
 9 | # print(df.head(5))
10 | 
11 | # 缺失值检测与去除
12 | print(df.isnull().sum())
13 | df.dropna(inplace=True)
14 | 
15 | 
16 | # 拆分原c_date_time为c_date和c_time
17 | def get_date(date_time):
18 |     # 有时会格式不对
19 |     if len(date_time) < 10:
20 |         return None
21 |     return re.findall(r'(\d+-\d+-\d+) \d+.*?', date_time)[0]
22 | 
23 | 
24 | def get_time(date_time):
25 |     if len(date_time) < 10:
26 |         return None
27 |     return re.findall(r'.*? (\d+:\d+:\d+)', date_time)[0]
28 | 
29 | 
30 | df['c_date'] = df['c_date_time'].apply(get_date)
31 | df['c_time'] = df['c_date_time'].apply(get_time)
32 | 
33 | # 如果需要，也可以进行数据类型的转换
34 | print(df.dtypes)
35 | df['c_date_time'] = df['c_date_time'].astype('datetime64[ns]')
36 | print(df.dtypes)
37 | 
38 | 
39 | # 也可方便地进行数据转换[Encoding Categorical Values]
40 | # 将汉字对应编码为数字
41 | def trans(words):
42 |     if words == '力荐':
43 |         return 5
44 |     elif words == '推荐':
45 |         return 4
46 |     elif words == '还行':
47 |         return 3
48 |     elif words == '较差':
49 |         return 2
50 |     elif words == '很差':
51 |         return 1
52 |     else:
53 |         return None
54 | 
55 | 
56 | df['c_rank_num'] = df['c_rank'].apply(trans)
57 | 
58 | # 设置索引列为c_date_time
59 | df.index = df['c_date_time']
60 | 
61 | # 去除多余的c_date_time列
62 | df = df.drop(['c_date_time'], axis=1)
63 | 
64 | # 其他的一些操作...
65 | 
66 | # 去除操作产生的缺失值
67 | df.dropna(inplace=True)
68 | # 保存预处理后的文件
69 | df.to_csv('../data/douban_processed.csv')
70 | 


--------------------------------------------------------------------------------
/Report/Source/analy_vis_data.py:
--------------------------------------------------------------------------------
 1 | import jieba
 2 | import pandas as pd
 3 | from snownlp import SnowNLP
 4 | import matplotlib.pyplot as plt
 5 | import seaborn as sns
 6 | from wordcloud import WordCloud, STOPWORDS
 7 | from scipy.misc import imread
 8 | 
 9 | # 读入处理好的数据
10 | df = pd.read_csv('../data/douban_processed.csv')
11 | 
12 | # 预览数据
13 | print(df.head(5))
14 | 
15 | 
16 | # 分析评论文本
17 | 
18 | # 情感分析
19 | def get_sentiments(origin_s):
20 |     s = SnowNLP(origin_s)
21 |     return s.sentiments
22 | 
23 | 
24 | df['c_sentiments'] = df['c_data'].apply(get_sentiments)
25 | df['c_sentiments'].plot.hist()
26 | plt.savefig('../data/positive.png')
27 | plt.show()
28 | 
29 | # 全部评论的关键字
30 | all_comments = ''.join(df['c_data'])
31 | all_snow = SnowNLP(all_comments)
32 | keywords = all_snow.keywords(30)
33 | print(keywords)
34 | 
35 | # 摘要
36 | # ['故事里每个人的结局都很好', '大家都不是一个人', '每个人都有故事每个人都有视角每个人都有选择']
37 | summary = all_snow.summary(3)
38 | print(summary)
39 | 
40 | # 其他应用机器学习进行探索...
41 | 
42 | # 简单的可视化
43 | sns.countplot('c_rank_num', data=df)
44 | plt.savefig('../data/rank.png')
45 | plt.show()
46 | 
47 | 
48 | # 词云
49 | def get_wordCloud(mylist):
50 |     word_list = [" ".join(jieba.cut(sentence)) for sentence in mylist]
51 |     new_text = ' '.join(word_list)
52 |     pic_path = '../data/heart.png'
53 |     img_mask = imread(pic_path)
54 | 
55 |     stopwords = set(STOPWORDS)
56 |     stopwords.add("电影")
57 |     wordcloud = WordCloud(background_color="white", max_words=2000, font_path='/home/shensir/Downloads/msyh.ttc',
58 |                           mask=img_mask, stopwords=stopwords).generate(new_text)
59 |     plt.imshow(wordcloud, interpolation='bilinear')
60 |     plt.axis("off")
61 |     plt.savefig('../data/wordcloud.png')
62 |     plt.show()
63 | 
64 | 
65 | get_wordCloud(df['c_data'])
66 | 


--------------------------------------------------------------------------------
/Ch2Data/DataIO/Mpymysql/Mbase.py:
--------------------------------------------------------------------------------
 1 | import pymysql
 2 | 
 3 | # 创建连接
 4 | db = pymysql.connect(host="localhost", user="root", password="密码", db="PyDataBase", charset='utf8')
 5 | # 获取游标，我们用它来执行数据库的操作
 6 | cursor = db.cursor()
 7 | 
 8 | 
 9 | # 　打印列名与列定义
10 | def print_colnames():
11 |     cursor.execute("SHOW COLUMNS FROM Py_Create;")
12 |     col_names = cursor.fetchall()
13 |     print(col_names)
14 |     return col_names
15 | 
16 | 
17 | # 查询数据
18 | def pritn_alldata():
19 |     cursor.execute("SELECT * FROM Py_Create;")
20 |     data = cursor.fetchall()  # 获取全部数据
21 |     print("All data: ", data)
22 |     return data
23 | 
24 | 
25 | # 执行sql语句
26 | try:
27 |     # 删除表
28 |     # 在创建新表之前检查是否已经存在此表，若存在则先删除
29 |     cursor.execute("DROP TABLE IF EXISTS Py_Create;")
30 |     # 创建表
31 |     cursor.execute("CREATE TABLE Py_Create(username VARCHAR (10), useraddr VARCHAR (20));")
32 |     # 插入数据
33 |     cursor.execute("INSERT INTO Py_Create (username,useraddr) VALUES ('员工一', '中国');")
34 |     cursor.execute("INSERT INTO Py_Create (username,useraddr) VALUES ('员工二', '美国');")
35 | 
36 |     # 打印数据
37 |     pritn_alldata()
38 | 
39 |     # 字段与记录的操作
40 | 
41 |     # 记录操作
42 |     # 插入就是INSERT语句
43 |     # 删除使用where
44 |     cursor.execute("DELETE FROM Py_Create WHERE useraddr='美国'")
45 | 
46 |     # 打印数据
47 |     pritn_alldata()
48 | 
49 |     # 字段操作
50 |     # 打印修改前的列
51 |     print_colnames()
52 | 
53 |     # 删除列
54 |     cursor.execute("ALTER TABLE Py_Create DROP username;")
55 |     # 添加列
56 |     cursor.execute("ALTER TABLE Py_Create ADD COLUMN (age TINYINT UNSIGNED);")
57 | 
58 |     # 打印修改后的列
59 |     print_colnames()
60 |     # 关闭cursor
61 |     cursor.close()
62 | 
63 |     # 提交上面的增删表和插入数据的操作到数据库
64 |     db.commit()
65 | 
66 | 
67 | except:
68 |     db.rollback()
69 |     print("ERROR!")
70 | 
71 | finally:
72 |     # 关闭数据库连接
73 |     db.close()
74 | 


--------------------------------------------------------------------------------
/Report/Source/spider.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import pickle
 3 | from bs4 import BeautifulSoup
 4 | from get_data import get_all_data
 5 | 
 6 | 
 7 | # 提交表单登录并获取cookie
 8 | def get_cookie_from_net():
 9 |     url = 'https://accounts.douban.com/login'
10 |     # 构建表单
11 |     payload = {'source': 'None',
12 |                'redir': 'https://www.douban.com/',
13 |                'form_email': '你的邮箱',
14 |                'form_password': '你的密码',
15 |                'login': '登录'}
16 | 
17 |     data = s.post(url, headers=headers, data=payload, verify=True)  # 绕过了SSL验证
18 |     with open('cookies.douban', 'wb') as f:
19 |         cookiedict = requests.utils.dict_from_cookiejar(s.cookies)
20 |         pickle.dump(cookiedict, f)
21 |     print("提交表单登录，成功获取cookies...")
22 | 
23 |     return s.cookies
24 | 
25 | 
26 | # 从cookie文件获取cookie
27 | def get_cookie_from_file():
28 |     with open('cookies.douban', 'rb') as f:
29 |         cookiedict = pickle.load(f)
30 |         cookies = requests.utils.cookiejar_from_dict(cookiedict)
31 |     print("解析文件，成功提取cookis...")
32 |     return cookies
33 | 
34 | 
35 | # 假设这里我要获取自己的签名数据
36 | def getdata(html):
37 |     soup = BeautifulSoup(html.text, 'lxml')
38 |     mydata = soup.select('#display')[0].get_text()
39 |     '''
40 |     这里进行登录后其他数据的获取及存储，这里仅仅获取了自己的签名数据。
41 |     '''
42 |     return mydata
43 | 
44 | 
45 | def login_and_getdata():
46 |     print('获取cookis...')
47 |     try:
48 |         s.cookies = get_cookie_from_file()
49 |     except:
50 |         print("从文件获取cookies失败...\n正在尝试提交表单登录以获取...")
51 |         s.cookies = get_cookie_from_net()
52 |     # 开始爬取数据
53 |     get_all_data(s, headers)
54 | 
55 | 
56 | if __name__ == '__main__':
57 |     # 一些全局变量
58 |     s = requests.session()
59 |     headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWe'
60 |                              'bKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}
61 | 
62 |     # 登录并获取数据
63 |     login_and_getdata()
64 | 


--------------------------------------------------------------------------------
/Ch1Spider/cookie/douban_login_new.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import pickle
 3 | from bs4 import BeautifulSoup
 4 | 
 5 | 
 6 | # 提交表单登录并获取cookie
 7 | def get_cookie_from_net():
 8 |     url = "https://accounts.douban.com/j/mobile/login/basic"
 9 |     # 构建表单
10 |     payload = {
11 |             "ck": "",
12 |             "name": "your email",
13 |             "password": "your password",
14 |             "remember": "true",
15 |             "ticket": ""
16 |         }
17 | 
18 |     data = s.post(url, headers=headers, data=payload).json()
19 |     # 检测登录是否成功
20 |     if data["status"] == "success":
21 |         print("登陆成功!")
22 | 
23 |     with open('cookies.douban', 'wb') as f:
24 |         cookiedict = requests.utils.dict_from_cookiejar(s.cookies)
25 |         pickle.dump(cookiedict, f)
26 |     print("成功获取cookies!")
27 | 
28 |     return s.cookies
29 | 
30 | 
31 | # 从cookie文件获取cookie
32 | def get_cookie_from_file():
33 |     with open('cookies.douban', 'rb') as f:
34 |         cookiedict = pickle.load(f)
35 |         cookies = requests.utils.cookiejar_from_dict(cookiedict)
36 |     print("解析文件，成功提取cookis...")
37 |     return cookies
38 | 
39 | 
40 | # 假设这里我要获取自己的签名数据
41 | def getdata(html):
42 |     soup = BeautifulSoup(html.text, 'lxml')
43 |     mydata = soup.select('#display')[0].get_text()
44 |     '''
45 |     这里进行登录后其他数据的获取及存储，这里仅仅获取了自己的签名数据。
46 |     '''
47 |     return mydata
48 | 
49 | 
50 | def login_and_getdata():
51 |     print('获取cookis...')
52 |     try:
53 |         s.cookies = get_cookie_from_file()
54 |     except:
55 |         print("从文件获取cookies失败...\n正在尝试提交表单登录以获取...")
56 |         s.cookies = get_cookie_from_net()
57 | 
58 |     html = s.get('https://www.douban.com/people/146448257/', headers=headers)
59 |     # print(html.text)
60 |     data = getdata(html)
61 |     print(data)
62 | 
63 | 
64 | if __name__ == '__main__':
65 |     # 一些全局变量
66 |     s = requests.session()
67 |     # 这里务必更换
68 |     headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6"}
69 |     # 登录并获取数据
70 |     login_and_getdata()
71 | 


--------------------------------------------------------------------------------
/Ch1Spider/cookie/douban_login.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import pickle
 3 | from fake_useragent import UserAgent
 4 | from bs4 import BeautifulSoup
 5 | 
 6 | # 提交表单登录并获取cookie
 7 | def get_cookie_from_net():
 8 |     url = 'https://accounts.douban.com/login'
 9 |     # 构建表单
10 |     payload = {'source': 'None',
11 |                'redir': 'https://www.douban.com/',
12 |                'form_email': 'your email',
13 |                'form_password': 'your pwd',
14 |                'login': '登录'}
15 | 
16 |     data = s.post(url, headers=headers, data=payload, verify=True)  # 绕过了SSL验证
17 |     with open('cookies.douban', 'wb') as f:
18 |         cookiedict = requests.utils.dict_from_cookiejar(s.cookies)
19 |         pickle.dump(cookiedict, f)
20 |     print("提交表单登录，成功获取cookies...")
21 | 
22 |     return s.cookies
23 | 
24 | 
25 | # 从cookie文件获取cookie
26 | def get_cookie_from_file():
27 |     with open('cookies.douban', 'rb') as f:
28 |         cookiedict = pickle.load(f)
29 |         cookies = requests.utils.cookiejar_from_dict(cookiedict)
30 |     print("解析文件，成功提取cookis...")
31 |     return cookies
32 | 
33 | 
34 | # 假设这里我要获取自己的签名数据
35 | def getdata(html):
36 |     soup = BeautifulSoup(html.text, 'lxml')
37 |     mydata = soup.select('#display')[0].get_text()
38 |     '''
39 |     这里进行登录后其他数据的获取及存储，这里仅仅获取了自己的签名数据。
40 |     '''
41 |     return mydata
42 | 
43 | 
44 | def login_and_getdata():
45 |     print('获取cookis...')
46 |     try:
47 |         s.cookies = get_cookie_from_file()
48 |     except:
49 |         print("从文件获取cookies失败...\n正在尝试提交表单登录以获取...")
50 |         s.cookies = get_cookie_from_net()
51 | 
52 |     html = s.get('https://www.douban.com/people/146448257/', headers=headers)
53 |     # print(html.text)
54 |     data = getdata(html)
55 |     print(data)
56 | 
57 | 
58 | if __name__ == '__main__':
59 |     # 一些全局变量
60 |     s = requests.session()
61 |     ua = UserAgent()
62 |     headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWe'
63 |                              'bKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}
64 | 
65 |     # 登录并获取数据
66 |     login_and_getdata()
67 | 


--------------------------------------------------------------------------------
/Ch4Data-Life/QQ/DataExtr.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import datetime
 3 | import seaborn as sns
 4 | import matplotlib.pyplot as plt
 5 | import jieba
 6 | from wordcloud import WordCloud, STOPWORDS
 7 | from scipy.misc import imread
 8 | 
 9 | # 日期
10 | def get_date(data):
11 |     # 日期
12 |     dates = re.findall(r'\d{4}-\d{2}-\d{2}', data)
13 |     # 天
14 |     days = [date[-2:] for date in dates]
15 |     plt.subplot(221)
16 |     sns.countplot(days)
17 |     plt.title('Days')
18 | 
19 |     # 周几
20 |     weekdays = [datetime.date(int(date[:4]), int(date[5:7]), int(date[-2:])).isocalendar()[-1]
21 |                 for date in dates]
22 |     plt.subplot(222)
23 |     sns.countplot(weekdays)
24 |     plt.title('WeekDays')
25 | 
26 | 
27 | # 时间
28 | def get_time(data):
29 |     times = re.findall(r'\d{2}:\d{2}:\d{2}', data)
30 |     # 小时
31 |     hours = [time[:2] for time in times]
32 |     plt.subplot(223)
33 |     sns.countplot(hours, order=['06', '07', '08', '09', '10', '11', '12', '13', '14', '15', '16', '17',
34 |                                 '18', '19', '20', '21', '22', '23', '00', '01', '02', '03', '04', '05'])
35 | 
36 |     plt.title('Hours')
37 | 
38 | 
39 | 
40 | # 词云
41 | def get_wordclound(text_data):
42 |     word_list = [" ".join(jieba.cut(sentence)) for sentence in text_data]
43 |     new_text = ' '.join(word_list)
44 | 
45 |     pic_path = 'QQ.jpg'
46 |     mang_mask = imread(pic_path)
47 |     plt.subplot(224)
48 |     wordcloud = WordCloud(background_color="white", font_path='/home/shen/Downloads/fonts/msyh.ttc',
49 |                           mask=mang_mask, stopwords=STOPWORDS).generate(new_text)
50 |     plt.imshow(wordcloud)
51 |     plt.axis("off")
52 | 
53 | # 内容及词云
54 | def get_content(data):
55 |     pa = re.compile(r'\d{4}-\d{2}-\d{2}.*?\(\d+\)\n(.*?)\n\n', re.DOTALL)
56 |     content = re.findall(pa, data)
57 |     get_wordclound(content)
58 | 
59 | 
60 | def run():
61 |     filename = 'python自学新人交流.txt'
62 |     with open(filename) as f:
63 |         data = f.read()
64 | 
65 |     get_date(data)
66 |     get_time(data)
67 |     get_content(data)
68 |     plt.show()
69 | 
70 | 
71 | if __name__ == '__main__':
72 |     run()
73 | 


--------------------------------------------------------------------------------
/Ch3Analysis-Visualization/ML/MKnn2.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | 参考YouTube上Google developers的系列视频
 3 | https://www.youtube.com/watch?v=cKxRvEZd3Mw
 4 | '''
 5 | 
 6 | import numpy as np
 7 | import operator
 8 | from scipy.spatial import distance
 9 | 
10 | 
11 | class ScrappyKNN():
12 |     def fit(self, X_train, y_train, k):
13 |         self.X_train = X_train
14 |         self.y_train = y_train
15 |         self.k = k
16 | 
17 |     def predict(self, X_test):
18 |         predictions = []
19 |         for row in X_test:
20 |             label = self.closest_k(row)
21 |             predictions.append(label)
22 | 
23 |         return predictions
24 | 
25 |     def closest_k(self, row):
26 |         # distances存储测试点到数据集各个点的距离
27 |         distances = []
28 |         for i in range(len(X_train)):
29 |             dist = self.euc(row, self.X_train[i])
30 |             distances.append(dist)
31 | 
32 |         # 转换成数组，对距离排序（从小到大）,返回位置信息
33 |         distances = np.array(distances)
34 |         sortedDistIndicies = distances.argsort()
35 | 
36 |         classCount = {}
37 |         for i in range(self.k):
38 |             voteIlabel = y_train[sortedDistIndicies[i]]
39 |             # 此处get，原字典有此voteIlabel则返回其对应的值，没有则返回0
40 |             classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1
41 | 
42 |         # 根据值（对应“票数”）进行排序，使得获得票数多的类在前（故使用reverse=True）
43 |         sortedClassCount = sorted(classCount.items(),
44 |                                   key=operator.itemgetter(1), reverse=True)
45 |         # 返回该测试点的类别
46 |         return sortedClassCount[0][0]
47 | 
48 |     # 计算欧式距离
49 |     def euc(self, a, b):
50 |         return distance.euclidean(a, b)
51 | 
52 | 
53 | from sklearn.datasets import load_iris
54 | 
55 | iris = load_iris()
56 | 
57 | X = iris.data
58 | y = iris.target
59 | 
60 | from sklearn.model_selection import train_test_split
61 | 
62 | X_train, X_test, y_train, y_test = train_test_split(X, y)
63 | 
64 | my_classifier = ScrappyKNN()
65 | my_classifier.fit(X_train, y_train, k=3)
66 | predictions = my_classifier.predict(X_test)
67 | 
68 | from sklearn.metrics import accuracy_score
69 | 
70 | print(accuracy_score(y_test, predictions))
71 | 
72 | # 0.973684210526
73 | 


--------------------------------------------------------------------------------
/Ch1Spider/exception/try_and_exception.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import time
 3 | import chardet
 4 | import requests
 5 | import urllib.robotparser
 6 | from fake_useragent import UserAgent
 7 | 
 8 | 
 9 | # 获取headers
10 | def get_headers():
11 |     ua = UserAgent()
12 |     user_agent = ua.random
13 |     headers = {'User-Agent': user_agent}
14 | 
15 |     return headers
16 | 
17 | 
18 | # 这里获取代理IP的函数直接给出了proxies,
19 | # 我们也可以用此函数去爬取免费的代理IP，因为不是重点，这里不再赘述
20 | def get_proxies():
21 |     proxies = {
22 |         "http": "125.88.74.122:84",
23 |         "http": "123.84.13.240:8118",
24 |         "https": "94.240.33.242:3128"
25 |     }
26 | 
27 |     return proxies
28 | 
29 | 
30 | # robots.txt检测
31 | def robot_check(robotstxt_url, headers, url):
32 |     rp = urllib.robotparser.RobotFileParser()
33 |     rp.set_url(robotstxt_url)
34 |     rp.read()
35 |     result = rp.can_fetch(headers['User-Agent'], url)
36 | 
37 |     return result
38 | 
39 | 
40 | # 获取网页数据, 这里我们没有返回data.text,
41 | # 因为抓取图片图片时返回的应该是data.content
42 | def get_data(url, num_retries=3, proxies=None):
43 |     try:
44 |         data = requests.get(url, timeout=5, headers=headers)
45 |         print(data.status_code)
46 |     except requests.exceptions.ConnectionError as e:
47 |         print("请求错误, url:", url)
48 |         print("错误详情:", e)
49 |         data = None
50 |     except:  # other error
51 |         print("未知错误, url:", url)
52 |         data = None
53 | 
54 |     if (data != None) and (500 <= data.status_code < 600):
55 |         if (num_retries > 0):
56 |             print("服务器错误，正在重试...")
57 |             time.sleep(1)
58 |             num_retries -= 1
59 |             get_data(url, num_retries, proxies=proxies)
60 | 
61 |     return data
62 | 
63 | 
64 | # 对网页内容的解析，提取和存储等操作
65 | def parse_data(data):
66 |     if data == None:
67 |         return None
68 | 
69 |     charset = chardet.detect(data.content)
70 |     data.encoding = charset['encoding']
71 |     html_text = data.text
72 |     '''
73 |     对网页数据的解析提取等操作,假设这里要获取网页的title
74 |     '''
75 |     interesting_data = re.findall('<title>(.*?)</title>', html_text)
76 | 
77 |     return interesting_data
78 | 
79 | 
80 | if __name__ == '__main__':
81 |     headers = get_headers()
82 |     proxies = get_proxies()
83 |     data = get_data("http://www.baidu.com", num_retries=3, proxies=proxies)
84 |     interesting_data = parse_data(data)
85 |     print(interesting_data)
86 | 


--------------------------------------------------------------------------------
/Ch4Data-Life/News/NewsReport.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import os
 3 | import time
 4 | import chardet
 5 | from bs4 import BeautifulSoup
 6 | from MEmail import send_ms
 7 | 
 8 | # 获取网页数据
 9 | def get_web_data(url):
10 |     headers = {
11 |         'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.94 Safari/537.36"}
12 |     html = requests.get(url, headers=headers)
13 |     Encoding = chardet.detect(html.content)['encoding']
14 |     html.encoding = Encoding
15 |     web_data = html.text
16 | 
17 |     return web_data
18 | 
19 | # 获取标题及对应链接
20 | def get_titles(web_data):
21 |     title_hrefs = {}
22 |     soup = BeautifulSoup(web_data, 'lxml')
23 |     titles_data = soup.find_all({'a': {'target': '_blank'}})
24 | 
25 |     for title in titles_data:
26 |         title_text = title.get_text()
27 | 
28 |         # 过滤一些无关的标签等[长度一般较短]
29 |         if len(title_text) >= 10:
30 |             if title.has_attr('href'):
31 |                 href = title['href']
32 |             else:
33 |                 href = 'Cannot find link...'
34 | 
35 |             title_hrefs[title_text] = href
36 | 
37 |     return title_hrefs
38 | 
39 | 
40 | # 筛选自己想了解的信息
41 | def get_roi(title_hrefs, key_words):
42 |     roi = {}  # 用于存储感兴趣的标题
43 |     for title in title_hrefs:
44 |         if key_words in title:
45 |             roi[title] = title_hrefs[title]
46 | 
47 |     return roi
48 | 
49 | # 生成本地日志记录
50 | def record(roi, key_words):
51 |     if 'NewsReportLog.txt' not in os.listdir():
52 |         with open('NewsReportLog.txt', 'w') as f:  # 写入模式
53 |             f.write(str(key_words)+'相关新闻抓取程序日志'+str(time.ctime())+'\n')
54 | 
55 |     with open('NewsReportLog.txt', 'a') as f:  # 追加模式
56 |         f.write('='*10+str(time.ctime()+'='*10))
57 |         for title in roi:
58 |             f.write(title)
59 |             f.write(roi[title])
60 | 
61 |         f.write('\n')
62 | 
63 | # 发送邮件到邮箱提醒
64 | def send_report(roi):
65 |     length = len(roi)
66 |     s1 = '本次共探测到'+str(length)+'条相关新闻'+'\n'
67 |     s2 = ''
68 |     for title in roi:
69 |         s2 += title
70 |         s2 += roi[title]
71 |         s2 += '\n'
72 |     #send_ms(s1+s2)
73 | 
74 | 
75 | if __name__=='__main__':
76 |     web_data = get_web_data("https://news.baidu.com/tech")
77 |     titles = get_titles(web_data)
78 |     key_words = 'iPhone'
79 |     roi = get_roi(titles, key_words)
80 |     print(roi)
81 |     if len(roi) != 0:
82 |         record(roi, key_words)
83 |         send_report(roi)
84 | 
85 | 


--------------------------------------------------------------------------------
/Report/Source/get_data.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | 豆瓣影评爬取：《奇迹男孩》
 3 | 登录后爬取，也只爬取了点赞数最高的500条评价，（服务器端限制了用户可以查看的条数）
 4 | '''
 5 | 
 6 | import re
 7 | import time
 8 | import csv
 9 | from bs4 import BeautifulSoup
10 | 
11 | 
12 | # 获取所有数据
13 | def get_all_data(login_session, headers):
14 |     # 创建文件，用来存储数据
15 |     file = open('../data/douban.csv', 'w')
16 |     csv_file = csv.writer(file)
17 |     # 开始爬取
18 |     page_urls = get_page_urls()
19 |     for page_url in page_urls:
20 |         try:
21 |             time.sleep(0.05)
22 |             page_data = login_session.get(page_url, headers=headers)
23 |             page_obj = BeautifulSoup(page_data.text, 'lxml')
24 |             comment_blocks = get_page_data(page_obj)
25 |             for comment_block in comment_blocks:
26 |                 get_one_com_data(comment_block, csv_file)
27 |         except Exception as e:
28 |             print(page_url)
29 |             print(e)
30 |     file.close()
31 | 
32 | 
33 | # 获取所有短评的URL（找规律）
34 | def get_page_urls():
35 |     page_urls = ["https://movie.douban.com/subject/26787574/comments?" \
36 |                  "start=%s&limit=20&sort=new_score&status=P&percent_type=" % (start)
37 |                  for start in range(0, 500, 20)]
38 |     return page_urls
39 | 
40 | 
41 | # 获取每页的短评信息
42 | def get_page_data(page_obj):
43 |     comment_blocks = page_obj.find('div', {'id': 'comments'}) \
44 |         .find_all('div', {'class': 'comment-item'})
45 |     return comment_blocks
46 | 
47 | 
48 | # 获取单个短评的信息
49 | def get_one_com_data(comment_block, csv_file):
50 |     try:
51 |         # 评价人数据
52 |         p_data = comment_block.find('a', {'class': ''})
53 |         # 评价人ID
54 |         p_name = p_data.get('title')
55 |         # 评价人主页
56 |         p_url = p_data.get('href')
57 |         # 评价具体数据
58 |         # 评价日期
59 |         c_date_time = comment_block.find('span', {'class': 'comment-time '}).get('title')
60 |         # 评价内容
61 |         c_data = comment_block.find('p', {'class': ''}).get_text()
62 |         # 评级 [在bs4中同样可以使用re，可以解决很多问题]
63 |         # 有些人未评等级
64 |         try:
65 |             c_rank = comment_block.find('span', class_=re.compile('allstar\d+ rating')).get('title')
66 |         except:
67 |             c_rank = None
68 |             pass
69 |         # 推荐（点赞）人数
70 |         c_recom = comment_block.find('span', {'class': 'votes'}).get_text()
71 |         # 将数据写入文件
72 |         if c_rank != None:
73 |             csv_file.writerow([p_name, p_url, c_date_time, c_data, c_rank, c_recom])
74 |         return [p_name, p_url, c_date_time, c_data, c_rank, c_recom]
75 |     except Exception as e:
76 |         print(e)
77 |         # print(comment_block)
78 | 


--------------------------------------------------------------------------------
/Ch1Spider/first-demo/spider.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import pandas as pd
 3 | from bs4 import BeautifulSoup
 4 | 
 5 | 
 6 | # 请求数据
 7 | def get_data():
 8 |     url = 'https://book.douban.com/latest'
 9 |     # headers 里面大小写均可
10 |     headers = {'User-Agent': "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0"}
11 |     data = requests.get(url, headers=headers)
12 |     # print(data.text)
13 |     return data
14 | 
15 | 
16 | # 解析数据
17 | def parse_data(data):
18 |     soup = BeautifulSoup(data.text, 'lxml')
19 |     # print(soup)
20 | 
21 |     # 观察到网页上的书籍按左右两边分布,按照标签分别提取
22 |     books_left = soup.find('ul', {'class': 'cover-col-4 clearfix'})
23 |     books_left = books_left.find_all('li')
24 | 
25 |     books_right = soup.find('ul', {'class': 'cover-col-4 pl20 clearfix'})
26 |     books_right = books_right.find_all('li')
27 | 
28 |     books = list(books_left) + list(books_right)
29 | 
30 |     # 对每一个图书区块进行相同的操作，获取图书信息
31 |     img_urls = []
32 |     titles = []
33 |     ratings = []
34 |     authors = []
35 |     details = []
36 |     for book in books:
37 |         # 图书图片url地址
38 |         img_url = book.find_all('a')[0].find('img').get('src')
39 |         img_urls.append(img_url)
40 |         # 图书标题
41 |         title = book.find_all('a')[1].get_text()
42 |         titles.append(title)
43 |         # print(title)
44 | 
45 |         # 评价星级
46 |         rating = book.find('p', {'class': 'rating'}).get_text()
47 |         rating = rating.replace('\n', '').replace(' ', '')
48 |         ratings.append(rating)
49 | 
50 |         # 作者及出版信息
51 |         author = book.find('p', {'class': 'color-gray'}).get_text()
52 |         author = author.replace('\n', '').replace(' ', '')
53 |         authors.append(author)
54 | 
55 |         # 图书简介
56 |         detail = book.find_all('p')[2].get_text()
57 |         detail = detail.replace('\n', '').replace(' ', '')
58 |         details.append(detail)
59 | 
60 |     print("img_urls: ", img_urls)
61 |     print("titles: ", titles)
62 |     print("ratings: ", ratings)
63 |     print("authors: ", authors)
64 |     print("details: ", details)
65 | 
66 |     return img_urls, titles, ratings, authors, details
67 | 
68 | 
69 | # 存储数据
70 | def save_data(img_urls, titles, ratings, authors, details):
71 |     result = pd.DataFrame()
72 |     result['img_urls'] = img_urls
73 |     result['titles'] = titles
74 |     result['ratings'] = ratings
75 |     result['authors'] = authors
76 |     result['details'] = details
77 | 
78 |     result.to_csv('result.csv', index=None)
79 | 
80 | 
81 | # 开始爬取
82 | def run():
83 |     data = get_data()
84 |     img_urls, titles, ratings, authors, details = parse_data(data)
85 |     save_data(img_urls, titles, ratings, authors, details)
86 | 
87 | 
88 | if __name__ == '__main__':
89 |     run()
90 | 


--------------------------------------------------------------------------------
/Ch4Data-Life/News/NewsReportLog.txt:
--------------------------------------------------------------------------------
1 | iPhone相关新闻抓取程序日志Mon Jun 26 08:49:05 2017
2 | ==========Mon Jun 26 08:49:05 2017==========6500元买吗？iPhone 8又有黑科技：3D传..http://tech.ifeng.com/a/20170625/44642859_0.shtml6500元买吗？iPhone 8不仅颜值高还搭载3D传感器http://news.pconline.com.cn/944/9444285.htmliPhone 8最新高清细节图曝光:无后置指纹http://mobile.yesky.com/182/237633182.shtmliPhone有什么录屏软件？iOS10如何不越狱实现录屏？http://news.86wan.com/xinwen/804619.html加拿大将迎来32GB iPhone 6：深空灰色http://iphone.tgbus.com/news/class/201706/20170625100446.shtml又一次iPhone 8爆料：壁纸和贴膜都有了http://iphone.tgbus.com/news/class/201706/20170625100212.shtml爆料大神拿到十多张iPhone8工程机图 快来看别声张http://digi.hsw.cn/system/2017/0625/85413.shtmliPhone到底是怎么诞生的？是乔布斯拿iPad改..http://www.citmt.cn/news/201706/7936.htmliPhone 8无线充电设计背后五大绝招是什么？http://tech.sina.com.cn/roll/2017-06-24/doc-ifyhmpew3268026.shtml
3 | ==========Fri Jun 30 15:24:55 2017==========iPhone8能用WiFi充电吗 iPhone8会..http://baijiahao.baidu.com/s?id=1571557098087634时光倒流十年 回顾初代苹果iPhone发售场景 http://www.cb.com.cn/shishiretu/2017_0630/1001871.htmliPhone10周年之际 设计师分享两款iPhone罕见原型机http://mobile.it168.com/a2017/0630/3138/000003138081.shtml一款电子墨水屏幕兼iPhone 7保护壳正在众筹http://news.pconline.com.cn/947/9474544.html十年：ZEALER 带你回顾历代 iPhonehttp://it.sohu.com/20170629/n499226359.shtml4.7寸经典手机 苹果iPhone 6苏宁售2578元http://mobile.pconline.com.cn/946/9468090.htmliPhone这十年也不易 它可迈过了不少坎儿http://mobile.zol.com.cn/645/6455077.html华强北红色iPhone8曝光：机身正面辣眼睛http://mobile.it168.com/a2017/0630/3138/000003138130.shtml微软或与iPhone对着干：有耳机插孔和可拆电池http://baijiahao.baidu.com/s?id=1571535272162131为实现快充！iPhone 8 有可能附赠10W充电..http://baijiahao.baidu.com/s?id=1571499839340222苹果告别神话十年，不再是身份标签的iPhone逐渐..http://news.sina.com.cn/c/2017-06-30/doc-ifyhrttz1773968.shtml苹果10年总共卖了12亿部iPhone：创收738..http://baijiahao.baidu.com/s?id=1571607401477009
4 | ==========Fri Jun 30 15:49:59 2017==========你的iPhone电量总不够用？这里赶紧关了，让电量..http://baijiahao.baidu.com/s?id=1571610812674132苹果10年总共卖了12亿部iPhone：创收738..http://baijiahao.baidu.com/s?id=1571607401477009安卓是如何击败iPhone成为市占之王？http://baijiahao.baidu.com/s?id=1571607515558253时光倒流十年 回顾初代苹果iPhone发售场景 http://www.cb.com.cn/shishiretu/2017_0630/1001871.html一款电子墨水屏幕兼iPhone 7保护壳正在众筹http://news.pconline.com.cn/947/9474544.html十年：ZEALER 带你回顾历代 iPhonehttp://it.sohu.com/20170629/n499226359.shtml4.7寸经典手机 苹果iPhone 6苏宁售2578元http://mobile.pconline.com.cn/946/9468090.htmlOLED面板缺货 iPhone 8首批备货或短缺http://mobile.zol.com.cn/645/6452259.html长沙买iPhone 7仅4199元支持分期可送货http://mobile.zol.com.cn/645/6455107.html第一代iPhone成收藏界新品 原包装未开封能卖400..http://firm.workercn.cn/497/201706/30/170630102411800.shtml华强北红色iPhone8曝光：机身正面辣眼睛http://mobile.it168.com/a2017/0630/3138/000003138130.shtml微软或与iPhone对着干：有耳机插孔和可拆电池http://baijiahao.baidu.com/s?id=1571535272162131为实现快充！iPhone 8 有可能附赠10W充电..http://baijiahao.baidu.com/s?id=1571499839340222苹果告别神话十年，不再是身份标签的iPhone逐渐..http://news.sina.com.cn/c/2017-06-30/doc-ifyhrttz1773968.shtml
5 | ==========Sun Jun 26 23:27:55 2022==========新增“古铜色”，电池加大！苹果新iPhone又有新..http://baijiahao.baidu.com/s?id=1736418600868515753iPhone 14Pro或弃用刘海屏增古铜配色http://baijiahao.baidu.com/s?id=1736404759591336703iPhone销量霸榜，高端苹果也走“薄利多销”路线..http://baijiahao.baidu.com/s?id=1736520051940361766iPhone 14大爆料 值得果粉期待吗？丨财经科..http://baijiahao.baidu.com/s?id=1736671674884272833
6 | 


--------------------------------------------------------------------------------
/Ch3Analysis-Visualization/EDA/DataCamp.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import seaborn as sns
  3 | import matplotlib.pyplot as plt
  4 | from pandas import DataFrame
  5 | import pandas as pd
  6 | from sklearn.datasets import load_iris
  7 | from sklearn.decomposition import PCA
  8 | from sklearn.model_selection import train_test_split
  9 | from sklearn.svm import SVC
 10 | from sklearn.metrics import accuracy_score
 11 | 
 12 | # 调入数据
 13 | iris = load_iris()
 14 | 
 15 | # sklearn对数据集的介绍
 16 | print(iris.DESCR)
 17 | 
 18 | # 提取数据集内容
 19 | # 这里根据需要可以不进行另外的赋值
 20 | iris_data = iris.data
 21 | feature_names = iris.feature_names
 22 | iris_target = iris.target
 23 | 
 24 | # 格式整理
 25 | iris_target.shape = (150, 1)
 26 | iris_all = np.hstack((iris_data, iris_target))
 27 | # 转化为DataFrame
 28 | iris_data_df = DataFrame(iris_data, columns=feature_names)
 29 | iris_target_df = DataFrame(iris_target, columns=['target'])
 30 | iris_data_all_df = DataFrame(iris_all, columns=feature_names + ['target'])
 31 | 
 32 | '''
 33 | 数据集基础信息的获取[以iris_data_df为例]
 34 | '''
 35 | 
 36 | # 数据预览
 37 | print(iris_data_all_df.head())  # 默认为前5行
 38 | print(iris_data_all_df.tail())  # 默认为后5行
 39 | print(iris_data_all_df.sample(5))  # 随机抽取5行
 40 | 
 41 | # 数据描述
 42 | '''
 43 | 这里是处理好的数据集，所以数据格式比较完整，不用进一步的处理。
 44 | 如有数据乱码或者出现缺失值等情况，我们当按照上一篇的方法进行适当的数据清洗。
 45 | '''
 46 | 
 47 | # print(iris_data_all_df.isnull().sum())  # 缺失值
 48 | print(iris_data_all_df.shape)  # 大小
 49 | print(iris_data_all_df.dtypes)  # 类型
 50 | print(iris_data_all_df.describe())  # 常见统计量的描述
 51 | print(iris_data_all_df.info())  # 多种信息
 52 | 
 53 | '''
 54 | 可视化的方法，来直观了解数据
 55 | '''
 56 | 
 57 | # 数据范围
 58 | sns.boxplot(data=iris_data_df)
 59 | plt.show()
 60 | 
 61 | # 总览
 62 | plt.plot(iris_data_df)
 63 | plt.legend(feature_names)
 64 | plt.show()
 65 | 
 66 | # 为了便于观察，也可以作出部分数据的图
 67 | # sepal
 68 | sepal_data_df = iris_data_df[['sepal length (cm)', 'sepal width (cm)']]
 69 | plt.plot(sepal_data_df)
 70 | plt.legend(['sepal length (cm)', 'sepal width (cm)'])
 71 | plt.title('sepal data')
 72 | plt.show()
 73 | 
 74 | # length
 75 | length_data = iris_data_df[['sepal length (cm)', 'petal length (cm)']]
 76 | plt.plot(length_data)
 77 | plt.legend(['sepal length (cm)', 'petal length (cm)'])
 78 | plt.title('length data')
 79 | 
 80 | # 相关性
 81 | sns.pairplot(iris_data_all_df, vars=iris_data_all_df.columns[:4], hue='target', size=3, kind="reg")
 82 | plt.show()
 83 | 
 84 | '''
 85 | Feature engineering
 86 | '''
 87 | 
 88 | # 变量之间的关系
 89 | Corr_Mat = iris_data_df.corr()
 90 | Mat_img = plt.matshow(Corr_Mat, cmap=plt.cm.winter_r)
 91 | plt.colorbar(Mat_img, ticks=[-1, 0, 1])
 92 | plt.show()
 93 | 
 94 | # 降维[参考Python DataScience Essentials]
 95 | pca = PCA(n_components=2)
 96 | pca_2c = pca.fit_transform(iris_data_df)
 97 | print(pca.explained_variance_ratio_)
 98 | print(pca.explained_variance_ratio_.sum())
 99 | 
100 | plt.scatter(pca_2c[:, 0], pca_2c[:, 1],
101 |             c=np.array(iris_target_df), alpha=0.8,
102 |             cmap=plt.cm.winter)
103 | 
104 | plt.show()
105 | 
106 | # train and test our model
107 | X_train, X_test, y_train, y_test = train_test_split(iris_data_df, iris_target_df, test_size=0.3)
108 | clf = SVC()
109 | clf.fit(X_train, y_train)
110 | predictions = clf.predict(X_test)
111 | print(accuracy_score(y_test, predictions))
112 | 


--------------------------------------------------------------------------------
/Ch1Spider/cookie/verifcode.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import json
  3 | import time
  4 | import pickle
  5 | import requests
  6 | import urllib.request
  7 | from PIL import Image
  8 | from fake_useragent import UserAgent
  9 | from bs4 import BeautifulSoup
 10 | from yundama import getcode_from_yundama
 11 | 
 12 | 
 13 | 
 14 | # 提交表单登录并获取cookie
 15 | 
 16 | def get_cookie_from_net():
 17 | 
 18 |     url = 'https://accounts.douban.com/login'
 19 |     login_html = s.get(url, headers=headers).text
 20 | 
 21 |     try:
 22 |         verif_img_url = re.findall(r'<img id="captcha_image" src="(.*?)" alt="captcha"', login_html)[0]
 23 |         verif_img_data = s.get(verif_img_url, headers=headers).content
 24 | 
 25 |         with open('douban.jpg', 'wb') as f:
 26 |             f.write(verif_img_data)
 27 | 
 28 |     except:
 29 |         captha_id = captha_code = None
 30 |     else:
 31 |         # 获取captcha-id
 32 |         captha_id = re.findall(r'name="captcha-id" value="(.*?)"/>', login_html)[0]
 33 |         print('captcha_id: ', captha_id)
 34 | 
 35 |         # 云打码自动获取
 36 |         print("利用云打码获取识别验证码...")
 37 |         captha_code = getcode_from_yundama()
 38 |         if not captha_code:
 39 |             print('sleeping...')
 40 |             time.sleep(10)
 41 |             captha_code = getcode_from_yundama()
 42 | 
 43 |         # 手动输入验证码
 44 |         # img = Image.open('douban.jpg')
 45 |         # Image._show(img)
 46 |         # captha_img = str(input("输入验证码："))
 47 | 
 48 |     # 构建表单
 49 |     if captha_id==None:
 50 |         payload = {'source': 'None',
 51 |                    'redir': 'https://www.douban.com/',
 52 |                    'form_email': '你的邮箱',
 53 |                    'form_password': '你的密码',
 54 |                    'login': '登录'}
 55 | 
 56 |     else:
 57 |         payload = {'source': 'None',
 58 |                    'redir': 'https://www.douban.com/',
 59 |                    'form_email': '你的邮箱',
 60 |                    'form_password': '你的邮箱',
 61 |                    'captcha-solution': captha_code,
 62 |                    'captcha-id': str(captha_id),
 63 |                    'login': '登录'}
 64 |     print(payload)
 65 | 
 66 |     url = 'https://accounts.douban.com/login'
 67 |     data = s.post(url, headers=headers, data=payload, verify=True)  # 绕过了SSL验证
 68 |     with open('cookies.douban', 'wb') as f:
 69 |         cookiedict = requests.utils.dict_from_cookiejar(s.cookies)
 70 |         pickle.dump(cookiedict, f)
 71 |     print("提交表单登录，成功获取cookies...")
 72 |     '''
 73 |     这里可以用用户名进一步的验证是否登录成功
 74 |     '''
 75 |     if '不秩稚童' in data.text:
 76 |         print("登录成功！")
 77 | 
 78 |     return s.cookies
 79 | 
 80 | # 从cookie文件获取cookie
 81 | def get_cookie_from_file():
 82 |     with open('cookies.douban', 'rb') as f:
 83 |         cookiedict = pickle.load(f)
 84 |         cookies = requests.utils.cookiejar_from_dict(cookiedict)
 85 |     print("解析文件，成功提取cookis...")
 86 |     return cookies
 87 | 
 88 | # 假设这里我要获取自己的签名数据
 89 | def getdata(html):
 90 |     soup = BeautifulSoup(html.text, 'lxml')
 91 |     mydata = soup.select('#display')[0].get_text()
 92 |     '''
 93 |     这里进行登录后其他数据的获取及存储，这里仅仅获取了自己的签名数据。
 94 |     '''
 95 |     return mydata
 96 | 
 97 | 
 98 | def login_and_getdata():
 99 |     print('获取cookis...')
100 |     try:
101 |         s.cookies = get_cookie_from_file()
102 |     except:
103 |         print("从文件获取cookies失败...\n正在尝试提交表单登录以获取...")
104 |         s.cookies = get_cookie_from_net()
105 | 
106 |     html = s.get('https://www.douban.com/people/146448257/', headers=headers)
107 |     data = getdata(html)
108 |     print(data)
109 | 
110 | 
111 | if __name__=='__main__':
112 |     # 一些全局变量
113 |     s = requests.session()
114 |     ua = UserAgent()
115 |     headers = {'User-Agent': ua.random}
116 | 
117 |     # 登录并获取数据
118 |     login_and_getdata()
119 | 
120 | 
121 | 


--------------------------------------------------------------------------------
/Ch1Spider/muti-threads/mutithreadspool.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | 多线程，多进程测试
  3 | 参考：
  4 | http://stackoverflow.com/questions/2846653/how-to-use-threading-in-python
  5 | https://docs.python.org/3.6/library/multiprocessing.html#module-multiprocessing.dummy
  6 | http://cuiqingcai.com/3325.html
  7 | '''
  8 | 
  9 | import time
 10 | import requests
 11 | import concurrent
 12 | from concurrent import futures
 13 | import pandas as pd
 14 | import threading
 15 | from multiprocessing import Pool
 16 | 
 17 | 
 18 | # 装饰器，打印函数的执行时间
 19 | def gettime(func):
 20 |     def warapper(*args, **kwargs):
 21 |         print("=" * 50)
 22 |         print(func.__name__, 'Start...')
 23 |         starttime = time.time()
 24 |         func(*args)
 25 |         endtime = time.time()
 26 |         spendtime = endtime - starttime
 27 |         print(func.__name__, "End...")
 28 |         print("Spend", spendtime, "s totally")
 29 |         print("=" * 50)
 30 | 
 31 |     return warapper
 32 | 
 33 | 
 34 | # 从文件取n个网址测试
 35 | def get_urls_from_file(n):
 36 |     df = pd.read_csv('TestUrls.csv')  # 共1000个网址
 37 |     urls = list(df['url'][:n])
 38 | 
 39 |     return urls
 40 | 
 41 | 
 42 | # 请求并解析网页获取数据（这里简单把要获取的数据设为网页源码）
 43 | def getdata(url, retries=3):
 44 |     # print("正在下载:", url)
 45 |     headers = {}
 46 |     try:
 47 |         html = requests.get(url, headers=headers)
 48 |         # print(html)
 49 | 
 50 |     except requests.exceptions.ConnectionError as e:
 51 |         # print('下载出错[ConnectionError]:', e)
 52 |         html = None
 53 | 
 54 |         # 5xx 错误为服务器错误,我们可以进行重新请求
 55 |     if (html != None and 500 <= html.status_code < 600 and retries):
 56 |         retries -= 1
 57 |         # print('服务器错误正在重试...')
 58 |         getdata(url, retries)
 59 |         data = html.text
 60 |     else:
 61 |         data = None
 62 | 
 63 |     return data
 64 | 
 65 | 
 66 | # 串行
 67 | @gettime
 68 | def Mynormal():
 69 |     for url in urls:
 70 |         getdata(url)
 71 | 
 72 | 
 73 | # 进程池
 74 | @gettime
 75 | def MyprocessPool(num=10):
 76 |     pool = Pool(num)
 77 |     results = pool.map(getdata, urls)
 78 | 
 79 |     pool.close()
 80 |     pool.join()
 81 |     return results
 82 | 
 83 | 
 84 | # 多线程
 85 | @gettime
 86 | def Mymultithread(max_threads=10):
 87 |     # 对urls的处理
 88 |     def urls_process():
 89 |         while True:
 90 |             try:
 91 |                 # 从urls末尾抽出一个url
 92 |                 url = urls.pop()
 93 |             except IndexError:
 94 |                 # urls爬取完毕，为空时，结束
 95 |                 break
 96 |             data = getdata(url, retries=3)
 97 |             '''
 98 |             这里是对网页数据的提取与存储操作
 99 |             '''
100 | 
101 |     threads = []
102 | 
103 |     # 未达到最大线程限制且仍然存在带爬取的url时，可以创建新的线程进行加速
104 |     while int(len(threads) < max_threads) and len(urls):
105 |         thread = threading.Thread(target=urls_process)
106 |         # print('创建线程', thread.getName())
107 |         thread.start()
108 |         threads.append(thread)
109 | 
110 |     for thread in threads:
111 |         thread.join()
112 | 
113 | 
114 | # 线程池
115 | @gettime
116 | def Myfutures(num_of_max_works=10):
117 |     with concurrent.futures.ThreadPoolExecutor(max_workers=num_of_max_works) as executor:
118 |         executor.map(getdata, urls)
119 | 
120 | 
121 | if __name__ == '__main__':
122 |     # 　取100个网页做测试
123 |     urls = get_urls_from_file(100)
124 |     Mynormal()  # 串行
125 |     MyprocessPool(10)  # 进程池
126 |     Myfutures(10)  # 线程池
127 |     Mymultithread(10)  # 多线程
128 | 
129 | '''
130 | 
131 | 100个网页
132 | 
133 | ==================================================
134 | Mynormal Start...
135 | Mynormal End...
136 | Spend 20.605727672576904 s totally
137 | ==================================================
138 | ==================================================
139 | MyprocessPool Start...
140 | MyprocessPool End...
141 | Spend 2.4525890350341797 s totally
142 | ==================================================
143 | ==================================================
144 | Mymutithread Start...
145 | Mymutithread End...
146 | Spend 2.1947641372680664 s totally
147 | ==================================================
148 | ==================================================
149 | Myfutures Start...
150 | Myfutures End...
151 | Spend 2.1515889167785645 s totally
152 | ==================================================
153 | 
154 | '''
155 | 


--------------------------------------------------------------------------------
/Ch2Data/DataIO/Mxlsx/MxlsxClass.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import xlsxwriter
  3 | import numpy as np
  4 | import pandas as pd
  5 | from openpyxl import load_workbook
  6 | 
  7 | 
  8 | class MxlsxWB():
  9 |     def __init__(self, workpath=os.getcwd(), filename=None):
 10 |         self.workpath = workpath  # 默认在当前目录
 11 |         self.filename = filename
 12 | 
 13 |     # 设置工作目录
 14 |     def set_path(self, workpath):
 15 |         self.workpath = workpath
 16 |         os.chdir(self.workpath)
 17 | 
 18 |     # 获取文件基本信息
 19 |     def get_fileinfo(self):
 20 |         # print(self.filename)
 21 |         print("=" * 30, "FILE INFO", "=" * 30)  # 分割线
 22 |         self.wb = load_workbook(filename=self.filename)
 23 |         self.sheetnames = self.wb.get_sheet_names()
 24 | 
 25 |         print("文件" + self.filename + "共包含", len(self.sheetnames), "个工作表")
 26 |         print("表名为：", end=" ")
 27 |         for name in self.sheetnames:
 28 |             print(name, end=" ")
 29 |         print("\n")
 30 |         print("=" * 30, "END FILE INFO", "=" * 30)  # 分割线
 31 | 
 32 |     # 选择工作表
 33 |     def choose_sheet(self, sheetname=None):
 34 |         if sheetname == None:
 35 |             self.sheetname = self.sheetnames[0]
 36 | 
 37 |         self.sheetname = sheetname
 38 |         self.worksheet = self.wb[self.sheetname]
 39 | 
 40 |     # 获取工作表基本信息
 41 |     def get_sheetinfo(self):
 42 | 
 43 |         print("=" * 30, self.sheetname, "=" * 30)  # 分割线
 44 | 
 45 |         self.num_of_rows = len(list(self.worksheet.rows))
 46 |         self.num_of_cols = len(list(self.worksheet.columns))
 47 | 
 48 |         print("行数：", self.num_of_rows)
 49 |         print("列数：", self.num_of_cols)
 50 |         print("列名：", MxlsxWB.get_rowdata(self, rownum=1))
 51 | 
 52 |         print("=" * 30, self.sheetname, "=" * 30)  # 分割线
 53 | 
 54 |     '''
 55 |     基于openpyxl——数据的查询与获取
 56 |     '''
 57 | 
 58 |     # 获取单行数据
 59 |     def get_rowdata(self, rownum):
 60 |         rowdata = []
 61 |         for row in self.worksheet.iter_rows(min_row=rownum, max_row=rownum, max_col=self.num_of_cols):
 62 |             for cell in row:
 63 |                 rowdata.append(cell.value)
 64 |         # print(rowdata)
 65 |         return rowdata
 66 | 
 67 |     # 获取单列数据
 68 |     def get_coldata(self, colnum):
 69 |         coldata = []
 70 |         for col in self.worksheet.iter_cols(min_row=colnum, max_row=colnum, max_col=self.num_of_rows):
 71 |             for cell in col:
 72 |                 coldata.append(cell.value)
 73 |         # print(coldata)
 74 |         return col
 75 | 
 76 |     # 获取特定区域数据
 77 |     def get_areadata(self, min_row, max_row, min_col, max_col):
 78 |         print("=" * 30, "区域数据", "=" * 30)  # 分割线
 79 | 
 80 |         # 创建空的（全为0）矩阵，数据类型指定为str
 81 |         areadata = np.matrix(np.zeros((max_row - min_row + 1, max_col - min_col + 1)), dtype=str)
 82 |         for col in self.worksheet.iter_cols(min_row=min_row, max_row=max_row, min_col=min_col, max_col=max_col):
 83 |             for cell in col:
 84 |                 col_index = cell.col_idx
 85 |                 row_index = cell.row
 86 |                 areadata[row_index - min_row, col_index - min_col] = cell.value
 87 |         print(areadata)
 88 | 
 89 |         print("=" * 30, "区域数据", "=" * 30)  # 分割线
 90 | 
 91 |         return areadata
 92 | 
 93 |     '''基于xlsxwriter——数据的写入'''
 94 | 
 95 |     def create_workbook(self, wb_name):
 96 |         if not '.xlsx' in wb_name:  # 如果忘记加后缀，自动补全
 97 |             self.wb = xlsxwriter.Workbook(wb_name + '.xlsx')
 98 |         self.wb = xlsxwriter.Workbook(wb_name)
 99 | 
100 |     def create_worksheet(self, ws_name):
101 |         self.worksheet = self.wb.add_worksheet(ws_name)
102 | 
103 |     # 写入列名,col_names为列表
104 |     def add_col_names(self, col_names):
105 |         self.num_of_cols = len(col_names)
106 |         for i in range(self.num_of_cols):
107 |             self.worksheet.write(0, i, col_names[i])
108 | 
109 |     # 在第colx列，写入一列数据，如之前的所有图书的标题列
110 |     def add_coldata(self, data, colx):
111 |         self.num_of_rows = len(data)
112 |         for row in range(len(data)):  # 记得不要覆盖标题列，所以下面row+1
113 |             self.worksheet.write(row + 1, colx - 1, data[row])
114 | 
115 |     # 在第rowx行，写入一行数据
116 |     def add_rowdata(self, data, rowx):
117 |         for col in range(self.num_of_cols):
118 |             self.worksheet.write(rowx - 1, col, data[col])
119 | 
120 |     def save(self):
121 |         self.wb.close()
122 | 
123 |     '''基于pandas的文件读写'''
124 | 
125 |     def read_by_pandas(self, filename=None):
126 |         if filename == None:
127 |             filename = self.filename
128 |         df = pd.read_excel(filename)
129 | 
130 |         print("=" * 10, "DataFrame From " + filename + ":", "=" * 10)
131 |         print(df)
132 |         print("=" * 10, "DataFrame From " + filename + ":", "=" * 10)
133 | 
134 |         return df
135 | 
136 |     def write_by_pandas(self, df, new_filename, new_sheetname):
137 |         df.to_excel(new_filename, sheetname=new_sheetname)
138 | 
139 | 
140 | if __name__ == '__main__':
141 |     Demo = MxlsxWB(filename='pandas_simple.xlsx')
142 |     Demo.set_path("Myxlsxdata")
143 | 
144 |     Demo.get_fileinfo()
145 |     Demo.choose_sheet('豆瓣图书')
146 |     Demo.get_sheetinfo()
147 |     Demo.get_areadata(2, 3, 2, 3)
148 | 
149 |     Demo.create_workbook('Mxlsxclass.xlsx')
150 |     Demo.create_worksheet('s1')
151 |     Demo.add_col_names(['col1', 'col2'])
152 |     Demo.add_coldata([1, 2, 3, 4, 5], 1)
153 |     Demo.add_coldata([2, 3, 4, 5, 6], 2)
154 |     Demo.save()
155 | 
156 |     Demo.read_by_pandas('Mxlsxclass.xlsx')
157 | 


--------------------------------------------------------------------------------
/Ch1Spider/first-demo/result.csv:
--------------------------------------------------------------------------------
 1 | img_urls,titles,ratings,authors,details
 2 | https://img1.doubanio.com/mpic/s29343377.jpg,散步去,8.9,[日]谷口治郎/后浪丨北京联合出版公司/2017-3,享誉欧美的日本漫画大师谷口治郎的代表作，汇集了18篇散步的故事，以电影镜头般的画笔描绘了日常的散步之乐和生活之美。
 3 | https://img1.doubanio.com/mpic/s29385647.jpg,地下铁道,9.1,[美]科尔森·怀特黑德（ColsonWhitehead）/世纪文景/上海人民出版社/2017-3,美国国家图书奖获奖作品，讲述少女科拉无家可归，受到欺辱和强暴，搭乘秘密的地下铁道一路向北、投奔自由的传奇故事。
 4 | https://img3.doubanio.com/mpic/s29357091.jpg,驻马店伤心故事集,8.9,郑在欢/上海文艺出版社/2017-2,郑在欢的中国版《小城畸人》，分为「病人列传」和「cult家族」两部分，用魔幻与残酷印刻出这个世界存在的现象和本质。
 5 | https://img3.doubanio.com/mpic/s29390802.jpg,草原动物园,9.2,马伯庸/中信出版社/2017-3,一位传教士带着「半个动物园」勇闯蒙古草原的奇幻故事，马伯庸历史奇幻新作，「马亲王」新历史主义写作的进阶之路。
 6 | https://img3.doubanio.com/mpic/s29365601.jpg,遇见野兔的那一年,8.3,[芬]阿托·帕西林纳/中信出版集团/2017-3-1,芬兰国宝级作家阿托·帕西林纳里程碑式作品，一个关于追求自由、逃离都市的生命转折故事。
 7 | https://img3.doubanio.com/mpic/s29247833.jpg,被占的宅子,9.2,[阿根廷]胡利奥·科塔萨尔/南海出版公司/2017-3,阿根廷小说大师胡利奥·科塔萨尔短篇小说全集第一部，收录《彼岸》《动物寓言集》《游戏的终结》三部短篇集。
 8 | https://img3.doubanio.com/mpic/s29261915.jpg,原谅石,8.7,[美]洛里·斯皮尔曼/九州出版社/2017-2,一个女人回望过去，找到二十年前的自己。德国年度十大畅销书排行榜作品，一个关于勇气与和解的故事。
 9 | https://img1.doubanio.com/mpic/s29335378.jpg,公牛山,8.8,[美]布赖恩·帕诺威奇/上海译文出版社/2017-2,一部绵延三代人、横跨数十年的黑道世家史诗，国际惊险小说家协会2016年度首作奖获奖作品。
10 | https://img1.doubanio.com/mpic/s29360928.jpg,寻找时间的人,8.7,[爱尔兰]凯特·汤普森/江苏凤凰文艺出版社/2017-3,爱尔兰国家文学奖得主凯特·汤普森作品，一部关于成长、亲情，以及不断消失的传统的颂歌。
11 | https://img5.doubanio.com/mpic/s29245396.jpg,带艾伯特回家,8.8,[美]霍默·希卡姆/未读·北京联合出版公司/2017-3,美国畅销书作家霍默·希卡姆的治愈系小说，一只鳄鱼充满惊喜的公路之旅，一个关于爱与选择的真实故事。
12 | https://img3.doubanio.com/mpic/s29369960.jpg,眼泪的化学,评价人数不足,[澳]彼得·凯里/上海译文出版社/2017-2,当代澳大利亚文学领军人物彼得·凯里作品，一部以博物馆为背景，兼具寓言与科幻特征的小说。
13 | https://img1.doubanio.com/mpic/s29362779.jpg,风雪追击,8.8,东野圭吾/现代出版社/2017-4,日本推理小说大师东野圭吾最新长篇小说，讲述一个突然被指认为杀人凶手的男孩，为了洗清嫌疑而进行自救的故事。
14 | https://img1.doubanio.com/mpic/s29361219.jpg,生火,8.7,[法]克里斯多夫·夏布特（ChristopheChabouté）编绘/后浪丨北京联合出版公司/2017-3,改编自杰克·伦敦描写极北严寒生活的同名小说，《灯塔》作者克里斯托夫·夏布特以漫画形式，再现自然主义文学经典。
15 | https://img3.doubanio.com/mpic/s29381992.jpg,有匪2：离恨楼,9.5,Priest/湖南文艺出版社/2017-3,晋江网大神级作家Priest古言小说扛鼎之作，《有匪》系列第二部，新增独家番外内容。
16 | https://img3.doubanio.com/mpic/s29348545.jpg,妹妹的坟墓,7.9,[美]罗伯特·杜格尼(RobertDugoni)/悦读名品|化学工业出版社/2017-3-1,《纽约时报》畅销作家罗伯特·杜格尼力作，妹妹的失踪之谜，牵引出一场饱含亲情的追凶之旅。
17 | https://img1.doubanio.com/mpic/s29372029.jpg,消失的星国,评价人数不足,墨熊/浦睿文化·湖南科学技术出版社/2017-3,华语科幻星云奖得主墨熊最新科幻力作，一个发生在时空边境线的故事，一场生死未卜的探险。
18 | https://img5.doubanio.com/mpic/s29383586.jpg,宛如昨日,7.2,蔡骏/湖南文艺出版社/2017-4,悬疑畅销作家蔡骏长篇游戏幻想推理小说，在数字游戏中触碰记忆深处的罪恶，揭开尘封已久的真相。
19 | https://img3.doubanio.com/mpic/s29379993.jpg,蝙蝠侠：黑与白1,8.9,[美]丹尼斯·奥尼尔等/世界图书出版公司/2017-3,黑暗骑士侦探的硬汉冒险短篇集，以完全的黑白基调呈现蝙蝠侠世界观。IGN评选为25部史上最佳蝙蝠侠漫画之一。
20 | https://img1.doubanio.com/mpic/s29376117.jpg,终极X战警2,评价人数不足,[英]马克·米勒/[美]亚当·库伯特/世界图书出版公司/2017-3-15,击败万磁王后，X战警即将面临新的挑战，伴随着来自过去的阴影，X战警能否化险为夷？
21 | https://img3.doubanio.com/mpic/s29407650.jpg,希腊棺材之谜,9.9,[美]埃勒里·奎因/新星出版社/2017-3,古董商留下了巨额遗产，葬礼后遗嘱却不翼而飞……推理小说大师埃勒里·奎因经典之作。
22 | https://img3.doubanio.com/mpic/s29372251.jpg,午夜起来听寂静,9.1,周云蓬/北京十月文艺出版社/2017-3,著名民谣歌手、诗人周云蓬新作，精选收入1999年至2016年间的诗作，完整呈现周云蓬的心灵成长轨迹。
23 | https://img3.doubanio.com/mpic/s29339735.jpg,全栈市场人,8.0,Lydia/人民邮电出版社/2017-2-1,知乎专栏作家Lydia倾心分享互联网运营精华，教你如何做一名互联网产品宣传项目操盘手。
24 | https://img1.doubanio.com/mpic/s29402558.jpg,致薇拉,评价人数不足,[美]弗拉基米尔·纳博科夫/人民文学出版社/2017-3,纳博科夫写给妻子薇拉的书信集，持续半个多世纪的婚姻中，纳博科夫写给薇拉的书信也从1932年的相识伴随到了最后。
25 | https://img3.doubanio.com/mpic/s29234990.jpg,达芬奇幽灵,9.3,[美]托比·莱斯特（TobyLester）/中信出版集团/楚尘文化/2017-3-10,作者托比·莱斯特以达芬奇名画《维特鲁威人》为钥匙，用侦探小说般的笔法打开文艺复兴之门，捕捉西方思想史上的关键时刻。
26 | https://img3.doubanio.com/mpic/s29385511.jpg,文明之光（第四册）,8.6,吴军/人民邮电出版社/2017-3-1,计算机科学家吴军博士《文明之光》系列的第四卷，以崭新视角全面展现人类文明史中那些绚烂多彩的璀璨文明。
27 | https://img3.doubanio.com/mpic/s29385675.jpg,青年斯大林,评价人数不足,[英]西蒙·蒙蒂菲奥里/浦睿文化·民主建设出版社/2017-3,《耶路撒冷三千年》的作者蒙蒂菲奥里历时10年，重述斯大林如何从鞋匠的儿子，最终成为列宁的左右手这一鲜为人知的成长历程。
28 | https://img5.doubanio.com/mpic/s29359446.jpg,食帖15：便当灵感集,9.1,林江/中信出版集团股份有限公司/2017-2,《食帖》系列第15辑《便当灵感集》，20位便当生活家经验分享，一本每日便当生活必备灵感全书。
29 | https://img3.doubanio.com/mpic/s29376053.jpg,中国1945,评价人数不足,[美]理查德·伯恩斯坦(RichardBernstein)/社会科学文献出版社/2017-3-1,作者理查德·伯恩斯坦用引人入胜的笔调，描述了1945年这个美国与中国关系的转折之年，挑战了现代中美关系起源的传统观点。
30 | https://img3.doubanio.com/mpic/s29370575.jpg,极简进步史,评价人数不足,[英]罗纳德·赖特/北京时代华文书局/2017-4-1,英国非虚构类创作大师罗纳德·赖特经典作品，对人类社会、文明、历史、科技、环境等多方面展开反思。
31 | https://img3.doubanio.com/mpic/s29287315.jpg,贩卖音乐,9.4,[美]大卫·伊斯曼/世界图书出版公司/2017-3-1,作者大卫·伊斯曼用通俗易懂的文字，讲述了美国镀金时代与进步时代音乐产业的起源、发展和变革等历史。
32 | https://img3.doubanio.com/mpic/s29398110.jpg,信仰与观看,评价人数不足,[法]罗兰·雷希特(RolandRecht)/北京大学出版社/2017-2-17,法国著名艺术史家罗兰·雷希特就哥特式大教堂艺术提出全新定义，为理解大教堂空间打开了崭新的维度。
33 | https://img3.doubanio.com/mpic/s29343324.jpg,庇护二世闻见录,评价人数不足,[意]皮科洛米尼/浙江大学出版社·启真馆/2017-2,《闻见录》是文艺复兴时期最伟大的教皇之一庇护二世（皮科洛米尼）的杰作，生动描写了一段典型的文艺复兴时期的经历。
34 | https://img3.doubanio.com/mpic/s29383791.jpg,何故为敌,评价人数不足,[德]李峻石（GüntherSchlee）/社会科学文献出版社/2017-3,德国著名民族学家李峻石从民族学研究中的身份认同问题出发，以欧洲、非洲的个案作为切入点，建构新的冲突理论。
35 | https://img3.doubanio.com/mpic/s29331664.jpg,共享经济没有告诉你的事,8.2,[加]汤姆·斯利（TomSlee）/后浪丨江西人民出版社/2017-3,作者汤姆·斯利从与众不同的角度，揭开共享经济鲜为人知的阴暗面，起底共享经济不为人知的行业内幕。
36 | https://img3.doubanio.com/mpic/s29404793.jpg,私人生活的变革,评价人数不足,阎云翔/世纪文景/上海人民出版社/2017-3,美国亚洲学会中国研究列文森图书大奖获奖作品，探讨了农民家庭生活中的个体性与情感生活这一从未被讨论过的议题。
37 | https://img1.doubanio.com/mpic/s29244537.jpg,石挥谈艺录：把生命交给舞台,8.4,石挥/后浪丨北京联合出版公司/2017-2,「话剧皇帝」石挥迄今为止最完整的著述辑录，从侧面完整呈现了中国话剧的历史演进。
38 | https://img1.doubanio.com/mpic/s29189179.jpg,托克维尔,9.1,[法]吕西安·若姆（LucienJaume）/三辉图书/漓江出版社/2017-2,围绕「托克维尔为什么写作《论美国的民主》」这一问题，剖析了托克维尔作为社会学家、道德家以及文学家的不同侧面。
39 | https://img3.doubanio.com/mpic/s29402102.jpg,几乎消失的偷闲艺术,评价人数不足,[加拿大]达尼·拉费里埃/海天出版社/2017-4,加拿大作家达尼·拉费里埃关于慢生活的一本美文集。几乎被人忘却的偷闲艺术，就是生活的艺术。
40 | https://img3.doubanio.com/mpic/s29373842.jpg,青苔不会消失,9.4,袁凌/中信出版集团/中信大方/2017-4,精选了袁凌十多年来记者生涯中最为杰出的十二篇非虚构作品，写出了一百位中国社会底层的人物故事。
41 | https://img1.doubanio.com/mpic/s29364988.jpg,鲍勃·迪伦：诗人之歌,9.2,[法]让-多米尼克·布里埃/湖南文艺出版社·读行者品牌/2017-4,基于鲍勃·迪伦从1962年到2015年的作品及访谈，作者让-多米尼克·布里埃重新勾勒了鲍勃·迪伦的传奇一生。
42 | 


--------------------------------------------------------------------------------
/Ch2Data/DataIO/Mcsv/result.csv:
--------------------------------------------------------------------------------
 1 | img_urls,titles,ratings,authors,details
 2 | https://img1.doubanio.com/mpic/s29343377.jpg,散步去,8.9,[日]谷口治郎/后浪丨北京联合出版公司/2017-3,享誉欧美的日本漫画大师谷口治郎的代表作，汇集了18篇散步的故事，以电影镜头般的画笔描绘了日常的散步之乐和生活之美。
 3 | https://img1.doubanio.com/mpic/s29385647.jpg,地下铁道,9.1,[美]科尔森·怀特黑德（ColsonWhitehead）/世纪文景/上海人民出版社/2017-3,美国国家图书奖获奖作品，讲述少女科拉无家可归，受到欺辱和强暴，搭乘秘密的地下铁道一路向北、投奔自由的传奇故事。
 4 | https://img3.doubanio.com/mpic/s29357091.jpg,驻马店伤心故事集,8.9,郑在欢/上海文艺出版社/2017-2,郑在欢的中国版《小城畸人》，分为「病人列传」和「cult家族」两部分，用魔幻与残酷印刻出这个世界存在的现象和本质。
 5 | https://img3.doubanio.com/mpic/s29390802.jpg,草原动物园,9.2,马伯庸/中信出版社/2017-3,一位传教士带着「半个动物园」勇闯蒙古草原的奇幻故事，马伯庸历史奇幻新作，「马亲王」新历史主义写作的进阶之路。
 6 | https://img3.doubanio.com/mpic/s29365601.jpg,遇见野兔的那一年,8.3,[芬]阿托·帕西林纳/中信出版集团/2017-3-1,芬兰国宝级作家阿托·帕西林纳里程碑式作品，一个关于追求自由、逃离都市的生命转折故事。
 7 | https://img3.doubanio.com/mpic/s29247833.jpg,被占的宅子,9.2,[阿根廷]胡利奥·科塔萨尔/南海出版公司/2017-3,阿根廷小说大师胡利奥·科塔萨尔短篇小说全集第一部，收录《彼岸》《动物寓言集》《游戏的终结》三部短篇集。
 8 | https://img3.doubanio.com/mpic/s29261915.jpg,原谅石,8.7,[美]洛里·斯皮尔曼/九州出版社/2017-2,一个女人回望过去，找到二十年前的自己。德国年度十大畅销书排行榜作品，一个关于勇气与和解的故事。
 9 | https://img1.doubanio.com/mpic/s29335378.jpg,公牛山,8.8,[美]布赖恩·帕诺威奇/上海译文出版社/2017-2,一部绵延三代人、横跨数十年的黑道世家史诗，国际惊险小说家协会2016年度首作奖获奖作品。
10 | https://img1.doubanio.com/mpic/s29360928.jpg,寻找时间的人,8.7,[爱尔兰]凯特·汤普森/江苏凤凰文艺出版社/2017-3,爱尔兰国家文学奖得主凯特·汤普森作品，一部关于成长、亲情，以及不断消失的传统的颂歌。
11 | https://img5.doubanio.com/mpic/s29245396.jpg,带艾伯特回家,8.8,[美]霍默·希卡姆/未读·北京联合出版公司/2017-3,美国畅销书作家霍默·希卡姆的治愈系小说，一只鳄鱼充满惊喜的公路之旅，一个关于爱与选择的真实故事。
12 | https://img3.doubanio.com/mpic/s29369960.jpg,眼泪的化学,评价人数不足,[澳]彼得·凯里/上海译文出版社/2017-2,当代澳大利亚文学领军人物彼得·凯里作品，一部以博物馆为背景，兼具寓言与科幻特征的小说。
13 | https://img1.doubanio.com/mpic/s29362779.jpg,风雪追击,8.8,东野圭吾/现代出版社/2017-4,日本推理小说大师东野圭吾最新长篇小说，讲述一个突然被指认为杀人凶手的男孩，为了洗清嫌疑而进行自救的故事。
14 | https://img1.doubanio.com/mpic/s29361219.jpg,生火,8.7,[法]克里斯多夫·夏布特（ChristopheChabouté）编绘/后浪丨北京联合出版公司/2017-3,改编自杰克·伦敦描写极北严寒生活的同名小说，《灯塔》作者克里斯托夫·夏布特以漫画形式，再现自然主义文学经典。
15 | https://img3.doubanio.com/mpic/s29381992.jpg,有匪2：离恨楼,9.5,Priest/湖南文艺出版社/2017-3,晋江网大神级作家Priest古言小说扛鼎之作，《有匪》系列第二部，新增独家番外内容。
16 | https://img3.doubanio.com/mpic/s29348545.jpg,妹妹的坟墓,7.9,[美]罗伯特·杜格尼(RobertDugoni)/悦读名品|化学工业出版社/2017-3-1,《纽约时报》畅销作家罗伯特·杜格尼力作，妹妹的失踪之谜，牵引出一场饱含亲情的追凶之旅。
17 | https://img1.doubanio.com/mpic/s29372029.jpg,消失的星国,评价人数不足,墨熊/浦睿文化·湖南科学技术出版社/2017-3,华语科幻星云奖得主墨熊最新科幻力作，一个发生在时空边境线的故事，一场生死未卜的探险。
18 | https://img5.doubanio.com/mpic/s29383586.jpg,宛如昨日,7.2,蔡骏/湖南文艺出版社/2017-4,悬疑畅销作家蔡骏长篇游戏幻想推理小说，在数字游戏中触碰记忆深处的罪恶，揭开尘封已久的真相。
19 | https://img3.doubanio.com/mpic/s29379993.jpg,蝙蝠侠：黑与白1,8.9,[美]丹尼斯·奥尼尔等/世界图书出版公司/2017-3,黑暗骑士侦探的硬汉冒险短篇集，以完全的黑白基调呈现蝙蝠侠世界观。IGN评选为25部史上最佳蝙蝠侠漫画之一。
20 | https://img1.doubanio.com/mpic/s29376117.jpg,终极X战警2,评价人数不足,[英]马克·米勒/[美]亚当·库伯特/世界图书出版公司/2017-3-15,击败万磁王后，X战警即将面临新的挑战，伴随着来自过去的阴影，X战警能否化险为夷？
21 | https://img3.doubanio.com/mpic/s29407650.jpg,希腊棺材之谜,9.9,[美]埃勒里·奎因/新星出版社/2017-3,古董商留下了巨额遗产，葬礼后遗嘱却不翼而飞……推理小说大师埃勒里·奎因经典之作。
22 | https://img3.doubanio.com/mpic/s29372251.jpg,午夜起来听寂静,9.1,周云蓬/北京十月文艺出版社/2017-3,著名民谣歌手、诗人周云蓬新作，精选收入1999年至2016年间的诗作，完整呈现周云蓬的心灵成长轨迹。
23 | https://img3.doubanio.com/mpic/s29339735.jpg,全栈市场人,8.0,Lydia/人民邮电出版社/2017-2-1,知乎专栏作家Lydia倾心分享互联网运营精华，教你如何做一名互联网产品宣传项目操盘手。
24 | https://img1.doubanio.com/mpic/s29402558.jpg,致薇拉,评价人数不足,[美]弗拉基米尔·纳博科夫/人民文学出版社/2017-3,纳博科夫写给妻子薇拉的书信集，持续半个多世纪的婚姻中，纳博科夫写给薇拉的书信也从1932年的相识伴随到了最后。
25 | https://img3.doubanio.com/mpic/s29234990.jpg,达芬奇幽灵,9.3,[美]托比·莱斯特（TobyLester）/中信出版集团/楚尘文化/2017-3-10,作者托比·莱斯特以达芬奇名画《维特鲁威人》为钥匙，用侦探小说般的笔法打开文艺复兴之门，捕捉西方思想史上的关键时刻。
26 | https://img3.doubanio.com/mpic/s29385511.jpg,文明之光（第四册）,8.6,吴军/人民邮电出版社/2017-3-1,计算机科学家吴军博士《文明之光》系列的第四卷，以崭新视角全面展现人类文明史中那些绚烂多彩的璀璨文明。
27 | https://img3.doubanio.com/mpic/s29385675.jpg,青年斯大林,评价人数不足,[英]西蒙·蒙蒂菲奥里/浦睿文化·民主建设出版社/2017-3,《耶路撒冷三千年》的作者蒙蒂菲奥里历时10年，重述斯大林如何从鞋匠的儿子，最终成为列宁的左右手这一鲜为人知的成长历程。
28 | https://img5.doubanio.com/mpic/s29359446.jpg,食帖15：便当灵感集,9.1,林江/中信出版集团股份有限公司/2017-2,《食帖》系列第15辑《便当灵感集》，20位便当生活家经验分享，一本每日便当生活必备灵感全书。
29 | https://img3.doubanio.com/mpic/s29376053.jpg,中国1945,评价人数不足,[美]理查德·伯恩斯坦(RichardBernstein)/社会科学文献出版社/2017-3-1,作者理查德·伯恩斯坦用引人入胜的笔调，描述了1945年这个美国与中国关系的转折之年，挑战了现代中美关系起源的传统观点。
30 | https://img3.doubanio.com/mpic/s29370575.jpg,极简进步史,评价人数不足,[英]罗纳德·赖特/北京时代华文书局/2017-4-1,英国非虚构类创作大师罗纳德·赖特经典作品，对人类社会、文明、历史、科技、环境等多方面展开反思。
31 | https://img3.doubanio.com/mpic/s29287315.jpg,贩卖音乐,9.4,[美]大卫·伊斯曼/世界图书出版公司/2017-3-1,作者大卫·伊斯曼用通俗易懂的文字，讲述了美国镀金时代与进步时代音乐产业的起源、发展和变革等历史。
32 | https://img3.doubanio.com/mpic/s29398110.jpg,信仰与观看,评价人数不足,[法]罗兰·雷希特(RolandRecht)/北京大学出版社/2017-2-17,法国著名艺术史家罗兰·雷希特就哥特式大教堂艺术提出全新定义，为理解大教堂空间打开了崭新的维度。
33 | https://img3.doubanio.com/mpic/s29343324.jpg,庇护二世闻见录,评价人数不足,[意]皮科洛米尼/浙江大学出版社·启真馆/2017-2,《闻见录》是文艺复兴时期最伟大的教皇之一庇护二世（皮科洛米尼）的杰作，生动描写了一段典型的文艺复兴时期的经历。
34 | https://img3.doubanio.com/mpic/s29383791.jpg,何故为敌,评价人数不足,[德]李峻石（GüntherSchlee）/社会科学文献出版社/2017-3,德国著名民族学家李峻石从民族学研究中的身份认同问题出发，以欧洲、非洲的个案作为切入点，建构新的冲突理论。
35 | https://img3.doubanio.com/mpic/s29331664.jpg,共享经济没有告诉你的事,8.2,[加]汤姆·斯利（TomSlee）/后浪丨江西人民出版社/2017-3,作者汤姆·斯利从与众不同的角度，揭开共享经济鲜为人知的阴暗面，起底共享经济不为人知的行业内幕。
36 | https://img3.doubanio.com/mpic/s29404793.jpg,私人生活的变革,评价人数不足,阎云翔/世纪文景/上海人民出版社/2017-3,美国亚洲学会中国研究列文森图书大奖获奖作品，探讨了农民家庭生活中的个体性与情感生活这一从未被讨论过的议题。
37 | https://img1.doubanio.com/mpic/s29244537.jpg,石挥谈艺录：把生命交给舞台,8.4,石挥/后浪丨北京联合出版公司/2017-2,「话剧皇帝」石挥迄今为止最完整的著述辑录，从侧面完整呈现了中国话剧的历史演进。
38 | https://img1.doubanio.com/mpic/s29189179.jpg,托克维尔,9.1,[法]吕西安·若姆（LucienJaume）/三辉图书/漓江出版社/2017-2,围绕「托克维尔为什么写作《论美国的民主》」这一问题，剖析了托克维尔作为社会学家、道德家以及文学家的不同侧面。
39 | https://img3.doubanio.com/mpic/s29402102.jpg,几乎消失的偷闲艺术,评价人数不足,[加拿大]达尼·拉费里埃/海天出版社/2017-4,加拿大作家达尼·拉费里埃关于慢生活的一本美文集。几乎被人忘却的偷闲艺术，就是生活的艺术。
40 | https://img3.doubanio.com/mpic/s29373842.jpg,青苔不会消失,9.4,袁凌/中信出版集团/中信大方/2017-4,精选了袁凌十多年来记者生涯中最为杰出的十二篇非虚构作品，写出了一百位中国社会底层的人物故事。
41 | https://img1.doubanio.com/mpic/s29364988.jpg,鲍勃·迪伦：诗人之歌,9.2,[法]让-多米尼克·布里埃/湖南文艺出版社·读行者品牌/2017-4,基于鲍勃·迪伦从1962年到2015年的作品及访谈，作者让-多米尼克·布里埃重新勾勒了鲍勃·迪伦的传奇一生。
42 | 


--------------------------------------------------------------------------------
/Ch2Data/Clean/RealClean/result.csv:
--------------------------------------------------------------------------------
 1 | img_urls,titles,ratings,authors,details
 2 | https://img1.doubanio.com/mpic/s29343377.jpg,散步去,8.9,[日]谷口治郎/后浪丨北京联合出版公司/2017-3,享誉欧美的日本漫画大师谷口治郎的代表作，汇集了18篇散步的故事，以电影镜头般的画笔描绘了日常的散步之乐和生活之美。
 3 | https://img1.doubanio.com/mpic/s29385647.jpg,地下铁道,9.1,[美]科尔森·怀特黑德（ColsonWhitehead）/世纪文景/上海人民出版社/2017-3,美国国家图书奖获奖作品，讲述少女科拉无家可归，受到欺辱和强暴，搭乘秘密的地下铁道一路向北、投奔自由的传奇故事。
 4 | https://img3.doubanio.com/mpic/s29357091.jpg,驻马店伤心故事集,8.9,郑在欢/上海文艺出版社/2017-2,郑在欢的中国版《小城畸人》，分为「病人列传」和「cult家族」两部分，用魔幻与残酷印刻出这个世界存在的现象和本质。
 5 | https://img3.doubanio.com/mpic/s29390802.jpg,草原动物园,9.2,马伯庸/中信出版社/2017-3,一位传教士带着「半个动物园」勇闯蒙古草原的奇幻故事，马伯庸历史奇幻新作，「马亲王」新历史主义写作的进阶之路。
 6 | https://img3.doubanio.com/mpic/s29365601.jpg,遇见野兔的那一年,8.3,[芬]阿托·帕西林纳/中信出版集团/2017-3-1,芬兰国宝级作家阿托·帕西林纳里程碑式作品，一个关于追求自由、逃离都市的生命转折故事。
 7 | https://img3.doubanio.com/mpic/s29247833.jpg,被占的宅子,9.2,[阿根廷]胡利奥·科塔萨尔/南海出版公司/2017-3,阿根廷小说大师胡利奥·科塔萨尔短篇小说全集第一部，收录《彼岸》《动物寓言集》《游戏的终结》三部短篇集。
 8 | https://img3.doubanio.com/mpic/s29261915.jpg,原谅石,8.7,[美]洛里·斯皮尔曼/九州出版社/2017-2,一个女人回望过去，找到二十年前的自己。德国年度十大畅销书排行榜作品，一个关于勇气与和解的故事。
 9 | https://img1.doubanio.com/mpic/s29335378.jpg,公牛山,8.8,[美]布赖恩·帕诺威奇/上海译文出版社/2017-2,一部绵延三代人、横跨数十年的黑道世家史诗，国际惊险小说家协会2016年度首作奖获奖作品。
10 | https://img1.doubanio.com/mpic/s29360928.jpg,寻找时间的人,8.7,[爱尔兰]凯特·汤普森/江苏凤凰文艺出版社/2017-3,爱尔兰国家文学奖得主凯特·汤普森作品，一部关于成长、亲情，以及不断消失的传统的颂歌。
11 | https://img5.doubanio.com/mpic/s29245396.jpg,带艾伯特回家,8.8,[美]霍默·希卡姆/未读·北京联合出版公司/2017-3,美国畅销书作家霍默·希卡姆的治愈系小说，一只鳄鱼充满惊喜的公路之旅，一个关于爱与选择的真实故事。
12 | https://img3.doubanio.com/mpic/s29369960.jpg,眼泪的化学,评价人数不足,[澳]彼得·凯里/上海译文出版社/2017-2,当代澳大利亚文学领军人物彼得·凯里作品，一部以博物馆为背景，兼具寓言与科幻特征的小说。
13 | https://img1.doubanio.com/mpic/s29362779.jpg,风雪追击,8.8,东野圭吾/现代出版社/2017-4,日本推理小说大师东野圭吾最新长篇小说，讲述一个突然被指认为杀人凶手的男孩，为了洗清嫌疑而进行自救的故事。
14 | https://img1.doubanio.com/mpic/s29361219.jpg,生火,8.7,[法]克里斯多夫·夏布特（ChristopheChabouté）编绘/后浪丨北京联合出版公司/2017-3,改编自杰克·伦敦描写极北严寒生活的同名小说，《灯塔》作者克里斯托夫·夏布特以漫画形式，再现自然主义文学经典。
15 | https://img3.doubanio.com/mpic/s29381992.jpg,有匪2：离恨楼,9.5,Priest/湖南文艺出版社/2017-3,晋江网大神级作家Priest古言小说扛鼎之作，《有匪》系列第二部，新增独家番外内容。
16 | https://img3.doubanio.com/mpic/s29348545.jpg,妹妹的坟墓,7.9,[美]罗伯特·杜格尼(RobertDugoni)/悦读名品|化学工业出版社/2017-3-1,《纽约时报》畅销作家罗伯特·杜格尼力作，妹妹的失踪之谜，牵引出一场饱含亲情的追凶之旅。
17 | https://img1.doubanio.com/mpic/s29372029.jpg,消失的星国,评价人数不足,墨熊/浦睿文化·湖南科学技术出版社/2017-3,华语科幻星云奖得主墨熊最新科幻力作，一个发生在时空边境线的故事，一场生死未卜的探险。
18 | https://img5.doubanio.com/mpic/s29383586.jpg,宛如昨日,7.2,蔡骏/湖南文艺出版社/2017-4,悬疑畅销作家蔡骏长篇游戏幻想推理小说，在数字游戏中触碰记忆深处的罪恶，揭开尘封已久的真相。
19 | https://img3.doubanio.com/mpic/s29379993.jpg,蝙蝠侠：黑与白1,8.9,[美]丹尼斯·奥尼尔等/世界图书出版公司/2017-3,黑暗骑士侦探的硬汉冒险短篇集，以完全的黑白基调呈现蝙蝠侠世界观。IGN评选为25部史上最佳蝙蝠侠漫画之一。
20 | https://img1.doubanio.com/mpic/s29376117.jpg,终极X战警2,评价人数不足,[英]马克·米勒/[美]亚当·库伯特/世界图书出版公司/2017-3-15,击败万磁王后，X战警即将面临新的挑战，伴随着来自过去的阴影，X战警能否化险为夷？
21 | https://img3.doubanio.com/mpic/s29407650.jpg,希腊棺材之谜,9.9,[美]埃勒里·奎因/新星出版社/2017-3,古董商留下了巨额遗产，葬礼后遗嘱却不翼而飞……推理小说大师埃勒里·奎因经典之作。
22 | https://img3.doubanio.com/mpic/s29372251.jpg,午夜起来听寂静,9.1,周云蓬/北京十月文艺出版社/2017-3,著名民谣歌手、诗人周云蓬新作，精选收入1999年至2016年间的诗作，完整呈现周云蓬的心灵成长轨迹。
23 | https://img3.doubanio.com/mpic/s29339735.jpg,全栈市场人,8.0,Lydia/人民邮电出版社/2017-2-1,知乎专栏作家Lydia倾心分享互联网运营精华，教你如何做一名互联网产品宣传项目操盘手。
24 | https://img1.doubanio.com/mpic/s29402558.jpg,致薇拉,评价人数不足,[美]弗拉基米尔·纳博科夫/人民文学出版社/2017-3,纳博科夫写给妻子薇拉的书信集，持续半个多世纪的婚姻中，纳博科夫写给薇拉的书信也从1932年的相识伴随到了最后。
25 | https://img3.doubanio.com/mpic/s29234990.jpg,达芬奇幽灵,9.3,[美]托比·莱斯特（TobyLester）/中信出版集团/楚尘文化/2017-3-10,作者托比·莱斯特以达芬奇名画《维特鲁威人》为钥匙，用侦探小说般的笔法打开文艺复兴之门，捕捉西方思想史上的关键时刻。
26 | https://img3.doubanio.com/mpic/s29385511.jpg,文明之光（第四册）,8.6,吴军/人民邮电出版社/2017-3-1,计算机科学家吴军博士《文明之光》系列的第四卷，以崭新视角全面展现人类文明史中那些绚烂多彩的璀璨文明。
27 | https://img3.doubanio.com/mpic/s29385675.jpg,青年斯大林,评价人数不足,[英]西蒙·蒙蒂菲奥里/浦睿文化·民主建设出版社/2017-3,《耶路撒冷三千年》的作者蒙蒂菲奥里历时10年，重述斯大林如何从鞋匠的儿子，最终成为列宁的左右手这一鲜为人知的成长历程。
28 | https://img5.doubanio.com/mpic/s29359446.jpg,食帖15：便当灵感集,9.1,林江/中信出版集团股份有限公司/2017-2,《食帖》系列第15辑《便当灵感集》，20位便当生活家经验分享，一本每日便当生活必备灵感全书。
29 | https://img3.doubanio.com/mpic/s29376053.jpg,中国1945,评价人数不足,[美]理查德·伯恩斯坦(RichardBernstein)/社会科学文献出版社/2017-3-1,作者理查德·伯恩斯坦用引人入胜的笔调，描述了1945年这个美国与中国关系的转折之年，挑战了现代中美关系起源的传统观点。
30 | https://img3.doubanio.com/mpic/s29370575.jpg,极简进步史,评价人数不足,[英]罗纳德·赖特/北京时代华文书局/2017-4-1,英国非虚构类创作大师罗纳德·赖特经典作品，对人类社会、文明、历史、科技、环境等多方面展开反思。
31 | https://img3.doubanio.com/mpic/s29287315.jpg,贩卖音乐,9.4,[美]大卫·伊斯曼/世界图书出版公司/2017-3-1,作者大卫·伊斯曼用通俗易懂的文字，讲述了美国镀金时代与进步时代音乐产业的起源、发展和变革等历史。
32 | https://img3.doubanio.com/mpic/s29398110.jpg,信仰与观看,评价人数不足,[法]罗兰·雷希特(RolandRecht)/北京大学出版社/2017-2-17,法国著名艺术史家罗兰·雷希特就哥特式大教堂艺术提出全新定义，为理解大教堂空间打开了崭新的维度。
33 | https://img3.doubanio.com/mpic/s29343324.jpg,庇护二世闻见录,评价人数不足,[意]皮科洛米尼/浙江大学出版社·启真馆/2017-2,《闻见录》是文艺复兴时期最伟大的教皇之一庇护二世（皮科洛米尼）的杰作，生动描写了一段典型的文艺复兴时期的经历。
34 | https://img3.doubanio.com/mpic/s29383791.jpg,何故为敌,评价人数不足,[德]李峻石（GüntherSchlee）/社会科学文献出版社/2017-3,德国著名民族学家李峻石从民族学研究中的身份认同问题出发，以欧洲、非洲的个案作为切入点，建构新的冲突理论。
35 | https://img3.doubanio.com/mpic/s29331664.jpg,共享经济没有告诉你的事,8.2,[加]汤姆·斯利（TomSlee）/后浪丨江西人民出版社/2017-3,作者汤姆·斯利从与众不同的角度，揭开共享经济鲜为人知的阴暗面，起底共享经济不为人知的行业内幕。
36 | https://img3.doubanio.com/mpic/s29404793.jpg,私人生活的变革,评价人数不足,阎云翔/世纪文景/上海人民出版社/2017-3,美国亚洲学会中国研究列文森图书大奖获奖作品，探讨了农民家庭生活中的个体性与情感生活这一从未被讨论过的议题。
37 | https://img1.doubanio.com/mpic/s29244537.jpg,石挥谈艺录：把生命交给舞台,8.4,石挥/后浪丨北京联合出版公司/2017-2,「话剧皇帝」石挥迄今为止最完整的著述辑录，从侧面完整呈现了中国话剧的历史演进。
38 | https://img1.doubanio.com/mpic/s29189179.jpg,托克维尔,9.1,[法]吕西安·若姆（LucienJaume）/三辉图书/漓江出版社/2017-2,围绕「托克维尔为什么写作《论美国的民主》」这一问题，剖析了托克维尔作为社会学家、道德家以及文学家的不同侧面。
39 | https://img3.doubanio.com/mpic/s29402102.jpg,几乎消失的偷闲艺术,评价人数不足,[加拿大]达尼·拉费里埃/海天出版社/2017-4,加拿大作家达尼·拉费里埃关于慢生活的一本美文集。几乎被人忘却的偷闲艺术，就是生活的艺术。
40 | https://img3.doubanio.com/mpic/s29373842.jpg,青苔不会消失,9.4,袁凌/中信出版集团/中信大方/2017-4,精选了袁凌十多年来记者生涯中最为杰出的十二篇非虚构作品，写出了一百位中国社会底层的人物故事。
41 | https://img1.doubanio.com/mpic/s29364988.jpg,鲍勃·迪伦：诗人之歌,9.2,[法]让-多米尼克·布里埃/湖南文艺出版社·读行者品牌/2017-4,基于鲍勃·迪伦从1962年到2015年的作品及访谈，作者让-多米尼克·布里埃重新勾勒了鲍勃·迪伦的传奇一生。
42 | 


--------------------------------------------------------------------------------
/Ch2Data/DataIO/Mpymysql/result.csv:
--------------------------------------------------------------------------------
 1 | img_urls,titles,ratings,authors,details
 2 | https://img1.doubanio.com/mpic/s29343377.jpg,散步去,8.9,[日]谷口治郎/后浪丨北京联合出版公司/2017-3,享誉欧美的日本漫画大师谷口治郎的代表作，汇集了18篇散步的故事，以电影镜头般的画笔描绘了日常的散步之乐和生活之美。
 3 | https://img1.doubanio.com/mpic/s29385647.jpg,地下铁道,9.1,[美]科尔森·怀特黑德（ColsonWhitehead）/世纪文景/上海人民出版社/2017-3,美国国家图书奖获奖作品，讲述少女科拉无家可归，受到欺辱和强暴，搭乘秘密的地下铁道一路向北、投奔自由的传奇故事。
 4 | https://img3.doubanio.com/mpic/s29357091.jpg,驻马店伤心故事集,8.9,郑在欢/上海文艺出版社/2017-2,郑在欢的中国版《小城畸人》，分为「病人列传」和「cult家族」两部分，用魔幻与残酷印刻出这个世界存在的现象和本质。
 5 | https://img3.doubanio.com/mpic/s29390802.jpg,草原动物园,9.2,马伯庸/中信出版社/2017-3,一位传教士带着「半个动物园」勇闯蒙古草原的奇幻故事，马伯庸历史奇幻新作，「马亲王」新历史主义写作的进阶之路。
 6 | https://img3.doubanio.com/mpic/s29365601.jpg,遇见野兔的那一年,8.3,[芬]阿托·帕西林纳/中信出版集团/2017-3-1,芬兰国宝级作家阿托·帕西林纳里程碑式作品，一个关于追求自由、逃离都市的生命转折故事。
 7 | https://img3.doubanio.com/mpic/s29247833.jpg,被占的宅子,9.2,[阿根廷]胡利奥·科塔萨尔/南海出版公司/2017-3,阿根廷小说大师胡利奥·科塔萨尔短篇小说全集第一部，收录《彼岸》《动物寓言集》《游戏的终结》三部短篇集。
 8 | https://img3.doubanio.com/mpic/s29261915.jpg,原谅石,8.7,[美]洛里·斯皮尔曼/九州出版社/2017-2,一个女人回望过去，找到二十年前的自己。德国年度十大畅销书排行榜作品，一个关于勇气与和解的故事。
 9 | https://img1.doubanio.com/mpic/s29335378.jpg,公牛山,8.8,[美]布赖恩·帕诺威奇/上海译文出版社/2017-2,一部绵延三代人、横跨数十年的黑道世家史诗，国际惊险小说家协会2016年度首作奖获奖作品。
10 | https://img1.doubanio.com/mpic/s29360928.jpg,寻找时间的人,8.7,[爱尔兰]凯特·汤普森/江苏凤凰文艺出版社/2017-3,爱尔兰国家文学奖得主凯特·汤普森作品，一部关于成长、亲情，以及不断消失的传统的颂歌。
11 | https://img5.doubanio.com/mpic/s29245396.jpg,带艾伯特回家,8.8,[美]霍默·希卡姆/未读·北京联合出版公司/2017-3,美国畅销书作家霍默·希卡姆的治愈系小说，一只鳄鱼充满惊喜的公路之旅，一个关于爱与选择的真实故事。
12 | https://img3.doubanio.com/mpic/s29369960.jpg,眼泪的化学,评价人数不足,[澳]彼得·凯里/上海译文出版社/2017-2,当代澳大利亚文学领军人物彼得·凯里作品，一部以博物馆为背景，兼具寓言与科幻特征的小说。
13 | https://img1.doubanio.com/mpic/s29362779.jpg,风雪追击,8.8,东野圭吾/现代出版社/2017-4,日本推理小说大师东野圭吾最新长篇小说，讲述一个突然被指认为杀人凶手的男孩，为了洗清嫌疑而进行自救的故事。
14 | https://img1.doubanio.com/mpic/s29361219.jpg,生火,8.7,[法]克里斯多夫·夏布特（ChristopheChabouté）编绘/后浪丨北京联合出版公司/2017-3,改编自杰克·伦敦描写极北严寒生活的同名小说，《灯塔》作者克里斯托夫·夏布特以漫画形式，再现自然主义文学经典。
15 | https://img3.doubanio.com/mpic/s29381992.jpg,有匪2：离恨楼,9.5,Priest/湖南文艺出版社/2017-3,晋江网大神级作家Priest古言小说扛鼎之作，《有匪》系列第二部，新增独家番外内容。
16 | https://img3.doubanio.com/mpic/s29348545.jpg,妹妹的坟墓,7.9,[美]罗伯特·杜格尼(RobertDugoni)/悦读名品|化学工业出版社/2017-3-1,《纽约时报》畅销作家罗伯特·杜格尼力作，妹妹的失踪之谜，牵引出一场饱含亲情的追凶之旅。
17 | https://img1.doubanio.com/mpic/s29372029.jpg,消失的星国,评价人数不足,墨熊/浦睿文化·湖南科学技术出版社/2017-3,华语科幻星云奖得主墨熊最新科幻力作，一个发生在时空边境线的故事，一场生死未卜的探险。
18 | https://img5.doubanio.com/mpic/s29383586.jpg,宛如昨日,7.2,蔡骏/湖南文艺出版社/2017-4,悬疑畅销作家蔡骏长篇游戏幻想推理小说，在数字游戏中触碰记忆深处的罪恶，揭开尘封已久的真相。
19 | https://img3.doubanio.com/mpic/s29379993.jpg,蝙蝠侠：黑与白1,8.9,[美]丹尼斯·奥尼尔等/世界图书出版公司/2017-3,黑暗骑士侦探的硬汉冒险短篇集，以完全的黑白基调呈现蝙蝠侠世界观。IGN评选为25部史上最佳蝙蝠侠漫画之一。
20 | https://img1.doubanio.com/mpic/s29376117.jpg,终极X战警2,评价人数不足,[英]马克·米勒/[美]亚当·库伯特/世界图书出版公司/2017-3-15,击败万磁王后，X战警即将面临新的挑战，伴随着来自过去的阴影，X战警能否化险为夷？
21 | https://img3.doubanio.com/mpic/s29407650.jpg,希腊棺材之谜,9.9,[美]埃勒里·奎因/新星出版社/2017-3,古董商留下了巨额遗产，葬礼后遗嘱却不翼而飞……推理小说大师埃勒里·奎因经典之作。
22 | https://img3.doubanio.com/mpic/s29372251.jpg,午夜起来听寂静,9.1,周云蓬/北京十月文艺出版社/2017-3,著名民谣歌手、诗人周云蓬新作，精选收入1999年至2016年间的诗作，完整呈现周云蓬的心灵成长轨迹。
23 | https://img3.doubanio.com/mpic/s29339735.jpg,全栈市场人,8.0,Lydia/人民邮电出版社/2017-2-1,知乎专栏作家Lydia倾心分享互联网运营精华，教你如何做一名互联网产品宣传项目操盘手。
24 | https://img1.doubanio.com/mpic/s29402558.jpg,致薇拉,评价人数不足,[美]弗拉基米尔·纳博科夫/人民文学出版社/2017-3,纳博科夫写给妻子薇拉的书信集，持续半个多世纪的婚姻中，纳博科夫写给薇拉的书信也从1932年的相识伴随到了最后。
25 | https://img3.doubanio.com/mpic/s29234990.jpg,达芬奇幽灵,9.3,[美]托比·莱斯特（TobyLester）/中信出版集团/楚尘文化/2017-3-10,作者托比·莱斯特以达芬奇名画《维特鲁威人》为钥匙，用侦探小说般的笔法打开文艺复兴之门，捕捉西方思想史上的关键时刻。
26 | https://img3.doubanio.com/mpic/s29385511.jpg,文明之光（第四册）,8.6,吴军/人民邮电出版社/2017-3-1,计算机科学家吴军博士《文明之光》系列的第四卷，以崭新视角全面展现人类文明史中那些绚烂多彩的璀璨文明。
27 | https://img3.doubanio.com/mpic/s29385675.jpg,青年斯大林,评价人数不足,[英]西蒙·蒙蒂菲奥里/浦睿文化·民主建设出版社/2017-3,《耶路撒冷三千年》的作者蒙蒂菲奥里历时10年，重述斯大林如何从鞋匠的儿子，最终成为列宁的左右手这一鲜为人知的成长历程。
28 | https://img5.doubanio.com/mpic/s29359446.jpg,食帖15：便当灵感集,9.1,林江/中信出版集团股份有限公司/2017-2,《食帖》系列第15辑《便当灵感集》，20位便当生活家经验分享，一本每日便当生活必备灵感全书。
29 | https://img3.doubanio.com/mpic/s29376053.jpg,中国1945,评价人数不足,[美]理查德·伯恩斯坦(RichardBernstein)/社会科学文献出版社/2017-3-1,作者理查德·伯恩斯坦用引人入胜的笔调，描述了1945年这个美国与中国关系的转折之年，挑战了现代中美关系起源的传统观点。
30 | https://img3.doubanio.com/mpic/s29370575.jpg,极简进步史,评价人数不足,[英]罗纳德·赖特/北京时代华文书局/2017-4-1,英国非虚构类创作大师罗纳德·赖特经典作品，对人类社会、文明、历史、科技、环境等多方面展开反思。
31 | https://img3.doubanio.com/mpic/s29287315.jpg,贩卖音乐,9.4,[美]大卫·伊斯曼/世界图书出版公司/2017-3-1,作者大卫·伊斯曼用通俗易懂的文字，讲述了美国镀金时代与进步时代音乐产业的起源、发展和变革等历史。
32 | https://img3.doubanio.com/mpic/s29398110.jpg,信仰与观看,评价人数不足,[法]罗兰·雷希特(RolandRecht)/北京大学出版社/2017-2-17,法国著名艺术史家罗兰·雷希特就哥特式大教堂艺术提出全新定义，为理解大教堂空间打开了崭新的维度。
33 | https://img3.doubanio.com/mpic/s29343324.jpg,庇护二世闻见录,评价人数不足,[意]皮科洛米尼/浙江大学出版社·启真馆/2017-2,《闻见录》是文艺复兴时期最伟大的教皇之一庇护二世（皮科洛米尼）的杰作，生动描写了一段典型的文艺复兴时期的经历。
34 | https://img3.doubanio.com/mpic/s29383791.jpg,何故为敌,评价人数不足,[德]李峻石（GüntherSchlee）/社会科学文献出版社/2017-3,德国著名民族学家李峻石从民族学研究中的身份认同问题出发，以欧洲、非洲的个案作为切入点，建构新的冲突理论。
35 | https://img3.doubanio.com/mpic/s29331664.jpg,共享经济没有告诉你的事,8.2,[加]汤姆·斯利（TomSlee）/后浪丨江西人民出版社/2017-3,作者汤姆·斯利从与众不同的角度，揭开共享经济鲜为人知的阴暗面，起底共享经济不为人知的行业内幕。
36 | https://img3.doubanio.com/mpic/s29404793.jpg,私人生活的变革,评价人数不足,阎云翔/世纪文景/上海人民出版社/2017-3,美国亚洲学会中国研究列文森图书大奖获奖作品，探讨了农民家庭生活中的个体性与情感生活这一从未被讨论过的议题。
37 | https://img1.doubanio.com/mpic/s29244537.jpg,石挥谈艺录：把生命交给舞台,8.4,石挥/后浪丨北京联合出版公司/2017-2,「话剧皇帝」石挥迄今为止最完整的著述辑录，从侧面完整呈现了中国话剧的历史演进。
38 | https://img1.doubanio.com/mpic/s29189179.jpg,托克维尔,9.1,[法]吕西安·若姆（LucienJaume）/三辉图书/漓江出版社/2017-2,围绕「托克维尔为什么写作《论美国的民主》」这一问题，剖析了托克维尔作为社会学家、道德家以及文学家的不同侧面。
39 | https://img3.doubanio.com/mpic/s29402102.jpg,几乎消失的偷闲艺术,评价人数不足,[加拿大]达尼·拉费里埃/海天出版社/2017-4,加拿大作家达尼·拉费里埃关于慢生活的一本美文集。几乎被人忘却的偷闲艺术，就是生活的艺术。
40 | https://img3.doubanio.com/mpic/s29373842.jpg,青苔不会消失,9.4,袁凌/中信出版集团/中信大方/2017-4,精选了袁凌十多年来记者生涯中最为杰出的十二篇非虚构作品，写出了一百位中国社会底层的人物故事。
41 | https://img1.doubanio.com/mpic/s29364988.jpg,鲍勃·迪伦：诗人之歌,9.2,[法]让-多米尼克·布里埃/湖南文艺出版社·读行者品牌/2017-4,基于鲍勃·迪伦从1962年到2015年的作品及访谈，作者让-多米尼克·布里埃重新勾勒了鲍勃·迪伦的传奇一生。
42 | 


--------------------------------------------------------------------------------
/Ch2Data/DataIO/Mxlsx/Myxlsxdata/result.csv:
--------------------------------------------------------------------------------
 1 | img_urls,titles,ratings,authors,details
 2 | https://img1.doubanio.com/mpic/s29343377.jpg,散步去,8.9,[日]谷口治郎/后浪丨北京联合出版公司/2017-3,享誉欧美的日本漫画大师谷口治郎的代表作，汇集了18篇散步的故事，以电影镜头般的画笔描绘了日常的散步之乐和生活之美。
 3 | https://img1.doubanio.com/mpic/s29385647.jpg,地下铁道,9.1,[美]科尔森·怀特黑德（ColsonWhitehead）/世纪文景/上海人民出版社/2017-3,美国国家图书奖获奖作品，讲述少女科拉无家可归，受到欺辱和强暴，搭乘秘密的地下铁道一路向北、投奔自由的传奇故事。
 4 | https://img3.doubanio.com/mpic/s29357091.jpg,驻马店伤心故事集,8.9,郑在欢/上海文艺出版社/2017-2,郑在欢的中国版《小城畸人》，分为「病人列传」和「cult家族」两部分，用魔幻与残酷印刻出这个世界存在的现象和本质。
 5 | https://img3.doubanio.com/mpic/s29390802.jpg,草原动物园,9.2,马伯庸/中信出版社/2017-3,一位传教士带着「半个动物园」勇闯蒙古草原的奇幻故事，马伯庸历史奇幻新作，「马亲王」新历史主义写作的进阶之路。
 6 | https://img3.doubanio.com/mpic/s29365601.jpg,遇见野兔的那一年,8.3,[芬]阿托·帕西林纳/中信出版集团/2017-3-1,芬兰国宝级作家阿托·帕西林纳里程碑式作品，一个关于追求自由、逃离都市的生命转折故事。
 7 | https://img3.doubanio.com/mpic/s29247833.jpg,被占的宅子,9.2,[阿根廷]胡利奥·科塔萨尔/南海出版公司/2017-3,阿根廷小说大师胡利奥·科塔萨尔短篇小说全集第一部，收录《彼岸》《动物寓言集》《游戏的终结》三部短篇集。
 8 | https://img3.doubanio.com/mpic/s29261915.jpg,原谅石,8.7,[美]洛里·斯皮尔曼/九州出版社/2017-2,一个女人回望过去，找到二十年前的自己。德国年度十大畅销书排行榜作品，一个关于勇气与和解的故事。
 9 | https://img1.doubanio.com/mpic/s29335378.jpg,公牛山,8.8,[美]布赖恩·帕诺威奇/上海译文出版社/2017-2,一部绵延三代人、横跨数十年的黑道世家史诗，国际惊险小说家协会2016年度首作奖获奖作品。
10 | https://img1.doubanio.com/mpic/s29360928.jpg,寻找时间的人,8.7,[爱尔兰]凯特·汤普森/江苏凤凰文艺出版社/2017-3,爱尔兰国家文学奖得主凯特·汤普森作品，一部关于成长、亲情，以及不断消失的传统的颂歌。
11 | https://img5.doubanio.com/mpic/s29245396.jpg,带艾伯特回家,8.8,[美]霍默·希卡姆/未读·北京联合出版公司/2017-3,美国畅销书作家霍默·希卡姆的治愈系小说，一只鳄鱼充满惊喜的公路之旅，一个关于爱与选择的真实故事。
12 | https://img3.doubanio.com/mpic/s29369960.jpg,眼泪的化学,评价人数不足,[澳]彼得·凯里/上海译文出版社/2017-2,当代澳大利亚文学领军人物彼得·凯里作品，一部以博物馆为背景，兼具寓言与科幻特征的小说。
13 | https://img1.doubanio.com/mpic/s29362779.jpg,风雪追击,8.8,东野圭吾/现代出版社/2017-4,日本推理小说大师东野圭吾最新长篇小说，讲述一个突然被指认为杀人凶手的男孩，为了洗清嫌疑而进行自救的故事。
14 | https://img1.doubanio.com/mpic/s29361219.jpg,生火,8.7,[法]克里斯多夫·夏布特（ChristopheChabouté）编绘/后浪丨北京联合出版公司/2017-3,改编自杰克·伦敦描写极北严寒生活的同名小说，《灯塔》作者克里斯托夫·夏布特以漫画形式，再现自然主义文学经典。
15 | https://img3.doubanio.com/mpic/s29381992.jpg,有匪2：离恨楼,9.5,Priest/湖南文艺出版社/2017-3,晋江网大神级作家Priest古言小说扛鼎之作，《有匪》系列第二部，新增独家番外内容。
16 | https://img3.doubanio.com/mpic/s29348545.jpg,妹妹的坟墓,7.9,[美]罗伯特·杜格尼(RobertDugoni)/悦读名品|化学工业出版社/2017-3-1,《纽约时报》畅销作家罗伯特·杜格尼力作，妹妹的失踪之谜，牵引出一场饱含亲情的追凶之旅。
17 | https://img1.doubanio.com/mpic/s29372029.jpg,消失的星国,评价人数不足,墨熊/浦睿文化·湖南科学技术出版社/2017-3,华语科幻星云奖得主墨熊最新科幻力作，一个发生在时空边境线的故事，一场生死未卜的探险。
18 | https://img5.doubanio.com/mpic/s29383586.jpg,宛如昨日,7.2,蔡骏/湖南文艺出版社/2017-4,悬疑畅销作家蔡骏长篇游戏幻想推理小说，在数字游戏中触碰记忆深处的罪恶，揭开尘封已久的真相。
19 | https://img3.doubanio.com/mpic/s29379993.jpg,蝙蝠侠：黑与白1,8.9,[美]丹尼斯·奥尼尔等/世界图书出版公司/2017-3,黑暗骑士侦探的硬汉冒险短篇集，以完全的黑白基调呈现蝙蝠侠世界观。IGN评选为25部史上最佳蝙蝠侠漫画之一。
20 | https://img1.doubanio.com/mpic/s29376117.jpg,终极X战警2,评价人数不足,[英]马克·米勒/[美]亚当·库伯特/世界图书出版公司/2017-3-15,击败万磁王后，X战警即将面临新的挑战，伴随着来自过去的阴影，X战警能否化险为夷？
21 | https://img3.doubanio.com/mpic/s29407650.jpg,希腊棺材之谜,9.9,[美]埃勒里·奎因/新星出版社/2017-3,古董商留下了巨额遗产，葬礼后遗嘱却不翼而飞……推理小说大师埃勒里·奎因经典之作。
22 | https://img3.doubanio.com/mpic/s29372251.jpg,午夜起来听寂静,9.1,周云蓬/北京十月文艺出版社/2017-3,著名民谣歌手、诗人周云蓬新作，精选收入1999年至2016年间的诗作，完整呈现周云蓬的心灵成长轨迹。
23 | https://img3.doubanio.com/mpic/s29339735.jpg,全栈市场人,8.0,Lydia/人民邮电出版社/2017-2-1,知乎专栏作家Lydia倾心分享互联网运营精华，教你如何做一名互联网产品宣传项目操盘手。
24 | https://img1.doubanio.com/mpic/s29402558.jpg,致薇拉,评价人数不足,[美]弗拉基米尔·纳博科夫/人民文学出版社/2017-3,纳博科夫写给妻子薇拉的书信集，持续半个多世纪的婚姻中，纳博科夫写给薇拉的书信也从1932年的相识伴随到了最后。
25 | https://img3.doubanio.com/mpic/s29234990.jpg,达芬奇幽灵,9.3,[美]托比·莱斯特（TobyLester）/中信出版集团/楚尘文化/2017-3-10,作者托比·莱斯特以达芬奇名画《维特鲁威人》为钥匙，用侦探小说般的笔法打开文艺复兴之门，捕捉西方思想史上的关键时刻。
26 | https://img3.doubanio.com/mpic/s29385511.jpg,文明之光（第四册）,8.6,吴军/人民邮电出版社/2017-3-1,计算机科学家吴军博士《文明之光》系列的第四卷，以崭新视角全面展现人类文明史中那些绚烂多彩的璀璨文明。
27 | https://img3.doubanio.com/mpic/s29385675.jpg,青年斯大林,评价人数不足,[英]西蒙·蒙蒂菲奥里/浦睿文化·民主建设出版社/2017-3,《耶路撒冷三千年》的作者蒙蒂菲奥里历时10年，重述斯大林如何从鞋匠的儿子，最终成为列宁的左右手这一鲜为人知的成长历程。
28 | https://img5.doubanio.com/mpic/s29359446.jpg,食帖15：便当灵感集,9.1,林江/中信出版集团股份有限公司/2017-2,《食帖》系列第15辑《便当灵感集》，20位便当生活家经验分享，一本每日便当生活必备灵感全书。
29 | https://img3.doubanio.com/mpic/s29376053.jpg,中国1945,评价人数不足,[美]理查德·伯恩斯坦(RichardBernstein)/社会科学文献出版社/2017-3-1,作者理查德·伯恩斯坦用引人入胜的笔调，描述了1945年这个美国与中国关系的转折之年，挑战了现代中美关系起源的传统观点。
30 | https://img3.doubanio.com/mpic/s29370575.jpg,极简进步史,评价人数不足,[英]罗纳德·赖特/北京时代华文书局/2017-4-1,英国非虚构类创作大师罗纳德·赖特经典作品，对人类社会、文明、历史、科技、环境等多方面展开反思。
31 | https://img3.doubanio.com/mpic/s29287315.jpg,贩卖音乐,9.4,[美]大卫·伊斯曼/世界图书出版公司/2017-3-1,作者大卫·伊斯曼用通俗易懂的文字，讲述了美国镀金时代与进步时代音乐产业的起源、发展和变革等历史。
32 | https://img3.doubanio.com/mpic/s29398110.jpg,信仰与观看,评价人数不足,[法]罗兰·雷希特(RolandRecht)/北京大学出版社/2017-2-17,法国著名艺术史家罗兰·雷希特就哥特式大教堂艺术提出全新定义，为理解大教堂空间打开了崭新的维度。
33 | https://img3.doubanio.com/mpic/s29343324.jpg,庇护二世闻见录,评价人数不足,[意]皮科洛米尼/浙江大学出版社·启真馆/2017-2,《闻见录》是文艺复兴时期最伟大的教皇之一庇护二世（皮科洛米尼）的杰作，生动描写了一段典型的文艺复兴时期的经历。
34 | https://img3.doubanio.com/mpic/s29383791.jpg,何故为敌,评价人数不足,[德]李峻石（GüntherSchlee）/社会科学文献出版社/2017-3,德国著名民族学家李峻石从民族学研究中的身份认同问题出发，以欧洲、非洲的个案作为切入点，建构新的冲突理论。
35 | https://img3.doubanio.com/mpic/s29331664.jpg,共享经济没有告诉你的事,8.2,[加]汤姆·斯利（TomSlee）/后浪丨江西人民出版社/2017-3,作者汤姆·斯利从与众不同的角度，揭开共享经济鲜为人知的阴暗面，起底共享经济不为人知的行业内幕。
36 | https://img3.doubanio.com/mpic/s29404793.jpg,私人生活的变革,评价人数不足,阎云翔/世纪文景/上海人民出版社/2017-3,美国亚洲学会中国研究列文森图书大奖获奖作品，探讨了农民家庭生活中的个体性与情感生活这一从未被讨论过的议题。
37 | https://img1.doubanio.com/mpic/s29244537.jpg,石挥谈艺录：把生命交给舞台,8.4,石挥/后浪丨北京联合出版公司/2017-2,「话剧皇帝」石挥迄今为止最完整的著述辑录，从侧面完整呈现了中国话剧的历史演进。
38 | https://img1.doubanio.com/mpic/s29189179.jpg,托克维尔,9.1,[法]吕西安·若姆（LucienJaume）/三辉图书/漓江出版社/2017-2,围绕「托克维尔为什么写作《论美国的民主》」这一问题，剖析了托克维尔作为社会学家、道德家以及文学家的不同侧面。
39 | https://img3.doubanio.com/mpic/s29402102.jpg,几乎消失的偷闲艺术,评价人数不足,[加拿大]达尼·拉费里埃/海天出版社/2017-4,加拿大作家达尼·拉费里埃关于慢生活的一本美文集。几乎被人忘却的偷闲艺术，就是生活的艺术。
40 | https://img3.doubanio.com/mpic/s29373842.jpg,青苔不会消失,9.4,袁凌/中信出版集团/中信大方/2017-4,精选了袁凌十多年来记者生涯中最为杰出的十二篇非虚构作品，写出了一百位中国社会底层的人物故事。
41 | https://img1.doubanio.com/mpic/s29364988.jpg,鲍勃·迪伦：诗人之歌,9.2,[法]让-多米尼克·布里埃/湖南文艺出版社·读行者品牌/2017-4,基于鲍勃·迪伦从1962年到2015年的作品及访谈，作者让-多米尼克·布里埃重新勾勒了鲍勃·迪伦的传奇一生。
42 | 


--------------------------------------------------------------------------------
/Ch1Spider/captures/yundamadoc.py:
--------------------------------------------------------------------------------
  1 | import http.client, mimetypes, urllib.parse, json, time
  2 | 
  3 | 
  4 | class YDMHttp:
  5 |     apiurl = 'http://api.yundama.com/api.php'
  6 | 
  7 |     username = ''
  8 |     password = ''
  9 |     appid = ''
 10 |     appkey = ''
 11 | 
 12 |     def __init__(self, username, password, appid, appkey):
 13 |         self.username = username
 14 |         self.password = password
 15 |         self.appid = str(appid)
 16 |         self.appkey = appkey
 17 | 
 18 |     def request(self, fields, files=[]):
 19 |         try:
 20 |             response = post_url(self.apiurl, fields, files)
 21 |             response = json.loads(response)
 22 |         except Exception as e:
 23 |             response = None
 24 |         return response
 25 | 
 26 |     def balance(self):
 27 |         data = {'method': 'balance', 'username': self.username, 'password': self.password, 'appid': self.appid,
 28 |                 'appkey': self.appkey}
 29 |         response = self.request(data)
 30 |         if (response):
 31 |             if (response['ret'] and response['ret'] < 0):
 32 |                 return response['ret']
 33 |             else:
 34 |                 return response['balance']
 35 |         else:
 36 |             return -9001
 37 | 
 38 |     def login(self):
 39 |         data = {'method': 'login', 'username': self.username, 'password': self.password, 'appid': self.appid,
 40 |                 'appkey': self.appkey}
 41 |         response = self.request(data)
 42 |         if (response):
 43 |             if (response['ret'] and response['ret'] < 0):
 44 |                 return response['ret']
 45 |             else:
 46 |                 return response['uid']
 47 |         else:
 48 |             return -9001
 49 | 
 50 |     def upload(self, filename, codetype, timeout):
 51 |         data = {'method': 'upload', 'username': self.username, 'password': self.password, 'appid': self.appid,
 52 |                 'appkey': self.appkey, 'codetype': str(codetype), 'timeout': str(timeout)}
 53 |         file = {'file': filename}
 54 |         response = self.request(data, file)
 55 |         if (response):
 56 |             if (response['ret'] and response['ret'] < 0):
 57 |                 return response['ret']
 58 |             else:
 59 |                 return response['cid']
 60 |         else:
 61 |             return -9001
 62 | 
 63 |     def result(self, cid):
 64 |         data = {'method': 'result', 'username': self.username, 'password': self.password, 'appid': self.appid,
 65 |                 'appkey': self.appkey, 'cid': str(cid)}
 66 |         response = self.request(data)
 67 |         return response and response['text'] or ''
 68 | 
 69 |     def decode(self, filename, codetype, timeout):
 70 |         cid = self.upload(filename, codetype, timeout)
 71 |         if (cid > 0):
 72 |             for i in range(0, timeout):
 73 |                 result = self.result(cid)
 74 |                 if (result != ''):
 75 |                     return cid, result
 76 |                 else:
 77 |                     time.sleep(1)
 78 |             return -3003, ''
 79 |         else:
 80 |             return cid, ''
 81 | 
 82 | 
 83 | ######################################################################
 84 | 
 85 | def post_url(url, fields, files=[]):
 86 |     urlparts = urllib.parse.urlsplit(url)
 87 |     return post_multipart(urlparts[1], urlparts[2], fields, files)
 88 | 
 89 | 
 90 | def post_multipart(host, selector, fields, files):
 91 |     print(host)
 92 |     content_type, body = encode_multipart_formdata(fields, files)
 93 |     h = http.client.HTTP(host)
 94 |     h.putrequest('POST', selector)
 95 |     h.putheader('Host', host)
 96 |     h.putheader('Content-Type', content_type)
 97 |     h.putheader('Content-Length', str(len(body)))
 98 |     h.endheaders()
 99 |     h.send(body)
100 |     errcode, errmsg, headers = h.getreply()
101 |     return h.file.read()
102 | 
103 | 
104 | def encode_multipart_formdata(fields, files=[]):
105 |     BOUNDARY = 'WebKitFormBoundaryJKrptX8yPbuAJLBQ'
106 |     CRLF = '\r\n'
107 |     L = []
108 |     for field in fields:
109 |         key = field
110 |         value = fields[key]
111 |         L.append('--' + BOUNDARY)
112 |         L.append('Content-Disposition: form-data; name="%s"' % key)
113 |         L.append('')
114 |         L.append(value)
115 |     for field in files:
116 |         key = field
117 |         filepath = files[key]
118 |         L.append('--' + BOUNDARY)
119 |         L.append('Content-Disposition: form-data; name="%s"; filename="%s"' % (key, filepath))
120 |         L.append('Content-Type: %s' % get_content_type(filepath))
121 |         L.append('')
122 |         L.append(open(filepath, 'rb').read())
123 |     L.append('--' + BOUNDARY + '--')
124 |     L.append('')
125 |     body = CRLF.join(L)
126 |     content_type = 'multipart/form-data; boundary=%s' % BOUNDARY
127 |     return content_type, body
128 | 
129 | 
130 | def get_content_type(filename):
131 |     return mimetypes.guess_type(filename)[0] or 'application/octet-stream'
132 | 
133 | 
134 | ######################################################################
135 | 
136 | 
137 | username = 'xxx'
138 | password = 'xxx'
139 | appid = 1
140 | appkey = 'xxx'
141 | filename = 'douban.jpg'
142 | codetype = 3000
143 | timeout = 60
144 | 
145 | yundama = YDMHttp(username, password, appid, appkey)
146 | 
147 | uid = yundama.login()
148 | print('uid: %s' % uid)
149 | 
150 | balance = yundama.balance()
151 | print('balance: %s' % balance)
152 | 
153 | cid, result = yundama.decode(filename, codetype, timeout)
154 | print('cid: %s, result: %s' % (cid, result))
155 | 
156 | ######################################################################


--------------------------------------------------------------------------------
/Ch3Analysis-Visualization/Visualization/Iris.csv:
--------------------------------------------------------------------------------
  1 | Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
  2 | 1,5.1,3.5,1.4,0.2,Iris-setosa
  3 | 2,4.9,3.0,1.4,0.2,Iris-setosa
  4 | 3,4.7,3.2,1.3,0.2,Iris-setosa
  5 | 4,4.6,3.1,1.5,0.2,Iris-setosa
  6 | 5,5.0,3.6,1.4,0.2,Iris-setosa
  7 | 6,5.4,3.9,1.7,0.4,Iris-setosa
  8 | 7,4.6,3.4,1.4,0.3,Iris-setosa
  9 | 8,5.0,3.4,1.5,0.2,Iris-setosa
 10 | 9,4.4,2.9,1.4,0.2,Iris-setosa
 11 | 10,4.9,3.1,1.5,0.1,Iris-setosa
 12 | 11,5.4,3.7,1.5,0.2,Iris-setosa
 13 | 12,4.8,3.4,1.6,0.2,Iris-setosa
 14 | 13,4.8,3.0,1.4,0.1,Iris-setosa
 15 | 14,4.3,3.0,1.1,0.1,Iris-setosa
 16 | 15,5.8,4.0,1.2,0.2,Iris-setosa
 17 | 16,5.7,4.4,1.5,0.4,Iris-setosa
 18 | 17,5.4,3.9,1.3,0.4,Iris-setosa
 19 | 18,5.1,3.5,1.4,0.3,Iris-setosa
 20 | 19,5.7,3.8,1.7,0.3,Iris-setosa
 21 | 20,5.1,3.8,1.5,0.3,Iris-setosa
 22 | 21,5.4,3.4,1.7,0.2,Iris-setosa
 23 | 22,5.1,3.7,1.5,0.4,Iris-setosa
 24 | 23,4.6,3.6,1.0,0.2,Iris-setosa
 25 | 24,5.1,3.3,1.7,0.5,Iris-setosa
 26 | 25,4.8,3.4,1.9,0.2,Iris-setosa
 27 | 26,5.0,3.0,1.6,0.2,Iris-setosa
 28 | 27,5.0,3.4,1.6,0.4,Iris-setosa
 29 | 28,5.2,3.5,1.5,0.2,Iris-setosa
 30 | 29,5.2,3.4,1.4,0.2,Iris-setosa
 31 | 30,4.7,3.2,1.6,0.2,Iris-setosa
 32 | 31,4.8,3.1,1.6,0.2,Iris-setosa
 33 | 32,5.4,3.4,1.5,0.4,Iris-setosa
 34 | 33,5.2,4.1,1.5,0.1,Iris-setosa
 35 | 34,5.5,4.2,1.4,0.2,Iris-setosa
 36 | 35,4.9,3.1,1.5,0.1,Iris-setosa
 37 | 36,5.0,3.2,1.2,0.2,Iris-setosa
 38 | 37,5.5,3.5,1.3,0.2,Iris-setosa
 39 | 38,4.9,3.1,1.5,0.1,Iris-setosa
 40 | 39,4.4,3.0,1.3,0.2,Iris-setosa
 41 | 40,5.1,3.4,1.5,0.2,Iris-setosa
 42 | 41,5.0,3.5,1.3,0.3,Iris-setosa
 43 | 42,4.5,2.3,1.3,0.3,Iris-setosa
 44 | 43,4.4,3.2,1.3,0.2,Iris-setosa
 45 | 44,5.0,3.5,1.6,0.6,Iris-setosa
 46 | 45,5.1,3.8,1.9,0.4,Iris-setosa
 47 | 46,4.8,3.0,1.4,0.3,Iris-setosa
 48 | 47,5.1,3.8,1.6,0.2,Iris-setosa
 49 | 48,4.6,3.2,1.4,0.2,Iris-setosa
 50 | 49,5.3,3.7,1.5,0.2,Iris-setosa
 51 | 50,5.0,3.3,1.4,0.2,Iris-setosa
 52 | 51,7.0,3.2,4.7,1.4,Iris-versicolor
 53 | 52,6.4,3.2,4.5,1.5,Iris-versicolor
 54 | 53,6.9,3.1,4.9,1.5,Iris-versicolor
 55 | 54,5.5,2.3,4.0,1.3,Iris-versicolor
 56 | 55,6.5,2.8,4.6,1.5,Iris-versicolor
 57 | 56,5.7,2.8,4.5,1.3,Iris-versicolor
 58 | 57,6.3,3.3,4.7,1.6,Iris-versicolor
 59 | 58,4.9,2.4,3.3,1.0,Iris-versicolor
 60 | 59,6.6,2.9,4.6,1.3,Iris-versicolor
 61 | 60,5.2,2.7,3.9,1.4,Iris-versicolor
 62 | 61,5.0,2.0,3.5,1.0,Iris-versicolor
 63 | 62,5.9,3.0,4.2,1.5,Iris-versicolor
 64 | 63,6.0,2.2,4.0,1.0,Iris-versicolor
 65 | 64,6.1,2.9,4.7,1.4,Iris-versicolor
 66 | 65,5.6,2.9,3.6,1.3,Iris-versicolor
 67 | 66,6.7,3.1,4.4,1.4,Iris-versicolor
 68 | 67,5.6,3.0,4.5,1.5,Iris-versicolor
 69 | 68,5.8,2.7,4.1,1.0,Iris-versicolor
 70 | 69,6.2,2.2,4.5,1.5,Iris-versicolor
 71 | 70,5.6,2.5,3.9,1.1,Iris-versicolor
 72 | 71,5.9,3.2,4.8,1.8,Iris-versicolor
 73 | 72,6.1,2.8,4.0,1.3,Iris-versicolor
 74 | 73,6.3,2.5,4.9,1.5,Iris-versicolor
 75 | 74,6.1,2.8,4.7,1.2,Iris-versicolor
 76 | 75,6.4,2.9,4.3,1.3,Iris-versicolor
 77 | 76,6.6,3.0,4.4,1.4,Iris-versicolor
 78 | 77,6.8,2.8,4.8,1.4,Iris-versicolor
 79 | 78,6.7,3.0,5.0,1.7,Iris-versicolor
 80 | 79,6.0,2.9,4.5,1.5,Iris-versicolor
 81 | 80,5.7,2.6,3.5,1.0,Iris-versicolor
 82 | 81,5.5,2.4,3.8,1.1,Iris-versicolor
 83 | 82,5.5,2.4,3.7,1.0,Iris-versicolor
 84 | 83,5.8,2.7,3.9,1.2,Iris-versicolor
 85 | 84,6.0,2.7,5.1,1.6,Iris-versicolor
 86 | 85,5.4,3.0,4.5,1.5,Iris-versicolor
 87 | 86,6.0,3.4,4.5,1.6,Iris-versicolor
 88 | 87,6.7,3.1,4.7,1.5,Iris-versicolor
 89 | 88,6.3,2.3,4.4,1.3,Iris-versicolor
 90 | 89,5.6,3.0,4.1,1.3,Iris-versicolor
 91 | 90,5.5,2.5,4.0,1.3,Iris-versicolor
 92 | 91,5.5,2.6,4.4,1.2,Iris-versicolor
 93 | 92,6.1,3.0,4.6,1.4,Iris-versicolor
 94 | 93,5.8,2.6,4.0,1.2,Iris-versicolor
 95 | 94,5.0,2.3,3.3,1.0,Iris-versicolor
 96 | 95,5.6,2.7,4.2,1.3,Iris-versicolor
 97 | 96,5.7,3.0,4.2,1.2,Iris-versicolor
 98 | 97,5.7,2.9,4.2,1.3,Iris-versicolor
 99 | 98,6.2,2.9,4.3,1.3,Iris-versicolor
100 | 99,5.1,2.5,3.0,1.1,Iris-versicolor
101 | 100,5.7,2.8,4.1,1.3,Iris-versicolor
102 | 101,6.3,3.3,6.0,2.5,Iris-virginica
103 | 102,5.8,2.7,5.1,1.9,Iris-virginica
104 | 103,7.1,3.0,5.9,2.1,Iris-virginica
105 | 104,6.3,2.9,5.6,1.8,Iris-virginica
106 | 105,6.5,3.0,5.8,2.2,Iris-virginica
107 | 106,7.6,3.0,6.6,2.1,Iris-virginica
108 | 107,4.9,2.5,4.5,1.7,Iris-virginica
109 | 108,7.3,2.9,6.3,1.8,Iris-virginica
110 | 109,6.7,2.5,5.8,1.8,Iris-virginica
111 | 110,7.2,3.6,6.1,2.5,Iris-virginica
112 | 111,6.5,3.2,5.1,2.0,Iris-virginica
113 | 112,6.4,2.7,5.3,1.9,Iris-virginica
114 | 113,6.8,3.0,5.5,2.1,Iris-virginica
115 | 114,5.7,2.5,5.0,2.0,Iris-virginica
116 | 115,5.8,2.8,5.1,2.4,Iris-virginica
117 | 116,6.4,3.2,5.3,2.3,Iris-virginica
118 | 117,6.5,3.0,5.5,1.8,Iris-virginica
119 | 118,7.7,3.8,6.7,2.2,Iris-virginica
120 | 119,7.7,2.6,6.9,2.3,Iris-virginica
121 | 120,6.0,2.2,5.0,1.5,Iris-virginica
122 | 121,6.9,3.2,5.7,2.3,Iris-virginica
123 | 122,5.6,2.8,4.9,2.0,Iris-virginica
124 | 123,7.7,2.8,6.7,2.0,Iris-virginica
125 | 124,6.3,2.7,4.9,1.8,Iris-virginica
126 | 125,6.7,3.3,5.7,2.1,Iris-virginica
127 | 126,7.2,3.2,6.0,1.8,Iris-virginica
128 | 127,6.2,2.8,4.8,1.8,Iris-virginica
129 | 128,6.1,3.0,4.9,1.8,Iris-virginica
130 | 129,6.4,2.8,5.6,2.1,Iris-virginica
131 | 130,7.2,3.0,5.8,1.6,Iris-virginica
132 | 131,7.4,2.8,6.1,1.9,Iris-virginica
133 | 132,7.9,3.8,6.4,2.0,Iris-virginica
134 | 133,6.4,2.8,5.6,2.2,Iris-virginica
135 | 134,6.3,2.8,5.1,1.5,Iris-virginica
136 | 135,6.1,2.6,5.6,1.4,Iris-virginica
137 | 136,7.7,3.0,6.1,2.3,Iris-virginica
138 | 137,6.3,3.4,5.6,2.4,Iris-virginica
139 | 138,6.4,3.1,5.5,1.8,Iris-virginica
140 | 139,6.0,3.0,4.8,1.8,Iris-virginica
141 | 140,6.9,3.1,5.4,2.1,Iris-virginica
142 | 141,6.7,3.1,5.6,2.4,Iris-virginica
143 | 142,6.9,3.1,5.1,2.3,Iris-virginica
144 | 143,5.8,2.7,5.1,1.9,Iris-virginica
145 | 144,6.8,3.2,5.9,2.3,Iris-virginica
146 | 145,6.7,3.3,5.7,2.5,Iris-virginica
147 | 146,6.7,3.0,5.2,2.3,Iris-virginica
148 | 147,6.3,2.5,5.0,1.9,Iris-virginica
149 | 148,6.5,3.0,5.2,2.0,Iris-virginica
150 | 149,6.2,3.4,5.4,2.3,Iris-virginica
151 | 150,5.9,3.0,5.1,1.8,Iris-virginica
152 | 


--------------------------------------------------------------------------------
/Report/Source/data_process.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import re\n",
 10 |     "import pandas as pd"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": 2,
 16 |    "metadata": {},
 17 |    "outputs": [],
 18 |    "source": [
 19 |     "# 读入数据，给定各字段的名字\n",
 20 |     "df = pd.read_csv('../data/douban.csv', header=None,\n",
 21 |     "                 names=['p_name', 'p_url', 'c_date_time', 'c_data', 'c_rank', 'c_recom'])"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "code",
 26 |    "execution_count": 3,
 27 |    "metadata": {},
 28 |    "outputs": [
 29 |     {
 30 |      "data": {
 31 |       "text/html": [
 32 |        "<div>\n",
 33 |        "<style>\n",
 34 |        "    .dataframe thead tr:only-child th {\n",
 35 |        "        text-align: right;\n",
 36 |        "    }\n",
 37 |        "\n",
 38 |        "    .dataframe thead th {\n",
 39 |        "        text-align: left;\n",
 40 |        "    }\n",
 41 |        "\n",
 42 |        "    .dataframe tbody tr th {\n",
 43 |        "        vertical-align: top;\n",
 44 |        "    }\n",
 45 |        "</style>\n",
 46 |        "<table border=\"1\" class=\"dataframe\">\n",
 47 |        "  <thead>\n",
 48 |        "    <tr style=\"text-align: right;\">\n",
 49 |        "      <th></th>\n",
 50 |        "      <th>p_name</th>\n",
 51 |        "      <th>p_url</th>\n",
 52 |        "      <th>c_date_time</th>\n",
 53 |        "      <th>c_data</th>\n",
 54 |        "      <th>c_rank</th>\n",
 55 |        "      <th>c_recom</th>\n",
 56 |        "    </tr>\n",
 57 |        "  </thead>\n",
 58 |        "  <tbody>\n",
 59 |        "    <tr>\n",
 60 |        "      <th>0</th>\n",
 61 |        "      <td>王大根</td>\n",
 62 |        "      <td>https://www.douban.com/people/diewithme/</td>\n",
 63 |        "      <td>2018-01-19 18:17:25</td>\n",
 64 |        "      <td>在这种家庭里做一条狗都好啊\\n</td>\n",
 65 |        "      <td>力荐</td>\n",
 66 |        "      <td>6463</td>\n",
 67 |        "    </tr>\n",
 68 |        "    <tr>\n",
 69 |        "      <th>1</th>\n",
 70 |        "      <td>李阿斗</td>\n",
 71 |        "      <td>https://www.douban.com/people/gailsylee/</td>\n",
 72 |        "      <td>2017-11-25 02:12:27</td>\n",
 73 |        "      <td>当时出国以后的第一最大感受就是尊重，不论老弱病孕还是任何“与众不同”，都不会有人上下打量你...</td>\n",
 74 |        "      <td>力荐</td>\n",
 75 |        "      <td>3429</td>\n",
 76 |        "    </tr>\n",
 77 |        "    <tr>\n",
 78 |        "      <th>2</th>\n",
 79 |        "      <td>光明小卖部</td>\n",
 80 |        "      <td>https://www.douban.com/people/gooooooooooohe/</td>\n",
 81 |        "      <td>2017-12-06 15:10:45</td>\n",
 82 |        "      <td>所有人都知道是化妆只有我一个人以为请的真实病人出演吗。。。\\n</td>\n",
 83 |        "      <td>力荐</td>\n",
 84 |        "      <td>3349</td>\n",
 85 |        "    </tr>\n",
 86 |        "    <tr>\n",
 87 |        "      <th>3</th>\n",
 88 |        "      <td>同志亦凡人中文站</td>\n",
 89 |        "      <td>https://www.douban.com/people/3540441/</td>\n",
 90 |        "      <td>2017-11-24 15:57:52</td>\n",
 91 |        "      <td>有种糖放多了的感觉，精华基本都在预告里了。但对孩子们的纯真友情毫无抵抗力啊，就像被温柔的抚...</td>\n",
 92 |        "      <td>推荐</td>\n",
 93 |        "      <td>1814</td>\n",
 94 |        "    </tr>\n",
 95 |        "    <tr>\n",
 96 |        "      <th>4</th>\n",
 97 |        "      <td>桃桃淘电影</td>\n",
 98 |        "      <td>https://www.douban.com/people/qijiuzhiyue/</td>\n",
 99 |        "      <td>2018-01-19 14:18:28</td>\n",
100 |        "      <td>其实这更像当代童话，因为，实在是太暖了。里面每个人都那么暖，怎么可以那么暖，怎么可以那么暖...</td>\n",
101 |        "      <td>推荐</td>\n",
102 |        "      <td>1711</td>\n",
103 |        "    </tr>\n",
104 |        "  </tbody>\n",
105 |        "</table>\n",
106 |        "</div>"
107 |       ],
108 |       "text/plain": [
109 |        "     p_name                                          p_url  \\\n",
110 |        "0       王大根       https://www.douban.com/people/diewithme/   \n",
111 |        "1       李阿斗       https://www.douban.com/people/gailsylee/   \n",
112 |        "2     光明小卖部  https://www.douban.com/people/gooooooooooohe/   \n",
113 |        "3  同志亦凡人中文站         https://www.douban.com/people/3540441/   \n",
114 |        "4     桃桃淘电影     https://www.douban.com/people/qijiuzhiyue/   \n",
115 |        "\n",
116 |        "           c_date_time                                             c_data  \\\n",
117 |        "0  2018-01-19 18:17:25                            在这种家庭里做一条狗都好啊\\n           \n",
118 |        "1  2017-11-25 02:12:27   当时出国以后的第一最大感受就是尊重，不论老弱病孕还是任何“与众不同”，都不会有人上下打量你...   \n",
119 |        "2  2017-12-06 15:10:45            所有人都知道是化妆只有我一个人以为请的真实病人出演吗。。。\\n           \n",
120 |        "3  2017-11-24 15:57:52   有种糖放多了的感觉，精华基本都在预告里了。但对孩子们的纯真友情毫无抵抗力啊，就像被温柔的抚...   \n",
121 |        "4  2018-01-19 14:18:28   其实这更像当代童话，因为，实在是太暖了。里面每个人都那么暖，怎么可以那么暖，怎么可以那么暖...   \n",
122 |        "\n",
123 |        "  c_rank  c_recom  \n",
124 |        "0     力荐     6463  \n",
125 |        "1     力荐     3429  \n",
126 |        "2     力荐     3349  \n",
127 |        "3     推荐     1814  \n",
128 |        "4     推荐     1711  "
129 |       ]
130 |      },
131 |      "execution_count": 3,
132 |      "metadata": {},
133 |      "output_type": "execute_result"
134 |     }
135 |    ],
136 |    "source": [
137 |     "# 预览数据\n",
138 |     "df.head(5)"
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "code",
143 |    "execution_count": 4,
144 |    "metadata": {},
145 |    "outputs": [
146 |     {
147 |      "name": "stdout",
148 |      "output_type": "stream",
149 |      "text": [
150 |       "p_name         0\n",
151 |       "p_url          0\n",
152 |       "c_date_time    0\n",
153 |       "c_data         0\n",
154 |       "c_rank         0\n",
155 |       "c_recom        0\n",
156 |       "dtype: int64\n"
157 |      ]
158 |     }
159 |    ],
160 |    "source": [
161 |     "# 缺失值检测与去除\n",
162 |     "print(df.isnull().sum())\n",
163 |     "#df.dropna(inplace=True)"
164 |    ]
165 |   },
166 |   {
167 |    "cell_type": "code",
168 |    "execution_count": 5,
169 |    "metadata": {},
170 |    "outputs": [],
171 |    "source": [
172 |     "# 拆分原c_date_time为c_date和c_time\n",
173 |     "def get_date(date_time):\n",
174 |     "    # 有时会格式不对\n",
175 |     "    if len(date_time) < 10:\n",
176 |     "        return None\n",
177 |     "    return re.findall(r'(\\d+-\\d+-\\d+) \\d+.*?', date_time)[0]\n",
178 |     "\n",
179 |     "\n",
180 |     "def get_time(date_time):\n",
181 |     "    if len(date_time) < 10:\n",
182 |     "        return None\n",
183 |     "    return re.findall(r'.*? (\\d+:\\d+:\\d+)', date_time)[0]\n",
184 |     "\n",
185 |     "\n",
186 |     "df['c_date'] = df['c_date_time'].apply(get_date)\n",
187 |     "df['c_time'] = df['c_date_time'].apply(get_time)"
188 |    ]
189 |   },
190 |   {
191 |    "cell_type": "code",
192 |    "execution_count": 6,
193 |    "metadata": {},
194 |    "outputs": [
195 |     {
196 |      "data": {
197 |       "text/html": [
198 |        "<div>\n",
199 |        "<style>\n",
200 |        "    .dataframe thead tr:only-child th {\n",
201 |        "        text-align: right;\n",
202 |        "    }\n",
203 |        "\n",
204 |        "    .dataframe thead th {\n",
205 |        "        text-align: left;\n",
206 |        "    }\n",
207 |        "\n",
208 |        "    .dataframe tbody tr th {\n",
209 |        "        vertical-align: top;\n",
210 |        "    }\n",
211 |        "</style>\n",
212 |        "<table border=\"1\" class=\"dataframe\">\n",
213 |        "  <thead>\n",
214 |        "    <tr style=\"text-align: right;\">\n",
215 |        "      <th></th>\n",
216 |        "      <th>p_name</th>\n",
217 |        "      <th>p_url</th>\n",
218 |        "      <th>c_date_time</th>\n",
219 |        "      <th>c_data</th>\n",
220 |        "      <th>c_rank</th>\n",
221 |        "      <th>c_recom</th>\n",
222 |        "      <th>c_date</th>\n",
223 |        "      <th>c_time</th>\n",
224 |        "    </tr>\n",
225 |        "  </thead>\n",
226 |        "  <tbody>\n",
227 |        "    <tr>\n",
228 |        "      <th>0</th>\n",
229 |        "      <td>王大根</td>\n",
230 |        "      <td>https://www.douban.com/people/diewithme/</td>\n",
231 |        "      <td>2018-01-19 18:17:25</td>\n",
232 |        "      <td>在这种家庭里做一条狗都好啊\\n</td>\n",
233 |        "      <td>力荐</td>\n",
234 |        "      <td>6463</td>\n",
235 |        "      <td>2018-01-19</td>\n",
236 |        "      <td>18:17:25</td>\n",
237 |        "    </tr>\n",
238 |        "    <tr>\n",
239 |        "      <th>1</th>\n",
240 |        "      <td>李阿斗</td>\n",
241 |        "      <td>https://www.douban.com/people/gailsylee/</td>\n",
242 |        "      <td>2017-11-25 02:12:27</td>\n",
243 |        "      <td>当时出国以后的第一最大感受就是尊重，不论老弱病孕还是任何“与众不同”，都不会有人上下打量你...</td>\n",
244 |        "      <td>力荐</td>\n",
245 |        "      <td>3429</td>\n",
246 |        "      <td>2017-11-25</td>\n",
247 |        "      <td>02:12:27</td>\n",
248 |        "    </tr>\n",
249 |        "    <tr>\n",
250 |        "      <th>2</th>\n",
251 |        "      <td>光明小卖部</td>\n",
252 |        "      <td>https://www.douban.com/people/gooooooooooohe/</td>\n",
253 |        "      <td>2017-12-06 15:10:45</td>\n",
254 |        "      <td>所有人都知道是化妆只有我一个人以为请的真实病人出演吗。。。\\n</td>\n",
255 |        "      <td>力荐</td>\n",
256 |        "      <td>3349</td>\n",
257 |        "      <td>2017-12-06</td>\n",
258 |        "      <td>15:10:45</td>\n",
259 |        "    </tr>\n",
260 |        "    <tr>\n",
261 |        "      <th>3</th>\n",
262 |        "      <td>同志亦凡人中文站</td>\n",
263 |        "      <td>https://www.douban.com/people/3540441/</td>\n",
264 |        "      <td>2017-11-24 15:57:52</td>\n",
265 |        "      <td>有种糖放多了的感觉，精华基本都在预告里了。但对孩子们的纯真友情毫无抵抗力啊，就像被温柔的抚...</td>\n",
266 |        "      <td>推荐</td>\n",
267 |        "      <td>1814</td>\n",
268 |        "      <td>2017-11-24</td>\n",
269 |        "      <td>15:57:52</td>\n",
270 |        "    </tr>\n",
271 |        "    <tr>\n",
272 |        "      <th>4</th>\n",
273 |        "      <td>桃桃淘电影</td>\n",
274 |        "      <td>https://www.douban.com/people/qijiuzhiyue/</td>\n",
275 |        "      <td>2018-01-19 14:18:28</td>\n",
276 |        "      <td>其实这更像当代童话，因为，实在是太暖了。里面每个人都那么暖，怎么可以那么暖，怎么可以那么暖...</td>\n",
277 |        "      <td>推荐</td>\n",
278 |        "      <td>1711</td>\n",
279 |        "      <td>2018-01-19</td>\n",
280 |        "      <td>14:18:28</td>\n",
281 |        "    </tr>\n",
282 |        "  </tbody>\n",
283 |        "</table>\n",
284 |        "</div>"
285 |       ],
286 |       "text/plain": [
287 |        "     p_name                                          p_url  \\\n",
288 |        "0       王大根       https://www.douban.com/people/diewithme/   \n",
289 |        "1       李阿斗       https://www.douban.com/people/gailsylee/   \n",
290 |        "2     光明小卖部  https://www.douban.com/people/gooooooooooohe/   \n",
291 |        "3  同志亦凡人中文站         https://www.douban.com/people/3540441/   \n",
292 |        "4     桃桃淘电影     https://www.douban.com/people/qijiuzhiyue/   \n",
293 |        "\n",
294 |        "           c_date_time                                             c_data  \\\n",
295 |        "0  2018-01-19 18:17:25                            在这种家庭里做一条狗都好啊\\n           \n",
296 |        "1  2017-11-25 02:12:27   当时出国以后的第一最大感受就是尊重，不论老弱病孕还是任何“与众不同”，都不会有人上下打量你...   \n",
297 |        "2  2017-12-06 15:10:45            所有人都知道是化妆只有我一个人以为请的真实病人出演吗。。。\\n           \n",
298 |        "3  2017-11-24 15:57:52   有种糖放多了的感觉，精华基本都在预告里了。但对孩子们的纯真友情毫无抵抗力啊，就像被温柔的抚...   \n",
299 |        "4  2018-01-19 14:18:28   其实这更像当代童话，因为，实在是太暖了。里面每个人都那么暖，怎么可以那么暖，怎么可以那么暖...   \n",
300 |        "\n",
301 |        "  c_rank  c_recom      c_date    c_time  \n",
302 |        "0     力荐     6463  2018-01-19  18:17:25  \n",
303 |        "1     力荐     3429  2017-11-25  02:12:27  \n",
304 |        "2     力荐     3349  2017-12-06  15:10:45  \n",
305 |        "3     推荐     1814  2017-11-24  15:57:52  \n",
306 |        "4     推荐     1711  2018-01-19  14:18:28  "
307 |       ]
308 |      },
309 |      "execution_count": 6,
310 |      "metadata": {},
311 |      "output_type": "execute_result"
312 |     }
313 |    ],
314 |    "source": [
315 |     "# 预览数据\n",
316 |     "df.head(5)"
317 |    ]
318 |   },
319 |   {
320 |    "cell_type": "code",
321 |    "execution_count": 7,
322 |    "metadata": {},
323 |    "outputs": [
324 |     {
325 |      "name": "stdout",
326 |      "output_type": "stream",
327 |      "text": [
328 |       "Before->\n",
329 |       " p_name         object\n",
330 |       "p_url          object\n",
331 |       "c_date_time    object\n",
332 |       "c_data         object\n",
333 |       "c_rank         object\n",
334 |       "c_recom         int64\n",
335 |       "c_date         object\n",
336 |       "c_time         object\n",
337 |       "dtype: object\n",
338 |       "After->\n",
339 |       " p_name                 object\n",
340 |       "p_url                  object\n",
341 |       "c_date_time    datetime64[ns]\n",
342 |       "c_data                 object\n",
343 |       "c_rank                 object\n",
344 |       "c_recom                 int64\n",
345 |       "c_date                 object\n",
346 |       "c_time                 object\n",
347 |       "dtype: object\n"
348 |      ]
349 |     }
350 |    ],
351 |    "source": [
352 |     "# 如果需要，也可以进行数据类型的转换\n",
353 |     "print('Before->\\n', df.dtypes)\n",
354 |     "df['c_date_time'] = df['c_date_time'].astype('datetime64[ns]')\n",
355 |     "print('After->\\n',  df.dtypes)"
356 |    ]
357 |   },
358 |   {
359 |    "cell_type": "code",
360 |    "execution_count": 8,
361 |    "metadata": {},
362 |    "outputs": [],
363 |    "source": [
364 |     "# 也可方便地进行数据转换[Encoding Categorical Values]\n",
365 |     "# 将汉字对应编码为数字\n",
366 |     "def trans(words):\n",
367 |     "    if words == '力荐':\n",
368 |     "        return 5\n",
369 |     "    elif words == '推荐':\n",
370 |     "        return 4\n",
371 |     "    elif words == '还行':\n",
372 |     "        return 3\n",
373 |     "    elif words == '较差':\n",
374 |     "        return 2\n",
375 |     "    elif words == '很差':\n",
376 |     "        return 1\n",
377 |     "    else:\n",
378 |     "        return None\n",
379 |     "\n",
380 |     "\n",
381 |     "df['c_rank_num'] = df['c_rank'].apply(trans)"
382 |    ]
383 |   },
384 |   {
385 |    "cell_type": "code",
386 |    "execution_count": 9,
387 |    "metadata": {},
388 |    "outputs": [
389 |     {
390 |      "data": {
391 |       "text/html": [
392 |        "<div>\n",
393 |        "<style>\n",
394 |        "    .dataframe thead tr:only-child th {\n",
395 |        "        text-align: right;\n",
396 |        "    }\n",
397 |        "\n",
398 |        "    .dataframe thead th {\n",
399 |        "        text-align: left;\n",
400 |        "    }\n",
401 |        "\n",
402 |        "    .dataframe tbody tr th {\n",
403 |        "        vertical-align: top;\n",
404 |        "    }\n",
405 |        "</style>\n",
406 |        "<table border=\"1\" class=\"dataframe\">\n",
407 |        "  <thead>\n",
408 |        "    <tr style=\"text-align: right;\">\n",
409 |        "      <th></th>\n",
410 |        "      <th>p_name</th>\n",
411 |        "      <th>p_url</th>\n",
412 |        "      <th>c_date_time</th>\n",
413 |        "      <th>c_data</th>\n",
414 |        "      <th>c_rank</th>\n",
415 |        "      <th>c_recom</th>\n",
416 |        "      <th>c_date</th>\n",
417 |        "      <th>c_time</th>\n",
418 |        "      <th>c_rank_num</th>\n",
419 |        "    </tr>\n",
420 |        "  </thead>\n",
421 |        "  <tbody>\n",
422 |        "    <tr>\n",
423 |        "      <th>0</th>\n",
424 |        "      <td>王大根</td>\n",
425 |        "      <td>https://www.douban.com/people/diewithme/</td>\n",
426 |        "      <td>2018-01-19 18:17:25</td>\n",
427 |        "      <td>在这种家庭里做一条狗都好啊\\n</td>\n",
428 |        "      <td>力荐</td>\n",
429 |        "      <td>6463</td>\n",
430 |        "      <td>2018-01-19</td>\n",
431 |        "      <td>18:17:25</td>\n",
432 |        "      <td>5</td>\n",
433 |        "    </tr>\n",
434 |        "    <tr>\n",
435 |        "      <th>1</th>\n",
436 |        "      <td>李阿斗</td>\n",
437 |        "      <td>https://www.douban.com/people/gailsylee/</td>\n",
438 |        "      <td>2017-11-25 02:12:27</td>\n",
439 |        "      <td>当时出国以后的第一最大感受就是尊重，不论老弱病孕还是任何“与众不同”，都不会有人上下打量你...</td>\n",
440 |        "      <td>力荐</td>\n",
441 |        "      <td>3429</td>\n",
442 |        "      <td>2017-11-25</td>\n",
443 |        "      <td>02:12:27</td>\n",
444 |        "      <td>5</td>\n",
445 |        "    </tr>\n",
446 |        "    <tr>\n",
447 |        "      <th>2</th>\n",
448 |        "      <td>光明小卖部</td>\n",
449 |        "      <td>https://www.douban.com/people/gooooooooooohe/</td>\n",
450 |        "      <td>2017-12-06 15:10:45</td>\n",
451 |        "      <td>所有人都知道是化妆只有我一个人以为请的真实病人出演吗。。。\\n</td>\n",
452 |        "      <td>力荐</td>\n",
453 |        "      <td>3349</td>\n",
454 |        "      <td>2017-12-06</td>\n",
455 |        "      <td>15:10:45</td>\n",
456 |        "      <td>5</td>\n",
457 |        "    </tr>\n",
458 |        "    <tr>\n",
459 |        "      <th>3</th>\n",
460 |        "      <td>同志亦凡人中文站</td>\n",
461 |        "      <td>https://www.douban.com/people/3540441/</td>\n",
462 |        "      <td>2017-11-24 15:57:52</td>\n",
463 |        "      <td>有种糖放多了的感觉，精华基本都在预告里了。但对孩子们的纯真友情毫无抵抗力啊，就像被温柔的抚...</td>\n",
464 |        "      <td>推荐</td>\n",
465 |        "      <td>1814</td>\n",
466 |        "      <td>2017-11-24</td>\n",
467 |        "      <td>15:57:52</td>\n",
468 |        "      <td>4</td>\n",
469 |        "    </tr>\n",
470 |        "    <tr>\n",
471 |        "      <th>4</th>\n",
472 |        "      <td>桃桃淘电影</td>\n",
473 |        "      <td>https://www.douban.com/people/qijiuzhiyue/</td>\n",
474 |        "      <td>2018-01-19 14:18:28</td>\n",
475 |        "      <td>其实这更像当代童话，因为，实在是太暖了。里面每个人都那么暖，怎么可以那么暖，怎么可以那么暖...</td>\n",
476 |        "      <td>推荐</td>\n",
477 |        "      <td>1711</td>\n",
478 |        "      <td>2018-01-19</td>\n",
479 |        "      <td>14:18:28</td>\n",
480 |        "      <td>4</td>\n",
481 |        "    </tr>\n",
482 |        "  </tbody>\n",
483 |        "</table>\n",
484 |        "</div>"
485 |       ],
486 |       "text/plain": [
487 |        "     p_name                                          p_url  \\\n",
488 |        "0       王大根       https://www.douban.com/people/diewithme/   \n",
489 |        "1       李阿斗       https://www.douban.com/people/gailsylee/   \n",
490 |        "2     光明小卖部  https://www.douban.com/people/gooooooooooohe/   \n",
491 |        "3  同志亦凡人中文站         https://www.douban.com/people/3540441/   \n",
492 |        "4     桃桃淘电影     https://www.douban.com/people/qijiuzhiyue/   \n",
493 |        "\n",
494 |        "          c_date_time                                             c_data  \\\n",
495 |        "0 2018-01-19 18:17:25                            在这种家庭里做一条狗都好啊\\n           \n",
496 |        "1 2017-11-25 02:12:27   当时出国以后的第一最大感受就是尊重，不论老弱病孕还是任何“与众不同”，都不会有人上下打量你...   \n",
497 |        "2 2017-12-06 15:10:45            所有人都知道是化妆只有我一个人以为请的真实病人出演吗。。。\\n           \n",
498 |        "3 2017-11-24 15:57:52   有种糖放多了的感觉，精华基本都在预告里了。但对孩子们的纯真友情毫无抵抗力啊，就像被温柔的抚...   \n",
499 |        "4 2018-01-19 14:18:28   其实这更像当代童话，因为，实在是太暖了。里面每个人都那么暖，怎么可以那么暖，怎么可以那么暖...   \n",
500 |        "\n",
501 |        "  c_rank  c_recom      c_date    c_time  c_rank_num  \n",
502 |        "0     力荐     6463  2018-01-19  18:17:25           5  \n",
503 |        "1     力荐     3429  2017-11-25  02:12:27           5  \n",
504 |        "2     力荐     3349  2017-12-06  15:10:45           5  \n",
505 |        "3     推荐     1814  2017-11-24  15:57:52           4  \n",
506 |        "4     推荐     1711  2018-01-19  14:18:28           4  "
507 |       ]
508 |      },
509 |      "execution_count": 9,
510 |      "metadata": {},
511 |      "output_type": "execute_result"
512 |     }
513 |    ],
514 |    "source": [
515 |     "# 预览数据\n",
516 |     "df.head(5)"
517 |    ]
518 |   },
519 |   {
520 |    "cell_type": "code",
521 |    "execution_count": 10,
522 |    "metadata": {},
523 |    "outputs": [],
524 |    "source": [
525 |     "# 设置索引列为c_date_time\n",
526 |     "df.index = df['c_date_time']\n",
527 |     "\n",
528 |     "# 去除多余的c_date_time列\n",
529 |     "df = df.drop(['c_date_time'], axis=1)\n",
530 |     "\n",
531 |     "# 其他的一些操作..."
532 |    ]
533 |   },
534 |   {
535 |    "cell_type": "code",
536 |    "execution_count": 11,
537 |    "metadata": {},
538 |    "outputs": [],
539 |    "source": [
540 |     "# 去除操作产生的缺失值\n",
541 |     "df.dropna(inplace=True)\n",
542 |     "# 保存预处理后的文件\n",
543 |     "df.to_csv('douban_processed.csv')"
544 |    ]
545 |   },
546 |   {
547 |    "cell_type": "code",
548 |    "execution_count": null,
549 |    "metadata": {},
550 |    "outputs": [],
551 |    "source": []
552 |   }
553 |  ],
554 |  "metadata": {
555 |   "kernelspec": {
556 |    "display_name": "Python 3",
557 |    "language": "python",
558 |    "name": "python3"
559 |   },
560 |   "language_info": {
561 |    "codemirror_mode": {
562 |     "name": "ipython",
563 |     "version": 3
564 |    },
565 |    "file_extension": ".py",
566 |    "mimetype": "text/x-python",
567 |    "name": "python",
568 |    "nbconvert_exporter": "python",
569 |    "pygments_lexer": "ipython3",
570 |    "version": "3.6.1"
571 |   }
572 |  },
573 |  "nbformat": 4,
574 |  "nbformat_minor": 2
575 | }
576 | 


--------------------------------------------------------------------------------