├── Rplot.png
├── field.jpg
├── wordcloud.png
├── full_invest.csv
├── full_invest_1.csv
├── full_invest_2.csv
├── full_invest_3.csv
├── full_invest_4.csv
├── invest_event.csv
├── data_analysis.R
├── data_analysis.py
├── itjuzi.py
└── README.md


/Rplot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zluckyhou/itjuzi/HEAD/Rplot.png


--------------------------------------------------------------------------------
/field.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zluckyhou/itjuzi/HEAD/field.jpg


--------------------------------------------------------------------------------
/wordcloud.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zluckyhou/itjuzi/HEAD/wordcloud.png


--------------------------------------------------------------------------------
/full_invest.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zluckyhou/itjuzi/HEAD/full_invest.csv


--------------------------------------------------------------------------------
/full_invest_1.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zluckyhou/itjuzi/HEAD/full_invest_1.csv


--------------------------------------------------------------------------------
/full_invest_2.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zluckyhou/itjuzi/HEAD/full_invest_2.csv


--------------------------------------------------------------------------------
/full_invest_3.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zluckyhou/itjuzi/HEAD/full_invest_3.csv


--------------------------------------------------------------------------------
/full_invest_4.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zluckyhou/itjuzi/HEAD/full_invest_4.csv


--------------------------------------------------------------------------------
/invest_event.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zluckyhou/itjuzi/HEAD/invest_event.csv


--------------------------------------------------------------------------------
/data_analysis.R:
--------------------------------------------------------------------------------
 1 | #投资事件分析
 2 | 
 3 | invest_event = read.csv('C:\\Users\\zluck\\Documents\\统计\\IT橘子投资事件分析\\invest_event.csv',header = TRUE,as.is = T)
 4 | head(invest_event)
 5 | 
 6 | # 中国市场上最活跃的投资方都有哪些
 7 | investor = invest_event$investor
 8 | inv = NULL
 9 | for (i in investor){
10 |   inv = c(inv,strsplit(i,',')[[1]])
11 | }
12 | 
13 | inv_freq = as.data.frame(sort(table(inv),decreasing = T)[-1])
14 | head(inv_freq,11)
15 | barplot(sort(table(inv),decreasing = T)[2:11],col = 'red')
16 | 
17 | # 腾讯作为投资方竟然排在第7，我们看一下腾讯的投资都有哪些，第一笔投资时什么时候
18 | inv_tx = invest_event %>% filter(investor == '腾讯')
19 | inv_tx[1,]
20 | inv_tx[dim(inv_tx)[1],]
21 | 
22 | #排名前10的创业首选地方
23 | loc = invest_event %>% group_by(place) %>% summarise(cnt = n()) %>% arrange(desc(cnt)) %>% head(10)
24 | loc
25 | barplot(sort(table(invest_event$place),decreasing = T)[1:10],col = 'orange')
26 | 
27 | # 哪些创业企业获得了成功
28 | invest_event %>% group_by(company) %>% summarise(cnt = n()) %>% arrange(desc(cnt))
29 | # 获得投资次数最多的20家公司
30 | sus_cp = invest_event %>% group_by(company) %>% summarise(cnt = n()) %>% arrange(desc(cnt)) %>% head(20)
31 | 
32 | 


--------------------------------------------------------------------------------
/data_analysis.py:
--------------------------------------------------------------------------------
 1 | # coding = utf-8
 2 | import os
 3 | import pandas as pd
 4 | from pandas import Series,DataFrame
 5 | from dateutil.parser import parse
 6 | import pymysql
 7 | 
 8 | 
 9 | from wordcloud import WordCloud, STOPWORDS
10 | import matplotlib.pyplot as plt
11 | 
12 | def merge_file(path):
13 |     fls = os.listdir(path)
14 |     ls_csv = [i for i in fls if os.path.splitext(i)[1] == '.csv']
15 |     invest_ls = []
16 |     for i in ls_csv:
17 |         invest_ls.append(pd.read_csv(path+r'\\'+i,index_col=0,usecols=range(1,9),encoding='gb18030'))
18 |     invest_info = pd.concat(invest_ls,axis=0)
19 |     return invest_info
20 | path = r'C:\Users\zluck\Documents\GitHub\itjuzi'
21 | invest_info = merge_file(path)
22 | 
23 | 
24 | #日期存在一些错误值，首先把这些错误值筛选出来，然后将错误日期解析为该日期的前一天
25 | 
26 | dates = []
27 | for i in range(len(invest_info)):
28 |     try:
29 |         dates.append(parse(invest_info.index[i]))
30 |     except Exception as e:
31 |         print (invest_info.index[i])
32 |         try:
33 |             dates.append(parse(invest_info.index[i+1]))
34 |             print (parse(invest_info.index[i+1]))
35 |         except Exception as e:
36 |             dates.append(parse(invest_info.index[i+2]))
37 |             print (parse(invest_info.index[i+2]))
38 | 
39 | # 将索引修改为解析后的日期格式
40 | invest_info.index = dates
41 | invest_info.index.name = 'date'
42 | invest_info.to_csv(path + '\\'+ 'invest_event.csv')
43 | 
44 | # 将投资事件信息存储到mysql中
45 | conn = pymysql.connect(host = 'localhost',port = 3306,user = 'root',passwd = 'helloworld',db = 'python',charset = 'utf8')
46 | cur = conn.cursor()
47 | #mysql需要设置编码才能存储中文 ALTER DATABASE python CHARACTER SET = utf8mb4 COLLATE = utf8mb4_unicode_ci;
48 | # 参见http://stackoverflow.com/questions/34305587/uploading-python-pandas-dataframe-to-mysql-internalerror-1366-incorrect-str
49 | # 由于investor列的有些值长度比较长，而mysql默认为varchar(63)，所以还需要修改字段长度
50 | # alter table invest_event modify investor varchar(255)
51 | invest_info.to_sql(name = 'invest_event',con = conn,if_exists = 'replace',flavor = 'mysql',dtype = {'investor':'varchar(255)'})
52 | 
53 | invest_info = pd.read_csv(r'C:\Users\zluck\Documents\GitHub\itjuzi\invest_event.csv',encoding='gb18030',index_col=0)
54 | # 最活跃的投资机构都有哪些
55 | investors = []
56 | for i in invest_info.investor:
57 |     if pd.notnull(i):
58 |         investors.extend(i.split(','))
59 |     else:
60 |         print (invest_info[invest_info.investor.isnull()])
61 | 
62 | active_investors = Series(investors).value_counts()
63 | 
64 | # 构造投资机构云图
65 | 
66 | ls = []
67 | for i in zip(active_investors.index,active_investors.values):
68 |     ls.append(i)
69 | 
70 | wc = WordCloud(font_path = r'C:\Windows\Fonts\simkai.ttf') #wordcloud默认字体为DroidSansMono,如果要支持中文，需要设置字体
71 | wc.generate_from_frequencies(ls[1:]) # 投资方未透露的投资事件最多，所以去掉第一个
72 | 
73 | plt.imshow(wc)
74 | plt.axis('off')
75 | plt.show()
76 | 
77 | # 最早的一笔投资是什么时候
78 | invest_info.iloc[-1,]
79 | 
80 | # 获得融资最多的创业公司
81 | 
82 | cnt = invest_info.groupby('company').size().sort_values(ascending=False)
83 | invest_info['cnt'] = invest_info.company.map(cnt)
84 | 
85 | invest_info[invest_info.cnt >= 7 ].sort_values(by = ['cnt','company'],ascending=False)
86 | 
87 | invest_info
88 | 


--------------------------------------------------------------------------------
/itjuzi.py:
--------------------------------------------------------------------------------
  1 | #coding:utf-8
  2 | import requests
  3 | from bs4 import BeautifulSoup
  4 | from pandas import DataFrame
  5 | import datetime,time,random
  6 | import codecs
  7 | 
  8 | def download_page(download_url):
  9 |     headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36'}
 10 |     html = requests.get(download_url,headers=headers).content.decode('utf-8')
 11 |     return html
 12 | 
 13 | 
 14 | date_list, link_list, company_list, field_list,place_list,round_list, amount_list, investor_list = [], [], [], [], [], [],[],[]
 15 | 
 16 | 
 17 | def parse_html(html):
 18 |     date, link, company, field, place, round, amount, investor = [], [], [], [], [], [], [], []
 19 |     body = BeautifulSoup(html,'lxml')
 20 |     eventset = body.find_all('ul',attrs={'class':'list-main-eventset'})[1]
 21 |     for event in eventset.find_all('li'):
 22 |         t = event.find('i',attrs={'class':'cell round'}).find('span').get_text().strip()
 23 |         date.append(t)
 24 |         amount.append(event.find('i',attrs = {'class':'cell fina'}).get_text().strip())
 25 |         a = event.find_all('a')
 26 |         link.append(a[0]['href'].strip())
 27 |         company.append(a[1].get_text().strip())
 28 |         field.append(a[2].get_text().strip())
 29 |         place.append(a[3].get_text().strip())
 30 |         round.append(a[4].get_text().strip())
 31 | 
 32 |         #investor 可能有多个，对应多个链接，也可能是‘投资方未透露’，需要处理这两种情况
 33 |         if len(a) >= 6:
 34 |             inv = ','.join([i.get_text().strip() for i in a[5:]])
 35 |         else:
 36 |             span = event.find('span',attrs = {'class':'investorset'}).find_all('span')
 37 |             inv = ','.join([i.get_text().strip() for i in span])
 38 |         investor.append(inv)
 39 | 
 40 |     page_url = body.find('div',attrs={'class':'ui-pagechange for-sec-bottom'}).find_all('a')
 41 |     if page_url[-1]['href']:
 42 |         download_url = page_url[-2]['href']
 43 |     else:
 44 |         download_url = False
 45 | 
 46 |     return date,link,company,field,place,round,amount,investor,download_url
 47 | 
 48 | 
 49 | download_url = 'https://www.itjuzi.com/investevents?page=1430'
 50 | def loop():
 51 |     global download_url
 52 |     while download_url:
 53 |         html = download_page(download_url)
 54 |         date, link, company, field, place, round, amount, investor, download_url = parse_html(html)
 55 | 
 56 |         date_list.extend(date)
 57 |         link_list.extend(link)
 58 |         company_list.extend(company)
 59 |         field_list.extend(field)
 60 |         place_list.extend(place)
 61 |         round_list.extend(round)
 62 |         amount_list.extend(amount)
 63 |         investor_list.extend(investor)
 64 |         time.sleep(random.random() * 0.5)
 65 | 
 66 | #由于截止目前共有1534页，数量太多，所以提取前10页
 67 | def main():
 68 |     start = datetime.datetime.now()
 69 |     try:
 70 |         loop()
 71 |     except Exception as e:
 72 |         print ('Next url is: {}, second chance.'.format(download_url))
 73 |         try:
 74 |             loop()
 75 |         except Exception as e:
 76 |             print ('Next url is: {}, last chance.'.format(download_url))
 77 |             try:
 78 |                 loop()
 79 |             except Exception as e:
 80 |                 print ('Next url is: {}, stop here.'.format(download_url))
 81 |                 raise
 82 |             raise
 83 |         raise
 84 |     finally:
 85 |         d = dict(date=date_list, link=link_list, company=company_list, field=field_list, place=place_list,
 86 |                  round=round_list, amount=amount_list, investor=investor_list)
 87 |         df = DataFrame(d, columns=['date', 'link', 'company', 'field', 'place', 'round', 'amount', 'investor'])
 88 |         df.to_csv(r'C:\Users\zluck\Documents\Python\web crawler\full_invest_4.csv', encoding='gb18030')
 89 |         end = datetime.datetime.now()
 90 |         timespan = end - start
 91 |         print ('Done! The process costs {}.'.format(timespan))
 92 | 
 93 | if __name__ == '__main__':
 94 |     main()
 95 | 
 96 | 
 97 | 
 98 | 
 99 | 
100 | 
101 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # ITjuzi
  2 | 
  3 | IT桔子投资事件分析
  4 | ---------------
  5 | 
  6 | 学习了Python爬虫和pandas数据分析，可以利用这些技术来做一些有趣的事，今天要做的就是对[IT桔子](https://www.itjuzi.com/)上的投资事件进行统计分析
  7 | 
  8 | 首先从该网站爬取1999年至今的所有投资事件信息，爬虫代码见`itjuzi.py`，由于爬取数据的过程中出现了几次中断（不知道什么原因，可能是爬取太频繁），所以数据存储在好几个csv文件中，爬完之后对数据进行了整合，放在`invest_event.csv`中，爬取的投资事件共15304条。
  9 | 
 10 | 我们看一下能从这些数据中发现什么
 11 | 
 12 | **最活跃的10家投资机构**
 13 | 
 14 | - IDG资本         489    
 15 | - 经纬中国          408   
 16 | - 红杉资本中国        371   
 17 | - 真格基金          345	
 18 | - 创新工场          262	
 19 | - 险峰长青(险峰华兴)    242	
 20 | - 腾讯            224	
 21 | - PreAngel      196	
 22 | - 达晨创投          163	
 23 | - 晨兴资本          147	
 24 | 
 25 | 
 26 | ![](wordcloud.png)
 27 | 
 28 | **最早的一笔投资**
 29 | 
 30 | date                | link                                      | company       | field    | place | round     | amount     | investor                                                 |
 31 | ---|---|---|---|---|---|---|---
 32 | 1999-06-07 | https://www.itjuzi.com/investevents/10241 | 搜房网-房天下 | 房产服务 | 北京  | A轮       | 数百万美元 | IDG资本  
 33 | 
 34 | **融资轮次前20的创业公司都有哪些**
 35 | 
 36 | | date                | company            | count(company) | field    | round      |
 37 |  ---------------------|--------------------|----------------|----------|------------
 38 | | 2016-09-09 00:00:00 | 滴滴出行           |             13 | 汽车交通 | 战略投资   |
 39 | | 2016-01-23 00:00:00 | 途牛旅游网         |             10 | 旅游     | IPO上市后  |
 40 | | 2014-10-03 00:00:00 | 58同城             |              9 | 本地生活 | IPO上市后  |
 41 | | 2015-11-09 00:00:00 | 酒仙网             |              8 | 电子商务 | 新三板     |
 42 | | 2016-04-13 00:00:00 | 饿了么             |              8 | 本地生活 | F轮-上市前 |
 43 | | 2016-02-04 00:00:00 | 驴妈妈旅游网       |              8 | 旅游     | 战略投资   |
 44 | | 2014-12-11 00:00:00 | PPTV聚力传媒       |              8 | 文化娱乐 | 战略投资   |
 45 | | 2016-08-19 00:00:00 | 京东               |              8 | 电子商务 | IPO上市后  |
 46 | | 2015-07-21 00:00:00 | 一嗨租车           |              8 | 汽车交通 | 战略投资   |
 47 | | 2014-11-12 00:00:00 | 优酷土豆-合一集团  |              8 | 文化娱乐 | IPO上市后  |
 48 | | 2016-04-20 00:00:00 | 美图秀秀           |              7 | 文化娱乐 | D轮        |
 49 | | 2014-03-01 00:00:00 | AeroHive艾诺威     |              7 | 硬件     | IPO上市    |
 50 | | 2014-02-01 00:00:00 | VANCL凡客诚品      |              7 | 电子商务 | F轮-上市前 |
 51 | | 2016-03-14 00:00:00 | 百姓网             |              7 | 本地生活 | 新三板     |
 52 | | 2015-08-15 00:00:00 | 明星衣橱           |              7 | 电子商务 | D轮        |
 53 | | 2014-12-11 00:00:00 | 陌陌               |              7 | 社交网络 | IPO上市    |
 54 | | 2016-09-08 00:00:00 | 百世物流(百世汇通) |              7 | 物流     | 不明确     |
 55 | | 2013-12-01 00:00:00 | UC优视科技         |              7 | 工具软件 | F轮-上市前 |
 56 | | 2015-10-21 00:00:00 | 大姨吗             |              7 | 医疗健康 | E轮        |
 57 | | 2016-07-07 00:00:00 | 趣分期(趣店集团)   |              7 | 金融     | F轮-上市前 |
 58 | 
 59 | 
 60 | **创业公司所在地分布情况**
 61 | 
 62 | | place  | count(place) |
 63 |  --------|--------------
 64 | | 北京   |         6639 |
 65 | | 上海   |         3124 |
 66 | | 广东   |         2280 |
 67 | | 浙江   |         1143 |
 68 | | 江苏   |          520 |
 69 | | 四川   |          357 |
 70 | | 福建   |          258 |
 71 | | 湖北   |          199 |
 72 | | 台湾   |          104 |
 73 | | 陕西   |           91 |
 74 | | 天津   |           88 |
 75 | | 重庆   |           78 |
 76 | | 山东   |           78 |
 77 | | 香港   |           76 |
 78 | | 湖南   |           56 |
 79 | | 河南   |           40 |
 80 | | 辽宁   |           36 |
 81 | | 安徽   |           34 |
 82 | | 海南   |           16 |
 83 | | 贵州   |           16 |
 84 | | 河北   |           14 |
 85 | | 内蒙古 |           11 |
 86 | | 广西   |           10 |
 87 | | 山西   |            7 |
 88 | | 黑龙江 |            6 |
 89 | | 江西   |            5 |
 90 | | 甘肃   |            4 |
 91 | | 云南   |            4 |
 92 | | 吉林   |            4 |
 93 | | 新疆   |            3 |
 94 | | 国外   |            1 |
 95 | | 宁夏   |            1 |
 96 | 
 97 | **创业公司最青睐的地方**
 98 | 
 99 | ![](Rplot.png)
100 | 
101 | 
102 | **创业公司所处的领域**
103 | 
104 | ![](field.jpg)
105 | 
106 | 
107 | 通过以上数据，我们发现获得融资次数最多的公司基本上都是互联网公司，创业者最喜欢去的地方是北京，其次是上海、广东、浙江。所以对于有志于创立一家公司的创业者，北京是首选地，其次你得是一个技术达人。
108 | 


--------------------------------------------------------------------------------