├── Rplot.png ├── field.jpg ├── wordcloud.png ├── full_invest.csv ├── full_invest_1.csv ├── full_invest_2.csv ├── full_invest_3.csv ├── full_invest_4.csv ├── invest_event.csv ├── data_analysis.R ├── data_analysis.py ├── itjuzi.py └── README.md /Rplot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zluckyhou/itjuzi/HEAD/Rplot.png -------------------------------------------------------------------------------- /field.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zluckyhou/itjuzi/HEAD/field.jpg -------------------------------------------------------------------------------- /wordcloud.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zluckyhou/itjuzi/HEAD/wordcloud.png -------------------------------------------------------------------------------- /full_invest.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zluckyhou/itjuzi/HEAD/full_invest.csv -------------------------------------------------------------------------------- /full_invest_1.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zluckyhou/itjuzi/HEAD/full_invest_1.csv -------------------------------------------------------------------------------- /full_invest_2.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zluckyhou/itjuzi/HEAD/full_invest_2.csv -------------------------------------------------------------------------------- /full_invest_3.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zluckyhou/itjuzi/HEAD/full_invest_3.csv -------------------------------------------------------------------------------- /full_invest_4.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zluckyhou/itjuzi/HEAD/full_invest_4.csv -------------------------------------------------------------------------------- /invest_event.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zluckyhou/itjuzi/HEAD/invest_event.csv -------------------------------------------------------------------------------- /data_analysis.R: -------------------------------------------------------------------------------- 1 | #投资事件分析 2 | 3 | invest_event = read.csv('C:\\Users\\zluck\\Documents\\统计\\IT橘子投资事件分析\\invest_event.csv',header = TRUE,as.is = T) 4 | head(invest_event) 5 | 6 | # 中国市场上最活跃的投资方都有哪些 7 | investor = invest_event$investor 8 | inv = NULL 9 | for (i in investor){ 10 | inv = c(inv,strsplit(i,',')[[1]]) 11 | } 12 | 13 | inv_freq = as.data.frame(sort(table(inv),decreasing = T)[-1]) 14 | head(inv_freq,11) 15 | barplot(sort(table(inv),decreasing = T)[2:11],col = 'red') 16 | 17 | # 腾讯作为投资方竟然排在第7,我们看一下腾讯的投资都有哪些,第一笔投资时什么时候 18 | inv_tx = invest_event %>% filter(investor == '腾讯') 19 | inv_tx[1,] 20 | inv_tx[dim(inv_tx)[1],] 21 | 22 | #排名前10的创业首选地方 23 | loc = invest_event %>% group_by(place) %>% summarise(cnt = n()) %>% arrange(desc(cnt)) %>% head(10) 24 | loc 25 | barplot(sort(table(invest_event$place),decreasing = T)[1:10],col = 'orange') 26 | 27 | # 哪些创业企业获得了成功 28 | invest_event %>% group_by(company) %>% summarise(cnt = n()) %>% arrange(desc(cnt)) 29 | # 获得投资次数最多的20家公司 30 | sus_cp = invest_event %>% group_by(company) %>% summarise(cnt = n()) %>% arrange(desc(cnt)) %>% head(20) 31 | 32 | -------------------------------------------------------------------------------- /data_analysis.py: -------------------------------------------------------------------------------- 1 | # coding = utf-8 2 | import os 3 | import pandas as pd 4 | from pandas import Series,DataFrame 5 | from dateutil.parser import parse 6 | import pymysql 7 | 8 | 9 | from wordcloud import WordCloud, STOPWORDS 10 | import matplotlib.pyplot as plt 11 | 12 | def merge_file(path): 13 | fls = os.listdir(path) 14 | ls_csv = [i for i in fls if os.path.splitext(i)[1] == '.csv'] 15 | invest_ls = [] 16 | for i in ls_csv: 17 | invest_ls.append(pd.read_csv(path+r'\\'+i,index_col=0,usecols=range(1,9),encoding='gb18030')) 18 | invest_info = pd.concat(invest_ls,axis=0) 19 | return invest_info 20 | path = r'C:\Users\zluck\Documents\GitHub\itjuzi' 21 | invest_info = merge_file(path) 22 | 23 | 24 | #日期存在一些错误值,首先把这些错误值筛选出来,然后将错误日期解析为该日期的前一天 25 | 26 | dates = [] 27 | for i in range(len(invest_info)): 28 | try: 29 | dates.append(parse(invest_info.index[i])) 30 | except Exception as e: 31 | print (invest_info.index[i]) 32 | try: 33 | dates.append(parse(invest_info.index[i+1])) 34 | print (parse(invest_info.index[i+1])) 35 | except Exception as e: 36 | dates.append(parse(invest_info.index[i+2])) 37 | print (parse(invest_info.index[i+2])) 38 | 39 | # 将索引修改为解析后的日期格式 40 | invest_info.index = dates 41 | invest_info.index.name = 'date' 42 | invest_info.to_csv(path + '\\'+ 'invest_event.csv') 43 | 44 | # 将投资事件信息存储到mysql中 45 | conn = pymysql.connect(host = 'localhost',port = 3306,user = 'root',passwd = 'helloworld',db = 'python',charset = 'utf8') 46 | cur = conn.cursor() 47 | #mysql需要设置编码才能存储中文 ALTER DATABASE python CHARACTER SET = utf8mb4 COLLATE = utf8mb4_unicode_ci; 48 | # 参见http://stackoverflow.com/questions/34305587/uploading-python-pandas-dataframe-to-mysql-internalerror-1366-incorrect-str 49 | # 由于investor列的有些值长度比较长,而mysql默认为varchar(63),所以还需要修改字段长度 50 | # alter table invest_event modify investor varchar(255) 51 | invest_info.to_sql(name = 'invest_event',con = conn,if_exists = 'replace',flavor = 'mysql',dtype = {'investor':'varchar(255)'}) 52 | 53 | invest_info = pd.read_csv(r'C:\Users\zluck\Documents\GitHub\itjuzi\invest_event.csv',encoding='gb18030',index_col=0) 54 | # 最活跃的投资机构都有哪些 55 | investors = [] 56 | for i in invest_info.investor: 57 | if pd.notnull(i): 58 | investors.extend(i.split(',')) 59 | else: 60 | print (invest_info[invest_info.investor.isnull()]) 61 | 62 | active_investors = Series(investors).value_counts() 63 | 64 | # 构造投资机构云图 65 | 66 | ls = [] 67 | for i in zip(active_investors.index,active_investors.values): 68 | ls.append(i) 69 | 70 | wc = WordCloud(font_path = r'C:\Windows\Fonts\simkai.ttf') #wordcloud默认字体为DroidSansMono,如果要支持中文,需要设置字体 71 | wc.generate_from_frequencies(ls[1:]) # 投资方未透露的投资事件最多,所以去掉第一个 72 | 73 | plt.imshow(wc) 74 | plt.axis('off') 75 | plt.show() 76 | 77 | # 最早的一笔投资是什么时候 78 | invest_info.iloc[-1,] 79 | 80 | # 获得融资最多的创业公司 81 | 82 | cnt = invest_info.groupby('company').size().sort_values(ascending=False) 83 | invest_info['cnt'] = invest_info.company.map(cnt) 84 | 85 | invest_info[invest_info.cnt >= 7 ].sort_values(by = ['cnt','company'],ascending=False) 86 | 87 | invest_info 88 | -------------------------------------------------------------------------------- /itjuzi.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | import requests 3 | from bs4 import BeautifulSoup 4 | from pandas import DataFrame 5 | import datetime,time,random 6 | import codecs 7 | 8 | def download_page(download_url): 9 | headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36'} 10 | html = requests.get(download_url,headers=headers).content.decode('utf-8') 11 | return html 12 | 13 | 14 | date_list, link_list, company_list, field_list,place_list,round_list, amount_list, investor_list = [], [], [], [], [], [],[],[] 15 | 16 | 17 | def parse_html(html): 18 | date, link, company, field, place, round, amount, investor = [], [], [], [], [], [], [], [] 19 | body = BeautifulSoup(html,'lxml') 20 | eventset = body.find_all('ul',attrs={'class':'list-main-eventset'})[1] 21 | for event in eventset.find_all('li'): 22 | t = event.find('i',attrs={'class':'cell round'}).find('span').get_text().strip() 23 | date.append(t) 24 | amount.append(event.find('i',attrs = {'class':'cell fina'}).get_text().strip()) 25 | a = event.find_all('a') 26 | link.append(a[0]['href'].strip()) 27 | company.append(a[1].get_text().strip()) 28 | field.append(a[2].get_text().strip()) 29 | place.append(a[3].get_text().strip()) 30 | round.append(a[4].get_text().strip()) 31 | 32 | #investor 可能有多个,对应多个链接,也可能是‘投资方未透露’,需要处理这两种情况 33 | if len(a) >= 6: 34 | inv = ','.join([i.get_text().strip() for i in a[5:]]) 35 | else: 36 | span = event.find('span',attrs = {'class':'investorset'}).find_all('span') 37 | inv = ','.join([i.get_text().strip() for i in span]) 38 | investor.append(inv) 39 | 40 | page_url = body.find('div',attrs={'class':'ui-pagechange for-sec-bottom'}).find_all('a') 41 | if page_url[-1]['href']: 42 | download_url = page_url[-2]['href'] 43 | else: 44 | download_url = False 45 | 46 | return date,link,company,field,place,round,amount,investor,download_url 47 | 48 | 49 | download_url = 'https://www.itjuzi.com/investevents?page=1430' 50 | def loop(): 51 | global download_url 52 | while download_url: 53 | html = download_page(download_url) 54 | date, link, company, field, place, round, amount, investor, download_url = parse_html(html) 55 | 56 | date_list.extend(date) 57 | link_list.extend(link) 58 | company_list.extend(company) 59 | field_list.extend(field) 60 | place_list.extend(place) 61 | round_list.extend(round) 62 | amount_list.extend(amount) 63 | investor_list.extend(investor) 64 | time.sleep(random.random() * 0.5) 65 | 66 | #由于截止目前共有1534页,数量太多,所以提取前10页 67 | def main(): 68 | start = datetime.datetime.now() 69 | try: 70 | loop() 71 | except Exception as e: 72 | print ('Next url is: {}, second chance.'.format(download_url)) 73 | try: 74 | loop() 75 | except Exception as e: 76 | print ('Next url is: {}, last chance.'.format(download_url)) 77 | try: 78 | loop() 79 | except Exception as e: 80 | print ('Next url is: {}, stop here.'.format(download_url)) 81 | raise 82 | raise 83 | raise 84 | finally: 85 | d = dict(date=date_list, link=link_list, company=company_list, field=field_list, place=place_list, 86 | round=round_list, amount=amount_list, investor=investor_list) 87 | df = DataFrame(d, columns=['date', 'link', 'company', 'field', 'place', 'round', 'amount', 'investor']) 88 | df.to_csv(r'C:\Users\zluck\Documents\Python\web crawler\full_invest_4.csv', encoding='gb18030') 89 | end = datetime.datetime.now() 90 | timespan = end - start 91 | print ('Done! The process costs {}.'.format(timespan)) 92 | 93 | if __name__ == '__main__': 94 | main() 95 | 96 | 97 | 98 | 99 | 100 | 101 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ITjuzi 2 | 3 | IT桔子投资事件分析 4 | --------------- 5 | 6 | 学习了Python爬虫和pandas数据分析,可以利用这些技术来做一些有趣的事,今天要做的就是对[IT桔子](https://www.itjuzi.com/)上的投资事件进行统计分析 7 | 8 | 首先从该网站爬取1999年至今的所有投资事件信息,爬虫代码见`itjuzi.py`,由于爬取数据的过程中出现了几次中断(不知道什么原因,可能是爬取太频繁),所以数据存储在好几个csv文件中,爬完之后对数据进行了整合,放在`invest_event.csv`中,爬取的投资事件共15304条。 9 | 10 | 我们看一下能从这些数据中发现什么 11 | 12 | **最活跃的10家投资机构** 13 | 14 | - IDG资本 489 15 | - 经纬中国 408 16 | - 红杉资本中国 371 17 | - 真格基金 345 18 | - 创新工场 262 19 | - 险峰长青(险峰华兴) 242 20 | - 腾讯 224 21 | - PreAngel 196 22 | - 达晨创投 163 23 | - 晨兴资本 147 24 | 25 | 26 | ![](wordcloud.png) 27 | 28 | **最早的一笔投资** 29 | 30 | date | link | company | field | place | round | amount | investor | 31 | ---|---|---|---|---|---|---|--- 32 | 1999-06-07 | https://www.itjuzi.com/investevents/10241 | 搜房网-房天下 | 房产服务 | 北京 | A轮 | 数百万美元 | IDG资本 33 | 34 | **融资轮次前20的创业公司都有哪些** 35 | 36 | | date | company | count(company) | field | round | 37 | ---------------------|--------------------|----------------|----------|------------ 38 | | 2016-09-09 00:00:00 | 滴滴出行 | 13 | 汽车交通 | 战略投资 | 39 | | 2016-01-23 00:00:00 | 途牛旅游网 | 10 | 旅游 | IPO上市后 | 40 | | 2014-10-03 00:00:00 | 58同城 | 9 | 本地生活 | IPO上市后 | 41 | | 2015-11-09 00:00:00 | 酒仙网 | 8 | 电子商务 | 新三板 | 42 | | 2016-04-13 00:00:00 | 饿了么 | 8 | 本地生活 | F轮-上市前 | 43 | | 2016-02-04 00:00:00 | 驴妈妈旅游网 | 8 | 旅游 | 战略投资 | 44 | | 2014-12-11 00:00:00 | PPTV聚力传媒 | 8 | 文化娱乐 | 战略投资 | 45 | | 2016-08-19 00:00:00 | 京东 | 8 | 电子商务 | IPO上市后 | 46 | | 2015-07-21 00:00:00 | 一嗨租车 | 8 | 汽车交通 | 战略投资 | 47 | | 2014-11-12 00:00:00 | 优酷土豆-合一集团 | 8 | 文化娱乐 | IPO上市后 | 48 | | 2016-04-20 00:00:00 | 美图秀秀 | 7 | 文化娱乐 | D轮 | 49 | | 2014-03-01 00:00:00 | AeroHive艾诺威 | 7 | 硬件 | IPO上市 | 50 | | 2014-02-01 00:00:00 | VANCL凡客诚品 | 7 | 电子商务 | F轮-上市前 | 51 | | 2016-03-14 00:00:00 | 百姓网 | 7 | 本地生活 | 新三板 | 52 | | 2015-08-15 00:00:00 | 明星衣橱 | 7 | 电子商务 | D轮 | 53 | | 2014-12-11 00:00:00 | 陌陌 | 7 | 社交网络 | IPO上市 | 54 | | 2016-09-08 00:00:00 | 百世物流(百世汇通) | 7 | 物流 | 不明确 | 55 | | 2013-12-01 00:00:00 | UC优视科技 | 7 | 工具软件 | F轮-上市前 | 56 | | 2015-10-21 00:00:00 | 大姨吗 | 7 | 医疗健康 | E轮 | 57 | | 2016-07-07 00:00:00 | 趣分期(趣店集团) | 7 | 金融 | F轮-上市前 | 58 | 59 | 60 | **创业公司所在地分布情况** 61 | 62 | | place | count(place) | 63 | --------|-------------- 64 | | 北京 | 6639 | 65 | | 上海 | 3124 | 66 | | 广东 | 2280 | 67 | | 浙江 | 1143 | 68 | | 江苏 | 520 | 69 | | 四川 | 357 | 70 | | 福建 | 258 | 71 | | 湖北 | 199 | 72 | | 台湾 | 104 | 73 | | 陕西 | 91 | 74 | | 天津 | 88 | 75 | | 重庆 | 78 | 76 | | 山东 | 78 | 77 | | 香港 | 76 | 78 | | 湖南 | 56 | 79 | | 河南 | 40 | 80 | | 辽宁 | 36 | 81 | | 安徽 | 34 | 82 | | 海南 | 16 | 83 | | 贵州 | 16 | 84 | | 河北 | 14 | 85 | | 内蒙古 | 11 | 86 | | 广西 | 10 | 87 | | 山西 | 7 | 88 | | 黑龙江 | 6 | 89 | | 江西 | 5 | 90 | | 甘肃 | 4 | 91 | | 云南 | 4 | 92 | | 吉林 | 4 | 93 | | 新疆 | 3 | 94 | | 国外 | 1 | 95 | | 宁夏 | 1 | 96 | 97 | **创业公司最青睐的地方** 98 | 99 | ![](Rplot.png) 100 | 101 | 102 | **创业公司所处的领域** 103 | 104 | ![](field.jpg) 105 | 106 | 107 | 通过以上数据,我们发现获得融资次数最多的公司基本上都是互联网公司,创业者最喜欢去的地方是北京,其次是上海、广东、浙江。所以对于有志于创立一家公司的创业者,北京是首选地,其次你得是一个技术达人。 108 | --------------------------------------------------------------------------------