├── README.md ├── spark_script ├── utils.py ├── analyze_active_user.py ├── extract_ai_question.py ├── analyze_user.py ├── parse_basic_info.py ├── analyze_answers.py └── analyze_question.py ├── LICENSE ├── .gitignore └── analyze_program_lang.ipynb /README.md: -------------------------------------------------------------------------------- 1 | ### 爬去一年多知乎的数据,一共爬去了一亿多条回答。最近拿来做点数据统计。 2 | 3 | 数据正在统计处理中,如有感兴趣的内容欢迎 issue 提出 4 | 5 | 6 | 7 | - [ **关于回答数分布统计**](analyze_answer_time_series.ipynb) 8 | 9 | - [ **关于问题分词统计**](analyze_question_title_words.ipynb) 10 | 11 | - [ **关于问题编程语言统计**](analyze_program_lang.ipynb) 12 | 13 | 14 | 15 | 16 | -------------------------------------------------------------------------------- /spark_script/utils.py: -------------------------------------------------------------------------------- 1 | # coding=u8 2 | import pandas as pd 3 | 4 | 5 | def _map_to_pandas(rdds): 6 | """ Needs to be here due to pickling issues """ 7 | return [pd.DataFrame(list(rdds))] 8 | 9 | def toPandas(df, n_partitions=None): 10 | """ 11 | Returns the contents of `df` as a local `pandas.DataFrame` in a speedy fashion. The DataFrame is 12 | repartitioned if `n_partitions` is passed. 13 | :param df: pyspark.sql.DataFrame 14 | :param n_partitions: int or None 15 | :return: pandas.DataFrame 16 | """ 17 | if n_partitions is not None: df = df.repartition(n_partitions) 18 | df_pand = df.rdd.mapPartitions(_map_to_pandas).collect() 19 | df_pand = pd.concat(df_pand) 20 | df_pand.columns = df.columns 21 | return df_pand 22 | 23 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 mikolaj 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /spark_script/analyze_active_user.py: -------------------------------------------------------------------------------- 1 | # coding=u8 2 | import json 3 | import operator 4 | import pyspark 5 | from pyspark import SparkContext, SparkConf 6 | from pyspark.sql.functions import udf, col, lit, split, collect_list, struct 7 | import pyspark.sql.functions as F 8 | from pyspark.sql.types import * 9 | import datetime 10 | import sys 11 | reload(sys) 12 | sys.setdefaultencoding('u8') 13 | 14 | 15 | 16 | def main(): 17 | print(spark.sparkContext.getConf().getAll()) 18 | spark.sparkContext.setLogLevel('WARN') 19 | 20 | column_list = ['answer_id', 'answer_created', 'answer_updated', 'author_id', 'author_name', 'author_url_token', 'badge_num',\ 21 | 'can_comment', 'comment_count', 'gender', 'insert_time', 'last_update_time', 'question_created', 'question_id',\ 22 | 'reward_member_count', 'reward_total_money', 'voteup_count'] 23 | 24 | df = spark.read.parquet("hdfs://device1/zhihu/users/max_answer_created") 25 | df.createOrReplaceTempView("table1") 26 | 27 | df = spark.sql("select * from table1 where max_answer_created >= '2018-01-01'") 28 | """ 29 | df = df.select(['author_id', 'author_url_token', 'answer_created']) 30 | df = df.withColumn('count',lit(1)) 31 | df = df.groupby(['author_id', 'author_url_token']).agg({'answer_created': 'max', 'count': 'sum'}) 32 | df = df.withColumnRenamed('max(answer_created)', 'max_answer_created').withColumnRenamed('sum(count)', 'sum_count') 33 | 34 | df = df.cache() 35 | 36 | my_udf = udf(analyze_url_token, ArrayType(StringType())) 37 | df = df.groupby(['author_id']).agg(collect_list(struct("max_answer_created", "author_url_token", "sum_count")).alias('struct')) \ 38 | .withColumn('struct', my_udf('struct')) 39 | df = df.cache() # 多用cache以减少重复计算 40 | df = df.withColumn('url_token', col('struct').getItem(0)).withColumn('old_url_token', col('struct').getItem(1)).withColumn('sum_count', col('struct').getItem(2)).withColumn('max_answer_created', col('struct').getItem(3)) 41 | 42 | df_user = df.select(['author_id', 'url_token', 'old_url_token', 'sum_count', 'max_answer_created']) 43 | """ 44 | 45 | print(df.show()) 46 | print(df.count()) 47 | 48 | 49 | if __name__ == '__main__': 50 | conf = SparkConf().setMaster("local[*]").setAppName("zhihu_parse_basic_info") 51 | #conf.set("spark.driver.memory","30g") # 要写对参数不然不会生效,之前写的是driver-memory ;注意:一定要在命令行中设置参数--driver-memory 不然无效! 52 | conf.set('spark.sql.warehouse.dir', 'hdfs://device1/user/dennis/spark/') 53 | conf.set('spark.executor.cores', '6') 54 | conf.set('spark.executor.instances', '6') 55 | 56 | 57 | sc = pyspark.SparkContext(conf=conf) 58 | spark = pyspark.sql.SparkSession.builder \ 59 | .config(conf=conf) \ 60 | .getOrCreate() 61 | 62 | main() 63 | -------------------------------------------------------------------------------- /spark_script/extract_ai_question.py: -------------------------------------------------------------------------------- 1 | # coding=u8 2 | import json 3 | import re 4 | import pyspark 5 | from pyspark import SparkContext, SparkConf 6 | from pyspark.sql.functions import udf, col, lit, lower 7 | import pyspark.sql.functions as F 8 | from pyspark.sql.types import * 9 | import datetime 10 | import sys 11 | try: 12 | reload(sys) 13 | sys.setdefaultencoding('u8') 14 | except: 15 | pass 16 | 17 | 18 | def map_line(line): 19 | 20 | ret_tuple = answer_id, answer_created, answer_updated, author_id, author_name, author_url_token, badge_num, can_comment,\ 21 | comment_count, gender, insert_time, last_update_time, question_created, question_id,\ 22 | reward_member_count, reward_total_money, voteup_count 23 | 24 | return ret_tuple 25 | 26 | def is_ai_title(title): 27 | is_ai = False 28 | match1 = re.search(r'\bai\b', title.lower()) 29 | if match1: 30 | is_ai = True 31 | if '人工智能' in title: 32 | is_ai = True 33 | if '机器学习' in title: 34 | is_ai = True 35 | if '神经网络' in title: 36 | is_ai = True 37 | if '自动驾驶' in title: 38 | is_ai = True 39 | 40 | return is_ai 41 | 42 | 43 | def parse_question(): 44 | 45 | column_list = ['answer_id', 'answer_content', 'answer_created', 'answer_updated', 'author_id', 'author_name', 'author_url_token', 'badge_num',\ 46 | 'can_comment', 'comment_count', 'gender', 'insert_time', 'question_created', 'question_title', 'question_id',\ 47 | 'reward_member_count', 'reward_total_money', 'voteup_count'] 48 | 49 | df = spark.read.parquet("hdfs://device1/zhihu/basic_info/*") 50 | #df = spark.read.option("mergeSchema", False).parquet("hdfs://device1/zhihu/basic_info/2017*") 51 | 52 | df = df.select(column_list) 53 | check_ai = udf(is_ai_title, BooleanType()) 54 | df = df.withColumn('count', lit(1)) 55 | df = df.withColumn('is_ai', check_ai('question_title')) 56 | df = df.filter(df.is_ai == True) 57 | df = df.withColumn('answer_created', df.answer_created.cast(DateType())) 58 | df = df.withColumn('answer_updated', df.answer_updated.cast(DateType())) 59 | df = df.withColumn('insert_time', df.insert_time.cast(DateType())) 60 | df = df.withColumn('question_created', df.question_created.cast(DateType())) 61 | print(df.printSchema()) 62 | print(df.show()) 63 | #df = df.repartition(1) 64 | #df.write.mode('overwrite').csv('hdfs://device1/zhihu/ai_question') 65 | """ 66 | df.write.format('jdbc').options( 67 | url='jdbc:mysql://35.220.239.102/zhihu_display', 68 | driver='com.mysql.jdbc.Driver', 69 | dbtable='ai_questions', 70 | user='ramsey', 71 | password='AaronRamsey10!').mode('overwrite').save() 72 | """ 73 | df_pandas = df.toPandas() 74 | df_pandas.to_excel('output.xlsx') 75 | #print(df_pandas) 76 | 77 | 78 | if __name__ == '__main__': 79 | conf = SparkConf().setMaster("local[*]").setAppName("zhihu_parse_basic_info") 80 | #conf.set("spark.driver.memory","30g") # 要写对参数不然不会生效,之前写的是driver-memory ;注意:一定要在命令行中设置参数--driver-memory 不然无效! 81 | conf.set('spark.sql.warehouse.dir', 'hdfs://device1/user/dennis/spark/') 82 | conf.set('spark.executor.cores', '6') 83 | conf.set('spark.executor.instances', '6') 84 | 85 | 86 | sc = pyspark.SparkContext(conf=conf) 87 | spark = pyspark.sql.SparkSession.builder \ 88 | .config(conf=conf) \ 89 | .getOrCreate() 90 | spark.sparkContext.setLogLevel('INFO') 91 | parse_question() 92 | 93 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.txt 2 | *.csv 3 | *.xlsx 4 | *.xls 5 | *.ttf 6 | 7 | setting.py 8 | 9 | # Logs 10 | logs 11 | *.log 12 | npm-debug.log* 13 | yarn-debug.log* 14 | yarn-error.log* 15 | 16 | # Runtime data 17 | pids 18 | *.pid 19 | *.seed 20 | *.pid.lock 21 | 22 | # Directory for instrumented libs generated by jscoverage/JSCover 23 | lib-cov 24 | 25 | # Coverage directory used by tools like istanbul 26 | coverage 27 | 28 | # nyc test coverage 29 | .nyc_output 30 | 31 | # Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files) 32 | .grunt 33 | 34 | # Bower dependency directory (https://bower.io/) 35 | bower_components 36 | 37 | # node-waf configuration 38 | .lock-wscript 39 | 40 | # Compiled binary addons (https://nodejs.org/api/addons.html) 41 | build/Release 42 | 43 | # Dependency directories 44 | node_modules/ 45 | jspm_packages/ 46 | 47 | # TypeScript v1 declaration files 48 | typings/ 49 | 50 | # Optional npm cache directory 51 | .npm 52 | 53 | # Optional eslint cache 54 | .eslintcache 55 | 56 | # Optional REPL history 57 | .node_repl_history 58 | 59 | # Output of 'npm pack' 60 | *.tgz 61 | 62 | # Yarn Integrity file 63 | .yarn-integrity 64 | 65 | # dotenv environment variables file 66 | .env 67 | 68 | # next.js build output 69 | .next 70 | 71 | # Byte-compiled / optimized / DLL files 72 | __pycache__/ 73 | *.py[cod] 74 | *$py.class 75 | 76 | # C extensions 77 | *.so 78 | 79 | # Distribution / packaging 80 | .Python 81 | build/ 82 | develop-eggs/ 83 | dist/ 84 | downloads/ 85 | eggs/ 86 | .eggs/ 87 | lib/ 88 | lib64/ 89 | parts/ 90 | sdist/ 91 | var/ 92 | wheels/ 93 | pip-wheel-metadata/ 94 | share/python-wheels/ 95 | *.egg-info/ 96 | .installed.cfg 97 | *.egg 98 | MANIFEST 99 | 100 | # PyInstaller 101 | # Usually these files are written by a python script from a template 102 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 103 | *.manifest 104 | *.spec 105 | 106 | # Installer logs 107 | pip-log.txt 108 | pip-delete-this-directory.txt 109 | 110 | # Unit test / coverage reports 111 | htmlcov/ 112 | .tox/ 113 | .nox/ 114 | .coverage 115 | .coverage.* 116 | .cache 117 | nosetests.xml 118 | coverage.xml 119 | *.cover 120 | .hypothesis/ 121 | .pytest_cache/ 122 | 123 | # Translations 124 | *.mo 125 | *.pot 126 | 127 | # Django stuff: 128 | *.log 129 | local_settings.py 130 | db.sqlite3 131 | 132 | # Flask stuff: 133 | instance/ 134 | .webassets-cache 135 | 136 | # Scrapy stuff: 137 | .scrapy 138 | 139 | # Sphinx documentation 140 | docs/_build/ 141 | 142 | # PyBuilder 143 | target/ 144 | 145 | # Jupyter Notebook 146 | .ipynb_checkpoints 147 | 148 | # IPython 149 | profile_default/ 150 | ipython_config.py 151 | 152 | # pyenv 153 | .python-version 154 | 155 | # pipenv 156 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 157 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 158 | # having no cross-platform support, pipenv may install dependencies that don’t work, or not 159 | # install all needed dependencies. 160 | #Pipfile.lock 161 | 162 | # celery beat schedule file 163 | celerybeat-schedule 164 | 165 | # SageMath parsed files 166 | *.sage.py 167 | 168 | # Environments 169 | .env 170 | .venv 171 | env/ 172 | venv/ 173 | ENV/ 174 | env.bak/ 175 | venv.bak/ 176 | 177 | # Spyder project settings 178 | .spyderproject 179 | .spyproject 180 | 181 | # Rope project settings 182 | .ropeproject 183 | 184 | # mkdocs documentation 185 | /site 186 | 187 | # mypy 188 | .mypy_cache/ 189 | .dmypy.json 190 | dmypy.json 191 | 192 | # Pyre type checker 193 | .pyre/ 194 | 195 | 196 | -------------------------------------------------------------------------------- /spark_script/analyze_user.py: -------------------------------------------------------------------------------- 1 | # coding=u8 2 | import json 3 | import operator 4 | import pyspark 5 | from pyspark import SparkContext, SparkConf 6 | from pyspark.sql.functions import udf, col, lit, split, collect_list, struct 7 | import pyspark.sql.functions as F 8 | from pyspark.sql.types import * 9 | import datetime 10 | import sys 11 | 12 | 13 | def map_line(line): 14 | 15 | ret_tuple = answer_id, answer_created, answer_updated, author_id, author_name, author_url_token, badge_num, can_comment,\ 16 | comment_count, gender, insert_time, last_update_time, question_created, question_id,\ 17 | reward_member_count, reward_total_money, voteup_count 18 | 19 | return ret_tuple 20 | 21 | def analyze_url_token(data_list): 22 | url_token = '' 23 | old_url_token = '' 24 | url_token_max_date = {} 25 | sum_count = 0 26 | for val in data_list: 27 | url_token_max_date.update({val.author_url_token: val.max_answer_created}) 28 | sum_count += val.sum_count 29 | 30 | 31 | if len(url_token_max_date.keys()) > 1: 32 | url_token = max(url_token_max_date.items(), key=operator.itemgetter(1))[0] 33 | old_url_token = min(url_token_max_date.items(), key=operator.itemgetter(1))[0] 34 | #print(old_url_token) 35 | else: 36 | url_token = list(url_token_max_date.keys())[0] 37 | 38 | max_date = max(url_token_max_date.items(), key=operator.itemgetter(1))[1] 39 | #result = '%s,%s' % (url_token, old_url_token) 40 | result = [url_token, old_url_token, str(sum_count), max_date.strip()] 41 | return result 42 | 43 | def parse_max_answer_created(): 44 | 45 | 46 | column_list = ['answer_id', 'answer_created', 'answer_updated', 'author_id', 'author_name', 'author_url_token', 'badge_num',\ 47 | 'can_comment', 'comment_count', 'gender', 'insert_time', 'last_update_time', 'question_created', 'question_id',\ 48 | 'reward_member_count', 'reward_total_money', 'voteup_count'] 49 | 50 | df = spark.read.parquet("hdfs://device1/zhihu/basic_info/*") 51 | df = df.select(['author_id', 'author_url_token', 'answer_created']) 52 | df = df.withColumn('count',lit(1)) 53 | df = df.groupby(['author_id', 'author_url_token']).agg({'answer_created': 'max', 'count': 'sum'}) 54 | df = df.withColumnRenamed('max(answer_created)', 'max_answer_created').withColumnRenamed('sum(count)', 'sum_count') 55 | 56 | df = df.persist() 57 | 58 | my_udf = udf(analyze_url_token, ArrayType(StringType())) 59 | df = df.groupby(['author_id']).agg(collect_list(struct("max_answer_created", "author_url_token", "sum_count")).alias('struct')) \ 60 | .withColumn('struct', my_udf('struct')) 61 | df = df.persist() # 多用cache以减少重复计算 62 | df = df.withColumn('url_token', col('struct').getItem(0)).withColumn('old_url_token', col('struct').getItem(1)).withColumn('sum_count', col('struct').getItem(2)).withColumn('max_answer_created', col('struct').getItem(3)) 63 | 64 | df_user = df.select(['author_id', 'url_token', 'old_url_token', 'sum_count', 'max_answer_created']) 65 | df_user.write.mode('overwrite').parquet('hdfs://device1/zhihu/users/max_answer_created') 66 | 67 | 68 | def dump_active_user(): 69 | df = spark.read.parquet("hdfs://device1/zhihu/users/max_answer_created") 70 | df = df.withColumn("count", df["sum_count"].cast(IntegerType())) 71 | df = df.select(['author_id', 'url_token', 'old_url_token', 'sum_count', 'max_answer_created']) 72 | df = df.filter("count >=5").filter("`max_answer_created` >= '2018-01-01'").filter("author_id != '0'").orderBy('count', ascending=False) 73 | print(df.show(50, False)) 74 | 75 | url_token_list = df.select('url_token').collect() 76 | 77 | print(len(url_token_list)) 78 | with open('active_url_token.txt', 'w') as f: 79 | for row in url_token_list: 80 | f.write(row.url_token + '\n') 81 | 82 | """ 83 | df.createOrReplaceTempView("table1") 84 | df = spark.sql("select * from table1 where answer_created >= '2018-01-01'") 85 | df_group = df.groupby(['author_id']).agg(F.sum('count').alias('count')) 86 | df_group.cache() 87 | 88 | df_filter = df_group.filter("`count` >= 5").orderBy('count', ascending=False) 89 | 90 | df_filter.cache() 91 | print(df_filter.show(50, False)) 92 | print(df_filter.count()) 93 | """ 94 | 95 | if __name__ == '__main__': 96 | conf = SparkConf().setMaster("local[*]").setAppName("zhihu_parse_basic_info") 97 | #conf.set("spark.driver.memory","30g") # 要写对参数不然不会生效,之前写的是driver-memory ;注意:一定要在命令行中设置参数--driver-memory 不然无效! 98 | conf.set('spark.sql.warehouse.dir', 'hdfs://device1/user/dennis/spark/') 99 | conf.set('spark.executor.cores', '6') 100 | conf.set('spark.executor.instances', '2') 101 | 102 | 103 | sc = pyspark.SparkContext(conf=conf) 104 | spark = pyspark.sql.SparkSession.builder \ 105 | .config(conf=conf) \ 106 | .getOrCreate() 107 | #spark.sparkContext.setLogLevel('WARN') 108 | 109 | #parse_max_answer_created() 110 | dump_active_user() 111 | -------------------------------------------------------------------------------- /spark_script/parse_basic_info.py: -------------------------------------------------------------------------------- 1 | # coding=u8 2 | import json 3 | import os 4 | import pyspark 5 | from pyspark.sql.types import * 6 | from pyspark import SparkContext, SparkConf 7 | import datetime 8 | import sys 9 | 10 | try: 11 | reload(sys) 12 | sys.setdefaultencoding('u8') 13 | except: 14 | pass 15 | 16 | 17 | def map_line(line): 18 | json_text = json.loads(line) 19 | 20 | 21 | # 从20180123 开始的数据用elasticsearch 导出 22 | if '$date' not in json_text['answer_created']: 23 | answer_created = json_text['answer_created'] 24 | answer_updated = json_text['answer_updated'] 25 | insert_time = json_text['insert_time'] 26 | #last_update_time = '' 27 | question_created = json_text['question_created'] 28 | else: 29 | answer_created = json_text['answer_created']['$date'][:19] 30 | answer_updated = json_text['answer_updated']['$date'][:19] 31 | insert_time = json_text['insert_time']['$date'][:19] 32 | #last_update_time = json_text['last_update_time']['$date'] 33 | question_created = json_text['question_created']['$date'][:19] 34 | 35 | answer_id = json_text['answer_id'] 36 | answer_content = json_text['answer_content'] 37 | author_id = json_text['author_id'] 38 | author_name = json_text['author_name'] 39 | author_url_token = json_text['author_url_token'] 40 | badge_num = json_text['badge_num'] 41 | can_comment = json_text['can_comment'] 42 | comment_count = json_text['comment_count'] 43 | gender = json_text['gender'] 44 | question_id = json_text['question_id'] 45 | question_title = json_text['question_title'] 46 | reward_member_count = json_text['reward_member_count'] 47 | reward_total_money = json_text['reward_total_money'] 48 | voteup_count = json_text['voteup_count'] 49 | 50 | ret_tuple = answer_id, answer_content, answer_created, answer_updated, author_id, author_name, author_url_token, badge_num, can_comment,\ 51 | comment_count, gender, insert_time, question_created, question_id,\ 52 | question_title, reward_member_count, reward_total_money, voteup_count 53 | 54 | return ret_tuple 55 | 56 | def main(file_name): 57 | #print(spark.sparkContext.getConf().getAll()) 58 | 59 | #file_name = '20180109_20180112' 60 | rdd = sc.textFile("file:////data/zhihu/zhihu2/%s.json"%file_name) 61 | rdd = rdd.map(map_line) 62 | column_list = ['answer_id', 'answer_content', 'answer_created', 'answer_updated', 'author_id', 'author_name', 'author_url_token', 'badge_num',\ 63 | 'can_comment', 'comment_count', 'gender', 'insert_time', 'question_created', 'question_id',\ 64 | 'question_title', 'reward_member_count', 'reward_total_money', 'voteup_count'] 65 | 66 | schema = StructType([StructField('answer_id', LongType(), False), 67 | StructField('answer_content', StringType(), True), 68 | StructField('answer_created', StringType(), False), 69 | StructField('answer_updated', StringType(), False), 70 | StructField('author_id', StringType(), False), 71 | StructField('author_name', StringType(), False), 72 | StructField('author_url_token', StringType(), False), 73 | StructField('badge_num', IntegerType(), True), 74 | StructField('can_comment', BooleanType(), True), 75 | StructField('comment_count', IntegerType(), True), 76 | StructField('gender', ByteType(), True), 77 | StructField('insert_time', StringType(), False), 78 | StructField('question_created', StringType(), False), 79 | StructField('question_id', LongType(), False), 80 | StructField('question_title', StringType(), True), 81 | StructField('reward_member_count', IntegerType(), True), 82 | StructField('reward_total_money', IntegerType(), True), 83 | StructField('voteup_count', IntegerType(), True)]) 84 | 85 | #df = rdd.toDF(column_list) 86 | df = spark.createDataFrame(rdd, schema) 87 | 88 | print(df.printSchema()) 89 | #print(df.show(30, False)) 90 | #df.write.mode('overwrite').json('hdfs://device1/zhihu/basic_info2/%s/'%file_name) 91 | 92 | df.write.mode('overwrite').parquet('hdfs://device1/zhihu/basic_info/%s/'%file_name) 93 | 94 | if __name__ == '__main__': 95 | conf = SparkConf().setMaster("local[*]").setAppName("zhihu_parse_basic_info") 96 | #conf.set("spark.driver.memory","30g") # 要写对参数不然不会生效,之前写的是driver-memory ;注意:一定要在命令行中设置参数--driver-memory 不然无效! 97 | conf.set('spark.sql.warehouse.dir', 'hdfs://device1/user/dennis/spark/') 98 | conf.set('spark.executor.cores', '6') 99 | conf.set('spark.executor.instances', '6') 100 | 101 | sc = pyspark.SparkContext(conf=conf) 102 | spark = pyspark.sql.SparkSession.builder \ 103 | .config(conf=conf) \ 104 | .getOrCreate() 105 | spark.sparkContext.setLogLevel('WARN') 106 | 107 | 108 | 109 | if len(sys.argv) < 2 : 110 | for file_ in os.listdir("/data/zhihu/zhihu2/"): 111 | if file_.endswith(".json"): 112 | file_name = file_.split('.')[0] 113 | #path = os.path.join("/data/zhihu/zhihu2/", file) 114 | #if '2018110' in file_name: 115 | main(file_name) 116 | 117 | sys.exit() 118 | else: 119 | file_name = sys.argv[1] 120 | main(file_name) 121 | 122 | 123 | -------------------------------------------------------------------------------- /spark_script/analyze_answers.py: -------------------------------------------------------------------------------- 1 | # coding=u8 2 | from utils import toPandas 3 | import json 4 | import pyspark 5 | from pyspark import SparkContext, SparkConf 6 | from pyspark.sql.functions import udf, col, lit, lower 7 | import pyspark.sql.functions as F 8 | from pyspark.sql.types import * 9 | import datetime 10 | import sys 11 | try: 12 | reload(sys) 13 | sys.setdefaultencoding('u8') 14 | except: 15 | pass 16 | 17 | 18 | 19 | def map_line(line): 20 | 21 | ret_tuple = answer_id, answer_created, answer_updated, author_id, author_name, author_url_token, badge_num, can_comment,\ 22 | comment_count, gender, insert_time, last_update_time, question_created, question_id,\ 23 | reward_member_count, reward_total_money, voteup_count 24 | 25 | return ret_tuple 26 | 27 | 28 | def split_date(_datetime): 29 | _datetime = datetime.datetime.strptime(_datetime, '%Y-%m-%dT%H:%M:%S') 30 | 31 | ret = _datetime.year, _datetime.month, _datetime.day, \ 32 | _datetime.hour, _datetime.minute, _datetime.second, _datetime.weekday(), str(_datetime.date()) 33 | 34 | return ret 35 | 36 | 37 | def parse_data(): 38 | 39 | column_list = ['answer_id', 'answer_created', 'answer_updated', 'author_id', 'author_name', 'author_url_token', 'badge_num',\ 40 | 'can_comment', 'comment_count', 'gender', 'insert_time', 'question_created', 'question_title', 'question_id',\ 41 | 'reward_member_count', 'reward_total_money', 'voteup_count'] 42 | 43 | df = spark.read.parquet("hdfs://device1/zhihu/basic_info/*") 44 | #df = spark.read.json("hdfs://device1/zhihu/basic_info2/*") 45 | print(df.show()) 46 | df = df.select(['answer_id', 'answer_created']) 47 | df = df.withColumn('count',lit(1)) 48 | my_udf = udf(split_date, ArrayType(StringType())) 49 | df = df.withColumn('new_date', my_udf('answer_created')) 50 | df = df.persist() 51 | df = df.withColumn('year', col('new_date').getItem(0)).withColumn('month', col('new_date').getItem(1)) \ 52 | .withColumn('day', col('new_date').getItem(2)).withColumn('hour', col('new_date').getItem(3)) \ 53 | .withColumn('minute', col('new_date').getItem(4)).withColumn('second', col('new_date').getItem(5)) \ 54 | .withColumn('weekday', col('new_date').getItem(6)).withColumn('date', col('new_date').getItem(7)) 55 | 56 | df = df.select(['answer_id', 'year', 'month', 'day', 'hour', 'minute', 'second', 'weekday', 'date', 'count']) 57 | df = df.groupby(['year', 'month', 'day', 'hour', 'date', 'weekday']).agg(F.sum('count').alias('count')) 58 | 59 | df.write.mode('overwrite').parquet('hdfs://device1/zhihu/answers_datetime_scatter/') 60 | 61 | def parse_data2(): 62 | df = spark.read.parquet("hdfs://device1/zhihu/basic_info/*") 63 | #df = spark.read.json("hdfs://device1/zhihu/basic_info2/*") 64 | df = df.select(['answer_id', 'gender', 'author_id', 'author_name', 'answer_created']) 65 | df = df.withColumn('count',lit(1)) 66 | 67 | my_udf = udf(split_date, ArrayType(StringType())) 68 | df = df.withColumn('new_date', my_udf('answer_created')) 69 | 70 | df = df.withColumn('year', col('new_date').getItem(0)).withColumn('month', col('new_date').getItem(1)) \ 71 | .withColumn('day', col('new_date').getItem(2)).withColumn('hour', col('new_date').getItem(3)) \ 72 | .withColumn('minute', col('new_date').getItem(4)).withColumn('second', col('new_date').getItem(5)) \ 73 | .withColumn('weekday', col('new_date').getItem(6)).withColumn('date', col('new_date').getItem(7)) 74 | 75 | df = df.select(['author_id', 'gender', 'author_name', 'year', 'month', 'day', 'hour', 'minute', 'second', 'weekday', 'date', 'count']) 76 | df = df.persist() 77 | df = df.groupby(['author_id', 'gender', 'author_name', 'year', 'month', 'day', 'hour', 'date', 'weekday']).agg(F.sum('count').alias('count')) 78 | df.write.mode('overwrite').json('hdfs://device1/zhihu/gender_name/') 79 | #df_pandas.to_csv('./gender_name.csv') 80 | 81 | def parse_data3(): 82 | df = spark.read.parquet("hdfs://device1/zhihu/basic_info/*") 83 | #df = spark.read.json("hdfs://device1/zhihu/basic_info2/*") 84 | df = df.select(['question_id', 'question_created', 'gender', 'author_id', 'answer_id', 'answer_created']) 85 | df = df.withColumn('count',lit(1)) 86 | df_group = df.groupby(['question_id']).agg(F.count('answer_id').alias('count')) 87 | df_group = df_group.cache() 88 | 89 | df_group = df_group.orderBy('count', ascending=False) 90 | print(df_group.show()) 91 | 92 | 93 | def analyse(): 94 | df = spark.read.parquet('hdfs://device1/zhihu/answers_datetime_scatter/') 95 | df_pandas = df.toPandas() 96 | df_pandas.to_csv('./datetime_distribution_answers.csv') 97 | 98 | 99 | def analyse2(): 100 | df = spark.read.json('hdfs://device1/zhihu/gender_name/') 101 | df = df.select(['author_id', 'gender', 'year', 'month', 'day', 'hour', 'weekday', 'date', 'count']) 102 | df = df.withColumn('count',lit(1)) 103 | print(df.show()) 104 | 105 | df_group = df.groupby(['gender', 'hour', 'weekday']).agg(F.sum('count').alias('count')) 106 | print(df_group.show()) 107 | df_pandas = df_group.toPandas() 108 | 109 | #df_pandas = toPandas(df, 10) 110 | df_pandas.to_csv('./gender_hour_count.csv') 111 | 112 | 113 | if __name__ == '__main__': 114 | conf = SparkConf().setMaster("local[*]").setAppName("zhihu_parse_basic_info") 115 | #conf.set("spark.driver.memory","30g") # 要写对参数不然不会生效,之前写的是driver-memory ;注意:一定要在命令行中设置参数--driver-memory 不然无效! 116 | conf.set('spark.sql.warehouse.dir', 'hdfs://device1/user/dennis/spark/') 117 | conf.set('spark.executor.cores', '6') 118 | conf.set('spark.executor.instances', '6') 119 | conf.set('spark.driver.maxResultSize', '11g') 120 | conf.set('spark.network.timeout', '5800s') 121 | conf.set("spark.executor.heartbeatInterval","4800s") 122 | 123 | 124 | 125 | sc = pyspark.SparkContext(conf=conf) 126 | spark = pyspark.sql.SparkSession.builder \ 127 | .config(conf=conf) \ 128 | .getOrCreate() 129 | spark.sparkContext.setLogLevel('WARN') 130 | 131 | #parse_data() 132 | parse_data2() 133 | #parse_data3() 134 | #analyse() 135 | #analyse2() 136 | 137 | -------------------------------------------------------------------------------- /spark_script/analyze_question.py: -------------------------------------------------------------------------------- 1 | # coding=u8 2 | import json 3 | import re 4 | import pyspark 5 | from pyspark import SparkContext, SparkConf 6 | from pyspark.sql.functions import udf, col, lit, lower 7 | import pyspark.sql.functions as F 8 | from pyspark.sql.types import * 9 | import datetime 10 | import sys 11 | from setting import MYSQL_PASSWORD 12 | 13 | try: 14 | reload(sys) 15 | sys.setdefaultencoding('u8') 16 | except: 17 | pass 18 | 19 | 20 | def map_line(line): 21 | 22 | ret_tuple = answer_id, answer_created, answer_updated, author_id, author_name, author_url_token, badge_num, can_comment,\ 23 | comment_count, gender, insert_time, last_update_time, question_created, question_id,\ 24 | reward_member_count, reward_total_money, voteup_count 25 | 26 | return ret_tuple 27 | 28 | 29 | 30 | 31 | def analyse_question(): 32 | df = spark.read.csv('hdfs://device1/zhihu/question_count') 33 | print(df.show()) 34 | 35 | 36 | def extract_question_title(): 37 | """提取最新的question title 保存到MySQL中。""" 38 | df = spark.read.parquet("hdfs://device1/zhihu/basic_info/*") 39 | df = df.select(['question_id', 'question_title', 'answer_created', 'question_created']) 40 | 41 | df = df.groupby(['question_id', 'question_title', 'question_created']).agg(F.max('answer_created').alias('latest_answer_time')) 42 | df.write.format('jdbc').options( 43 | url='jdbc:mysql://localhost/zhihu?characterEncoding=utf8', 44 | driver='com.mysql.jdbc.Driver', 45 | dbtable='question_title', 46 | user='root', 47 | password=MYSQL_PASSWORD) \ 48 | .option("encoding", "UTF-8").mode('overwrite').save() 49 | 50 | print(df.show()) 51 | 52 | 53 | def analyze_question_answers_count(): 54 | 55 | column_list = ['answer_id', 'answer_created', 'answer_updated', 'author_id', 'author_name', 'author_url_token', 'badge_num',\ 56 | 'can_comment', 'comment_count', 'gender', 'insert_time', 'question_created', 'question_title', 'question_id',\ 57 | 'reward_member_count', 'reward_total_money', 'voteup_count'] 58 | 59 | df = spark.read.parquet("hdfs://device1/zhihu/basic_info/*") 60 | #df = spark.read.json("hdfs://device1/zhihu/basic_info2/*") 61 | df = df.select(['question_id', 'question_created']) 62 | df = df.withColumn('count',lit(1)) 63 | df.createOrReplaceTempView("table1") 64 | #df = spark.sql("select * from table1 where question_created >= '2018-01-01'") 65 | df_group = df.groupby('question_id').agg(F.sum('count').alias('count')) 66 | df_group.cache() 67 | 68 | df_filter = df_group.filter("`count` >= 30").orderBy('count', ascending=False) 69 | 70 | df_filter.cache() 71 | question_list = df_filter.select('question_id').collect() 72 | with open('active_questions.txt', 'w') as f: 73 | for row in question_list: 74 | f.write(str(row.question_id) + '\n') 75 | 76 | print(df_filter.show(50)) 77 | # print(df_filter.count()) 78 | 79 | 80 | def is_code_question(question_title): 81 | keywords = ['java', 'python', 'php', 'scala', 'c#', 'c++', 'html', 'c语言', 'node', 'go'] 82 | 83 | 84 | def is_city_question(question_title): 85 | cities = ['北京', '上海', '广州', '深圳', '杭州', '天津', '厦门', '成都', '西安', '重庆', '香港'] 86 | is_city = False 87 | 88 | for city in cities: 89 | if city in question_title: 90 | is_city = True 91 | 92 | return is_city 93 | 94 | 95 | def parse_question(): 96 | 97 | column_list = ['answer_id', 'answer_created', 'answer_updated', 'author_id', 'author_name', 'author_url_token', 'badge_num',\ 98 | 'can_comment', 'comment_count', 'gender', 'insert_time', 'question_created', 'question_title', 'question_id',\ 99 | 'reward_member_count', 'reward_total_money', 'voteup_count'] 100 | 101 | df = spark.read.json("hdfs://device1/zhihu/basic_info2/*") 102 | #df = spark.read.option("mergeSchema", False).parquet("hdfs://device1/zhihu/basic_info/2017*") 103 | 104 | df = df.select(['gender', 'question_id', 'question_created', 'question_title']) 105 | 106 | df = df.withColumn('count',lit(1)) 107 | df = df.groupby(['question_id', 'gender', 'question_created', 'question_title']).agg(F.sum('count').alias('answer_count')) 108 | 109 | df.write.mode('overwrite').csv('hdfs://device1/zhihu/question_count') 110 | 111 | def check_program_lang(title): 112 | ret = [0, 0, 0, 0] #java,python,javascript,php 113 | title = title.lower() 114 | if re.search(r'\bjava\b', title): 115 | ret[0] = 1 116 | if 'python' in title: 117 | ret[1] = 1 118 | if 'javascript' in title or re.search(r'\bjs\b', title): 119 | ret[2] = 1 120 | if 'php' in title: 121 | ret[3] = 1 122 | if sum(ret) > 0: 123 | ret.append(1) 124 | else: 125 | ret.append(0) 126 | 127 | return ret 128 | 129 | def analyze_program_question(): 130 | df = spark.read.json('hdfs://device1/zhihu/question_count') 131 | 132 | schema = StructType([ 133 | StructField("java", IntegerType(), False), 134 | StructField("python", IntegerType(), False), 135 | StructField("js", IntegerType(), False), 136 | StructField("php", IntegerType(), False), 137 | StructField("is_lang", IntegerType(), False) 138 | ]) 139 | 140 | check_lang_udf = udf(check_program_lang, schema) 141 | df = df.withColumn('lang', check_lang_udf('question_title')) 142 | df = df.select(['question_id', 'question_created', 'question_title', 'answer_count', 'lang.*']) 143 | df = df.filter(df.is_lang > 0) 144 | df = df.cache() 145 | print(df.count()) 146 | df_pandas = df.toPandas() 147 | print(df_pandas) 148 | df_pandas.to_csv('./question_lang.csv') 149 | 150 | 151 | if __name__ == '__main__': 152 | conf = SparkConf().setMaster("local[*]").setAppName("zhihu_parse_basic_info") 153 | #conf.set("spark.driver.memory","30g") # 要写对参数不然不会生效,之前写的是driver-memory ;注意:一定要在命令行中设置参数--driver-memory 不然无效! 154 | conf.set('spark.sql.warehouse.dir', 'hdfs://device1/user/dennis/spark/') 155 | conf.set('spark.executor.cores', '6') 156 | conf.set('spark.executor.instances', '6') 157 | 158 | 159 | sc = pyspark.SparkContext(conf=conf) 160 | spark = pyspark.sql.SparkSession.builder \ 161 | .config(conf=conf) \ 162 | .getOrCreate() 163 | spark.sparkContext.setLogLevel('INFO') 164 | #parse_question() 165 | #analyse_question() 166 | #analyze_program_question() 167 | #analyze_question_answers_count() 168 | extract_question_title() 169 | -------------------------------------------------------------------------------- /analyze_program_lang.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import matplotlib.pyplot as plt\n", 10 | "import numpy as np\n", 11 | "import pandas as pd\n", 12 | "import seaborn as sns\n", 13 | "sns.set(style=\"whitegrid\")\n", 14 | "import warnings\n", 15 | "warnings.simplefilter(action='ignore', category=FutureWarning)\n", 16 | "\n", 17 | "%matplotlib inline\n", 18 | "\n", 19 | "pd.options.mode.chained_assignment = None\n", 20 | "\n" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 30, 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "df_original = pd.read_csv('data/question_lang.csv')\n", 30 | "df_original = df_original[['question_id', 'question_title', 'answer_count', 'java', 'python','js','php']]\n", 31 | "df = df_original.sort_values('answer_count', ascending=False)\n" 32 | ] 33 | }, 34 | { 35 | "cell_type": "markdown", 36 | "metadata": {}, 37 | "source": [ 38 | "### 各编程语言问题数 比较" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": 95, 44 | "metadata": {}, 45 | "outputs": [ 46 | { 47 | "data": { 48 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAn8AAAHrCAYAAAC+UfszAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvIxREBQAAHWFJREFUeJzt3Xu0pXdd3/HPZIYM4RYgKJhKEkXypTLBwCGWCohapMUl1gqLa0IBbxE1VItAEa3axVoppjZAgKSACoIoXZbLAiWKSmVErBwM4YB+uTXJgFDuBcScQDL94zxjD5PJzNkz5+x9Zn6v11p7zd7Pbz9n/87Kkz3v+T37smP//v0BAGAMJy16AgAAzI/4AwAYiPgDABiI+AMAGIj4AwAYiPgDABiI+AMAGIj4AwAYiPgDABiI+AMAGMiuRU9gu1leXt6d5LwkH09y44KnAwBwODuTfEOSv1paWlrdyA7i7+bOS/L2RU8CAGAGD06ydyN3FH839/EkOfvss3PyyScvei4AALfohhtuyAc+8IFk6peNEH83d2OSnHzyydm9e/ei5wIAsBEbfqmaN3wAAAxE/AEADET8AQAMRPwBAAxE/AEADET8AQAMRPwBAAxE/AEADET8AQAMRPwBAAxE/AEADET8AQAMRPwBAAxE/AEADET8AQAMRPwBAAxE/AEADET8AQAMRPxtohu+cuOip8BR8t8OgFHsWvQETiQn32pnHv+MVy96GhyF337eExY9BQCYCyt/AAADEX8AAAMRfwAAAxF/AAADEX8AAAMRfwAAAxF/AAADEX8AAAMRfwAAAxF/AAADEX8AAAOZ23f7VtUlSR6Z5Kwk53T3SlWdleT16+52xyR36O47T/tck+T66ZIkz+zuK6exByS5IskpSa5Jcn53f/JIYwAAI5tb/GUt8p6f5O0HNnT3NUnOPXC7qi49xJwe1d0r6zdU1Y4kr0rypO7eW1XPSXJxkqccbmzzfyUAgOPL3E77dvfe7t53S+NVdXKSJyT59Q38uPsnub679063L0/y6A2MAQAMbZ4rf0fyA0k+1t3vPmj7q6fVvL1Jnt3dn09yRpJrD9yhuz9dVSdV1Z0PN9bdn93oZFZWVo58p4MsLS3NvA/bx/Ly8qKnAABbbjvF31Ny81W/B3f3vqraneTSJJclOX8ek9mzZ0927949j4dimxDvABxvVldXZ16w2hbv9q2q05M8JMmr128/cJq4u1eTvDjJA6eh65KcuW7/uyTZP63sHW4MAGBo2yL+kjwpyZu7+zMHNlTVbavq1On6jiSPTXLVNLyc5JSqetB0+8Ikr93AGADA0OYWf1X1gqr6aJJvTPLWqnrfuuEn5eanfO+a5G1VdXWSlSRnJ3lqknT3TUkuSPKSqvpg1lYNn3WkMdgubvrqVxY9BY6B/37A8Wxur/nr7ouSXHQLY2cfYttHktz3MD/vHUnOmXUMtoOTdt0qy8/7kUVPg6O09IyXLXoKAEdtu5z2BQBgDsQfAMBAxB8AwEDEHwDAQMQfAMBAxB8AwEDEHwDAQMQfAMBAxB8AwEDEHwDAQMQfAMBAxB8AwEDEHwDAQMQfAMBAxB8AwEDEHwDAQMQfAMBAxB8AwEDEHwDAQMQfAMBAxB8AwEDEHwDAQMQfAMBAxB8AwEDEHwDAQMQfAMBAxB8AwEDEHwDAQMQfAMBAxB8AwEDEHwDAQMQfAMBAxB8AwEDEHwDAQMQfAMBAxB8AwEDEHwDAQMQfAMBAxB8AwEDEHwDAQMQfAMBAxB8AwEDEHwDAQMQfAMBAxB8AwEDEHwDAQMQfAMBAxB8AwEB2zeuBquqSJI9MclaSc7p7Zdp+TZLrp0uSPLO7r5zGHpDkiiSnJLkmyfnd/cljGQMAGNk8V/5en+Q7k1x7iLFHdfe50+VA+O1I8qokP9ndZyf5syQXH8sYAMDo5hZ/3b23u/fNsMv9k1zf3Xun25cnefQxjgEADG1up32P4NXTit3eJM/u7s8nOSPrVgm7+9NVdVJV3flox7r7sxud0MrKysy/xNLS0sz7sH0sLy/P7bEcK8e/eR4vAJtpO8Tfg7t7X1XtTnJpksuSnL/gOWXPnj3ZvXv3oqfBHAkyZuF4AbaD1dXVmResFv5u3wOngrt7NcmLkzxwGrouyZkH7ldVd0myf1q9O9oxAIChLTT+quq2VXXqdH1HkscmuWoaXk5ySlU9aLp9YZLXHuMYAMDQ5vlRLy9I8kNJ7pbkrVX1mSSPSPJ7VbUzyc4k70/y1CTp7puq6oIkV1TVrTN9ZMuxjAEAjG5u8dfdFyW56BBD9z3MPu9Ics5mjgEAjGzhr/kDAGB+xB8AwEDEHwDAQMQfAMBAxB8AwEDEHwDAQMQfAMBAxB8AwEDEHwDAQMQfAMBAxB8AwEDEHwDAQMQfAMBAxB8AwEDEHwDAQMQfAMBAxB8AwEDEHwDAQMQfAMBAxB8AwEDEHwDAQMQfAMBAxB8AwEDEHwDAQMQfAMBAxB8AwEDEHwDAQMQfAMBAxB8AwEDEHwDAQMQfAMBAxB8AwEDEHwDAQMQfAMBAxB8AwEDEHwDAQMQfAMBAxB8AwEDEHwDAQMQfAMBAxB8AwEDEHwDAQMQfAMBAxB8AwEDEHwDAQMQfAMBAxB8AwEDEHwDAQHbN64Gq6pIkj0xyVpJzunulqk5L8ltJ7pFkNcmHkvx4d39q2md/kvcmuWn6MRd093unsUck+dXpd1hO8uTu/vKRxgAARjbPlb/XJ/nOJNeu27Y/yfO6u7r7Pkk+nOTig/b7ju4+d7ocCL/bJXlpkkd097ck+WKSpx9pDABgdHOLv+7e2937Dtr22e5+27pN70xy5gZ+3MOTvKu7PzjdvjzJYzYwBgAwtLmd9j2SqjopyU8keeNBQ2+rql1J/iDJL3X3apIz8rUriNcluft0/XBjG7aysjLrLllaWpp5H7aP5eXluT2WY+X4N8/jBWAzbZv4S/LCJF9Kctm6bWd0976qukPWXhv4C0meM4/J7NmzJ7t3757HQ7FNCDJm4XgBtoPV1dWZF6y2xbt9pzeD3DPJY7r7wJs7cuA0cXd/IcnLkjxwGrouX3t6+Iwk+zYwBgAwtIXHX1U9N8lSkh+cTuke2H6nqjplur4ryaOSXDUNvyXJeVV1z+n2hUleu4ExAIChzS3+quoFVfXRJN+Y5K1V9b6quneSZyc5Pck7quqqqnrdtMu9kvxlVb0nydVJvpK1077p7i8m+bEkb6qqDyU5NcklRxoDABjd3F7z190XJbnoEEM7buH+f5HkPof5eW9I8oZZxwAARrbw074AAMyP+AMAGIj4AwAYiPgDABiI+AMAGIj4AwAYiPgDABiI+AMAGIj4AwAYiPgDABiI+AMAGIj4AwAYiPgDABiI+AMAGIj4AwAYiPgDABiI+AMAGIj4AwAYiPgDABiI+AMAGIj4AwAYiPgDABiI+AMAGIj4AwAYiPgDABiI+AMAGIj4AwAYiPgDABiI+AMAGIj4AwAYiPgDABiI+AMAGIj4AwAYiPgDABiI+AMAGIj4AwAYiPgDABiI+AMAGIj4AwAYiPgDABiI+AMAGIj4AwAYiPgDABiI+AMAGIj4AwAYiPgDABiI+AMAGMiG46+qnn4L239286YDAMBW2jXDfX8xySWH2P6cJL92uB2r6pIkj0xyVpJzuntl2n52klckOS3JZ5I8sbs/uFVjAACjO+LKX1V9T1V9T5KdVfXdB25Plx9J8sUNPM7rk3xnkmsP2n55khd199lJXpTkii0eAwAY2kZW/l4+/XnrJL++bvv+JJ9I8tNH+gHdvTdJquoft1XV1ye5X5LvnTa9JsllVfV1SXZs9lh3f2oDvysAwAntiPHX3d+UJFX1yu5+4iY+9t2TfKy7b5we58aq+rtp+44tGJsp/lZWVmb+hZaWlmbeh+1jeXl5bo/lWDn+zfN4AdhMG37N3/rwq6qTDhq7aTMntR3s2bMnu3fvXvQ0mCNBxiwcL8B2sLq6OvOC1Szv9r1fVf1FVf19kq9Ml69Ofx6NfUn+SVXtnH7+ziSnT9u3YgwAYHizfM7fK5L8aZL7J/nm6fJN058z6+5PJrkqyeOmTY9L8tfd/amtGDuaOQIAnGhm+aiXM5P8fHfvn/VBquoFSX4oyd2SvLWqPtPd905yYZJXVNUvJvlckvWvKdyKMQCAoc0Sf69L8rAkV876IN19UZKLDrH9b5P8s1vYZ9PHAABGN0v83TrJ66pqb9Y+4uUfbfK7gAEA2CKzxN/7pwsAAMepWT7q5Ze3ciIAAGy9Dcff9BVvh9Tdf7I50wEAYCvNctr35Qfd/rokJyf5aI7y414AAJivWU77ftP629MHKD8nyRc3e1IAAGyNWT7k+WtM35/73CTP2LzpAACwlY46/ibfm+SE+15fAIAT1Sxv+NiXZP23e9wma5/999TNnhQAAFtjljd8nH/Q7b9P8oHu/sImzgcAgC00yxs+/meSVNVJSe6a5P90t1O+AADHkQ2/5q+qbl9Vr0zyD0k+luQfquoVVXXqls0OAIBNNcsbPl6Y5LZJzklyyvTnbZK8YAvmBQDAFpjlNX//Ksk3d/eXp9sfqKonJ/nw5k8LAICtMMvK3/VZ+1aP9e6SZHXzpgMAwFaaZeXvZUn+qKp+Lcm1Sc5M8jNJXroVEwMAYPPNEn/PzdobPZ6Q5PQkf5fked198Hf+AgCwTc1y2vf5Sbq7H9rd39rdD03yN1V16RbNDQCATTZL/D0uybsO2rac5PGbNx0AALbSLPG3P8nOg7btnPFnAACwQLOE29uT/KfpGz4OfNPHL03bAQA4Dszyho+nJXlTko9X1bVJzkjy8SSP2IqJAQCw+Wb5bt+PVtX9knx7krsn2Zfkf/l+XwCA48csK3+ZQu+d0wUAgOOMN2sAAAxE/AEADET8AQAMRPwBAAxE/AEADET8AWxzN3z1K4ueAkfJfzu2o5k+6gWA+Tt5163ypN942qKnwVH4zSc/f9FTgJux8gcAMBDxBwAwEPEHADAQ8QcAMBDxBwAwEPEHADAQ8QcAMBDxBwAwEPEHADAQ8QcAMBDxBwAwEPEHADAQ8QcAMBDxBwAwEPEHADAQ8QcAMBDxBwAwkF2LnkBVnZXk9es23THJHbr7zlV1TZLrp0uSPLO7r5z2e0CSK5KckuSaJOd39yePNAYAMLKFx193X5Pk3AO3q+rSfO28HtXdK+v3qaodSV6V5EndvbeqnpPk4iRPOdzY1v4mAADb37Y67VtVJyd5QpJfP8Jd75/k+u7eO92+PMmjNzAGADC0ha/8HeQHknysu9+9bturp9W8vUme3d2fT3JGkmsP3KG7P11VJ1XVnQ831t2f3ehEVlZWjnyngywtLc28D9vH8vLy3B7LsXL8c7ywUfM8VmAjtlv8PSVfu+r34O7eV1W7k1ya5LIk589jInv27Mnu3bvn8VBsE/6CZRaOFzbKscJWWl1dnXnBatuc9q2q05M8JMmrD2zr7n3Tn6tJXpzkgdPQdUnOXLfvXZLsn1b2DjcGADC0bRN/SZ6U5M3d/ZkkqarbVtWp0/UdSR6b5KrpvstJTqmqB023L0zy2g2MAQAMbbvF3/pTvndN8raqujrJSpKzkzw1Sbr7piQXJHlJVX0wayuGzzrSGADA6LbNa/66++yDbn8kyX0Pc/93JDln1jEAgJFtp5U/AAC2mPgDABiI+AOAE8SNN3xl0VPgGMzrv9+2ec0fAHBsdp58q/z+E5+86GlwlL7vlb8xl8ex8gcAMBDxBwAwEPEHADAQ8QcAMBDxBwAwEPEHADAQ8QcAMBDxBwAwEPEHADAQ8QcAMBDxBwAwEPEHADAQ8QcAMBDxBwAwEPEHADAQ8QcAMBDxBwAwEPEHADAQ8QcAMBDxBwAwEPEHADAQ8QcAMBDxBwAwEPEHADAQ8QcAMBDxBwAwEPEHADAQ8QcAMBDxBwAwEPEHADAQ8QcAMBDxBwAwEPEHADAQ8QcAMBDxBwAwEPEHADAQ8QcAMBDxBwAwEPEHADAQ8QcAMBDxBwAwEPEHADAQ8QcAMJBdi55AklTVNUmuny5J8szuvrKqHpDkiiSnJLkmyfnd/clpn6MaAwAY2XZa+XtUd587Xa6sqh1JXpXkJ7v77CR/luTiJDnaMQCA0W2n+DvY/ZNc3917p9uXJ3n0MY4BAAxtO8Xfq6vq6qp6cVXdMckZSa49MNjdn05yUlXd+RjGAACGti1e85fkwd29r6p2J7k0yWVJXrfICa2srMy8z9LS0hbMhHlZXl6e22M5Vo5/jhc2yrHCLOZxvGyL+OvufdOfq1X14iRvTPL8JGceuE9V3SXJ/u7+bFVddzRjs8xpz5492b1797H8WhxnPGkyC8cLG+VYYRazHi+rq6szL1gt/LRvVd22qk6dru9I8tgkVyVZTnJKVT1ouuuFSV47XT/aMQCAoS08/pLcNcnbqurqJCtJzk7y1O6+KckFSV5SVR9M8pAkz0qSox0DABjdwk/7dvdHktz3FsbekeSczRwDABjZdlj5AwBgTsQfAMBAxB8AwEDEHwDAQMQfAMBAxB8AwEDEHwDAQMQfAMBAxB8AwEDEHwDAQMQfAMBAxB8AwEDEHwDAQMQfAMBAxB8AwEDEHwDAQMQfAMBAxB8AwEDEHwDAQMQfAMBAxB8AwEDEHwDAQMQfAMBAxB8AwEDEHwDAQMQfAMBAxB8AwEDEHwDAQMQfAMBAxB8AwEDEHwDAQMQfAMBAxB8AwEDEHwDAQMQfAMBAxB8AwEDEHwDAQMQfAMBAxB8AwEDEHwDAQMQfAMBAxB8AwEDEHwDAQMQfAMBAxB8AwEDEHwDAQMQfAMBAxB8AwEB2LXoCVXVakt9Kco8kq0k+lOTHu/tTVbU/yXuT3DTd/YLufu+03yOS/GrWfoflJE/u7i8faQwAYGTbYeVvf5LndXd1932SfDjJxevGv6O7z50uB8LvdklemuQR3f0tSb6Y5OlHGgMAGN3C46+7P9vdb1u36Z1JzjzCbg9P8q7u/uB0+/Ikj9nAGADA0BZ+2ne9qjopyU8keeO6zW+rql1J/iDJL3X3apIzkly77j7XJbn7dP1wYxu2srIy6y5ZWlqaeR+2j+Xl5bk9lmPl+Od4YaMcK8xiHsfLtoq/JC9M8qUkl023z+jufVV1h6y9LvAXkjxnHhPZs2dPdu/ePY+HYpvwpMksHC9slGOFWcx6vKyurs68YLXw074HVNUlSe6Z5DHdfVOSdPe+6c8vJHlZkgdOd78uX3tq+Iwk+zYwBgAwtG0Rf1X13CRLSX5wOq2bqrpTVZ0yXd+V5FFJrpp2eUuS86rqntPtC5O8dgNjAABDW3j8VdW9kzw7yelJ3lFVV1XV65LcK8lfVtV7klyd5CtZO+2b7v5ikh9L8qaq+lCSU5NccqQxAIDRLfw1f939viQ7bmH4PofZ7w1J3jDrGADAyBa+8gcAwPyIPwCAgYg/AICBiD8AgIGIPwCAgYg/AICBiD8AgIGIPwCAgYg/AICBiD8AgIGIPwCAgYg/AICBiD8AgIGIPwCAgYg/AICBiD8AgIGIPwCAgYg/AICBiD8AgIGIPwCAgYg/AICBiD8AgIGIPwCAgYg/AICBiD8AgIGIPwCAgYg/AICBiD8AgIGIPwCAgYg/AICBiD8AgIGIPwCAgYg/AICBiD8AgIGIPwCAgYg/AICBiD8AgIGIPwCAgYg/AICBiD8AgIGIPwCAgYg/AICBiD8AgIGIPwCAgYg/AICBiD8AgIGIPwCAgYg/AICB7Fr0BLZKVZ2d5BVJTkvymSRP7O4PLnZWAACLdSKv/F2e5EXdfXaSFyW5YsHzAQBYuBNy5a+qvj7J/ZJ877TpNUkuq6qv6+5PHWH3nUlyww03HNVj3+E2tzqq/Vis1dXV+T/orW8//8dkUyzieLn9rW4798fk2C3iWDnp9p5bjldHc7ys65WdG91nx/79+2d+oO2uqpaSvLK7771u2/uTnN/d7z7cvsvLyw9K8vYtniIAwGZ68NLS0t6N3PGEXPk7Rn+V5MFJPp7kxgXPBQDgcHYm+Yas9cuGnKgrf1+f5ANJTuvuG6tqZ9be9HHPDZz2BQA4YZ2Qb/jo7k8muSrJ46ZNj0vy18IPABjdCbnylyRVda+sfdTLnZJ8Lmsf9dKLnRUAwGKdsPEHAMDNnZCnfQEAODTxBwAwEPEHADAQ8QcAMBDxBwAwEPFHquqqqjpl0fPg+FRV31VVD1t3+6yq+vQi58T253mHjaqq/VV1u0XP40Ti691Id5+76DlwXPuuJLdL8ocLngfHEc87sDg+549U1f4kt0/yS0kekuTkJJ9O8pTuvraqXp7k6u5+/nT/PUnemOQeWfv2lKdN+yTJ07v7j+f7G7BZpmPhl5M8LMlpSZ7d3b9XVc9IckZ3/9R0v7smuXq635VZO4vwd0l+Z7q8K8kVSb4vyW2S/HB37532fWKSn0uyP8mHk/x4d3+yqp6U5PFZ+1D2PUk+n+SR3f2JOfzqzNl0rJ2a5OIk35NkNcmXuvuBC50YC3FLzz3rxn4+yb+Zxn7uoLFD7sctc9qX9S7u7vO6+9uSvCbJf562/2aSf7vufk9O8pvdvT9rf/E/oLvvm+SxWftWFY5vN3X3dyT5gST/bfqu7JcmedS6Uy8/luS3u/s9SS5P8sruPre7L57GT0vyF9Nx8SuZjqXpHw4XJ3lYd98nyUqSF6577POy9g+Ieyd5f5Kf3spflIX7tiQPTfKt0/PO9y94PizWoZ57DvhCd5+X5IIkL5hhPw5B/LHew6vqnVW1kuTpSc5Nku5+e5LbV9V9qmpX1lb7DkTePZJcWVXvS/K7Se5WVXdbwNzZPC9PkunrEN+dtbj/XNZWey+YjoEfTfKSw/yML3X3m6br78zacZIk353k97v749PtK7L2l/8Bf97d+w6xHyemq5PsTPLyqrpg0ZNh4W723LNu7HemP9+Z5PSquvUG9+MQxB8HnJbkvyZ5XHfvSfKUJOv/53pl1lb/Hp7kb7r72mn7a5K8eFqpuV+Srx60H8e3HVk7PZus/Wv7J5L866wdAx84zH6r667fmP//+uL1P++A9bevv4X9ODHdmOTeWfuH432SvM8/Hpkc/FxxfZJ0943T7Vt6bjjUcwwHEX8ccIckNyT5RFWdlOTCg8ZfkbUVvx9J8hvrtt8xyf+erv9wkt1bPE+23pOTpKrumbXV379Mku5eSfKZJJcmedG6+38ha6/d2og/TvJ96/6C/9Ekb92EOXN8ulOSU7r7LUmeleT/JvnmxU6JBTrkc88W7jcs8ccBH0ny35O8L8mf5P8HXZKku6/L2muwvivJ/1g39O+SvL6q9iY5K2txwPFttar+PMmbMr0ZY93Yy5LclOTN67a9Lsn9p4/ueNbhfnB3vy/Jf0jyR1V1ddZe8/W0TZ09x5Mzkry1qt6TtVPAf5C103qM6XDPPVux37C823dw0wtjr01ym+kNHAzswDu/u/tLtzD+sqy9tOZX5zszTiSedzjYkZ57Nnu/0Xk9zcCq6r5Jfi/Jr3gC5nCq6vQkf5rkE0kuWvB0OI553oHFs/IHADAQr/kDABiI+AMAGIj4AwAYiPgDOEpVdU1VPfTI9wTYPsQfAMBAxB8AwEB8zh/AMaqqb0/y/CT/NMk/ZO1z7H62u2+Yxvdn7XuR/32SuyT57SQ/1d37q2pnkudl7buzv5jkvyR5YZJbdfdX5/27ACc+K38Ax+7GJD+TtbD750n+RZKnHnSf709yXta+0u7RSf7ltP1Hkzw8a99Jer8kPziH+QIDs/IHcIy6e3ndzWuq6ookD0ly6brtF3f355N8vqr+NGux95asheDzu/ujSVJVF2ctHgG2hPgDOEZVdXaSX0ty/yS3ydpz6/JBd/vEuutfTnK76frpSfatG1t/HWDTOe0LcOxekuRvk9yzu++Q5NlJdmxw348n+cZ1t+++yXMD+BpW/gCO3e2TfCHJl6rqXll7c8enNrjva5M8rarenOTvkzxza6YIsMbKH8Cxe3qSx2ft3bovTfK7M+z70iR/mOTqJH+d5PeTfDVrbyIB2HQ79u/fv+g5ADCpqocnuby7z1z0XIATk9O+AAtUVack+e6srf7dNcl/TPK6hU4KOKE57QuwWDuS/HKSz2XttO/fJPnFhc4IOKE57QsAMBArfwAAAxF/AAADEX8AAAMRfwAAAxF/AAAD+X+B8zdDx3NWlQAAAABJRU5ErkJggg==\n", 49 | "text/plain": [ 50 | "
" 51 | ] 52 | }, 53 | "metadata": { 54 | "needs_background": "light" 55 | }, 56 | "output_type": "display_data" 57 | } 58 | ], 59 | "source": [ 60 | "#print(df)\n", 61 | "#print(df.loc[df.js>0])\n", 62 | "df_lang_count = df[['answer_count', 'java','python','js','php']]\n", 63 | "df_sum = df_lang_count.sum()\n", 64 | "df_sum = pd.DataFrame({'lang': df_sum.index, 'count': df_sum.values}).loc[1:]\n", 65 | "\n", 66 | "\n", 67 | "a4_dims = (10.0, 8)\n", 68 | "fig, ax = plt.subplots(figsize=a4_dims)\n", 69 | "ax = sns.barplot(x='lang', y='count', data=df_sum, palette=None)\n" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": 93, 75 | "metadata": {}, 76 | "outputs": [], 77 | "source": [ 78 | "### 各编成语言的回答数 比较" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": 94, 84 | "metadata": {}, 85 | "outputs": [ 86 | { 87 | "data": { 88 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAn8AAAHrCAYAAAC+UfszAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvIxREBQAAGGlJREFUeJzt3XvUZXdd3/HPZCYzSSSghpuRm7CYH8qAmCGCFRaXAhZKq6y2tEGSGhQNqKVaoIgUEBdrxViVW0pSQEjkIrRKq0jFG60ExOojyBou31CEECAxCZdCgEzI5Okf50x9nGSS50zOZZ75vl5rzXrO2b+z5/zOmj1n3rP32WdvW19fDwAAPRy36gkAALA84g8AoBHxBwDQiPgDAGhE/AEANCL+AAAaEX8AAI2IPwCARsQfAEAj4g8AoJEdq57A0WZtbW1XktOTXJHkwIqnAwBwS7Yn+bYkf7F37979m1lB/N3U6Unes+pJAADM4OFJLtnMA8XfTV2RJLt3787OnTtXPRcAgMO6/vrrc+mllybTftkM8XdTB5Jk586d2bVr16rnAgCwGZv+qJoTPgAAGhF/AACNiD8AgEbEHwBAI+IPAKAR8QcA0Ij4AwBoRPwBADQi/gAAGhF/AACNiD8AgEbEHwBAI+IPAKAR8QcA0Ij4AwBoRPwBADQi/gAAGhF/AACNiL85uv4bB1Y9BY6QPzsAutix6gkcS3Yevz1Pee6bVj0NjsCbz/vhVU8BAJbCnj8AgEbEHwBAI+IPAKAR8QcA0Ij4AwBoRPwBADQi/gAAGhF/AACNiD8AgEbEHwBAI+IPAKAR8QcA0Ij4AwBoRPwBADQi/gAAGhF/AACNiD8AgEbEHwBAI+IPAKAR8QcA0Ij4AwBoRPwBADQi/gAAGhF/AACNiD8AgEbEHwBAI+IPAKAR8QcA0Ij4AwBoRPwBADQi/gAAGhF/AACNiD8AgEbEHwBAI+IPAKAR8QcA0Ij4AwBoRPwBADQi/gAAGhF/AACNiD8AgEbEHwBAI+IPAKAR8QcA0Ij4AwBoRPwBADQi/gAAGhF/AACNiD8AgEbEHwBAI+IPAKAR8QcA0Ij4AwBoRPwBADQi/gAAGhF/AACNiD8AgEbEHwBAIzuW9URjjCcm+cUk2zKJzhdX1W+PMXYnuSjJKUk+n+Ssqvr4dJ25jwEAdLaUPX9jjG1JfiPJmVX1oCRPTXLRGOO4JBckOb+qdic5P8mFG1ZdxBgAQFtL2/OX5MYkd5je/uYkVyS5Y5LTkjx2uvwtSV41xrhTJnsI5zpWVVcv6LUBAGwJS4m/qlofYzw5yX8fY3w1yclJ/nGSuyf5bFUdmD7uwBjjc9Pl2xYwtun427dv38yvc+/evTOvw9FjbW1t1VMAgIVbSvyNMXYk+bkkP1hV7x1jfH+StyY5cxnPfyT27NmTXbt2rXoaLJF4B2Cr2b9//8w7rJZ1tu+DkpxaVe9NkunPrya5Lsm3jzG2J8n056lJLp/+mvcYAEBry4q/zyS52xhjJMkY4zuT3DXJx5N8MMkZ08edkeQDVXV1VV0177GFvkIAgC1gWZ/5u3KM8Ywk/3WMceN08dlV9YUxxjmZnPn7wiRfTHLWhlUXMQYA0NbSzvatqjcledPNLP9YkoccZp25jwEAdOYKH7ACN97wjVVPgdvAnx+wlS3ze/6AqeN2HJ+1835s1dPgCO197mtXPQWAI2bPHwBAI+IPAKAR8QcA0Ij4AwBoRPwBADQi/gAAGhF/AACNiD8AgEbEHwBAI+IPAKAR8QcA0Ij4AwBoRPwBADQi/gAAGhF/AACNiD8AgEbEHwBAI+IPAKAR8QcA0Ij4AwBoRPwBADQi/gAAGhF/AACNiD8AgEbEHwBAI+IPAKAR8QcA0Ij4AwBoRPwBADQi/gAAGhF/AACNiD8AgEbEHwBAI+IPAKAR8QcA0Ij4AwBoRPwBADQi/gAAGhF/AACNiD8AgEbEHwBAI+IPAKAR8QcA0Ij4AwBoRPwBADQi/gAAGhF/AACNiD8AgEbEHwBAI+IPAKAR8QcA0Ij4AwBoRPwBADQi/gAAGhF/AACNiD8AgEbEHwBAI+IPAKAR8QcA0Ij4AwBoRPwBADQi/gAAGhF/AACNiD8AgEbEHwBAI+IPAKAR8QcA0Ij4AwBoRPwBADQi/gAAGhF/AACNiD8AgEbEHwBAI+IPAKAR8QcA0Ij4AwBoRPwBADSyY1lPNMY4IcmvJXlMkuuS/FlV/fgYY3eSi5KckuTzSc6qqo9P15n7GABAZ8vc83deJtG3u6oekOQ/TJdfkOT8qtqd5PwkF25YZxFjAABtLWXP3xjjdknOSnK3qlpPkqr62zHGnZOcluSx04e+Jcmrxhh3SrJt3mNVdfUCXyYAwFFvWYd975PJ4dcXjTEeleTaJC9I8vUkn62qA0lSVQfGGJ9LcvdMIm7eY5uOv3379s38Ivfu3TvzOhw91tbWlvZctpWtb5nbC8A8LSv+diS5d5IPVNVzxhgPSfK7Sf7Fkp5/Znv27MmuXbtWPQ2WSJAxC9sLcDTYv3//zDuslvWZv8uS3JDJIdhU1Z8nuSaTPX/fPsbYniTTn6cmuXz6a95jAACtLSX+quqaJO/O9HN407Nx75zk0iQfTHLG9KFnZLJ38OqqumreY4t8jQAAW8HSvuolyTlJfn2M8StJvpHkzKr60hjjnCQXjTFemOSLmZwYsnGdeY8BALS1tPirqr9J8sibWf6xJA85zDpzHwMA6MwVPgAAGhF/AACNiD8AgEbEHwBAI+IPAKAR8QcA0Ij4AwBoRPwBADQi/gAAGhF/AACNiD8AgEbEHwBAI+IPAKAR8QcA0Ij4AwBoRPwBADQi/gAAGhF/AACNiD8AgEbEHwBAI+IPAKAR8QcA0Ij4AwBoRPwBADQi/gAAGhF/AACNbDr+xhjPPszyn53fdAAAWKRZ9vy98DDLXzCPiQAAsHg7bu0BY4xHT29uH2M8Ksm2DcP3TvKVRUwMAID5u9X4S/K66c8Tkvz6huXrSa5M8tPznhQAAItxq/FXVd+RJGOMi6vqrMVPCQCARdnMnr8kycbwG2Mcd8jYjfOcFAAAi7Hp+BtjnJbk/CQPzOQQcDL5/N96ku3znxoAAPO26fhLclGS303ytCRfW8x0AABYpFni755Jfr6q1hc1GQAAFmuW7/l7e5LHLWoiAAAs3ix7/k5I8vYxxiWZfMXL/+csYACArWGW+PvI9BcAAFvULF/18guLnAgAAIs3y1e9PPpwY1X1J/OZDgAAizTLYd/XHXL/Tkl2JvlMJtf4BQDgKDfLYd/v2Hh/jLE9yQuSfGXekwIAYDFm+aqXv6eqDiR5aZLnzm86AAAs0hHH39Rjk7iuLwDAFjHLCR+XZ3Id34NOyuS7/54570kBALAYs5zw8dRD7n81yaVV9eU5zgcAgAWa5YSP/5UkY4zjktwlyd9WlUO+AABbyKY/8zfGOHmMcXGSryf5bJKvjzEuGmPcYWGzAwBgrmY54eOVSb4pyQOSnDj9eVKSVyxgXgAALMAsn/n7R0nuXVVfm96/dIxxdpJPzH9aAAAswix7/q7L5KoeG90xyf75TQcAgEWaZc/fa5P84RjjV5NcluSeSX4myWsWMTEAAOZvlvh7aSYnevxwklOTfC7JeVV16DV/AQA4Ss1y2PflSaqqHlNV31VVj0ny0THGyxY0NwAA5myW+DsjyV8esmwtyVPmNx0AABZplvhbT7L9kGXbZ/w9AABYoVnC7T1JfnF6hY+DV/p48XQ5AABbwCwnfDwryTuSXDHGuCzJPZJckeSfLGJiAADM3yzX9v3MGOO0JN+b5O5JLk/yv13fFwBg65hlz1+moff+6S8AALYYJ2sAADQi/gAAGhF/AACNiD8AgEbEHwBAI+IPAKAR8QcA0Ij4AwBoRPwBADQi/gAAGhF/AEe562/4xqqnwBHyZ8fRaKZr+wKwfDt3HJ8fef2zVj0NjsAbzn75qqcAN2HPHwBAI+IPAKAR8QcA0Ij4AwBoRPwBADQi/gAAGln6V72MMV6U5MVJHlBV+8YYD01yYZITk3wqyVOr6qrpY+c+BgDQ2VL3/I0xTkvy0CSfnt7fluSNSX6yqnYn+dMk5y5qDACgu6XF3xhjV5Lzkzwzyfp08YOTXFdVl0zvX5DkyQscAwBobZl7/l6S5I1V9ckNy+6R5LKDd6rqmiTHjTG+dUFjAACtLeUzf2OM70tyepLnLeP55mHfvn0zr7N3794FzIRlWVtbW9pz2Va2PtsLm7XMbQU2Y1knfDwiyf2SfHKMkSR3S/KuJK9Ics+DDxpj3DHJelV9YYzx6XmPzTLhPXv2ZNeuXbO/UrYs/8AyC9sLm2VbYZH2798/8w6rpRz2rapzq+rUqrpXVd0ryWeS/ECSX05y4hjjYdOHnpPkbdPbawsYAwBobaXf81dVNyY5M8mrxxgfz2QP4fMWNQYA0N3Sv+cvSaZ7/w7efl+SBxzmcXMfAwDozBU+AAAaEX8AAI2IPwCARsQfAEAj4g8AoBHxBwDQiPgDAGhE/AEANCL+AAAaEX8AAI2IPwCARsQfAEAj4g8AoBHxBwDQiPgDAGhE/AEANCL+AAAaEX8AAI2IPwCARsQfABwjDlz/jVVPgdtgWX9+O5byLADAwm3feXzeedbZq54GR+gJF79+Kc9jzx8AQCPiDwCgEfEHANCI+AMAaET8AQA0Iv4AABoRfwAAjYg/AIBGxB8AQCPiDwCgEfEHANCI+AMAaET8AQA0Iv4AABoRfwAAjYg/AIBGxB8AQCPiDwCgEfEHANCI+AMAaET8AQA0Iv4AABoRfwAAjYg/AIBGxB8AQCPiDwCgEfEHANCI+AMAaET8AQA0Iv4AABoRfwAAjYg/AIBGxB8AQCPiDwCgEfEHANCI+AMAaET8AQA0Iv4AABoRfwAAjYg/AIBGxB8AQCPiDwCgEfEHANCI+AMAaET8AQA0Iv4AABoRfwAAjYg/AIBGxB8AQCPiDwCgEfEHANCI+AMAaET8AQA0Iv4AABoRfwAAjYg/AIBGxB8AQCPiDwCgkR3LeJIxxilJfiPJfZLsT/J/kvxEVV09xnhokguTnJjkU0meWlVXTdeb+xgAQGfL2vO3nuS8qhpV9cAkn0hy7hhjW5I3JvnJqtqd5E+TnJskixgDAOhuKfFXVV+oqv+5YdH7k9wzyYOTXFdVl0yXX5DkydPbixgDAGhtKYd9NxpjHJfkGUl+J8k9klx2cKyqrhljHDfG+NZFjFXVFzY7z3379s382vbu3TvzOhw91tbWlvZctpWtz/bCZtlWmMUytpelx1+SVya5NsmrkjxpBc+/KXv27MmuXbtWPQ2WyJsms7C9sFm2FWYx6/ayf//+mXdYLfVs3zHGf0xy3yT/sqpuTPLpTA7/Hhy/Y5L16R66RYwBALS2tPgbY7w0yd4kP1RV+6eL15KcOMZ42PT+OUnetsAxAIDWlvVVL/dP8vwklyZ53xgjST5ZVU8aY5yZ5MIxxgmZfi1LklTVjfMeAwDobinxV1UfTrLtMGPvS/KAZY0BAHTmCh8AAI2IPwCARsQfAEAj4g8AoBHxBwDQiPgDAGhE/AEANCL+AAAaEX8AAI2IPwCARsQfAEAj4g8AoBHxBwDQiPgDAGhE/AEANCL+AAAaEX8AAI2IPwCARsQfAEAj4g8AoBHxBwDQiPgDAGhE/AEANCL+AAAaEX8AAI2IPwCARsQfAEAj4g8AoBHxBwDQiPgDAGhE/AEANCL+AAAaEX8AAI2IPwCARsQfAEAj4g8AoBHxBwDQiPgDAGhE/AEANCL+AAAaEX8AAI2IPwCARsQfAEAj4g8AoBHxBwDQiPgDAGhE/AEANCL+AAAaEX8AAI2IPwCARsQfAEAj4g8AoBHxBwDQiPgDAGhE/AEANCL+AAAaEX8AAI2IPwCARsQfAEAj4g8AoBHxBwDQiPgDAGhE/AEANCL+AAAaEX8AAI2IPwCARsQfAEAj4g8AoBHxBwDQiPgDAGhE/AEANCL+AAAaEX8AAI2IPwCARsQfAEAj4g8AoBHxBwDQiPgDAGhE/AEANLJj1RNYlDHG7iQXJTklyeeTnFVVH1/trAAAVutY3vN3QZLzq2p3kvOTXLji+QAArNwxuedvjHHnJKcleex00VuSvGqMcaequvpWVt+eJNdff/0RPfftTzr+iNZjtfbv37/8Jz3h5OU/J3Oxiu3l5OO/aenPyW23im3luJO9t2xVR7K9bOiV7ZtdZ9v6+vrMT3S0G2PsTXJxVd1/w7KPJHlqVf3VLa27trb2sCTvWfAUAQDm6eF79+69ZDMPPCb3/N1Gf5Hk4UmuSHJgxXMBALgl25N8Wyb9sinH6p6/Oye5NMkpVXVgjLE9k5M+7ruJw74AAMesY/KEj6q6KskHk5wxXXRGkg8IPwCgu2Nyz1+SjDHul8lXvXxLki9m8lUvtdpZAQCs1jEbfwAA3NQxedgXAICbJ/4AABoRfwAAjYg/AIBGxB8AQCPij4wxPjjGOHHV82BrGmM8cozxuA337zXGuGaVc+Lo532HzRpjrI8xbrfqeRxLXN6NVNWDVj0HtrRHJrldkj9Y8TzYQrzvwOr4nj8yxlhPcnKSFyd5RJKdSa5J8rSqumyM8bokH6qql08fvyfJ7yS5TyZXT3nWdJ0keXZV/fFyXwHzMt0WfiHJ45KckuT5VfVbY4znJrlHVf3U9HF3SfKh6ePelclRhM8l+c3pr79McmGSJyQ5KcmPVtUl03XPSvKcJOtJPpHkJ6rqqjHGjyR5SiZfyr4nyZeS/LOqunIJL50lm25rd0hybpJHJ9mf5Nqq+v6VToyVONx7z4axn0/ypOnYcw4Zu9n1ODyHfdno3Ko6vaq+O8lbkvzSdPkbkvzrDY87O8kbqmo9k3/4H1pV35PkX2VyVRW2thur6h8k+adJ/vP0WtmvSfLPNxx6+fEkb66qv05yQZKLq+pBVXXudPyUJH823S5ekum2NP2Pw7lJHldVD0yyL8krNzz36Zn8B+L+ST6S5KcX+UJZue9O8pgk3zV933niiufDat3ce89BX66q05OcmeQVM6zHzRB/bPT4Mcb7xxj7kjw7yYOSpKrek+TkMcYDxxg7MtnbdzDy7pPkXWOMDyd5a5K7jjHuuoK5Mz+vS5Lp5RD/KpO4/2Ime3vPnG4DT0/y6lv4Pa6tqndMb78/k+0kSR6V5J1VdcX0/oWZ/ON/0Hur6vKbWY9j04eSbE/yujHGmaueDCt3k/eeDWO/Of35/iSnjjFO2OR63Azxx0GnJPm1JGdU1Z4kT0uy8S/XxZns/Xt8ko9W1WXT5W9J8p+me2pOS3LDIeuxtW3L5PBsMvnf9jOS/GAm28Clt7De/g23D+TvPl+88fc7aOP96w6zHsemA0nun8l/HB+Y5MP+88jUoe8V1yVJVR2Y3j/ce8PNvcdwCPHHQbdPcn2SK8cYxyU555DxizLZ4/djSV6/Yfk3J/nk9PaPJtm14HmyeGcnyRjjvpns/f3zJKmqfUk+n+RlSc7f8PgvZ/LZrc344yRP2PAP/NOT/NEc5szW9C1JTqyq30/yvCT/N8m9VzslVuhm33sWuF5b4o+D/ibJf0ny4SR/kr8LuiRJVX06k89gPTLJb28Y+rdJ/tsY45Ik98okDtja9o8x3pvkHZmejLFh7LVJbkzyexuWvT3Jg6df3fG8W/qNq+rDSX4uyR+OMT6UyWe+njXX2bOV3CPJH40x/jqTQ8D/I5PDevR0S+89i1ivLWf7Njf9YOxlSU6ansBBYwfP/K6qaw8z/tpMPlrzy8udGccS7zsc6tbee+a9Xnc+T9PYGON7kvxWkpd4A+aWjDFOTfLuJFcm+Tcrng5bmPcdWD17/gAAGvGZPwCARsQfAEAj4g8AoBHxB3CExhifGmM85tYfCXD0EH8AAI2IPwCARnzPH8BtNMb43iQvT/KdSb6eyffY/WxVXT8dX8/kusj/Lskdk7w5yU9V1foYY3uS8zK5dvZXkvxKklcmOb6qblj2awGOffb8Adx2B5L8TCZh931J/mGSZx7ymCcmOT2TS9o9OckPTJc/PcnjM7km6WlJfmgJ8wUas+cP4DaqqrUNdz81xrgwySOSvGzD8nOr6ktJvjTGeHcmsff7mYTgy6vqM0kyxjg3k3gEWAjxB3AbjTF2J/nVJA9OclIm761rhzzsyg23v5bkdtPbpya5fMPYxtsAc+ewL8Bt9+okH0ty36q6fZLnJ9m2yXWvSHK3DffvPue5Afw99vwB3HYnJ/lykmvHGPfL5OSOqze57tuSPGuM8XtJvprk3y9migAT9vwB3HbPTvKUTM7WfU2St86w7muS/EGSDyX5QJJ3Jrkhk5NIAOZu2/r6+qrnAMDUGOPxSS6oqnuuei7AsclhX4AVGmOcmORRmez9u0uSFyV5+0onBRzTHPYFWK1tSX4hyRczOez70SQvXOmMgGOaw74AAI3Y8wcA0Ij4AwBoRPwBADQi/gAAGhF/AACN/D9AppI+kkBHeQAAAABJRU5ErkJggg==\n", 89 | "text/plain": [ 90 | "
" 91 | ] 92 | }, 93 | "metadata": { 94 | "needs_background": "light" 95 | }, 96 | "output_type": "display_data" 97 | } 98 | ], 99 | "source": [ 100 | "df_melt = df[['question_id', 'answer_count', 'java','python','js','php']]\n", 101 | "\n", 102 | "df_melt = pd.melt(df_melt, id_vars=['question_id','answer_count'], value_vars=['java','python','js','php'])\n", 103 | "df_melt = df_melt.loc[df_melt.value > 0]\n", 104 | "\n", 105 | "df_melt = df_melt[['question_id','answer_count','variable']]\n", 106 | "df_lang_ans = df_melt.groupby(['variable'])['answer_count'].sum()\n", 107 | "df_lang_ans = pd.DataFrame({'lang': df_lang_ans.index, 'count': df_lang_ans.values})\n", 108 | "df_lang_ans = df_lang_ans.sort_values(['count'], ascending=False)\n", 109 | "\n", 110 | "\n", 111 | "a4_dims = (10.0, 8)\n", 112 | "fig, ax = plt.subplots(figsize=a4_dims)\n", 113 | "ax = sns.barplot(x='lang', y='count', data=df_lang_ans, palette=None)" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": 63, 119 | "metadata": {}, 120 | "outputs": [ 121 | { 122 | "name": "stdout", 123 | "output_type": "stream", 124 | "text": [ 125 | "JAVA 语言回答数最多的问题: 如何看待一些大学生说 3 天学会了 Java?\n", 126 | "https://www.zhihu.com/question/66535555\n", 127 | "回答数:389\n" 128 | ] 129 | } 130 | ], 131 | "source": [ 132 | "#print(df)\n", 133 | "def most_answers_lang(lang):\n", 134 | " first = df.loc[df[lang] > 0].iloc[0]\n", 135 | " question_title = first['question_title']\n", 136 | " question_id = first['question_id']\n", 137 | " answer_count = first['answer_count']\n", 138 | " question_url = 'https://www.zhihu.com/question/%s' % question_id\n", 139 | " print('%s 语言回答数最多的问题: %s\\n%s\\n回答数:%s' % (lang.upper(), question_title, question_url, answer_count))\n", 140 | " \n", 141 | "most_answers_lang('java')\n", 142 | " " 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": 64, 148 | "metadata": {}, 149 | "outputs": [ 150 | { 151 | "name": "stdout", 152 | "output_type": "stream", 153 | "text": [ 154 | "PYTHON 语言回答数最多的问题: 学习 Python 很吃力,我是不是可以放弃编程了?\n", 155 | "https://www.zhihu.com/question/60766946\n", 156 | "回答数:637\n" 157 | ] 158 | } 159 | ], 160 | "source": [ 161 | "most_answers_lang('python')" 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": 65, 167 | "metadata": {}, 168 | "outputs": [ 169 | { 170 | "name": "stdout", 171 | "output_type": "stream", 172 | "text": [ 173 | "JS 语言回答数最多的问题: 如何看待哔哩哔哩的 flv.js 作者月薪不到 5000 元?\n", 174 | "https://www.zhihu.com/question/53686737\n", 175 | "回答数:425\n" 176 | ] 177 | } 178 | ], 179 | "source": [ 180 | "most_answers_lang('js')" 181 | ] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "execution_count": 66, 186 | "metadata": {}, 187 | "outputs": [ 188 | { 189 | "name": "stdout", 190 | "output_type": "stream", 191 | "text": [ 192 | "PHP 语言回答数最多的问题: 零基础应该选择学习 java、php、前端 还是 python?\n", 193 | "https://www.zhihu.com/question/40801731\n", 194 | "回答数:334\n" 195 | ] 196 | } 197 | ], 198 | "source": [ 199 | "most_answers_lang('php')" 200 | ] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "execution_count": null, 205 | "metadata": {}, 206 | "outputs": [], 207 | "source": [] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "execution_count": null, 212 | "metadata": {}, 213 | "outputs": [], 214 | "source": [] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "execution_count": null, 219 | "metadata": {}, 220 | "outputs": [], 221 | "source": [] 222 | }, 223 | { 224 | "cell_type": "code", 225 | "execution_count": null, 226 | "metadata": {}, 227 | "outputs": [], 228 | "source": [] 229 | }, 230 | { 231 | "cell_type": "code", 232 | "execution_count": null, 233 | "metadata": {}, 234 | "outputs": [], 235 | "source": [] 236 | } 237 | ], 238 | "metadata": { 239 | "kernelspec": { 240 | "display_name": "Python 3", 241 | "language": "python", 242 | "name": "python3" 243 | }, 244 | "language_info": { 245 | "codemirror_mode": { 246 | "name": "ipython", 247 | "version": 3 248 | }, 249 | "file_extension": ".py", 250 | "mimetype": "text/x-python", 251 | "name": "python", 252 | "nbconvert_exporter": "python", 253 | "pygments_lexer": "ipython3", 254 | "version": "3.7.0" 255 | } 256 | }, 257 | "nbformat": 4, 258 | "nbformat_minor": 2 259 | } 260 | --------------------------------------------------------------------------------