├── reportgen.egg-info ├── dependency_links.txt ├── top_level.txt ├── requires.txt ├── SOURCES.txt └── PKG-INFO ├── reportgen ├── associate │ ├── __init__.py │ ├── _fpgrowth.pyx │ └── fpgrowth.py ├── font │ ├── readme.txt │ └── DroidSansFallback.ttf ├── images │ └── logo.png ├── template │ └── template.pptx ├── utils │ ├── __init__.py │ ├── utils.py │ ├── preprocessing.py │ ├── delaunay.py │ └── metrics.py ├── questionnaire │ ├── __init__.py │ └── README.md ├── __init__.py ├── config.py ├── README.rst ├── analysis.py └── report.py ├── example ├── datasets │ ├── [问卷星数据]800_800_0.xls │ ├── [问卷星数据]800_800_2.xls │ └── LendingClub_Sample.xlsx ├── analysis_example.py └── questionnaire_example.py ├── MANIFEST.in ├── LICENSE.txt ├── setup.py └── README.rst /reportgen.egg-info/dependency_links.txt: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /reportgen.egg-info/top_level.txt: -------------------------------------------------------------------------------- 1 | reportgen 2 | -------------------------------------------------------------------------------- /reportgen/associate/__init__.py: -------------------------------------------------------------------------------- 1 | from .fpgrowth import * 2 | del fpgrowth 3 | -------------------------------------------------------------------------------- /reportgen.egg-info/requires.txt: -------------------------------------------------------------------------------- 1 | pandas 2 | numpy 3 | seaborn 4 | python-pptx 5 | Pillow 6 | -------------------------------------------------------------------------------- /reportgen/font/readme.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gasongjian/reportgen/HEAD/reportgen/font/readme.txt -------------------------------------------------------------------------------- /reportgen/images/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gasongjian/reportgen/HEAD/reportgen/images/logo.png -------------------------------------------------------------------------------- /reportgen/template/template.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gasongjian/reportgen/HEAD/reportgen/template/template.pptx -------------------------------------------------------------------------------- /reportgen/font/DroidSansFallback.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gasongjian/reportgen/HEAD/reportgen/font/DroidSansFallback.ttf -------------------------------------------------------------------------------- /example/datasets/[问卷星数据]800_800_0.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gasongjian/reportgen/HEAD/example/datasets/[问卷星数据]800_800_0.xls -------------------------------------------------------------------------------- /example/datasets/[问卷星数据]800_800_2.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gasongjian/reportgen/HEAD/example/datasets/[问卷星数据]800_800_2.xls -------------------------------------------------------------------------------- /example/datasets/LendingClub_Sample.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gasongjian/reportgen/HEAD/example/datasets/LendingClub_Sample.xlsx -------------------------------------------------------------------------------- /reportgen/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 2 | ''' 3 | 存在一些工具 4 | ''' 5 | from .utils import iqr 6 | from .delaunay import Delaunay2D 7 | 8 | __all__=['iqr', 9 | 'Delaunay2D'] 10 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | #phonetic representation 2 | include reportgen/images/logo.png 3 | include reportgen/template/template.pptx 4 | include reportgen/font/readme.txt 5 | include reportgen/font/DroidSansFallback.ttf 6 | -------------------------------------------------------------------------------- /reportgen/questionnaire/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 2 | ''' 3 | ''' 4 | from __future__ import division 5 | 6 | from . import questionnaire 7 | 8 | from .questionnaire import * 9 | 10 | del questionnaire -------------------------------------------------------------------------------- /reportgen/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 2 | from . import report 3 | from .report import * 4 | from . import analysis 5 | from .analysis import * 6 | from reportgen.utils import preprocessing 7 | from reportgen.utils import metrics 8 | from reportgen import questionnaire 9 | from reportgen import utils 10 | from reportgen import associate 11 | 12 | del report 13 | del analysis 14 | 15 | __version__ = '0.1.8' 16 | -------------------------------------------------------------------------------- /reportgen.egg-info/SOURCES.txt: -------------------------------------------------------------------------------- 1 | MANIFEST.in 2 | README.rst 3 | setup.py 4 | reportgen/__init__.py 5 | reportgen/analysis.py 6 | reportgen/config.py 7 | reportgen/report.py 8 | reportgen.egg-info/PKG-INFO 9 | reportgen.egg-info/SOURCES.txt 10 | reportgen.egg-info/dependency_links.txt 11 | reportgen.egg-info/requires.txt 12 | reportgen.egg-info/top_level.txt 13 | reportgen/associate/__init__.py 14 | reportgen/associate/fpgrowth.py 15 | reportgen/font/DroidSansFallback.ttf 16 | reportgen/font/readme.txt 17 | reportgen/images/logo.png 18 | reportgen/questionnaire/__init__.py 19 | reportgen/questionnaire/questionnaire.py 20 | reportgen/template/template.pptx 21 | reportgen/utils/__init__.py 22 | reportgen/utils/delaunay.py 23 | reportgen/utils/metrics.py 24 | reportgen/utils/preprocessing.py 25 | reportgen/utils/utils.py -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2017 The Python Packaging Authority (PyPA) 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in all 11 | copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | SOFTWARE. -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | from setuptools import setup, find_packages 4 | 5 | setup( 6 | name='reportgen', 7 | version='0.1.8', 8 | description=( 9 | 'reportgen is a Python library for creating and updating analysis report.' 10 | ), 11 | long_description=open('README.rst').read(), 12 | author='JSong', 13 | author_email='gasongjian@126.com', 14 | maintainer='JSong', 15 | maintainer_email='gasongjian@126.com', 16 | license='BSD License', 17 | packages=find_packages(), 18 | include_package_data=True, 19 | # relative to the vfclust directory 20 | package_data={ 21 | 'images':[ 22 | 'logo.png'], 23 | 'template': 24 | ['template.pptx'], 25 | 'font':['readme.txt','DroidSansFallback.ttf'] 26 | }, 27 | platforms=["all"], 28 | url='https://github.com/gasongjian/reportgen', 29 | classifiers=[ 30 | 'Development Status :: 4 - Beta', 31 | 'Intended Audience :: Developers', 32 | 'License :: OSI Approved :: BSD License', 33 | 'Programming Language :: Python', 34 | 'Programming Language :: Python :: Implementation', 35 | 'Programming Language :: Python :: 3.4', 36 | 'Programming Language :: Python :: 3.5', 37 | 'Programming Language :: Python :: 3.6', 38 | 'Topic :: Software Development :: Libraries' 39 | ], 40 | install_requires=[ 41 | 'pandas', 42 | 'numpy', 43 | 'seaborn', 44 | 'python-pptx', 45 | 'Pillow' 46 | ] 47 | ) 48 | -------------------------------------------------------------------------------- /reportgen/associate/_fpgrowth.pyx: -------------------------------------------------------------------------------- 1 | #cython: boundscheck=False 2 | #cython: wraparound=False 3 | #cython: initializedcheck=False 4 | #cython: cdivision=True 5 | #cython: embedsignature=True 6 | #cython: language_level=3 7 | #cython: language=c++ 8 | 9 | from libcpp.set cimport set as cppset 10 | from libcpp.vector cimport vector 11 | from libcpp.utility cimport pair 12 | from libcpp.unordered_map cimport unordered_map as hashmap 13 | 14 | 15 | cdef int _BUCKETING_FEW_ITEMS = 16 16 | BUCKETING_FEW_ITEMS = _BUCKETING_FEW_ITEMS 17 | 18 | ctypedef cppset[int] itemset_t 19 | ctypedef vector[pair[itemset_t, int]] itemsets_t 20 | 21 | 22 | cpdef itemsets_t bucketing_count(list db, 23 | cppset[int] frequent_items, 24 | int min_support): 25 | """ The bucketing count operation. """ 26 | cdef: 27 | int i, j, k = frequent_items.size() 28 | 29 | vector[int] inv_map = vector[int]() 30 | hashmap[int, int] fwd_map = hashmap[int, int]() 31 | int index = 0 32 | 33 | vector[int] buckets = vector[int](2**k, 0) 34 | pair[int, vector[int]] transaction 35 | int tid = 0 36 | int item 37 | 38 | int count 39 | itemset_t result 40 | itemsets_t results = itemsets_t() 41 | 42 | # Forward and inverse mapping of frequent_items to [0, n_items) 43 | for item in frequent_items: 44 | inv_map.push_back(item) 45 | fwd_map[item] = index 46 | index += 1 47 | # Project transactions 48 | for transaction in db: 49 | tid = 0 50 | for item in transaction.second: 51 | if not frequent_items.count(item): continue 52 | tid |= 1 << fwd_map.at(item) 53 | buckets[tid] += transaction.first 54 | # Aggregate bucketing counts ([2], Figure 5) 55 | for i in range(0, k): 56 | i = 1 << i 57 | for j in range(1 << k): 58 | if j & i == 0: 59 | buckets[j] += buckets[j + i] 60 | # Count results 61 | for tid in range(1, buckets.size()): 62 | count = buckets[tid] 63 | if count >= min_support: 64 | result = itemset_t() 65 | for i in range(_BUCKETING_FEW_ITEMS): 66 | if tid & 1 << i: 67 | result.insert(inv_map[i]) 68 | results.push_back(pair[itemset_t, int](result, count)) 69 | return results 70 | -------------------------------------------------------------------------------- /reportgen/config.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Aug 14 09:39:10 2017 4 | 5 | @author: JSong 6 | """ 7 | 8 | import os 9 | import sys 10 | import pandas as pd 11 | 12 | 13 | _thisdir = os.path.realpath(os.path.split(__file__)[0]) 14 | 15 | __all__=['template_pptx', 16 | 'font_path', 17 | 'chart_type_list', 18 | 'number_format_data', 19 | 'number_format_tick', 20 | 'font_default_size', 21 | 'summary_loc', 22 | 'chart_loc'] 23 | 24 | def _get_element_path(dir_name,suffix=None): 25 | if not(os.path.exists(os.path.join(_thisdir,dir_name))): 26 | element_path=None 27 | return element_path 28 | element_path=None 29 | filelist=os.listdir(os.path.join(_thisdir,dir_name)) 30 | if isinstance(suffix,str): 31 | suffix=[suffix] 32 | elif suffix is not None: 33 | suffix=list(suffix) 34 | for f in filelist: 35 | if isinstance(suffix,list) and os.path.splitext(f)[1][1:] in suffix: 36 | element_path=os.path.join(_thisdir,dir_name,f) 37 | return element_path 38 | 39 | 40 | # default pptx template 41 | template_pptx=_get_element_path('template',suffix=['pptx']) 42 | #template='template.pptx' 43 | 44 | 45 | # default font of chinese 46 | font_path=_get_element_path('font',suffix=['ttf','ttc']) 47 | if font_path is None: 48 | if sys.platform.startswith('win'): 49 | #font_path='C:\\windows\\fonts\\msyh.ttc' 50 | fontlist=['calibri.ttf','simfang.ttf','simkai.ttf','simhei.ttf','simsun.ttc','msyh.ttf','MSYH.TTC','msyh.ttc'] 51 | for f in fontlist: 52 | if os.path.exists(os.path.join('C:\\windows\\fonts\\',f)): 53 | font_path=os.path.join('C:\\windows\\fonts\\',f) 54 | 55 | chart_type_list={\ 56 | "COLUMN_CLUSTERED":['柱状图','ChartData','pptx'],\ 57 | "BAR_CLUSTERED":['条形图','ChartData','pptx'], 58 | 'HIST':['分布图,KDE','XChartData','matplotlib']} 59 | chart_type_list=pd.DataFrame(chart_type_list) 60 | 61 | 62 | # PPT图表中的数字位数 63 | number_format_data='0"%"' 64 | 65 | # PPT图表中坐标轴的数字标签格式 66 | number_format_tick='0"%"' 67 | 68 | # 默认字体大小 69 | ''' 70 | Pt(8):101600, Pt(10):127000, Pt(12):152400, Pt(14):177800 71 | Pt(16):203200, Pt(18):228600, Pt(20):254000, Pt(22):279400 72 | Pt(24):304800, Pt(26):330200 73 | ''' 74 | font_default_size=127000# Pt(10) 75 | 76 | 77 | # PPT中结论文本框所在的位置 78 | # 四个值依次为left、top、width、height 79 | summary_loc=[0.10,0.14,0.80,0.15] 80 | 81 | 82 | # PPT中结论文本框所在的位置 83 | # 四个值依次为left、top、width、height 84 | chart_loc=[0.10,0.30,0.80,0.60] 85 | -------------------------------------------------------------------------------- /example/analysis_example.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sun Mar 25 17:57:48 2018 4 | 5 | @author: gason 6 | """ 7 | import pandas as pd 8 | import numpy as np 9 | import reportgen as rpt 10 | from sklearn import preprocessing 11 | from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier 12 | from sklearn.linear_model import LogisticRegressionCV 13 | 14 | import warnings 15 | warnings.filterwarnings('ignore') #为了整洁,去除弹出的warnings 16 | pd.set_option('precision', 5) #设置精度 17 | pd.set_option('display.float_format', lambda x: '%.5f' % x) #为了直观的显示数字,不采用科学计数法 18 | pd.options.display.max_rows = 200 #最多显示200行 19 | 20 | 21 | 22 | 23 | # 数据导入 24 | data=pd.read_excel('.\\datasets\\LendingClub_Sample.xlsx') 25 | 26 | # 数据预览 27 | rpt.AnalysisReport(data.copy(),filename='LendingClub 数据预览'); 28 | 29 | # 机器学习相关函数补充 30 | 31 | # 只作工具包测试,所以不区分训练集和测试集 32 | y=data['target'] 33 | X=data.drop(['target'],axis=1) 34 | 35 | 36 | # convert into dummies 37 | categorical_var=list(set(X.columns[X.apply(pd.Series.nunique)<30])|set(X.select_dtypes(include=['O']).columns)) 38 | #categorical_var = ['collections_12_mths_ex_med', 'home_ownership', 'sub_grade',\ 39 | #'inq_last_6mths', 'initial_list_status', 'emp_length', 'application_type', \ 40 | #'acc_now_delinq', 'grade', 'purpose', 'verification_status', 'addr_state', 'term', 'pub_rec', 'delinq_2yrs'] 41 | 42 | continuous_var=list(set(X.columns)-set(categorical_var)) 43 | #continuous_var=['open_acc', 'total_rev_hi_lim', 'loan_amnt', 'tot_coll_amt', \ 44 | #'total_acc', 'tot_cur_bal', 'dti', 'annual_inc', 'earliest_cr_line', 'int_rate', 'installment'] 45 | 46 | # WOE 编码 47 | woe=rpt.preprocessing.WeightOfEvidence(categorical_features=categorical_var,encoder_na=False) 48 | X=woe.fit_transform(X,y) 49 | 50 | # 离散化 51 | #dis=rpt.preprocessing.Discretization(continous_features=continuous_var) 52 | #X2=dis.fit_transform(X,y) 53 | 54 | # 补缺和标准化 55 | X=X.fillna(-99) 56 | X[continuous_var]=preprocessing.MinMaxScaler().fit_transform(X[continuous_var]) 57 | 58 | 59 | clfs={'LogisticRegression':LogisticRegressionCV(),\ 60 | 'RandomForest':RandomForestClassifier(),'GradientBoosting':GradientBoostingClassifier()} 61 | y_preds,y_probas={},{} 62 | for clf in clfs: 63 | clfs[clf].fit(X, y) 64 | y_preds[clf] =clfs[clf].predict(X) 65 | y_probas[clf] = clfs[clf].predict_proba(X)[:,1] 66 | 67 | models_report,conf_matrix=rpt.ClassifierReport(y,y_preds,y_probas) 68 | print(models_report) 69 | 70 | 71 | # 信息论度量 72 | p=y_probas['LogisticRegression'][y==1] 73 | q=y_probas['LogisticRegression'][y==0] 74 | print(rpt.metrics.entropyc.kl_div(p,q)) 75 | 76 | 77 | def xiu(data): 78 | data.iloc[:,0]=1 79 | return 2 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | -------------------------------------------------------------------------------- /reportgen/utils/utils.py: -------------------------------------------------------------------------------- 1 | """Small plotting-related utility functions.""" 2 | from __future__ import print_function, division 3 | import colorsys 4 | import warnings 5 | import os 6 | 7 | import numpy as np 8 | from scipy import stats 9 | import pandas as pd 10 | 11 | 12 | 13 | 14 | 15 | def _kde_support(data, bw, gridsize, cut, clip): 16 | """Establish support for a kernel density estimate.""" 17 | support_min = max(data.min() - bw * cut, clip[0]) 18 | support_max = min(data.max() + bw * cut, clip[1]) 19 | return np.linspace(support_min, support_max, gridsize) 20 | 21 | 22 | def percentiles(a, pcts, axis=None): 23 | """Like scoreatpercentile but can take and return array of percentiles. 24 | 25 | Parameters 26 | ---------- 27 | a : array 28 | data 29 | pcts : sequence of percentile values 30 | percentile or percentiles to find score at 31 | axis : int or None 32 | if not None, computes scores over this axis 33 | 34 | Returns 35 | ------- 36 | scores: array 37 | array of scores at requested percentiles 38 | first dimension is length of object passed to ``pcts`` 39 | 40 | """ 41 | scores = [] 42 | try: 43 | n = len(pcts) 44 | except TypeError: 45 | pcts = [pcts] 46 | n = 0 47 | for i, p in enumerate(pcts): 48 | if axis is None: 49 | score = stats.scoreatpercentile(a.ravel(), p) 50 | else: 51 | score = np.apply_along_axis(stats.scoreatpercentile, axis, a, p) 52 | scores.append(score) 53 | scores = np.asarray(scores) 54 | if not n: 55 | scores = scores.squeeze() 56 | return scores 57 | 58 | 59 | def ci(a, which=95, axis=None): 60 | """Return a percentile range from an array of values.""" 61 | p = 50 - which / 2, 50 + which / 2 62 | return percentiles(a, p, axis) 63 | 64 | 65 | def iqr(a): 66 | """Calculate the IQR for an array of numbers.""" 67 | a = np.asarray(a) 68 | q1 = stats.scoreatpercentile(a, 25) 69 | q3 = stats.scoreatpercentile(a, 75) 70 | return q3 - q1 71 | 72 | 73 | 74 | def categorical_order(values, order=None): 75 | """Return a list of unique data values. 76 | 77 | Determine an ordered list of levels in ``values``. 78 | 79 | Parameters 80 | ---------- 81 | values : list, array, Categorical, or Series 82 | Vector of "categorical" values 83 | order : list-like, optional 84 | Desired order of category levels to override the order determined 85 | from the ``values`` object. 86 | 87 | Returns 88 | ------- 89 | order : list 90 | Ordered list of category levels not including null values. 91 | 92 | """ 93 | if order is None: 94 | if hasattr(values, "categories"): 95 | order = values.categories 96 | else: 97 | try: 98 | order = values.cat.categories 99 | except (TypeError, AttributeError): 100 | try: 101 | order = values.unique() 102 | except AttributeError: 103 | order = pd.unique(values) 104 | try: 105 | np.asarray(values).astype(np.float) 106 | order = np.sort(order) 107 | except (ValueError, TypeError): 108 | order = order 109 | order = filter(pd.notnull, order) 110 | return list(order) 111 | 112 | 113 | 114 | -------------------------------------------------------------------------------- /example/questionnaire_example.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | # reportgen v 0.1.8 4 | ------------------- 5 | 6 | ## **问卷模块** :问卷类型的数据分析 7 | 8 | ------------------ 9 | 10 | 问卷数据涉及到各种题型,包括单选题、多选题、填空题、矩阵多选题、排序题等等。不管是频数统计还是交叉分析,单选题都很好处理。但其他题目就相对复杂的多,比如单选题和多选题之间的交叉统计,多选题和多选题之间的交叉统计。 11 | 12 | 为了区分题目类型和统计处理方法,本工具包统一使用新型的数据类型(或者说编码方式)。在这种类型中,每一份问卷都有两个文件,data 和 code ,它们的含义如下: 13 | 14 | - 1)、data:按序号编码的数据(csv、xlsx等都可以),示例如下: 15 | 16 | |Q1|Q2|Q3_A1|Q3_A2|Q3_A3|Q3_A4| 17 | |:----:|:---:|:----:|:----:|:---:|:----:| 18 | |1|1|1|0|1|0| 19 | |1|2|0|0|1|0| 20 | |1|1|1|0|0|1| 21 | |2|3|0|1|1|0| 22 | |1|2|1|0|1|0| 23 | |1|4|0|1|0|1| 24 | |2|2|1|0|1|0| 25 | |1|1|0|1|0|1| 26 | |2|2|1|0|1|0| 27 | 28 | - 2)、code:编码文件( json格式,就是 python中的字典类型), 给定每道题的题号、序号编码等内容, 29 | 每一个题目都有如下字段: 30 | 31 | - content: 题目内容 32 | - code:题目对应的编码 33 | - code_r: 题目对应的编码(矩阵单选题专有) 34 | - qtype:题目类型,单选题、多选题、矩阵单选题、排序题、填空题等 35 | - qlist:该题的索引,如多选题的 ['Q1_A1','Q1_A2',..] 36 | - code_order: 非必须,题目类别的顺序,用于PPT报告的生成[一般后期添加] 37 | - name: 非必须,特殊题型的标注 38 | - weight:非必须,dict,每个选项的权重,用于如月收入等的平均数统计 39 | 40 | 示例: 41 | 42 | ```json 43 | code={'Q1':{ 44 | 'content':'性别', 45 | 'code':{ 46 | 1:'男', 47 | 2:'女' 48 | } 49 | 'qtype':'单选题', 50 | 'qlist':['Q1'] 51 | }, 52 | 'Q2':{ 53 | 'content':'年龄', 54 | 'code':{ 55 | 1:'17岁以下', 56 | 2:'18-25岁', 57 | 3:'26-35岁', 58 | 4:'36-46岁' 59 | }, 60 | 'qtype':'单选题', 61 | 'qlist':['Q2'] 62 | }, 63 | 'Q3':{ 64 | 'content':'爱好', 65 | 'code':{ 66 | 'Q3_A1':'17岁以下', 67 | 'Q3_A2':'18-25岁', 68 | 'Q3_A3':'26-35岁', 69 | 'Q3_A4':'36-46岁' 70 | }, 71 | 'qtype':'多选题', 72 | 'qlist':['Q3_A1','Q3_A2','Q3_A3','Q3_A4'] 73 | } 74 | } 75 | 76 | ##该工具包包含如下函数: 77 | 78 | ### 文件 IO 79 | 80 | - `read_code`, 从本地读取code数据,支持excel文件和json文件 81 | - `save_code`, 将code 保存为 xlsx 或json数据 82 | - `load_data`, 支持打开文件窗口来选择问卷数据 83 | - `read_data`, 读取本地的数据,自适应xlsx、csv等 84 | - `save_data`, 将问卷数据(data和code)保存到本地 85 | - `wenjuanwang`, 编码问卷网平台的问卷数据,输入为问卷网上下载的三个文件 86 | - `wenjuanxing`, 编码问卷星平台的问卷数据,输入为问卷星网站上下载的两个xls文件(按选项序号和按选项文本) 87 | 88 | ### 数据处理 89 | - `spec_rcode`: 对问卷中的一些特殊题型进行处理,如将城市题分类成省份、城市、城市级别等 90 | - `dataText_to_code`: 91 | - `dataCode_to_text`: 92 | - `var_combine`: 见data_merge 93 | - `data_merge`: 合并两份问卷数据,常见于多个推动渠道的问卷合并 94 | - `clean_ftime`: 根据用户填写时间来筛选问卷,会根据填问卷累计时间曲线的拐点来给出剔除的时间点 95 | - `data_auto_code`: 96 | - `qdata_flatten`: 将问卷数据展平,便于将多份问卷数据存储在同一个数据库中 97 | 98 | ### 统计检验等 99 | - `sample_size_cal`: 样本量计算公式 100 | - `confidence_interval`: 置信区间计算公式 101 | - `gof_test`: 拟合优度检验 102 | - `chi2_test`: 卡方检验 103 | - `fisher_exact`: 卡方检验,适用于观察频数过少的情形 104 | - `anova`: 方差分析 105 | 106 | ### 数据分析 107 | - `mca`: 对应分析,目前只支持两个变量 108 | - `cluster`: 态度题的聚类分析,会根据轮廓系数自动选择最佳类别数 109 | - `association_rules`: 关联分析,用于多选题的进一步分析 110 | 111 | ### 统计 112 | - `contingency`: 列联表分析,统一给出列联表的各种数据,包含fo、fop、TGI等 113 | - `qtable`: 单个题目的统计分析和两个题目的交叉分析,给出频数表和频率表 114 | 115 | ### 可视化 116 | - `summary_chart`: 整体统计报告,针对每一道题,选择合适的图表进行展示,并输出为pptx文件 117 | - `cross_chart`: 交叉分析报告,如能将年龄与每一道题目进行交叉分析,并输出为pptx文件 118 | - `onekey_gen`: 综合上两个,一键生成 119 | - `scorpion`: 生成一个表格,内含每个题目的相关统计信息 120 | - `scatter`: 散点图绘制,不同于matplotlib的是,其能给每个点加文字标签 121 | - `sankey`: 桑基图绘制,不画图,只提供 R 需要的数据 122 | 123 | """ 124 | 125 | import reportgen.questionnaire as ques 126 | 127 | 128 | # 导入问卷星数据 129 | datapath=['.\\datasets\\[问卷星数据]800_800_0.xls','.\\datasets\\[问卷星数据]800_800_2.xls'] 130 | data,code=ques.wenjuanxing(datapath) 131 | 132 | # 导出 133 | ques.save_data(data,filename='data.xlsx') 134 | ques.save_data(data,filename='data.xlsx',code=code)# 会将选项编码替换成文本 135 | ques.save_code(code,filename='code.xlsx') 136 | 137 | 138 | # 对单变量进行统计分析 139 | result=ques.qtable(data,code,'Q1') 140 | print(result['fo']) 141 | 142 | # 两个变量的交叉分析 143 | result=ques.qtable(data,code,'Q1','Q2') 144 | print(result['fop']) 145 | 146 | # 聚类分析,会在原数据上添加一列,类别题 147 | #ques.cluster(data,code,'态度题') 148 | 149 | # 在.\\out\\下 生成 pptx文件 150 | ques.summary_chart(data,code,filename='整体统计报告'); 151 | ques.cross_chart(data,code,cross_class='Q4',filename='交叉分析报告_年龄'); 152 | ques.scorpion(data,code,filename='详细分析数据') 153 | ques.onekey_gen(data,code,filename='reportgen 自动生成报告'); 154 | -------------------------------------------------------------------------------- /reportgen/questionnaire/README.md: -------------------------------------------------------------------------------- 1 | # reportgen v 0.1.8 2 | ------------------- 3 | 4 | ## **问卷模块** :问卷类型的数据分析 5 | 6 | ------------------ 7 | 8 | 问卷数据涉及到各种题型,包括单选题、多选题、填空题、矩阵多选题、排序题等等。不管是频数统计还是交叉分析,单选题都很好处理。但其他题目就相对复杂的多,比如单选题和多选题之间的交叉统计,多选题和多选题之间的交叉统计。 9 | 10 | 为了区分题目类型和统计处理方法,本工具包统一使用新型的数据类型(或者说编码方式)。在这种类型中,每一份问卷都有两个文件,data 和 code ,它们的含义如下: 11 | 12 | - 1)、data:按序号编码的数据(csv、xlsx等都可以),示例如下: 13 | 14 | |Q1|Q2|Q3_A1|Q3_A2|Q3_A3|Q3_A4| 15 | |:----:|:---:|:----:|:----:|:---:|:----:| 16 | |1|1|1|0|1|0| 17 | |1|2|0|0|1|0| 18 | |1|1|1|0|0|1| 19 | |2|3|0|1|1|0| 20 | |1|2|1|0|1|0| 21 | |1|4|0|1|0|1| 22 | |2|2|1|0|1|0| 23 | |1|1|0|1|0|1| 24 | |2|2|1|0|1|0| 25 | 26 | - 2)、code:编码文件( json格式,就是 python中的字典类型), 给定每道题的题号、序号编码等内容, 27 | 每一个题目都有如下字段: 28 | 29 | - content: 题目内容 30 | - code:题目对应的编码 31 | - code_r: 题目对应的编码(矩阵单选题专有) 32 | - qtype:题目类型,单选题、多选题、矩阵单选题、排序题、填空题等 33 | - qlist:该题的索引,如多选题的 ['Q1_A1','Q1_A2',..] 34 | - code_order: 非必须,题目类别的顺序,用于PPT报告的生成[一般后期添加] 35 | - name: 非必须,特殊题型的标注 36 | - weight:非必须,dict,每个选项的权重,用于如月收入等的平均数统计 37 | 38 | 示例: 39 | 40 | ```json 41 | code={'Q1':{ 42 | 'content':'性别', 43 | 'code':{ 44 | 1:'男', 45 | 2:'女' 46 | } 47 | 'qtype':'单选题', 48 | 'qlist':['Q1'] 49 | }, 50 | 'Q2':{ 51 | 'content':'年龄', 52 | 'code':{ 53 | 1:'17岁以下', 54 | 2:'18-25岁', 55 | 3:'26-35岁', 56 | 4:'36-46岁' 57 | }, 58 | 'qtype':'单选题', 59 | 'qlist':['Q2'] 60 | }, 61 | 'Q3':{ 62 | 'content':'爱好', 63 | 'code':{ 64 | 'Q3_A1':'17岁以下', 65 | 'Q3_A2':'18-25岁', 66 | 'Q3_A3':'26-35岁', 67 | 'Q3_A4':'36-46岁' 68 | }, 69 | 'qtype':'多选题', 70 | 'qlist':['Q3_A1','Q3_A2','Q3_A3','Q3_A4'] 71 | } 72 | } 73 | 74 | ##该工具包包含如下函数: 75 | 76 | ### 文件 IO 77 | 78 | - `read_code`, 从本地读取code数据,支持excel文件和json文件 79 | - `save_code`, 将code 保存为 xlsx 或json数据 80 | - `load_data`, 支持打开文件窗口来选择问卷数据 81 | - `read_data`, 读取本地的数据,自适应xlsx、csv等 82 | - `save_data`, 将问卷数据(data和code)保存到本地 83 | - `wenjuanwang`, 编码问卷网平台的问卷数据,输入为问卷网上下载的三个文件 84 | - `wenjuanxing`, 编码问卷星平台的问卷数据,输入为问卷星网站上下载的两个xls文件(按选项序号和按选项文本) 85 | 86 | ### 数据处理 87 | - `spec_rcode`: 对问卷中的一些特殊题型进行处理,如将城市题分类成省份、城市、城市级别等 88 | - `dataText_to_code`: 89 | - `dataCode_to_text`: 90 | - `var_combine`: 见data_merge 91 | - `data_merge`: 合并两份问卷数据,常见于多个推动渠道的问卷合并 92 | - `clean_ftime`: 根据用户填写时间来筛选问卷,会根据填问卷累计时间曲线的拐点来给出剔除的时间点 93 | - `data_auto_code`: 94 | - `qdata_flatten`: 将问卷数据展平,便于将多份问卷数据存储在同一个数据库中 95 | 96 | ### 统计检验等 97 | - `sample_size_cal`: 样本量计算公式 98 | - `confidence_interval`: 置信区间计算公式 99 | - `gof_test`: 拟合优度检验 100 | - `chi2_test`: 卡方检验 101 | - `fisher_exact`: 卡方检验,适用于观察频数过少的情形 102 | - `anova`: 方差分析 103 | 104 | ### 数据分析 105 | - `mca`: 对应分析,目前只支持两个变量 106 | - `cluster`: 态度题的聚类分析,会根据轮廓系数自动选择最佳类别数 107 | - `association_rules`: 关联分析,用于多选题的进一步分析 108 | 109 | ### 统计 110 | - `contingency`: 列联表分析,统一给出列联表的各种数据,包含fo、fop、TGI等 111 | - `qtable`: 单个题目的统计分析和两个题目的交叉分析,给出频数表和频率表 112 | 113 | ### 可视化 114 | - `summary_chart`: 整体统计报告,针对每一道题,选择合适的图表进行展示,并输出为pptx文件 115 | - `cross_chart`: 交叉分析报告,如能将年龄与每一道题目进行交叉分析,并输出为pptx文件 116 | - `onekey_gen`: 综合上两个,一键生成 117 | - `scorpion`: 生成一个表格,内含每个题目的相关统计信息 118 | - `scatter`: 散点图绘制,不同于matplotlib的是,其能给每个点加文字标签 119 | - `sankey`: 桑基图绘制,不画图,只提供 R 需要的数据 120 | """ 121 | 122 | 123 | ## 一些实践: 124 | 125 | 数据在 .\\example\\datasets\\ 126 | 127 | ```python 128 | import reportgen.questionnaire as ques 129 | 130 | 131 | # 导入问卷星数据 132 | datapath=['.\\datasets\\[问卷星数据]800_800_0.xls','.\\datasets\\[问卷星数据]800_800_2.xls'] 133 | data,code=ques.wenjuanxing(datapath) 134 | 135 | # 导出 136 | ques.save_data(data,filename='data.xlsx') 137 | ques.save_data(data,filename='data.xlsx',code=code)# 会将选项编码替换成文本 138 | ques.save_code(code,filename='code.xlsx') 139 | 140 | 141 | # 对单变量进行统计分析 142 | result=ques.qtable(data,code,'Q1') 143 | print(result['fo']) 144 | 145 | # 两个变量的交叉分析 146 | result=ques.qtable(data,code,'Q1','Q2') 147 | print(result['fop']) 148 | 149 | # 聚类分析,会在原数据上添加一列,类别题 150 | #ques.cluster(data,code,'态度题') 151 | 152 | # 在.\\out\\下 生成 pptx文件 153 | ques.summary_chart(data,code,filename='整体统计报告'); 154 | ques.cross_chart(data,code,cross_class='Q4',filename='交叉分析报告_年龄'); 155 | ques.scorpion(data,code,filename='详细分析数据') 156 | ques.onekey_gen(data,code,filename='reportgen 自动生成报告'); 157 | ``` 158 | -------------------------------------------------------------------------------- /reportgen/README.rst: -------------------------------------------------------------------------------- 1 | reportgen 2 | =========== 3 | 4 | Release v0.1.8 5 | 6 | *reportgen* is a Python library for creating and updating analysis report. 7 | 8 | Release History 9 | ------------------ 10 | 0.1.8(2018-03-28) 11 | 12 | - Add subpackages metrics and preprocessing which contain entropy,WOE,discretization etc.. 13 | - Add associate analysis(FP growth): frequent_itemsets and association_rules. 14 | - Add functions :ClassifierReport,type_of_var. 15 | - Fix the logic of package. 16 | - Fix some bugs. 17 | 18 | 0.1.6(2017-12-06) 19 | 20 | - Add function rpt.plot(). 21 | - Support drawing on the exist matplotlib figure and Report file 22 | - Fix some bugs. 23 | 24 | 0.1.5(2017-11-29) 25 | 26 | - Add function AnalysisReport, it can plot the general data to pptx files. 27 | - Fix some bugs. 28 | 29 | 0.1.0(2017-11-18) 30 | 31 | - Create. 32 | 33 | 34 | Feature Support 35 | ------------------ 36 | 37 | **reportgen** has the following capabilities, with many more on the roadmap: 38 | 39 | - get all the texts in the pptx file 40 | - get all the images in the pptx file 41 | - add one slide simply about charts/tables/images with pandas in a pptx file 42 | - add slides simply about charts/tables/images with pandas in a pptx file 43 | 44 | Quick Start 45 | ------------ 46 | 47 | 1. Get texts or images in a pptx file. 48 | 49 | :: 50 | 51 | import reportgen as rpt 52 | # Open a pptx file 53 | p=rpt.Report('analysis.pptx') 54 | # We can get the texts and images simply. 55 | result=p.get_texts() 56 | print('\n'.join(result)) 57 | # All the images will saved in folder '.\\images\\'. 58 | p.get_images() 59 | 60 | 2. Created a analysis report. 61 | 62 | :: 63 | 64 | import reportgen as rpt 65 | import pandas as pd 66 | # Open a pptx file 67 | p=rpt.Report('template.pptx')# The parameters can be defaulted 68 | # add a cover 69 | p.add_cover(title='A analysis report powered by reportgen') 70 | # add a chart slide 71 | data=pd.DataFrame({'Jack':[90,80,100],'David':[100,70,85]},index=['Math','English','Physics']) 72 | p.add_slide(data={'data':data,'slide_type':'chart','type':'COLUMN_CLUSTERED'},\ 73 | title='the scores report',summary='Our class got excellent results',footnote='This is a footnote.') 74 | # add a table slide 75 | data=pd.DataFrame({'Jack':[90,80,100],'David':[100,70,85]},index=['Math','English','Physics']) 76 | p.add_slide(data={'data':data,'slide_type':'table'},title='the scores report',summary='Our class got excellent results',footnote='This is a footnote.') 77 | # add a textbox slide 78 | data='This a paragraph. \n'*4 79 | p.add_slide(data={'data':data,'slide_type':'textbox'},title='This is a textbox slide',summary='',footnote='') 80 | # add a picture slide 81 | data='.\\images\\images.png' 82 | p.add_slide(data={'data':data,'slide_type':'picture'},title='This is a picture slide') 83 | p.save('analysis report.pptx') 84 | 85 | 86 | 87 | 88 | In general, I divide a slide of analysis report into four parts: title、summary、footnote and the body data. And the body are one or more charts/textboxs/tables/pictures. 89 | 90 | The *add_slide* which is the most commonly used function has the following parameters: 91 | 92 | :: 93 | 94 | add_slide(data=[{'data':,'slide_type':,'type':},],title='',summary='',footnote='',layouts='auto') 95 | 96 | For example, we can draw a chart on the left side, and insert a picture on the right. 97 | 98 | :: 99 | 100 | import reportgen as rpt 101 | import pandas as pd 102 | p=rpt.Report() 103 | scores=pd.DataFrame({'Jack':[90,80,100],'David':[100,70,85]},index=['Math','English','Physics']) 104 | data=[{'data':scores,'slide_type':'chart','type':'COLUMN_CLUSTERED'},\ 105 | {'data':'.\\images2.jpg','slide_type':'picture'}] 106 | p.add_slide(data=data) 107 | p.save('add_slide.pptx') 108 | 109 | As a lazy person, I also provide a solution with less scripts. 110 | 111 | :: 112 | 113 | import reportgen as rpt 114 | p=rpt.Report() 115 | imgs=['.\\images\\'+img for img in os.listdir('.\\images\\')] 116 | p.add_slides(data=imgs) 117 | # more functions way 118 | slides_data=[{'title':'ppt{}'.format(i),'data':data} for i in range(10)] 119 | p.add.slides(slides_data) 120 | p.save('add_slides.pptx') 121 | 122 | 123 | Now you can get a glance at any data. 124 | 125 | :: 126 | 127 | import pandas as pd 128 | import reportgen as rpt 129 | 130 | data=pd.read_excel('Scores.xlsx') 131 | rpt.AnalysisReport(data,filename='Analysis Report of Scores.pptx'); 132 | 133 | The scripts will make a pptx file which analysis all the fields of the data in a visual way. 134 | 135 | TO DO 136 | ------- 137 | 138 | - support export analysis report to html 139 | - make the chart_type recommend more intelligence 140 | 141 | 142 | Contact 143 | -------- 144 | 145 | If you have any question,you can email to gasongjian AT 126.com. And if you have a WeChat account,you can focus to my WeChat Official Account: gasongjian. 146 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | reportgen 2 | =========== 3 | 4 | Release v0.1.8 5 | 6 | *reportgen* is a Python library for creating and updating analysis report. 7 | 8 | Release History 9 | ------------------ 10 | 0.1.8(2018-03-28) 11 | 12 | - Add subpackages metrics and preprocessing which contain entropy,WOE,discretization etc.. 13 | - Add associate analysis(FP growth): frequent_itemsets and association_rules. 14 | - Add functions :ClassifierReport,type_of_var. 15 | - Fix the logic of package. 16 | - Fix some bugs. 17 | 18 | 0.1.6(2017-12-06) 19 | 20 | - Add function rpt.plot(). 21 | - Support drawing on the exist matplotlib figure and Report file 22 | - Fix some bugs. 23 | 24 | 0.1.5(2017-11-29) 25 | 26 | - Add function AnalysisReport, it can plot the general data to pptx files. 27 | - Fix some bugs. 28 | 29 | 0.1.0(2017-11-18) 30 | 31 | - Create. 32 | 33 | 34 | Feature Support 35 | ------------------ 36 | 37 | **reportgen** has the following capabilities, with many more on the roadmap: 38 | 39 | - get all the texts in the pptx file 40 | - get all the images in the pptx file 41 | - add one slide simply about charts/tables/images with pandas in a pptx file 42 | - add slides simply about charts/tables/images with pandas in a pptx file 43 | 44 | Quick Start 45 | ------------ 46 | 47 | 1. Get texts or images in a pptx file. 48 | 49 | :: 50 | 51 | # import 52 | import reportgen as rpt 53 | # Open a pptx file 54 | p=rpt.Report('analysis.pptx') 55 | # We can get the texts and images simply. 56 | result=p.get_texts() 57 | print('\n'.join(result)) 58 | # All the images will saved in folder '.\\images\\'. 59 | p.get_images() 60 | 61 | 2. Created a analysis report. 62 | 63 | :: 64 | 65 | # 66 | import reportgen as rpt 67 | import pandas as pd 68 | # Open a pptx file 69 | p=rpt.Report('template.pptx')# The parameters can be defaulted 70 | # add a cover 71 | p.add_cover(title='A analysis report powered by reportgen') 72 | # add a chart slide 73 | data=pd.DataFrame({'Jack':[90,80,100],'David':[100,70,85]},index=['Math','English','Physics']) 74 | p.add_slide(data={'data':data,'slide_type':'chart','type':'COLUMN_CLUSTERED'},\ 75 | title='the scores report',summary='Our class got excellent results',footnote='This is a footnote.') 76 | # add a table slide 77 | data=pd.DataFrame({'Jack':[90,80,100],'David':[100,70,85]},index=['Math','English','Physics']) 78 | p.add_slide(data={'data':data,'slide_type':'table'},title='the scores report',summary='Our class got excellent results',footnote='This is a footnote.') 79 | # add a textbox slide 80 | data='This a paragraph. \n'*4 81 | p.add_slide(data={'data':data,'slide_type':'textbox'},title='This is a textbox slide',summary='',footnote='') 82 | # add a picture slide 83 | data='.\\images\\images.png' 84 | p.add_slide(data={'data':data,'slide_type':'picture'},title='This is a picture slide') 85 | p.save('analysis report.pptx') 86 | 87 | 88 | 89 | 90 | In general, I divide a slide of analysis report into four parts: title、summary、footnote and the body data. And the body are one or more charts/textboxs/tables/pictures. 91 | 92 | The *add_slide* which is the most commonly used function has the following parameters: 93 | 94 | :: 95 | 96 | add_slide(data=[{'data':,'slide_type':,'type':},],title='',summary='',footnote='',layouts='auto') 97 | 98 | For example, we can draw a chart on the left side, and insert a picture on the right. 99 | 100 | :: 101 | 102 | import reportgen as rpt 103 | import pandas as pd 104 | p=rpt.Report() 105 | scores=pd.DataFrame({'Jack':[90,80,100],'David':[100,70,85]},index=['Math','English','Physics']) 106 | data=[{'data':scores,'slide_type':'chart','type':'COLUMN_CLUSTERED'},\ 107 | {'data':'.\\images2.jpg','slide_type':'picture'}] 108 | p.add_slide(data=data) 109 | p.save('add_slide.pptx') 110 | 111 | As a lazy person, I also provide a solution with less scripts. 112 | 113 | :: 114 | 115 | import reportgen as rpt 116 | p=rpt.Report() 117 | imgs=['.\\images\\'+img for img in os.listdir('.\\images\\')] 118 | p.add_slides(data=imgs) 119 | # more functions way 120 | slides_data=[{'title':'ppt{}'.format(i),'data':data} for i in range(10)] 121 | p.add.slides(slides_data) 122 | p.save('add_slides.pptx') 123 | 124 | 125 | Now you can get a glance at any data. 126 | 127 | :: 128 | 129 | import pandas as pd 130 | import reportgen as rpt 131 | 132 | data=pd.read_excel('Scores.xlsx') 133 | rpt.AnalysisReport(data,filename='Analysis Report of Scores.pptx'); 134 | 135 | The scripts will make a pptx file which analysis all the fields of the data in a visual way. 136 | 137 | TO DO 138 | ------- 139 | 140 | - support export analysis report to html 141 | - make the chart_type recommend more intelligence 142 | 143 | 144 | Contact 145 | -------- 146 | 147 | If you have any question,you can email to gasongjian AT 126.com. And if you have a WeChat account,you can focus to my WeChat Official Account: gasongjian. 148 | -------------------------------------------------------------------------------- /reportgen.egg-info/PKG-INFO: -------------------------------------------------------------------------------- 1 | Metadata-Version: 1.1 2 | Name: reportgen 3 | Version: 0.1.8 4 | Summary: reportgen is a Python library for creating and updating analysis report. 5 | Home-page: https://github.com/gasongjian/reportgen 6 | Author: JSong 7 | Author-email: gasongjian@126.com 8 | License: BSD License 9 | Description-Content-Type: UNKNOWN 10 | Description: reportgen 11 | =========== 12 | 13 | Release v0.1.8 14 | 15 | *reportgen* is a Python library for creating and updating analysis report. 16 | 17 | Release History 18 | ------------------ 19 | 0.1.8(2018-03-28) 20 | 21 | - Add subpackages metrics and preprocessing which contain entropy銆乄OE銆乨iscretization etc.. 22 | - Add associate analysis(FP growth): frequent_itemsets and association_rules. 23 | - Add functions :ClassifierReport,type_of_var. 24 | - Fix the logic of package. 25 | - Fix some bugs. 26 | 27 | 0.1.6(2017-12-06) 28 | 29 | - Add function rpt.plot(). 30 | - Support drawing on the exist matplotlib figure and Report file 31 | - Fix some bugs. 32 | 33 | 0.1.5(2017-11-29) 34 | 35 | - Add function AnalysisReport, it can plot the general data to pptx files. 36 | - Fix some bugs. 37 | 38 | 0.1.0(2017-11-18) 39 | 40 | - Create. 41 | 42 | 43 | Feature Support 44 | ------------------ 45 | 46 | **reportgen** has the following capabilities, with many more on the roadmap: 47 | 48 | - get all the texts in the pptx file 49 | - get all the images in the pptx file 50 | - add one slide simply about charts/tables/images with pandas in a pptx file 51 | - add slides simply about charts/tables/images with pandas in a pptx file 52 | 53 | Quick Start 54 | ------------ 55 | 56 | 1. Get texts or images in a pptx file. 57 | 58 | :: 59 | 60 | # import 61 | import reportgen as rpt 62 | # Open a pptx file 63 | p=rpt.Report('analysis.pptx') 64 | # We can get the texts and images simply. 65 | result=p.get_texts() 66 | print('\n'.join(result)) 67 | # All the images will saved in folder '.\\images\\'. 68 | p.get_images() 69 | 70 | 2. Created a analysis report. 71 | 72 | :: 73 | 74 | # 75 | import reportgen as rpt 76 | import pandas as pd 77 | # Open a pptx file 78 | p=rpt.Report('template.pptx')# The parameters can be defaulted 79 | # add a cover 80 | p.add_cover(title='A analysis report powered by reportgen') 81 | # add a chart slide 82 | data=pd.DataFrame({'Jack':[90,80,100],'David':[100,70,85]},index=['Math','English','Physics']) 83 | p.add_slide(data={'data':data,'slide_type':'chart','type':'COLUMN_CLUSTERED'},\ 84 | title='the scores report',summary='Our class got excellent results',footnote='This is a footnote.') 85 | # add a table slide 86 | data=pd.DataFrame({'Jack':[90,80,100],'David':[100,70,85]},index=['Math','English','Physics']) 87 | p.add_slide(data={'data':data,'slide_type':'table'},title='the scores report',summary='Our class got excellent results',footnote='This is a footnote.') 88 | # add a textbox slide 89 | data='This a paragraph. \n'*4 90 | p.add_slide(data={'data':data,'slide_type':'textbox'},title='This is a textbox slide',summary='',footnote='') 91 | # add a picture slide 92 | data='.\\images\\images.png' 93 | p.add_slide(data={'data':data,'slide_type':'picture'},title='This is a picture slide') 94 | p.save('analysis report.pptx') 95 | 96 | 97 | 98 | 99 | In general, I divide a slide of analysis report into four parts: title銆乻ummary銆乫ootnote and the body data. And the body are one or more charts/textboxs/tables/pictures. 100 | 101 | The *add_slide* which is the most commonly used function has the following parameters: 102 | 103 | :: 104 | 105 | add_slide(data=[{'data':,'slide_type':,'type':},],title='',summary='',footnote='',layouts='auto') 106 | 107 | For example, we can draw a chart on the left side, and insert a picture on the right. 108 | 109 | :: 110 | 111 | import reportgen as rpt 112 | import pandas as pd 113 | p=rpt.Report() 114 | scores=pd.DataFrame({'Jack':[90,80,100],'David':[100,70,85]},index=['Math','English','Physics']) 115 | data=[{'data':scores,'slide_type':'chart','type':'COLUMN_CLUSTERED'},\ 116 | {'data':'.\\images2.jpg','slide_type':'picture'}] 117 | p.add_slide(data=data) 118 | p.save('add_slide.pptx') 119 | 120 | As a lazy person, I also provide a solution with less scripts. 121 | 122 | :: 123 | 124 | import reportgen as rpt 125 | p=rpt.Report() 126 | imgs=['.\\images\\'+img for img in os.listdir('.\\images\\')] 127 | p.add_slides(data=imgs) 128 | # more functions way 129 | slides_data=[{'title':'ppt{}'.format(i),'data':data} for i in range(10)] 130 | p.add.slides(slides_data) 131 | p.save('add_slides.pptx') 132 | 133 | 134 | Now you can get a glance at any data. 135 | 136 | :: 137 | 138 | import pandas as pd 139 | import reportgen as rpt 140 | 141 | data=pd.read_excel('Scores.xlsx') 142 | rpt.AnalysisReport(data,filename='Analysis Report of Scores.pptx'); 143 | 144 | The scripts will make a pptx file which analysis all the fields of the data in a visual way. 145 | 146 | TO DO 147 | ------- 148 | 149 | - support export analysis report to html 150 | - make the chart_type recommend more intelligence 151 | 152 | 153 | Contact 154 | -------- 155 | 156 | If you have any question,you can email to gasongjian AT 126.com. And if you have a WeChat account,you can focus to my WeChat Official Account: gasongjian. 157 | 158 | Platform: all 159 | Classifier: Development Status :: 4 - Beta 160 | Classifier: Intended Audience :: Developers 161 | Classifier: License :: OSI Approved :: BSD License 162 | Classifier: Programming Language :: Python 163 | Classifier: Programming Language :: Python :: Implementation 164 | Classifier: Programming Language :: Python :: 3.4 165 | Classifier: Programming Language :: Python :: 3.5 166 | Classifier: Programming Language :: Python :: 3.6 167 | Classifier: Topic :: Software Development :: Libraries 168 | -------------------------------------------------------------------------------- /reportgen/utils/preprocessing.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sun Mar 25 14:09:46 2018 4 | 5 | @author: JSong 6 | """ 7 | 8 | import pandas as pd 9 | import numpy as np 10 | from scipy import stats 11 | from sklearn.utils.multiclass import type_of_target 12 | 13 | 14 | __all__=['WeightOfEvidence', 15 | 'chimerge', 16 | 'Discretization'] 17 | 18 | 19 | def check_array(X,ensure_DataFrame=True,copy=True): 20 | '''Convert X to DataFrame 21 | ''' 22 | X=X.copy() 23 | if not(np.issubdtype(type(X),np.ndarray)): 24 | X=np.array(X) 25 | X=pd.DataFrame(X) 26 | return X 27 | 28 | 29 | def _features_selected(X, selected="all"): 30 | """Apply a transform function to portion of selected features 31 | 32 | Parameters 33 | ---------- 34 | X : {array-like}, shape [n_samples, n_features] 35 | 36 | selected: "all" or array of indices or mask 37 | Specify which features to apply the transform to. 38 | 39 | Returns 40 | ------- 41 | n_features_new : array 42 | """ 43 | 44 | X=check_array(X) 45 | if selected == "all": 46 | return np.array(X.columns) 47 | n_features = X.shape[1] 48 | sel = pd.Series(np.zeros(n_features, dtype=bool),index=X.columns) 49 | sel[np.asarray(selected)] = True 50 | return np.array(X.columns[sel]) 51 | 52 | 53 | class WeightOfEvidence(): 54 | """ WOE Encoder 55 | 56 | parameters: 57 | ----------- 58 | 59 | categorical_features : "all" or array of indices or mask 60 | Specify what features are treated as categorical. 61 | 62 | - 'all' (default): All features are treated as categorical. 63 | - array of indices: Array of categorical feature indices. 64 | - mask: Array of length n_features and with dtype=bool. 65 | encoder_na: default False, take nan as a single class of the features 66 | 67 | attribute: 68 | ----------- 69 | woe (Dict): - the woe of trained data 70 | iv (Dict): - info value of trained data 71 | """ 72 | 73 | def __init__(self,categorical_features='all',encoder_na=False,woe_min=-20, woe_max=20): 74 | self.woe = {} 75 | self.iv = {} 76 | self.encoder_na=encoder_na 77 | self.woe_min=woe_min 78 | self.woe_max=woe_max 79 | self.categorical_features=categorical_features 80 | 81 | def _posibility(self, x, tag, event=1): 82 | """计算触发概率 83 | Parameters: 84 | ---------- 85 | x (Sequence): - 离散特征序列 86 | tag (Sequence): - 用于训练的标签序列 87 | event (any): - True指代的触发事件 88 | Returns: 89 | ---------- 90 | Dict[str,Tuple[rate_T, rate_F]]: - 训练好后的好坏触发概率 91 | """ 92 | if type_of_target(tag) not in ['binary']: 93 | raise AttributeError("tag must be a binary array") 94 | #if type_of_target(x) in ['continuous']: 95 | # raise AttributeError("input array must not continuous") 96 | tag = np.array(tag) 97 | x = np.array(x) 98 | event_total = (tag == event).sum() 99 | non_event_total = tag.shape[-1] - event_total 100 | x_labels = pd.unique(x[pd.notnull(x)]) 101 | pos_dic = {} 102 | for x1 in x_labels: 103 | # 当 x1 是nan时,y1 也为空 104 | y1 = tag[np.where(x == x1)[0]] 105 | event_count = (y1 == event).sum() 106 | non_event_count = y1.shape[-1] - event_count 107 | rate_event = 1.0 * event_count / event_total 108 | rate_non_event = 1.0 * non_event_count / non_event_total 109 | pos_dic[x1] = (rate_event, rate_non_event) 110 | return pos_dic 111 | 112 | def fit(self, X, y, event=1): 113 | """训练对单独一项自变量(列,特征)的woe值. 114 | WOE_k=log (该特征中正类占比/该特征中负类占比) 115 | Parameters: 116 | ----------- 117 | X : DataFrame, 训练数据 118 | y (Sequence): 标签 119 | event: - True指代的触发事件 120 | woe_min (munber): - woe的最小值,默认值为 -20 121 | woe_max (munber): - woe的最大值,默认值为 20 122 | """ 123 | X = check_array(X,ensure_DataFrame=True) 124 | y = np.array(y) 125 | if np.isnan(y).sum()>0: 126 | raise AttributeError("y contain NaN number!") 127 | feartures_new=_features_selected(X,self.categorical_features) 128 | if self.encoder_na: 129 | X[feartures_new]=X[feartures_new].fillna('np.nan') 130 | for v in feartures_new: 131 | woe_dict = {} 132 | iv = 0 133 | pos_dic = self._posibility(x=X[v], tag=y, event=event) 134 | for l, (rate_event, rate_non_event) in pos_dic.items(): 135 | if rate_event == 0: 136 | woe1 = self.woe_min 137 | elif rate_non_event == 0: 138 | woe1 = self.woe_max 139 | else: 140 | woe1 = np.log(rate_event / rate_non_event) # np.log就是ln 141 | iv += (rate_event - rate_non_event) * woe1 142 | woe_dict[l] = woe1 143 | self.woe[v] = woe_dict 144 | self.iv[v] = iv 145 | 146 | def transform(self, X): 147 | """将离散特征序列转换为woe值组成的序列 148 | Parameters: 149 | X : DataFrame, 训练数据 150 | Returns: 151 | DataFrame: - 替换特征序列枚举值为woe对应数值后的序列 152 | """ 153 | X=check_array(X) 154 | feartures_new=_features_selected(X,self.categorical_features) 155 | if self.encoder_na: 156 | X[feartures_new]=X[feartures_new].fillna('np.nan') 157 | for v in feartures_new: 158 | X[v]=X[v].replace(self.woe[v]) 159 | return X 160 | def fit_transform(self,X,y,event=1): 161 | self.fit(X, y, event=event) 162 | return self.transform(X) 163 | 164 | 165 | 166 | def _chisqure_fo(fo): 167 | if any(fo==0): 168 | fo=fo+1 169 | s=stats.chi2_contingency(fo) 170 | return s[0],s[1] 171 | 172 | 173 | def chimerge(x,y,max_intervals=30,threshold=5,sample=None): 174 | '''卡方分箱 175 | parameter 176 | --------- 177 | x: {array-like}, shape [n_samples, 1] 178 | y: target, connot contain nan 179 | max_intervals: 最大的区间数 180 | threshold:卡方阈值(两个变量) 181 | sample: int,当样本数过大时,对数据进行取样 182 | 183 | return 184 | ------ 185 | bins: 186 | 187 | ''' 188 | 189 | x=pd.Series(x) 190 | y=pd.Series(y) 191 | class_y=list(pd.unique(y[pd.notnull(y)])) 192 | value_max=x.max() 193 | #value_max=np.sort(x)[-1] 194 | value_min=x.min() 195 | # 随机取样,且确保取样后的y能包含class_y中的所有类别 196 | if isinstance(sample,int): 197 | sample=min(sample,len(x)) 198 | tmp=set() 199 | while tmp!=set(class_y): 200 | cc=np.random.choice([True,False],size=len(x),p=[sample/len(x),1-sample/len(x)]) 201 | tmp=set(np.unique(y[cc])) 202 | x=x[cc] 203 | y=y[cc] 204 | fo=pd.crosstab(x,y)# 列联表 205 | fo=fo.sort_index() 206 | 207 | while fo.shape[0] > max_intervals: 208 | chitest={} 209 | index=list(fo.index) 210 | for r in range(len(fo)-1): 211 | #chi2,_=stats.chi2_contingency(fo.iloc[[r,r+1],:]) 212 | chi2,_=_chisqure_fo(fo.iloc[[r,r+1],:]) 213 | if chi2 not in chitest: 214 | chitest[chi2]=[] 215 | chitest[chi2].append((r,r+1)) 216 | smallest = min(chitest.keys()) 217 | if smallest <= threshold: 218 | #print('最小的chi2值: {}'.format(smallest)) 219 | #print([(index[r[0]],index[r[1]]) for r in list(reversed(chitest[smallest]))]) 220 | for (lower,upper) in list(reversed(chitest[smallest])): 221 | fo.loc[index[lower],:]=fo.loc[index[lower],:]+fo.loc[index[upper],:] 222 | fo = fo.drop(index[upper],axis=0) 223 | #print('已经删除 {}'.format(index[upper])) 224 | else: 225 | break 226 | bins=list(fo.index)+[value_max] 227 | bins[0]=value_min 228 | # 如果bins都是数值,则最左和最右都扩大1%以囊括最小最大值 229 | if np.issubdtype(type(bins[0]),np.number): 230 | bins[0]=bins[0]*0.99 if bins[0]>0 else bins[0]-0.01 231 | bins[-1]=bins[-1]*1.01 232 | return bins 233 | 234 | 235 | class Discretization(): 236 | """离散化连续数据.需要实例化以保存bins状态. 237 | parameter: 238 | bins (Sequence): - 用于分段的列表,第一位为下限,最后一位为上限 239 | method: 离散的方法 240 | """ 241 | 242 | def __init__(self, bins=None,method='auto',continous_features='all',**kwargs): 243 | self.bins = bins 244 | self.method=method 245 | self.continous_features=continous_features 246 | if 'max_intervals' in kwargs: 247 | self.max_intervals=kwargs['max_intervals'] 248 | else: 249 | self.max_intervals=10 250 | if 'threshold' in kwargs: 251 | self.threshold=kwargs['threshold'] 252 | else: 253 | self.threshold=5 254 | if 'sample' in kwargs: 255 | self.sample=kwargs['sample'] 256 | else: 257 | self.sample=None 258 | 259 | def fit(self,X,y=None): 260 | if self.method == 'auto': 261 | if y is not None: 262 | method='chimerge' 263 | elif self.bins is not None: 264 | method='' 265 | else: 266 | method='' 267 | else: 268 | method=self.method 269 | X=check_array(X) 270 | feartures_new=_features_selected(X,self.continous_features) 271 | if method.lower() in ['chimerge']: 272 | self.bins={} 273 | for v in feartures_new: 274 | bins=chimerge(X[v],y,max_intervals=self.max_intervals,threshold=self.threshold,sample=self.sample) 275 | self.bins[v]=bins 276 | 277 | def transform(self, X): 278 | X=check_array(X) 279 | feartures_new=_features_selected(X,self.continous_features) 280 | for v in feartures_new: 281 | bins=self.bins[v] 282 | labels=['[{},{})'.format(bins[i],bins[i+1]) for i in range(len(bins)-1)] 283 | X[v] = pd.cut(X[v], bins=bins,labels=labels,right=False) 284 | return X 285 | 286 | def fit_transform(self,X,y=None): 287 | self.fit(X,y) 288 | return self.transform(X) 289 | 290 | 291 | 292 | 293 | 294 | -------------------------------------------------------------------------------- /reportgen/utils/delaunay.py: -------------------------------------------------------------------------------- 1 | # -*- coding: ascii -*- 2 | """ 3 | Simple structured Delaunay triangulation in 2D with Bowyer-Watson algorithm. 4 | 5 | Written by Jose M. Espadero ( http://github.com/jmespadero/pyDelaunay2D ) 6 | Based on code from Ayron Catteau. Published at http://github.com/ayron/delaunay 7 | 8 | Just pretend to be simple and didactic. The only requisite is numpy. 9 | Robust checks disabled by default. May not work in degenerate set of points. 10 | """ 11 | 12 | import numpy as np 13 | from math import sqrt 14 | 15 | 16 | class Delaunay2D: 17 | """ 18 | Class to compute a Delaunay triangulation in 2D 19 | ref: http://en.wikipedia.org/wiki/Bowyer-Watson_algorithm 20 | ref: http://www.geom.uiuc.edu/~samuelp/del_project.html 21 | """ 22 | 23 | def __init__(self, center=(0, 0), radius=9999): 24 | """ Init and create a new frame to contain the triangulation 25 | center -- Optional position for the center of the frame. Default (0,0) 26 | radius -- Optional distance from corners to the center. 27 | """ 28 | center = np.asarray(center) 29 | # Create coordinates for the corners of the frame 30 | self.coords = [center+radius*np.array((-1, -1)), 31 | center+radius*np.array((+1, -1)), 32 | center+radius*np.array((+1, +1)), 33 | center+radius*np.array((-1, +1))] 34 | 35 | # Create two dicts to store triangle neighbours and circumcircles. 36 | self.triangles = {} 37 | self.circles = {} 38 | 39 | # Create two CCW triangles for the frame 40 | T1 = (0, 1, 3) 41 | T2 = (2, 3, 1) 42 | self.triangles[T1] = [T2, None, None] 43 | self.triangles[T2] = [T1, None, None] 44 | 45 | # Compute circumcenters and circumradius for each triangle 46 | for t in self.triangles: 47 | self.circles[t] = self.Circumcenter(t) 48 | 49 | def Circumcenter(self, tri): 50 | """Compute Circumcenter and circumradius of a triangle in 2D. 51 | Uses an extension of the method described here: 52 | http://www.ics.uci.edu/~eppstein/junkyard/circumcenter.html 53 | """ 54 | pts = np.asarray([self.coords[v] for v in tri]) 55 | pts2 = np.dot(pts, pts.T) 56 | A = np.bmat([[2 * pts2, [[1], 57 | [1], 58 | [1]]], 59 | [[[1, 1, 1, 0]]]]) 60 | 61 | b = np.hstack((np.sum(pts * pts, axis=1), [1])) 62 | x = np.linalg.solve(A, b) 63 | bary_coords = x[:-1] 64 | center = np.dot(bary_coords, pts) 65 | 66 | # radius = np.linalg.norm(pts[0] - center) # euclidean distance 67 | radius = np.sum(np.square(pts[0] - center)) # squared distance 68 | return (center, radius) 69 | 70 | def inCircleFast(self, tri, p): 71 | """Check if point p is inside of precomputed circumcircle of tri. 72 | """ 73 | center, radius = self.circles[tri] 74 | return np.sum(np.square(center - p)) <= radius 75 | 76 | def inCircleRobust(self, tri, p): 77 | """Check if point p is inside of circumcircle around the triangle tri. 78 | This is a robust predicate, slower than compare distance to centers 79 | ref: http://www.cs.cmu.edu/~quake/robust.html 80 | """ 81 | m1 = np.asarray([self.coords[v] - p for v in tri]) 82 | m2 = np.sum(np.square(m1), axis=1).reshape((3, 1)) 83 | m = np.hstack((m1, m2)) # The 3x3 matrix to check 84 | return np.linalg.det(m) <= 0 85 | 86 | def AddPoint(self, p): 87 | """Add a new point to the current DT, and refine it using Bowyer-Watson. 88 | """ 89 | p = np.asarray(p) 90 | idx = len(self.coords) 91 | # print("coords[", idx,"] ->",p) 92 | self.coords.append(p) 93 | 94 | # Search the triangle(s) whose circumcircle contains p 95 | bad_triangles = [] 96 | for T in self.triangles: 97 | # Choose one method: inCircleRobust(T, p) or inCircleFast(T, p) 98 | if self.inCircleFast(T, p): 99 | bad_triangles.append(T) 100 | 101 | # Find the CCW boundary (star shape) of the bad triangles, 102 | # expressed as a list of edges (point pairs) and the opposite 103 | # triangle to each edge. 104 | boundary = [] 105 | # Choose a "random" triangle and edge 106 | T = bad_triangles[0] 107 | edge = 0 108 | # get the opposite triangle of this edge 109 | while True: 110 | # Check if edge of triangle T is on the boundary... 111 | # if opposite triangle of this edge is external to the list 112 | tri_op = self.triangles[T][edge] 113 | if tri_op not in bad_triangles: 114 | # Insert edge and external triangle into boundary list 115 | boundary.append((T[(edge+1) % 3], T[(edge-1) % 3], tri_op)) 116 | 117 | # Move to next CCW edge in this triangle 118 | edge = (edge + 1) % 3 119 | 120 | # Check if boundary is a closed loop 121 | if boundary[0][0] == boundary[-1][1]: 122 | break 123 | else: 124 | # Move to next CCW edge in opposite triangle 125 | edge = (self.triangles[tri_op].index(T) + 1) % 3 126 | T = tri_op 127 | 128 | # Remove triangles too near of point p of our solution 129 | for T in bad_triangles: 130 | del self.triangles[T] 131 | del self.circles[T] 132 | 133 | # Retriangle the hole left by bad_triangles 134 | new_triangles = [] 135 | for (e0, e1, tri_op) in boundary: 136 | # Create a new triangle using point p and edge extremes 137 | T = (idx, e0, e1) 138 | 139 | # Store circumcenter and circumradius of the triangle 140 | self.circles[T] = self.Circumcenter(T) 141 | 142 | # Set opposite triangle of the edge as neighbour of T 143 | self.triangles[T] = [tri_op, None, None] 144 | 145 | # Try to set T as neighbour of the opposite triangle 146 | if tri_op: 147 | # search the neighbour of tri_op that use edge (e1, e0) 148 | for i, neigh in enumerate(self.triangles[tri_op]): 149 | if neigh: 150 | if e1 in neigh and e0 in neigh: 151 | # change link to use our new triangle 152 | self.triangles[tri_op][i] = T 153 | 154 | # Add triangle to a temporal list 155 | new_triangles.append(T) 156 | 157 | # Link the new triangles each another 158 | N = len(new_triangles) 159 | for i, T in enumerate(new_triangles): 160 | self.triangles[T][1] = new_triangles[(i+1) % N] # next 161 | self.triangles[T][2] = new_triangles[(i-1) % N] # previous 162 | 163 | def exportTriangles(self): 164 | """Export the current list of Delaunay triangles 165 | """ 166 | # Filter out triangles with any vertex in the extended BBox 167 | return [(a-4, b-4, c-4) 168 | for (a, b, c) in self.triangles if a > 3 and b > 3 and c > 3] 169 | 170 | def exportCircles(self): 171 | """Export the circumcircles as a list of (center, radius) 172 | """ 173 | # Remember to compute circumcircles if not done before 174 | # for t in self.triangles: 175 | # self.circles[t] = self.Circumcenter(t) 176 | 177 | # Filter out triangles with any vertex in the extended BBox 178 | # Do sqrt of radius before of return 179 | return [(self.circles[(a, b, c)][0], sqrt(self.circles[(a, b, c)][1])) 180 | for (a, b, c) in self.triangles if a > 3 and b > 3 and c > 3] 181 | 182 | def exportDT(self): 183 | """Export the current set of Delaunay coordinates and triangles. 184 | """ 185 | # Filter out coordinates in the extended BBox 186 | coord = self.coords[4:] 187 | 188 | # Filter out triangles with any vertex in the extended BBox 189 | tris = [(a-4, b-4, c-4) 190 | for (a, b, c) in self.triangles if a > 3 and b > 3 and c > 3] 191 | return coord, tris 192 | 193 | def exportExtendedDT(self): 194 | """Export the Extended Delaunay Triangulation (with the frame vertex). 195 | """ 196 | return self.coords, list(self.triangles) 197 | 198 | def exportVoronoiRegions(self): 199 | """Export coordinates and regions of Voronoi diagram as indexed data. 200 | """ 201 | # Remember to compute circumcircles if not done before 202 | # for t in self.triangles: 203 | # self.circles[t] = self.Circumcenter(t) 204 | useVertex = {i:[] for i in range(len(self.coords))} 205 | vor_coors = [] 206 | index={} 207 | # Build a list of coordinates and a index per triangle/region 208 | for tidx, (a, b, c) in enumerate(self.triangles): 209 | vor_coors.append(self.circles[(a,b,c)][0]) 210 | # Insert triangle, rotating it so the key is the "last" vertex 211 | useVertex[a]+=[(b, c, a)] 212 | useVertex[b]+=[(c, a, b)] 213 | useVertex[c]+=[(a, b, c)] 214 | # Set tidx as the index to use with this triangles 215 | index[(a, b, c)] = tidx; 216 | index[(c, a, b)] = tidx; 217 | index[(b, c, a)] = tidx; 218 | 219 | # init regions per coordinate dictionary 220 | regions = {} 221 | # Sort each region in a coherent order, and substitude each triangle 222 | # by its index 223 | for i in range (4, len(self.coords)): 224 | v = useVertex[i][0][0] # Get a vertex of a triangle 225 | r=[] 226 | for _ in range(len(useVertex[i])): 227 | # Search the triangle beginning with vertex v 228 | t = [t for t in useVertex[i] if t[0] == v][0] 229 | r.append(index[t]) # Add the index of this triangle to region 230 | v = t[1] # Choose the next vertex to search 231 | regions[i-4]=r # Store region. 232 | 233 | return vor_coors, regions 234 | -------------------------------------------------------------------------------- /reportgen/utils/metrics.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import pandas as pd 4 | import numpy as np 5 | from scipy import stats 6 | import scipy.spatial as ss 7 | from scipy.special import digamma 8 | from math import log 9 | import numpy.random as nr 10 | import random 11 | 12 | #from collections import Iterable 13 | 14 | __all__=['entropy', 15 | 'entropyc', 16 | 'entropyd', 17 | 'chi2', 18 | 'info_value'] 19 | 20 | 21 | # 待定,还未修改好 22 | class feature_encoder(): 23 | ''' 24 | 用于单个特征对因变量的分析,如 25 | - 该特征中每个item的影响力 26 | - 对item重编码 27 | 28 | ''' 29 | 30 | def chi2(X,y): 31 | N=pd.Series(y).count() 32 | fo=pd.crosstab(X,y) 33 | fe=stats.contingency.expected_freq(fo) 34 | weight_chi2=(fo-fe)**2/fe/N/min(fo.shape[0],fo.shape[1]) 35 | weight_chi2=weight_chi2.sum(axis=1) 36 | return weight_chi2 37 | 38 | 39 | def woe(X,y): 40 | ctable=pd.crosstab(X,y) 41 | # 如果有0则每一项都加1 42 | ctable=ctable+1 if (ctable==0).any().any() else ctable 43 | if ctable.shape[1]==2: 44 | n_g,n_b=ctable.sum() 45 | ctable=(ctable/ctable.sum()).assign(woe=lambda x:np.log2(x.iloc[:,0]/x.iloc[:,1]))\ 46 | .assign(ivi=lambda x:(x.iloc[:,0]-x.iloc[:,1])*x['woe']) 47 | return ctable.loc[:,['woe','ivi']] 48 | else: 49 | woe_dict={} 50 | p=ctable.sum()/ctable.sum().sum() 51 | for cc in ctable.columns: 52 | ctable_bin=pd.DataFrame(index=ctable.index,columns=['one','rest']) 53 | ctable_bin['one']=ctable.loc[:,cc] 54 | ctable_bin['rest']=ctable.loc[:,~(ctable.columns==cc)].sum(axis=1) 55 | n_o,n_r=ctable_bin.sum() 56 | ctable_bin=ctable_bin/ctable_bin.sum() 57 | ctable_bin['woe']=np.log2(ctable_bin['one']/ctable_bin['rest']) 58 | ctable_bin['ivi']=(ctable_bin['one']-ctable_bin['rest'])*ctable_bin['woe'] 59 | woe_dict[cc]=ctable_bin.loc[:,['woe','ivi']] 60 | tmp=0 61 | for cc in ctable.columns: 62 | tmp+=woe_dict[cc]*p[cc] 63 | woe_dict['avg']=tmp 64 | return woe_dict 65 | 66 | 67 | 68 | def chi2(X,y): 69 | '''计算一组数据的卡方值,弥补sklearn中的chi2只支持2*2的缺憾 70 | parameter 71 | ---------- 72 | X:可以是单个特征,也可以是一组特征 73 | y:目标变量 74 | 75 | return 76 | ------ 77 | chi2_value: np.array 数组 78 | chi2_pvalue:np.array 数组 79 | ''' 80 | X=np.asarray(X) 81 | if len(X.shape)==1: 82 | X=X.reshape((len(X),1)) 83 | X=pd.DataFrame(X) 84 | chi2_value=[] 85 | chi2_pvalue=[] 86 | for c in X.columns: 87 | fo=pd.crosstab(X[c],y) 88 | s=stats.chi2_contingency(fo) 89 | chi2_value.append(s[0]) 90 | chi2_pvalue.append(s[1]) 91 | return (np.array(chi2_value),np.array(chi2_pvalue)) 92 | 93 | 94 | 95 | # 待定 96 | def info_value(X,y,bins='auto'): 97 | '''计算连续变量的IV值 98 | 计算X和y之间的IV值 99 | IV=\sum (g_k/n_g-b_k/n_b)*log2(g_k*n_b/n_g/) 100 | ''' 101 | threshold=[] 102 | for q in [0.05,0.04,0.03,0.02,0.01,1e-7]: 103 | t_down=max([X[y==k].quantile(q) for k in y.dropna().unique()]) 104 | t_up=min([X[y==k].quantile(1-q) for k in y.dropna().unique()]) 105 | threshold.append((t_down,t_up)) 106 | 107 | if bins is not None: 108 | X=pd.cut(X,bins) 109 | ctable=pd.crosstab(X,y) 110 | p=ctable.sum()/ctable.sum().sum() 111 | if ctable.shape[1]==2: 112 | ctable=ctable/ctable.sum() 113 | IV=((ctable.iloc[:,0]-ctable.iloc[:,1])*np.log2(ctable.iloc[:,0]/ctable.iloc[:,1])).sum() 114 | return IV 115 | 116 | IV=0 117 | for cc in ctable.columns: 118 | ctable_bin=pd.concat([ctable[cc],ctable.loc[:,~(ctable.columns==cc)].sum(axis=1)],axis=1) 119 | ctable_bin=ctable_bin/ctable_bin.sum() 120 | IV_bin=((ctable_bin.iloc[:,0]-ctable_bin.iloc[:,1])*np.log2(ctable_bin.iloc[:,0]/ctable_bin.iloc[:,1])).sum() 121 | IV+=IV_bin*p[cc] 122 | return IV 123 | 124 | 125 | 126 | # 计算离散随机变量的熵 127 | class entropy: 128 | 129 | ''' 130 | 计算样本的熵以及相关的指标 131 | 函数的输入默认均为原始的样本集 132 | 133 | ''' 134 | def entropy(X): 135 | ''' 136 | 计算随机变量的信息熵 137 | H(X)=-\sum p_i log2(p_i) 138 | ''' 139 | X=pd.Series(X) 140 | p=X.value_counts(normalize=True) 141 | p=p[p>0] 142 | h=-(p*np.log2(p)).sum() 143 | return h 144 | 145 | 146 | def cond_entropy(x,y): 147 | ''' 148 | 计算随机变量的条件熵 149 | y必须是因子型变量 150 | H(X,y)=\sum p(y_i)H(X|y=y_i) 151 | ''' 152 | #h=entropy_combination(X,y)-entropy(y) 153 | y=pd.Series(y) 154 | p=y.value_counts(normalize=True) 155 | h=0 156 | for yi in y.dropna().unique(): 157 | h+=p[yi]*entropy.entropy(x[y==yi]) 158 | return h 159 | 160 | def comb_entropy(x,y): 161 | ''' 162 | 计算随机变量的联合熵 163 | H(X,y)=-\sum p(x_i,y_i)*log2(p(x_i,y_i))=H(X)+H(y|X) 164 | ''' 165 | ''' 166 | w=pd.crosstab(X,y) 167 | N=w.sum().sum() 168 | w=w/N 169 | w=w.values.flatten() 170 | w=w[w>0] 171 | h=-(w*np.log2(w)).sum() 172 | ''' 173 | h=entropy.entropy(y)+entropy.cond_entropy(x,y) 174 | return h 175 | 176 | def mutual_info(x,y): 177 | ''' 178 | 计算随机变量的互信息 179 | I(X;y)=H(X)-H(X|y)=H(y)-H(y|X) 180 | ''' 181 | h=entropy.entropy(x)-entropy.cond_entropy(x,y) 182 | return h 183 | 184 | def info_gain(x,y): 185 | ''' 186 | 计算随机变量的互信息 187 | I(X;y)=H(X)-H(X|y)=H(y)-H(y|X) 188 | ''' 189 | h=entropy.entropy(x)-entropy.cond_entropy(x,y) 190 | return h 191 | 192 | def info_gain_ratio(x,y): 193 | ''' 194 | 计算随机变量的信息增益比,此时X是总体,y是某个特征 195 | I(X;y)=H(X)-H(X|y)=H(y)-H(y|X) 196 | IG(X;y)=I(X;y)/H(y) 197 | ''' 198 | h=entropy.entropy(x)-entropy.cond_entropy(x,y) 199 | hy=entropy.entropy(y) 200 | h=h/hy if hy>0 else 0 201 | return h 202 | 203 | 204 | 205 | def cross_entropy(x,y): 206 | ''' 207 | 计算随机变量的交叉熵 208 | 要求X和y的测度空间相同,此时X和y的样本数量可以不一致 209 | 210 | H(p,q)=-\sum p(x)log2(q(x)) 211 | 212 | parameter 213 | -------- 214 | ''' 215 | X=pd.Series(x) 216 | y=pd.Series(y) 217 | p=X.value_counts(normalize=True) 218 | q=y.value_counts(normalize=True) 219 | h=-(p*np.log2(q)).sum() 220 | return h 221 | 222 | 223 | def relative_entropy(x,y): 224 | ''' 225 | 计算随机变量的相对熵 226 | 要求X和y的测度空间相同,此时X和y的样本数量可以不一致 227 | D=\sum p(x) log2(p(x)/q(x))=H(p,q)-H(p) 228 | 229 | parameter 230 | -------- 231 | dtype: X和y的数据类型,因子变量category和数值变量numeric,默认是category 232 | ''' 233 | 234 | X=pd.Series(x) 235 | y=pd.Series(y) 236 | p=X.value_counts(normalize=True) 237 | q=y.value_counts(normalize=True) 238 | #h=entropy.entropy_cross(p,q)-entropy.entropy(p) 239 | h=(p*np.log2(p/q)).sum() 240 | return h 241 | 242 | 243 | 244 | 245 | # 计算连续变量的熵(利用分布进行近似 CONTINUOUS ESTIMATORS) 246 | class entropyc: 247 | 248 | ''' 249 | 原作者:Greg Ver Steeg 250 | GitHub:https://github.com/gregversteeg/NPEET 251 | Or go to http://www.isi.edu/~gregv/npeet.html 252 | 253 | ref:Alexander Kraskov etc. Estimating mutual information. Phys. Rev. E, 69:066138, Jun 2004 254 | 255 | 连续分布的熵估计 256 | ''' 257 | 258 | def __reshape(x): 259 | x=np.asarray(x) 260 | if len(x.shape)==1: 261 | x=x.reshape((len(x),1)) 262 | return x 263 | 264 | def entropy(x, k=3, base=2): 265 | """ 266 | The classic K-L k-nearest neighbor continuous entropy estimator 267 | 268 | if x is a one-dimensional scalar and we have: 269 | H(X)=-\sum p_i log2(p_i) 270 | if we only have random sample (x1 . . . xN) of N realizations of X, 271 | we can estimator H(X): 272 | 273 | H(X) = −ψ(k) + ψ(N) + \log c_d + d/N \sum_{i=1}^{N} \log eps(i) 274 | 275 | where ψ(x) is digammer funciton,d is the dimention of x, 276 | c_d is the volume of the d-dimensional unit ball 277 | eps(i) is twice the distance from xi to its k-th neighbour 278 | 279 | parameter 280 | --------- 281 | x: 某个分布的抽样,且支持多维。 282 | k: k近邻的 283 | base:2 284 | 285 | return 286 | ------- 287 | entropy 288 | """ 289 | x=entropyc.__reshape(x) 290 | assert k <= len(x) - 1, "Set k smaller than num. samples - 1" 291 | d = len(x[0]) 292 | N = len(x) 293 | intens = 1e-10 # small noise to break degeneracy, see doc. 294 | x = [list(p + intens * nr.rand(len(x[0]))) for p in x] 295 | tree = ss.cKDTree(x) 296 | nn = [tree.query(point, k + 1, p=float('inf'))[0][k] for point in x] 297 | const = digamma(N) - digamma(k) + d * log(base) 298 | return (const + d * np.mean(list(map(log, nn)))) / log(base) 299 | 300 | def cond_entropy(x, y, k=3, base=2): 301 | """ The classic K-L k-nearest neighbor continuous entropy estimator for the 302 | entropy of X conditioned on Y. 303 | """ 304 | hxy = entropyc.entropy([xi + yi for (xi, yi) in zip(x, y)], k, base) 305 | hy = entropyc.entropy(y, k, base) 306 | return hxy - hy 307 | 308 | def __column(xs, i): 309 | return [[x[i]] for x in xs] 310 | 311 | def tc(xs, k=3, base=2): 312 | xis = [entropyc.entropy(entropyc.__column(xs, i), k, base) for i in range(0, len(xs[0]))] 313 | return np.sum(xis) - entropyc.entropy(xs, k, base) 314 | 315 | def ctc(xs, y, k=3, base=2): 316 | xis = [entropyc.cond_entropy(entropyc.__column(xs, i), y, k, base) for i in range(0, len(xs[0]))] 317 | return np.sum(xis) - entropyc.cond_entropy(xs, y, k, base) 318 | 319 | def corex(xs, ys, k=3, base=2): 320 | cxis = [entropyc.mutual_info(entropyc.__column(xs, i), ys, k, base) for i in range(0, len(xs[0]))] 321 | return np.sum(cxis) - entropyc.mutual_info(xs, ys, k, base) 322 | 323 | def mutual_info(x, y, k=3, base=2): 324 | """ Mutual information of x and y 325 | x, y should be a list of vectors, e.g. x = [[1.3], [3.7], [5.1], [2.4]] 326 | if x is a one-dimensional scalar and we have four samples 327 | """ 328 | x=entropyc.__reshape(x) 329 | y=entropyc.__reshape(y) 330 | assert len(x) == len(y), "Lists should have same length" 331 | assert k <= len(x) - 1, "Set k smaller than num. samples - 1" 332 | intens = 1e-10 # small noise to break degeneracy, see doc. 333 | x = [list(p + intens * nr.rand(len(x[0]))) for p in x] 334 | y = [list(p + intens * nr.rand(len(y[0]))) for p in y] 335 | points = zip2(x, y) 336 | # Find nearest neighbors in joint space, p=inf means max-norm 337 | tree = ss.cKDTree(points) 338 | dvec = [tree.query(point, k + 1, p=float('inf'))[0][k] for point in points] 339 | a, b, c, d = avgdigamma(x, dvec), avgdigamma(y, dvec), digamma(k), digamma(len(x)) 340 | return (-a - b + c + d) / log(base) 341 | 342 | 343 | def cond_mutual_info(x, y, z, k=3, base=2): 344 | """ Mutual information of x and y, conditioned on z 345 | x, y, z should be a list of vectors, e.g. x = [[1.3], [3.7], [5.1], [2.4]] 346 | if x is a one-dimensional scalar and we have four samples 347 | """ 348 | x=entropyc.__reshape(x) 349 | y=entropyc.__reshape(y) 350 | z=entropyc.__reshape(z) 351 | assert len(x) == len(y), "Lists should have same length" 352 | assert k <= len(x) - 1, "Set k smaller than num. samples - 1" 353 | intens = 1e-10 # small noise to break degeneracy, see doc. 354 | x = [list(p + intens * nr.rand(len(x[0]))) for p in x] 355 | y = [list(p + intens * nr.rand(len(y[0]))) for p in y] 356 | z = [list(p + intens * nr.rand(len(z[0]))) for p in z] 357 | points = zip2(x, y, z) 358 | # Find nearest neighbors in joint space, p=inf means max-norm 359 | tree = ss.cKDTree(points) 360 | dvec = [tree.query(point, k + 1, p=float('inf'))[0][k] for point in points] 361 | a, b, c, d = avgdigamma(zip2(x, z), dvec), avgdigamma(zip2(y, z), dvec), avgdigamma(z, dvec), digamma(k) 362 | return (-a - b + c + d) / log(base) 363 | 364 | 365 | def kl_div(x, xp, k=3, base=2): 366 | """ KL Divergence between p and q for x~p(x), xp~q(x) 367 | x, xp should be a list of vectors, e.g. x = [[1.3], [3.7], [5.1], [2.4]] 368 | if x is a one-dimensional scalar and we have four samples 369 | """ 370 | x=entropyc.__reshape(x) 371 | xp=entropyc.__reshape(xp) 372 | assert k <= len(x) - 1, "Set k smaller than num. samples - 1" 373 | assert k <= len(xp) - 1, "Set k smaller than num. samples - 1" 374 | assert len(x[0]) == len(xp[0]), "Two distributions must have same dim." 375 | d = len(x[0]) 376 | n = len(x) 377 | m = len(xp) 378 | const = log(m) - log(n - 1) 379 | tree = ss.cKDTree(x) 380 | treep = ss.cKDTree(xp) 381 | nn = [tree.query(point, k + 1, p=float('inf'))[0][k] for point in x] 382 | nnp = [treep.query(point, k, p=float('inf'))[0][k - 1] for point in x] 383 | return (const + d * np.mean(list(map(log, nnp))) - d * np.mean(list(map(log, nn)))) / log(base) 384 | 385 | 386 | 387 | # 计算随机变量的熵(直接离散话估计 DISCRETE ESTIMATORS) 388 | class entropyd: 389 | 390 | def entropy(sx, base=2): 391 | """ Discrete entropy estimator 392 | Given a list of samples which can be any hashable object 393 | """ 394 | return entropyd.entropyfromprobs(entropyd.hist(sx), base=base) 395 | 396 | 397 | def mutual_info(x, y, base=2): 398 | """ Discrete mutual information estimator 399 | Given a list of samples which can be any hashable object 400 | """ 401 | return -entropyd.entropy(zip(x, y), base) + entropyd.entropy(x, base) + entropyd.entropy(y, base) 402 | 403 | def cond_mutual_info(x, y, z): 404 | """ Discrete mutual information estimator 405 | Given a list of samples which can be any hashable object 406 | """ 407 | return entropyd.entropy(zip(y, z))+entropyd.entropy(zip(x, z))-entropyd.entropy(zip(x, y, z))-entropyd.entropy(z) 408 | 409 | def cond_entropy(x, y, base=2): 410 | """ The classic K-L k-nearest neighbor continuous entropy estimator for the 411 | entropy of X conditioned on Y. 412 | """ 413 | return entropyd.entropy(zip(x, y), base) - entropyd.entropy(y, base) 414 | 415 | def tcd(xs, base=2): 416 | xis = [entropyd.entropy(entropyd._column(xs, i), base) for i in range(0, len(xs[0]))] 417 | hx = entropyd.entropy(xs, base) 418 | return np.sum(xis) - hx 419 | 420 | def ctcd(xs, y, base=2): 421 | xis = [entropyd.cond_entropy(entropyd._column(xs, i), y, base) for i in range(0, len(xs[0]))] 422 | return np.sum(xis) - entropyd.cond_entropy(xs, y, base) 423 | 424 | def corexd(xs, ys, base=2): 425 | cxis = [entropyd.mutual_infod(entropyd._column(xs, i), ys, base) for i in range(0, len(xs[0]))] 426 | return np.sum(cxis) - entropyd.mutual_info(xs, ys, base) 427 | 428 | def hist(sx): 429 | sx = discretize(sx) 430 | # Histogram from list of samples 431 | d = dict() 432 | for s in sx: 433 | if type(s) == list: 434 | s = tuple(s) 435 | d[s] = d.get(s, 0) + 1 436 | return map(lambda z: float(z) / len(sx), d.values()) 437 | 438 | 439 | def entropyfromprobs(probs, base=2): 440 | # Turn a normalized list of probabilities of discrete outcomes into entropy (base 2) 441 | return -sum(map(entropyd.elog, probs)) / log(base) 442 | 443 | def _column(xs, i): 444 | return [[x[i]] for x in xs] 445 | 446 | def elog(x): 447 | # for entropy, 0 log 0 = 0. but we get an error for putting log 0 448 | if x <= 0. or x >= 1.: 449 | return 0 450 | else: 451 | return x * log(x) 452 | 453 | 454 | 455 | 456 | 457 | # UTILITY FUNCTIONS 458 | def vectorize(scalarlist): 459 | """ Turn a list of scalars into a list of one-d vectors 460 | """ 461 | return [[x] for x in scalarlist] 462 | 463 | 464 | def shuffle_test(measure, x, y, z=False, ns=200, ci=0.95, **kwargs): 465 | """ Shuffle test 466 | Repeatedly shuffle the x-values and then estimate measure(x, y, [z]). 467 | Returns the mean and conf. interval ('ci=0.95' default) over 'ns' runs. 468 | 'measure' could me mi, cmi, e.g. Keyword arguments can be passed. 469 | Mutual information and CMI should have a mean near zero. 470 | """ 471 | xp = x[:] # A copy that we can shuffle 472 | outputs = [] 473 | for i in range(ns): 474 | random.shuffle(xp) 475 | if z: 476 | outputs.append(measure(xp, y, z, **kwargs)) 477 | else: 478 | outputs.append(measure(xp, y, **kwargs)) 479 | outputs.sort() 480 | return np.mean(outputs), (outputs[int((1. - ci) / 2 * ns)], outputs[int((1. + ci) / 2 * ns)]) 481 | 482 | def _freedman_diaconis_bins(a): 483 | """Calculate number of hist bins using Freedman-Diaconis rule.""" 484 | # From http://stats.stackexchange.com/questions/798/ 485 | a = np.asarray(a) 486 | iqr = stats.scoreatpercentile(a, 75)-stats.scoreatpercentile(a, 25) 487 | h = 2*iqr/(len(a)**(1/3)) 488 | bins=int(np.ceil((a.max()-a.min())/h)) if h!=0 else int(np.sqrt(a.size)) 489 | return bins 490 | 491 | # INTERNAL FUNCTIONS 492 | 493 | def avgdigamma(points, dvec): 494 | # This part finds number of neighbors in some radius in the marginal space 495 | # returns expectation value of 496 | N = len(points) 497 | tree = ss.cKDTree(points) 498 | avg = 0. 499 | for i in range(N): 500 | dist = dvec[i] 501 | # subtlety, we don't include the boundary point, 502 | # but we are implicitly adding 1 to kraskov def bc center point is included 503 | num_points = len(tree.query_ball_point(points[i], dist - 1e-15, p=float('inf'))) 504 | avg += digamma(num_points) / N 505 | return avg 506 | 507 | 508 | def zip2(*args): 509 | # zip2(x, y) takes the lists of vectors and makes it a list of vectors in a joint space 510 | # E.g. zip2([[1], [2], [3]], [[4], [5], [6]]) = [[1, 4], [2, 5], [3, 6]] 511 | return [sum(sublist, []) for sublist in zip(*args)] 512 | 513 | def discretize(xs): 514 | def discretize_one(x): 515 | if len(x) > 1: 516 | return tuple(x) 517 | else: 518 | return x[0] 519 | # discretize(xs) takes a list of vectors and makes it a list of tuples or scalars 520 | return [discretize_one(x) for x in xs] 521 | -------------------------------------------------------------------------------- /reportgen/associate/fpgrowth.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | JSong:直接从 python 包 orangecontrib 中 fork,未作修改 4 | 原文: 5 | This module implements FP-growth [1] frequent pattern mining algorithm with 6 | bucketing optimization [2] for conditional databases of few items. 7 | 8 | The entry points are :obj:`frequent_itemsets()`, :obj:`association_rules()`, and 9 | :obj:`rules_stats()` functions below. 10 | 11 | 12 | [1]: J. Han, J. Pei, Y. Yin, R. Mao. 13 | Mining Frequent Patterns without Candidate Generation: A 14 | Frequent-Pattern Tree Approach. 2004. 15 | https://www.cs.sfu.ca/~jpei/publications/dami03_fpgrowth.pdf 16 | 17 | [2]: R. Agrawal, C. Aggarwal, V. Prasad. 18 | Depth first generation of long patterns. 2000. 19 | http://www.cs.tau.ac.il/~fiat/dmsem03/Depth%20First%20Generation%20of%20Long%20Patterns%20-%202000.pdf 20 | 21 | [3]: R. Agrawal, et al. 22 | Fast Discovery of Association Rules. 1996. 23 | http://cs-people.bu.edu/evimaria/cs565/advances.pdf 24 | 25 | 26 | Examples 27 | -------- 28 | Here's an example from R. Agrawal's original Apriori article [3 § 12.2.2]. 29 | Given a database of transactions: 30 | 31 | >>> T = [[1, 3, 4 ], 32 | ... [ 2, 3, 5], 33 | ... [1, 2, 3, 5], 34 | ... [ 2, 5]] 35 | 36 | We can enumerate all frequent itemsets with support greater than two 37 | transactions: 38 | 39 | >>> from orangecontrib.associate.fpgrowth import * # doctest: +SKIP 40 | >>> itemsets = frequent_itemsets(T, 2) 41 | 42 | Note, functions in this module produce generators. 43 | The results space can explode quite quickly 44 | and can easily be too large to fit in your RAM. By using generators, you can 45 | filter the results to your liking `as you pass them`. 46 | 47 | >>> itemsets 48 | 49 | >>> list(itemsets) 50 | [(frozenset({1}), 2), 51 | (frozenset({2}), 3), 52 | (frozenset({3}), 3), 53 | (frozenset({1, 3}), 2), 54 | (frozenset({2, 3}), 2), 55 | (frozenset({5}), 3), 56 | (frozenset({2, 5}), 3), 57 | (frozenset({3, 5}), 2), 58 | (frozenset({2, 3, 5}), 2)] 59 | 60 | We can try it with a larger and more real-world database of categorical values: 61 | 62 | >>> import Orange 63 | >>> data = Orange.data.Table('zoo') 64 | >>> data 65 | [[1, 0, 0, 1, 0, ... | mammal] {aardvark}, 66 | [1, 0, 0, 1, 0, ... | mammal] {antelope}, 67 | [0, 0, 1, 0, 0, ... | fish] {bass}, 68 | [1, 0, 0, 1, 0, ... | mammal] {bear}, 69 | [1, 0, 0, 1, 0, ... | mammal] {boar}, 70 | ... 71 | ] 72 | 73 | We can't use table data directly; we first have to one-hot transform it: 74 | 75 | >>> X, mapping = OneHot.encode(data, include_class=True) 76 | 77 | We get a database we can use to find frequent itemsets, and a mapping we will 78 | use later to revert the transformation. 79 | 80 | >>> X 81 | array([[False, True, ..., True, False], 82 | [False, True, ..., True, False], 83 | [ True, False, ..., False, False], 84 | ..., 85 | [False, True, ..., True, False], 86 | [ True, False, ..., False, False], 87 | [ True, False, ..., False, False]], dtype=bool) 88 | >>> sorted(mapping.items()) 89 | [(0, (0, 0)), 90 | (1, (0, 1)), 91 | (2, (1, 0)), 92 | (3, (1, 1)), 93 | ... 94 | (40, (16, 4)), 95 | (41, (16, 5)), 96 | (42, (16, 6))] 97 | 98 | We want itemsets with >40% support: 99 | 100 | >>> itemsets = dict(frequent_itemsets(X, .4)) 101 | >>> len(itemsets) 102 | 520 103 | 104 | The transaction-coded items corresponding to class values are: 105 | 106 | >>> class_items = {item 107 | ... for item, var, _ in OneHot.decode(mapping, data, mapping) 108 | ... if var is data.domain.class_var} 109 | >>> sorted(class_items) 110 | [36, 37, 38, 39, 40, 41, 42] 111 | 112 | That makes sense as our class variable has seven values: 113 | 114 | >>> data.domain.class_var.values 115 | ['amphibian', 'bird', 'fish', 'insect', 'invertebrate', 'mammal', 'reptile'] 116 | 117 | Now we can generate all association rules that have consequent equal to one 118 | of the class values and >80% confidence (i.e. classification rules): 119 | 120 | >>> rules = [(P, Q, supp, conf) 121 | ... for P, Q, supp, conf in association_rules(itemsets, .8) 122 | ... if len(Q) == 1 and Q & class_items] 123 | >>> len(rules) 124 | 18 125 | >>> rules 126 | [(frozenset({17, 2, 19, 20, 7}), frozenset({41}), 41, 1.0), 127 | (frozenset({17, 2, 19, 7}), frozenset({41}), 41, 1.0), 128 | ... 129 | (frozenset({20, 7}), frozenset({41}), 41, 1.0), 130 | (frozenset({7}), frozenset({41}), 41, 1.0)] 131 | 132 | To make them more helpful, we can use ``mapping`` to transform the rules' items 133 | back into table domain values, e.g. for first five rules: 134 | 135 | >>> names = {item: '{}={}'.format(var.name, val) 136 | ... for item, var, val in OneHot.decode(mapping, data, mapping)} 137 | >>> for ante, cons, supp, conf in rules[:5]: 138 | ... print(', '.join(names[i] for i in ante), '-->', 139 | ... names[next(iter(cons))], 140 | ... '(supp: {}, conf: {})'.format(supp, conf)) 141 | backbone=1, feathers=0, breathes=1, venomous=0, milk=1 --> type=mammal (supp: 41, conf: 1.0) 142 | backbone=1, feathers=0, breathes=1, milk=1 --> type=mammal (supp: 41, conf: 1.0) 143 | backbone=1, breathes=1, venomous=0, milk=1 --> type=mammal (supp: 41, conf: 1.0) 144 | feathers=0, breathes=1, venomous=0, milk=1 --> type=mammal (supp: 41, conf: 1.0) 145 | backbone=1, feathers=0, breathes=1, venomous=0 --> type=mammal (supp: 41, conf: 0.87...) 146 | 147 | 148 | Reference with further examples below. 149 | """ 150 | 151 | # TODO: Consider FPClose from "Efficiently using prefix-trees in mining frequent itemsets" 152 | # TODO: Consider ExAnte: Anticipated data reduction in constrained pattern mining 153 | 154 | from collections import defaultdict, Iterator 155 | from itertools import combinations, chain 156 | from functools import reduce 157 | 158 | import numpy as np 159 | from scipy.sparse import issparse, spmatrix 160 | 161 | __all__=['frequent_itemsets', 'association_rules', 'rules_stats', 'OneHot', 'preprocess'] 162 | 163 | _FP_TREE_EMPTY = (None, []) 164 | _BUCKETING_FEW_ITEMS = 10 165 | 166 | 167 | class _Node(dict): 168 | def __init__(self, item=None, parent=None, count=None): 169 | self.item = item 170 | self.parent = parent 171 | self.count = count 172 | 173 | 174 | def _bucketing_count(db, frequent_items, min_support): 175 | """ 176 | Bucket counting (bucketing) optimization for databases where few items 177 | are frequent ([2] § 5). 178 | """ 179 | # Forward and inverse mapping of frequent_items to [0, n_items) 180 | inv_map = dict(enumerate(frequent_items)).__getitem__ 181 | fwd_map = {v: k for k, v in inv_map.__self__.items()}.__getitem__ 182 | # Project transactions 183 | k = len(frequent_items) 184 | buckets = [0] * 2**k 185 | for count, transaction in db: 186 | set_bits = (fwd_map(i) for i in frequent_items.intersection(transaction)) 187 | tid = reduce(lambda a, b: a | 1 << b, set_bits, 0) 188 | buckets[tid] += count 189 | # Aggregate bucketing counts ([2], Figure 5) 190 | for i in range(0, k): 191 | i = 2**i 192 | for j in range(2**k): 193 | if j & i == 0: 194 | buckets[j] += buckets[j + i] 195 | # Announce results 196 | buckets = enumerate(buckets) 197 | next(buckets) # Skip 000...0 198 | for tid, count in buckets: 199 | if count >= min_support: 200 | yield frozenset(inv_map(i) for i, b in enumerate(reversed(bin(tid))) if b == '1'), count 201 | 202 | 203 | # Replace above bucketing count with the one from C module 204 | try: 205 | from orangecontrib.associate._fpgrowth import bucketing_count as _bucketing_count, \ 206 | BUCKETING_FEW_ITEMS as _BUCKETING_FEW_ITEMS 207 | except ImportError: 208 | # The module may not have been compiled due to compiler missing (e.g. on WinDOS); 209 | # just use above Python code 210 | pass 211 | 212 | 213 | def _fp_tree_insert(item, T, node_links, count): 214 | """ Insert item into _Node-tree T and return the new node """ 215 | node = T.get(item) 216 | if node is None: 217 | node = T[item] = _Node(item, T, count) 218 | node_links[item].append(node) 219 | else: # Node for this item already in T, just inc its count 220 | node.count += count 221 | return node 222 | 223 | 224 | def _fp_tree(db, min_support): 225 | """ 226 | FP-tree construction ([1] § 2.1, Algorithm 1). 227 | 228 | If frequent items in db are determined to be less than threshold, 229 | "bucketing" [2] is used instead. 230 | 231 | Returns 232 | ------- 233 | tuple 234 | (FP-tree, None) or (None, list of frequent itemsets with support) 235 | """ 236 | if not isinstance(db, list): db = list(db) 237 | 238 | if not db: 239 | return _FP_TREE_EMPTY 240 | 241 | # Used to count item support so it can be reported when generating itemsets 242 | item_support = defaultdict(int) 243 | # Used for ordering transactions' items for "optimally" "compressed" tree 244 | node_support = defaultdict(int) 245 | for count, transaction in db: 246 | for item in transaction: 247 | item_support[item] += count 248 | node_support[item] += 1 249 | # Only ever consider items that have min_support 250 | frequent_items = {item 251 | for item, support in item_support.items() 252 | if support >= min_support} 253 | 254 | # Short-circuit, if possible 255 | n_items = len(frequent_items) 256 | if 0 == n_items: 257 | return _FP_TREE_EMPTY 258 | if 1 == n_items: 259 | item = frequent_items.pop() 260 | return None, ((frozenset({item}), item_support[item]),) 261 | if n_items <= _BUCKETING_FEW_ITEMS: 262 | return None, ((frozenset(itemset), support) 263 | for itemset, support in _bucketing_count(db, frequent_items, min_support)) 264 | 265 | # "The items [...] should be ordered in the frequency descending order of 266 | # node occurrence of each item instead of its support" ([1], p. 12, bottom) 267 | sort_index = {item: i 268 | for i, item in 269 | enumerate(sorted(frequent_items, 270 | key=node_support.__getitem__, 271 | reverse=True))}.__getitem__ 272 | # Only retain frequent items and sort them 273 | db = ((count, sorted(frequent_items.intersection(transaction), 274 | key=sort_index)) 275 | for count, transaction in db) 276 | 277 | root = _Node() 278 | node_links = defaultdict(list) 279 | for count, transaction in db: 280 | T = root 281 | for item in transaction: 282 | T = _fp_tree_insert(item, T, node_links, count) 283 | # Sorted support-descending (in reverse because popping from the back for efficiency) 284 | root.node_links = sorted(node_links.items(), key=lambda i: -sort_index(i[0])) 285 | return root, None 286 | 287 | 288 | def _powerset(lst): 289 | """ 290 | >>> list(_powerset([1, 2, 3])) 291 | [(1,), (2,), (3,), (1, 2), (1, 3), (2, 3), (1, 2, 3)] 292 | """ 293 | return chain.from_iterable(combinations(lst, r) 294 | for r in range(1, len(lst) + 1)) 295 | 296 | 297 | def _single_prefix_path(root): 298 | """ Return (single-prefix path, rest of tree with new root) """ 299 | path = [] 300 | tree = root 301 | node_links = root.node_links 302 | while len(tree) == 1: 303 | tree = next(iter(tree.values())) 304 | path.append((tree.item, tree.count)) 305 | node_links.pop() 306 | tree.parent, tree.item, tree.node_links = None, None, node_links 307 | return path, tree 308 | 309 | 310 | def _prefix_paths(tree, nodes): 311 | """ Generate all paths of tree leading to all item nodes """ 312 | for node in nodes: 313 | path = [] 314 | support = node.count 315 | node = node.parent 316 | while node.item is not None: 317 | path.append(node.item) 318 | node = node.parent 319 | if path: 320 | yield support, path 321 | 322 | 323 | def _freq_patterns_single(P, alpha, min_support): 324 | """ Yield subsets of P as (frequent itemset, support) """ 325 | for itemset in _powerset(P): 326 | yield alpha.union(i[0] for i in itemset), itemset[-1][1] 327 | 328 | 329 | def _freq_patterns_multi(Q, alpha, min_support): 330 | """ Mine multi-path FP-tree """ 331 | for item, nodes in reversed(Q.node_links): 332 | support = sum(n.count for n in nodes) 333 | beta = alpha.union({item}) 334 | yield beta, support 335 | tree, got_itemsets = _fp_tree(_prefix_paths(Q, nodes), min_support) 336 | if got_itemsets: 337 | for itemset, support in got_itemsets: 338 | yield beta.union(itemset), support 339 | elif tree is not None: 340 | yield from _fp_growth(tree, beta, min_support) 341 | 342 | 343 | def _fp_growth(tree, alpha, min_support): 344 | """ FP-growth ([1], § 3.3, Algorithm 2). """ 345 | # Single prefix path optimization ([1] § 3.1) 346 | P, Q = _single_prefix_path(tree) if len(tree) == 1 else ([], tree) 347 | # Return P×Q 348 | yield from _freq_patterns_single(P, alpha, min_support) 349 | for itemsetQ, supportQ in _freq_patterns_multi(Q, alpha, min_support): 350 | yield itemsetQ, supportQ 351 | for itemsetP, supportP in _freq_patterns_single(P, alpha, min_support): 352 | yield itemsetQ | itemsetP, supportQ 353 | 354 | 355 | def frequent_itemsets(X, min_support=.2): 356 | """ 357 | Generator yielding frequent itemsets from database X. 358 | 359 | Parameters 360 | ---------- 361 | X : list or numpy.ndarray or scipy.sparse.spmatrix or iterator 362 | The database of transactions where each transaction is a collection 363 | of integer items. If `numpy.ndarray`, the items are considered to be 364 | indices of non-zero columns. 365 | min_support : float or int 366 | If float in range (0, 1), percent of minimal support for itemset to 367 | be considered frequent. If int > 1, the absolute number of instances. 368 | For example, general iterators don't have defined length, so you need 369 | to pass the absolute minimal support as int. 370 | 371 | Yields 372 | ------ 373 | itemset: frozenset 374 | Iteratively yields all itemsets (as frozensets of item indices) with 375 | support greater or equal to specified `min_support`. 376 | support: int 377 | Itemset's support as number of instaances. 378 | 379 | Examples 380 | -------- 381 | Have a database of 50 transactions, 100 possible items: 382 | 383 | >>> import numpy as np 384 | >>> np.random.seed(0) 385 | >>> X = np.random.random((50, 100)) > .9 386 | 387 | Convert it to sparse so we show this type is supported: 388 | 389 | >>> from scipy.sparse import lil_matrix # other types would convert to LIL anyway 390 | >>> X = lil_matrix(X) 391 | 392 | Count the number of itemsets of at least two items with support greater 393 | than 4%: 394 | 395 | >>> sum(1 for itemset, support in frequent_itemsets(X, .05) 396 | ... if len(itemset) >= 2) 397 | 72 398 | 399 | Let's get all the itemsets with at least 20% support: 400 | 401 | >>> gen = frequent_itemsets(X, .2) 402 | >>> gen 403 | 404 | 405 | >>> itemsets = list(gen) 406 | >>> itemsets 407 | [(frozenset({4}), 11), (frozenset({25}), 10)] 408 | 409 | We get the same result by specifying the support as absolute number: 410 | 411 | >>> list(frequent_itemsets(X, 10)) == itemsets 412 | True 413 | 414 | So the items '4' and '25' (fifth and twenty sixth columns of X) are the 415 | only items (and itemsets) that appear 10 or more times. Let's check this: 416 | 417 | >>> (X.sum(axis=0) >= 10).nonzero()[1] 418 | array([ 4, 25]) 419 | 420 | Conclusion: Given databases of uniformly distributed random data, 421 | there's not much to work with. 422 | """ 423 | if not isinstance(X, (np.ndarray, spmatrix, list, Iterator)): 424 | raise TypeError('X must be (sparse) array of boolean values, or' 425 | 'list of lists of hashable items, or iterator') 426 | if not (isinstance(min_support, int) and min_support > 0 or 427 | isinstance(min_support, float) and 0 < min_support <= 1): 428 | raise ValueError('min_support must be an integer number of instances,' 429 | 'or a percent fraction in (0, 1]') 430 | 431 | min_support *= (1 if isinstance(min_support, int) else 432 | len(X) if isinstance(X, list) else 433 | X.shape[0]) 434 | min_support = max(1, int(np.ceil(min_support))) 435 | 436 | if issparse(X): 437 | X = X.tolil().rows 438 | elif isinstance(X, np.ndarray): 439 | X = (t.nonzero()[-1] for t in X) 440 | 441 | db = ((1, transaction) for transaction in X) # 1 is initial item support 442 | tree, itemsets = _fp_tree(db, min_support) 443 | if itemsets: 444 | yield from itemsets 445 | if tree: 446 | yield from _fp_growth(tree, frozenset(), min_support) 447 | 448 | 449 | def _association_rules(left, right, last_item, support, min_confidence, itemsets): 450 | if not left: return 451 | confidence = support / itemsets[left] 452 | if confidence >= min_confidence: 453 | yield left, right, support, confidence 454 | for item in left: 455 | if item > last_item: continue # This ensures same rules aren't visited twice 456 | yield from _association_rules( 457 | left - {item}, right | {item}, 458 | item, support, min_confidence, itemsets) 459 | 460 | 461 | def association_rules(itemsets, min_confidence, itemset=None): 462 | """ 463 | Generate association rules ([3] § 12.3) from dict of itemsets' supports 464 | (from :obj:`frequent_itemsets()`). If `itemset` is provided, only generate 465 | its rules. 466 | 467 | Parameters 468 | ---------- 469 | itemsets: dict 470 | A `dict` mapping itemsets to their supports. Can be generated by 471 | feeding the output of `frequent_itemsets()` to `dict` constructor. 472 | min_confidence: float 473 | Confidence percent. Defined as `itemset_support / antecedent_support`. 474 | itemset: frozenset 475 | Itemset the association rules of which we are interested in. 476 | 477 | Yields 478 | ------ 479 | antecedent: frozenset 480 | The LHS of the association rule. 481 | consequent: frozenset 482 | The RHS of the association rule. 483 | support: int 484 | The number of instances supporting (containing) this rule. 485 | confidence: float 486 | ``total_support / lhs_support``. 487 | 488 | Examples 489 | -------- 490 | >>> np.random.seed(0) 491 | >>> N = 100 492 | >>> X = np.random.random((N, 100)) > .9 493 | 494 | Find all itemsets with at least 5% support: 495 | 496 | >>> itemsets = dict(frequent_itemsets(X, .05)) 497 | >>> len(itemsets) 498 | 116 499 | 500 | Generate all association rules from these itemsets with minimum 501 | 50% confidence: 502 | 503 | >>> rules = association_rules(itemsets, .5) 504 | >>> rules 505 | 506 | >>> rules = list(rules) 507 | >>> len(rules) 508 | 7 509 | >>> rules 510 | [(frozenset({36}), frozenset({25}), 5, 0.55...), 511 | (frozenset({63}), frozenset({58}), 5, 0.5), 512 | ... 513 | (frozenset({30}), frozenset({32}), 5, 0.55...), 514 | (frozenset({75}), frozenset({98}), 5, 0.5)] 515 | 516 | Or only the rules for a particular itemset: 517 | 518 | >>> list(association_rules(itemsets, .3, frozenset({75, 98}))) 519 | [(frozenset({75}), frozenset({98}), 5, 0.5), 520 | (frozenset({98}), frozenset({75}), 5, 0.45...)] 521 | 522 | """ 523 | assert (isinstance(itemsets, dict) and 524 | isinstance(next(iter(itemsets), frozenset()), frozenset)) 525 | assert 0 < min_confidence <= 1 526 | from_itemsets = (itemset,) if itemset else sorted(itemsets, key=len, reverse=True) 527 | for itemset in from_itemsets: 528 | support = itemsets[itemset] 529 | for item in itemset: 530 | right = frozenset({item}) 531 | yield from _association_rules( 532 | itemset - right, right, 533 | item, support, min_confidence, itemsets) 534 | 535 | 536 | def rules_stats(rules, itemsets, n_examples): 537 | """ 538 | Generate additional stats for rules generated by :obj:`association_rules()`. 539 | 540 | Parameters 541 | ---------- 542 | rules: iterable 543 | Rules as output by `association_rules()`. 544 | itemsets: dict 545 | The itemsets as obtained by `dict(frequent_itemsets(...))`. 546 | n_examples: int 547 | The total number of instances (for calculating coverage, lift, 548 | and leverage). 549 | 550 | Yields 551 | ------ 552 | atecedent: frozenset 553 | The LHS of the association rule. 554 | consequent: frozenset 555 | The RHS of the association rule. 556 | support: int 557 | Support as an absolute number of instances. 558 | confidence: float 559 | The confidence percent, calculated as: ``total_support / lhs_rupport``. 560 | coverage: float 561 | Calculated as: ``lhs_support / n_examples`` 562 | strength: float 563 | Calculated as: ``rhs_support / lhs_examples`` 564 | lift: float 565 | Calculated as: ``n_examples * total_support / lhs_support / rhs_support`` 566 | leverage: float 567 | Calculated as: ``(total_support * n_examples - lhs_support * rhs_support) / n_examples**2`` 568 | 569 | Examples 570 | -------- 571 | >>> N = 30 572 | >>> X = np.random.random((N, 50)) > .9 573 | >>> itemsets = dict(frequent_itemsets(X, .1)) 574 | >>> rules = association_rules(itemsets, .6) 575 | >>> list(rules_stats(rules, itemsets, N)) 576 | [(frozenset({15}), frozenset({0}), 3, 0.75, 0.13..., 1.5, 3.75, 0.073...), 577 | (frozenset({47}), frozenset({22}), 3, 0.6, 0.16..., 1.4, 2.57..., 0.061...), 578 | (frozenset({27}), frozenset({22}), 4, 0.66..., 0.2, 1.16..., 2.85..., 0.086...), 579 | (frozenset({19}), frozenset({22}), 3, 0.6, 0.16..., 1.4, 2.57..., 0.061...)] 580 | 581 | """ 582 | assert (isinstance(itemsets, dict) and 583 | isinstance(next(iter(itemsets), frozenset()), frozenset)) 584 | assert n_examples > 0 585 | for left, right, support, confidence in rules: 586 | l_support, r_support = itemsets[left], itemsets[right] 587 | coverage = l_support / n_examples 588 | strength = r_support / l_support 589 | lift = n_examples * confidence / r_support 590 | leverage = (support*n_examples - l_support*r_support) / n_examples**2 591 | yield (left, right, support, confidence, 592 | coverage, strength, lift, leverage) 593 | 594 | 595 | def __fp_tree_count_nodes(tree): 596 | count = 1 if tree.item is not None else 0 597 | for t in tree.values(): 598 | count += __fp_tree_count_nodes(t) 599 | return count 600 | 601 | 602 | def __fp_tree_max_height(tree): 603 | if tree: 604 | return max((1 if tree.item is not None else 0) + 605 | __fp_tree_max_height(child) for child in tree.values()) 606 | return 1 if tree.item is not None else 0 607 | 608 | 609 | class OneHot: 610 | """ 611 | Encode discrete Orange.data.Table into a 2D array of binary attributes. 612 | """ 613 | @staticmethod 614 | def encode(table, include_class=False): 615 | """ 616 | Return a tuple of 617 | (bool (one hot) ndarray, {col: (variable_index, value_index)} mapping) 618 | 619 | If the input table is sparse, a list of nonzero column indices 620 | per row (LIL rows) is returned instead of the one-hot ndarray. 621 | """ 622 | X, encoded, mapping = table.X, [], {} 623 | if issparse(X): 624 | encoded = X.tolil().rows.tolist() 625 | for i, var in enumerate(table.domain.attributes): 626 | mapping[i] = i, 0 627 | else: 628 | for i, var in enumerate(table.domain.attributes): 629 | if not var.is_discrete: continue 630 | for j, val in enumerate(var.values): 631 | mapping[len(mapping)] = i, j 632 | encoded.append(X[:, i] == j) 633 | 634 | if include_class and table.domain.has_discrete_class: 635 | i, var = len(table.domain.attributes), table.domain.class_var 636 | for j, val in enumerate(var.values): 637 | mapping[len(mapping)] = i, j 638 | if issparse(X): 639 | for row in encoded: 640 | row.append(i + j) 641 | else: 642 | encoded.append(table.Y == j) 643 | 644 | if not issparse(X): 645 | encoded = np.column_stack(encoded) if encoded else None 646 | return encoded, mapping 647 | 648 | @staticmethod 649 | def decode(itemset, table, mapping): 650 | """Yield sorted (item, variable, value) tuples (one for each item)""" 651 | attributes = table.domain.attributes 652 | for item in itemset: 653 | ivar, ival = mapping[item] 654 | var = attributes[ivar] if ivar < len(attributes) else table.domain.class_var 655 | yield item, var, (var.values[ival] if var.is_discrete else 0) 656 | 657 | 658 | def preprocess(table): 659 | """ 660 | This function applies a one-hot transform to Orange data table, making it 661 | suitable as an `X` input into :obj:`frequent_itemsets()` above. 662 | 663 | For a more fine-grained control, use :obj:`OneHot` methods directly. 664 | 665 | Parameters 666 | ---------- 667 | table: Orange.data.Table 668 | The table to encode into `X` compatible with `frequent_itemsets()` 669 | above. 670 | 671 | Returns 672 | ------- 673 | X: numpy.ndarray 674 | The table's `X` with one-hot tranfsorm applied. 675 | 676 | 677 | Examples 678 | -------- 679 | For a more concrete example, i.e. using non-uniform data: 680 | 681 | >>> from Orange.data import Table 682 | >>> table = Table('voting') 683 | >>> table 684 | [[n, y, n, y, y, ... | republican], 685 | [n, y, n, y, y, ... | republican], 686 | [?, y, y, ?, y, ... | democrat], 687 | [n, y, y, n, ?, ... | democrat], 688 | [y, y, y, n, y, ... | democrat], 689 | ... 690 | ] 691 | 692 | Table, as-is, can't be used with :obj:`frequent_itemsets()` directly (it can, 693 | but it would produce garbage). We first need to one-hot transform it, i.e. 694 | make binary columns for each value of each of its discrete variables. 695 | 696 | >>> X = preprocess(table) 697 | >>> X 698 | array([[ True, False, False, ..., True, True, False], 699 | [ True, False, False, ..., False, True, False], 700 | ..., 701 | [ True, False, True, ..., True, True, False], 702 | [ True, False, False, ..., False, True, False]], dtype=bool) 703 | 704 | Now we `can` use it. 705 | 706 | Note: the transformation includes class if it's discrete. For a 707 | finer-grained control, including the variable values to columns mapping, 708 | use :obj:`OneHot` class directly. 709 | """ 710 | if table.domain.has_continuous_attributes(): 711 | raise ValueError('Frequent itemsets require all variables to be discrete') 712 | encoded, mapping = OneHot.encode(table, table.domain.has_discrete_class) 713 | return encoded 714 | 715 | 716 | if __name__ == '__main__': 717 | import doctest 718 | import __main__, builtins 719 | 720 | class Context(dict): 721 | # See http://bugs.python.org/issue26303 722 | def copy(self): return self 723 | def clear(self): pass 724 | 725 | globals = __main__.__dict__.copy() 726 | globals.update(builtins.__dict__) 727 | 728 | doctest.testmod(globs=Context(globals), 729 | optionflags=doctest.NORMALIZE_WHITESPACE | doctest.ELLIPSIS) 730 | -------------------------------------------------------------------------------- /reportgen/analysis.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Thu Nov 23 21:53:32 2017 4 | 5 | @author: gason 6 | """ 7 | 8 | import pandas as pd 9 | import numpy as np 10 | import re 11 | import time 12 | import os 13 | from collections import Iterable 14 | 15 | from pandas.api.types import is_string_dtype 16 | from pandas.api.types import is_numeric_dtype 17 | from pandas.api.types import is_number 18 | from pandas.api.types import is_datetime64_any_dtype 19 | from pandas.api.types import is_categorical_dtype 20 | from scipy import stats 21 | from sklearn import metrics 22 | 23 | from . import report as _rpt 24 | from . import config 25 | from .report import genwordcloud 26 | from .utils.metrics import entropyc 27 | 28 | from .utils import iqr 29 | 30 | #from sklearn.neighbors import KernelDensity 31 | import matplotlib.pyplot as plt 32 | import seaborn as sns 33 | 34 | _thisdir = os.path.split(__file__)[0] 35 | # default chinese font 36 | from matplotlib.font_manager import FontProperties 37 | font_path=config.font_path 38 | if font_path: 39 | myfont=FontProperties(fname=font_path) 40 | sns.set(font=myfont.get_name()) 41 | 42 | 43 | __all__=['type_of_var', 44 | 'describe', 45 | 'plot', 46 | 'features_analysis', 47 | 'distributions', 48 | 'AnalysisReport', 49 | 'ClassifierReport'] 50 | 51 | 52 | def _freedman_diaconis_bins(a): 53 | """Calculate number of hist bins using Freedman-Diaconis rule.""" 54 | # From http://stats.stackexchange.com/questions/798/ 55 | a = np.asarray(a) 56 | assert len(a.shape)>0 57 | assert len(a)>0 58 | h = 2 * iqr(a) / (len(a) ** (1 / 3)) 59 | # fall back to sqrt(a) bins if iqr is 0 60 | if h == 0: 61 | return int(np.sqrt(a.size)) 62 | else: 63 | return int(np.ceil((a.max() - a.min()) / h)) 64 | 65 | 66 | 67 | def distributions(a,hist=True,bins=None,norm_hist=True,kde=False,grid=None,gridsize=100,clip=None): 68 | '''数组的分布信息 69 | hist=True,则返回分布直方图(counts,bins) 70 | kde=True,则返回核密度估计数组(grid,y) 71 | 72 | example 73 | ------- 74 | a=np.random.randint(1,50,size=(1000,1)) 75 | ''' 76 | a = np.asarray(a).squeeze() 77 | if hist: 78 | if bins is None: 79 | bins = min(_freedman_diaconis_bins(a), 50) 80 | counts,bins=np.histogram(a,bins=bins) 81 | if norm_hist: 82 | counts=counts/counts.sum() 83 | if kde: 84 | bw='scott' 85 | cut=3 86 | if clip is None: 87 | clip = (-np.inf, np.inf) 88 | try: 89 | kdemodel = stats.gaussian_kde(a, bw_method=bw) 90 | except TypeError: 91 | kdemodel = stats.gaussian_kde(a) 92 | bw = "scotts" if bw == "scott" else bw 93 | bw = getattr(kdemodel, "%s_factor" % bw)() * np.std(a) 94 | if grid is None: 95 | support_min = max(a.min() - bw * cut, clip[0]) 96 | support_max = min(a.max() + bw * cut, clip[1]) 97 | grid=np.linspace(support_min, support_max, gridsize) 98 | y = kdemodel(grid) 99 | if hist and not(kde): 100 | return counts,bins 101 | elif not(hist) and kde: 102 | return grid,y 103 | elif hist and kde: 104 | return ((counts,bins),(grid,y)) 105 | else: 106 | return None 107 | 108 | 109 | def dtype_detection(data,category_detection=True,StructureText_detection=True,\ 110 | datetime_to_category=True,criterion='sqrt',min_mean_counts=5,fix=False): 111 | '''检测数据中单个变量的数据类型 112 | 将数据类型分为以下4种 113 | 1. number,数值型 114 | 2. category,因子 115 | 3. datetime,时间类型 116 | 4. text,文本型 117 | 5. text_st,结构性文本,比如ID, 118 | 6. group_number,连续 119 | 120 | parameter 121 | --------- 122 | data: pd.Series 数据, 仅支持一维 123 | # 如果有data,则函数会改变原来data的数据类型 124 | category_detection: bool,根据 nunique 检测是否是因子类型 125 | StructureText_detection: bool, 结构化文本,如列中都有一个分隔符"-" 126 | datetime_to_category: 时间序列如果 nunique过少是否转化成因子变量 127 | criterion: string or int, optional (default="sqrt",即样本数的开根号) 128 | 支持:'sqrt':样本量的开根号, int: 绝对数, 0-1的float:样本数的百分多少 129 | 检测因子变量时,如果一个特征的nunique小于criterion,则判定为因子变量 130 | min_mean_counts: default 5,数值型判定为因子变量时,需要满足每个类别的平均频数要大于min_mean_counts 131 | fix: bool,是否返回修改好类型的数据 132 | 133 | 134 | return: 135 | result:dict{ 136 | 'name':列名, 137 | 'vtype':变量类型, 138 | 'ordered':是否是有序因子, 139 | 'categories':所有的因子} 140 | 141 | ''' 142 | 143 | assert len(data.shape)==1 144 | data=data.copy() 145 | data=pd.Series(data) 146 | dtype,name,n_sample=data.dtype,data.name,data.count() 147 | 148 | min_mean_counts=5 149 | if criterion=='sqrt': 150 | max_nuniques=np.sqrt(n_sample) 151 | elif isinstance(criterion,int): 152 | max_nuniques=criterion 153 | elif isinstance(criterion,float) and (0=min_mean_counts: 170 | data=data.astype('category') 171 | ordered=data.cat.ordered 172 | vtype='category' 173 | categories=list(data.dropna().cat.categories) 174 | result={'name':name,'vtype':vtype,'ordered':ordered,'categories':categories} 175 | elif is_string_dtype(dtype): 176 | # 处理时间类型 177 | tmp=data.map(lambda x: np.nan if '%s'%x == 'nan' else len('%s'%x)) 178 | tmp=tmp.dropna().astype(np.int64) 179 | if not(any(data.dropna().map(is_number))) and 70.2112 196 | if is_string_dtype(data.dtype) and not(is_categorical_dtype(data.dtype)) and all(data.str.contains('%')): 197 | data=data.str.strip('%').astype(np.float64)/100 198 | 199 | if is_categorical_dtype(data.dtype): 200 | vtype='category' 201 | categories=list(data.cat.categories) 202 | ordered=data.cat.ordered 203 | # 时间格式 204 | elif np.issubdtype(data.dtype,np.datetime64): 205 | vtype='datetime' 206 | # 是否是结构化数组 207 | elif StructureText_detection and tmp.dropna().std()==0: 208 | # 不可迭代,不是字符串 209 | if not(isinstance(data.dropna().iloc[0],Iterable)): 210 | vtype='text' 211 | else: 212 | k=set(list(data.dropna().iloc[0])) 213 | for x in data: 214 | if isinstance(x,str) and len(x)>0: 215 | k&=set(list(x)) 216 | if len(k)>0: 217 | vtype='text_st' 218 | else: 219 | vtype='text' 220 | elif is_numeric_dtype(data.dtype): 221 | vtype='number' 222 | ordered=False 223 | categories=[] 224 | else: 225 | vtype='text' 226 | result={'name':name,'vtype':vtype,'ordered':ordered,'categories':categories} 227 | elif is_datetime64_any_dtype(dtype): 228 | vtype='datetime' 229 | result={'name':name,'vtype':vtype,'ordered':ordered,'categories':categories} 230 | else: 231 | print('unknown dtype!') 232 | result=None 233 | 234 | if fix: 235 | return result,data 236 | else: 237 | return result 238 | 239 | 240 | 241 | def type_of_var(data,category_detection=True,criterion='sqrt',min_mean_counts=5,copy=True): 242 | '''返回各个变量的类型 243 | 将数据类型分为以下4种 244 | 1. number,数值型 245 | 2. category,因子 246 | 3. datetime,时间类型 247 | 4. text,文本型 248 | 5. text_st,结构性文本,比如ID, 249 | 250 | parameters 251 | ---------- 252 | data: pd.DataFrame类型 253 | category_detection: bool,根据 nunique 检测是否是因子类型 254 | criterion: string or int, optional (default="sqrt",即样本数的开根号) 255 | 支持:'sqrt':样本量的开根号, int: 绝对数, 0-1的float:样本数的百分多少 256 | 检测因子变量时,如果一个特征的nunique小于criterion,则判定为因子变量 257 | min_mean_counts: default 5,数值型判定为因子变量时,需要满足每个类别的平均频数要大于min_mean_counts 258 | copy: bool, 是否更改数据类型,如时间格式、因子变量等 259 | 260 | return: 261 | -------- 262 | var_type:dict{ 263 | ColumnName:type,} 264 | 265 | ''' 266 | assert isinstance(data,pd.core.frame.DataFrame) 267 | var_type={} 268 | for c in data.columns: 269 | #print('type_of_var : ',c) 270 | if copy: 271 | data=data.copy() 272 | result=dtype_detection(data[c],category_detection=category_detection,\ 273 | criterion=criterion,min_mean_counts=min_mean_counts,datetime_to_category=False,fix=False) 274 | if result is not None: 275 | var_type[c]=result['vtype'] 276 | else: 277 | var_type[c]='unknown' 278 | else: 279 | result,tmp=dtype_detection(data[c],category_detection=category_detection,\ 280 | criterion=criterion,min_mean_counts=min_mean_counts,datetime_to_category=False,fix=True) 281 | data[c]=tmp 282 | if result is not None: 283 | var_type[c]=result['vtype'] 284 | else: 285 | var_type[c]='unknown' 286 | return var_type 287 | 288 | 289 | 290 | def var_detection(data,combine=True): 291 | '''检测整个数据的变量类型,内部使用,外部请用type_of_var 292 | parameter 293 | --------- 294 | data: 数据,DataFrame格式 295 | combine: 检测变量中是否有类似的变量,有的话则会合并。 296 | 297 | return 298 | ------ 299 | var_list:[{'name':,'vtype':,'vlist':,'ordered':,'categories':,},] 300 | 301 | ''' 302 | var_list=[] 303 | for c in data.columns: 304 | result,tmp=dtype_detection(data[c],fix=True) 305 | data[c]=tmp 306 | if result is not None: 307 | result['vlist']=[c] 308 | var_list.append(result) 309 | if not(combine): 310 | return var_list,data 311 | var_group=[] 312 | i=0 313 | pattern=re.compile(r'(.*?)(\d+)') 314 | while i < len(var_list)-1: 315 | v=var_list[i] 316 | vnext=var_list[i+1] 317 | if v['vtype']!='number' or vnext['vtype']!='number': 318 | i+=1 319 | continue 320 | tmp1=[] 321 | for vv in var_list[i:]: 322 | if vv['vtype']!='number': 323 | break 324 | w=re.findall(pattern,'%s'%vv['name']) 325 | if len(w)==0 or (len(w)>0 and len(w[0])<2): 326 | break 327 | tmp1.append((w[0][0],w[0][1])) 328 | if len(tmp1)<2: 329 | i+=1 330 | continue 331 | flag1=len(set([t[0] for t in tmp1]))==1 332 | flag2=np.diff([int(t[1]) for t in tmp1]).sum()==len(tmp1)-1 333 | if flag1 and flag2: 334 | var_group.append(list(range(i,i+len(tmp1)))) 335 | i+=len(tmp1) 336 | var_group_new={} 337 | var_group_total=[]#将所有的分组ind加起来 338 | for vi in var_group: 339 | var_group_total+=vi 340 | name='{}-->{}'.format(var_list[vi[0]]['name'],var_list[vi[-1]]['name']) 341 | vlist=[var_list[v]['name'] for v in vi] 342 | vtype='group_number' 343 | tmp={'name':name,'vtype':vtype,'vlist':vlist,'ordered':True,'categories':vlist} 344 | var_group_new[vi[0]]=tmp 345 | var_list_new=[] 346 | var_list_have=[] 347 | for i,v in enumerate(var_list): 348 | if i not in var_group_total: 349 | v['vlist']=[v['name']] 350 | var_list_new.append(v) 351 | var_list_have+=v['vlist'] 352 | elif i in var_group_total and v['name'] not in var_list_have: 353 | var_list_new.append(var_group_new[i]) 354 | var_list_have+=var_group_new[i]['vlist'] 355 | return var_list_new,data 356 | 357 | def describe(data): 358 | ''' 359 | 对每个变量生成统计指标特征 360 | 对于每一个变量,生成如下字段: 361 | 数据类型: 362 | 最大值/频数最大的那个: 363 | 最小值/频数最小的那个: 364 | 均值/频数中间的那个: 365 | 缺失率: 366 | 范围/唯一数: 367 | ''' 368 | 369 | data=pd.DataFrame(data) 370 | n_sample=len(data) 371 | var_type=type_of_var(data,copy=True) 372 | summary=pd.DataFrame(columns=data.columns,index=['dtype','max','min','mean','missing_pct','std/nuniue']) 373 | for c in data.columns: 374 | missing_pct=1-data[c].count()/n_sample 375 | if var_type[c] == 'number': 376 | max_value,min_value,mean_value=data[c].max(),data[c].min(),data[c].mean() 377 | std_value=data[c].std() 378 | summary.loc[:,c]=[var_type[c],max_value,min_value,mean_value,missing_pct,std_value] 379 | elif var_type[c] == 'category' or is_categorical_dtype(data[c].dtype): 380 | tmp=data[c].value_counts() 381 | max_value,min_value=tmp.argmax(),tmp.argmin() 382 | mean_value_index=tmp[tmp==tmp.median()].index 383 | mean_value=mean_value_index[0] if len(mean_value_index)>0 else np.nan 384 | summary.loc[:,c]=[var_type[c],max_value,min_value,mean_value,missing_pct,len(tmp)] 385 | elif var_type[c] == 'datetime': 386 | max_value,min_value=data[c].max(),data[c].min() 387 | summary.loc[:,c]=[var_type[c],max_value,min_value,np.nan,missing_pct,np.nan] 388 | else: 389 | summary.loc[:,c]=[var_type[c],np.nan,np.nan,np.nan,missing_pct,np.nan] 390 | return summary 391 | 392 | 393 | 394 | def plot(data,figure_type='auto',chart_type='auto',vertical=False,ax=None): 395 | '''auto choose the best chart type to draw the data 【还没完全写好】 396 | paremeter 397 | ----------- 398 | figure_type: 'mpl' or 'pptx' or 'html' 399 | chart_type: 'hist' or 'dist' or 'kde' or 'bar' ...... 400 | 401 | return 402 | ------- 403 | chart:dict format. 404 | .type: equal to figure_type 405 | .fig: only return if type == 'mpl' 406 | .ax: 407 | .chart_data: 408 | 409 | ''' 410 | 411 | # 判别部分 412 | 413 | # 绘制部分 414 | data=pd.DataFrame(data) 415 | assert len(data.dropna())>0 416 | chart={} 417 | if figure_type in ['mpl','matplotlib']: 418 | chart['type']='mpl' 419 | if ax is None: 420 | fig,ax=plt.subplots() 421 | if chart_type in ['hist','kde']: 422 | for c in data.columns: 423 | if len(data[c].dropna())>10: 424 | sns.kdeplot(data[c].dropna(),shade=True,ax=ax) 425 | else: 426 | print('reportgen.plot:: ',c,'have no valid data!') 427 | legend_label=ax.get_legend_handles_labels() 428 | if len(legend_label)>0 and len(legend_label[0])>1: 429 | ax.legend() 430 | else: 431 | try: 432 | ax.legend_.remove() 433 | except: 434 | pass 435 | ax.axis('auto') 436 | elif chart_type in ['dist']: 437 | for c in data.columns: 438 | if len(data[c].dropna())>10: 439 | sns.distplot(data[c].dropna(),ax=ax) 440 | else: 441 | print('reportgen.plot:: ',c,'have no valid data!') 442 | legend_label=ax.get_legend_handles_labels() 443 | if len(legend_label)>0 and len(legend_label[0])>1: 444 | ax.legend() 445 | else: 446 | try: 447 | ax.legend_.remove() 448 | except: 449 | pass 450 | ax.axis('auto') 451 | elif chart_type in ['scatter']: 452 | ax.xaxis.set_ticks_position('none') 453 | ax.yaxis.set_ticks_position('none') 454 | ax.axhline(y=0, linestyle='-', linewidth=1.2, alpha=0.6) 455 | ax.axvline(x=0, linestyle='-', linewidth=1.2, alpha=0.6) 456 | color=['blue','red','green','dark'] 457 | if not isinstance(data,list): 458 | data=[data] 459 | for i,dd in enumerate(data): 460 | if '%s'%dd.iloc[:,0] != 'nan' or '%s'%dd.iloc[:,1] != 'nan': 461 | ax.scatter(dd.iloc[:,0], dd.iloc[:,1], c=color[i], s=50, 462 | label=dd.columns[1]) 463 | for _, row in dd.iterrows(): 464 | ax.annotate(row.name, (row.iloc[0], row.iloc[1]), color=color[i],fontproperties=myfont,fontsize=10) 465 | ax.axis('equal') 466 | legend_label=ax.get_legend_handles_labels() 467 | if len(legend_label)>0 and len(legend_label[0])>0: 468 | ax.legend() 469 | try: 470 | chart['fig']=fig 471 | except: 472 | pass 473 | chart['ax']=ax 474 | return chart 475 | if figure_type in ['pptx']: 476 | chart['type']='pptx' 477 | count,bins=distributions(data.iloc[:,0].dropna(),kde=False) 478 | if all(pd.Series(bins).astype(int)==bins): 479 | decimals_format='{:.0f}~' 480 | else: 481 | decimals_format='{:.2f}~' 482 | bins_index=[decimals_format.format(b) for b in bins[:-1]] 483 | decimals_format=decimals_format[:-1] 484 | bins_index[-1]=bins_index[-1]+decimals_format.format(bins[-1]) 485 | 486 | chart_data=pd.DataFrame({'frequency':count*100},index=bins_index) 487 | chart['chart_data']=chart_data 488 | if isinstance(ax,_rpt.Report): 489 | slide_data={'data':chart_data,'slide_type':'chart'} 490 | ax.add_slide(data=slide_data,title='',summary='',footnote='') 491 | # 暂时空缺,后期会将ax修改为Report接口 492 | chart['ax']=ax 493 | return chart 494 | 495 | 496 | # 仅测试用 497 | def features_analysis(X,y=None,out_file=None,categorical_features=[],number_features=[],\ 498 | max_leafs=5): 499 | ''' 500 | categorical_features=None 501 | number_features=None 502 | categorical_features=[] if categorical_features is None else categorical_features 503 | number_features=[] if number_features is None else number_features 504 | X=data 505 | ''' 506 | from graphviz import Digraph 507 | import pydotplus 508 | N=len(X) 509 | X=X.copy() 510 | if len(categorical_features)==0: 511 | var_type=type_of_var(X) 512 | categorical_features=[k for k in var_type if var_type[k]=='category'] 513 | 514 | #categorical_features=['grade','target','term'] 515 | #number_features=['tot_cur_bal','annual_inc'] 516 | X['_count_']=range(len(X)) 517 | # 根据唯一值个数的不同从小到大排列特征的顺序 518 | nunique=X[categorical_features].apply(pd.Series.nunique).sort_values() 519 | categorical_features=list(nunique.index) 520 | for k in nunique[nunique>5].index: 521 | topitems=X[k].value_counts().sort_values(ascending=False) 522 | X[k]=X[k].replace(dict(zip(topitems.index[(max_leafs-1):],['others']*(len(topitems)-max_leafs+1)))) 523 | tmp=X.groupby(categorical_features) 524 | 525 | # 针对因子变量计数,针对数值变量,计算分组均值 526 | aggfun={'_count_':'count'} 527 | for k in number_features: 528 | aggfun.update({k:'mean'}) 529 | count_data=tmp.agg(aggfun) 530 | 531 | # 每一个节点,定义一些属性1,父节点, 特征名称, value, 532 | 533 | # 生成节点的索引表格 534 | names=count_data.index.names 535 | levels=count_data.index.levels 536 | labels=pd.DataFrame(count_data.index.labels).T 537 | labels.columns=names 538 | for i in range(len(names)): 539 | labels[names[i]]=labels[names[i]].replace(dict(zip(range(len(levels[i])),levels[i]))) 540 | labels_node=pd.DataFrame(index=labels.index,columns=labels.columns) 541 | #labels_prenode=pd.DataFrame(index=labels.index,columns=labels.columns) 542 | dot=Digraph() 543 | nodes=[{'id':0,'column':'start','value':None}] 544 | dot.node(str(nodes[-1]['id']),'Total\n{} , 100%'.format(N),shape="diamond") 545 | 546 | for c in range(len(labels.columns)): 547 | if c==len(labels.columns)-1: 548 | count_data_tmp=count_data.copy() 549 | else: 550 | count_data_tmp=X.groupby(names[:c+1]).agg(aggfun) 551 | for i in range(len(labels.index)): 552 | value=labels.iloc[i,c] 553 | if value!=nodes[-1]['value'] and c!=nodes[-1]['column']: 554 | # 增加一个新节点 555 | addnode={'id':nodes[-1]['id']+1,'column':names[c],'value':value} 556 | nodes.append(addnode) 557 | node_id=str(nodes[-1]['id']) 558 | #cond=labels.iloc[i,:c+1] 559 | #n=_cal_count(X,labels.iloc[i,:c+1]) 560 | if len(count_data_tmp.index.names)==1: 561 | n=count_data_tmp.loc[labels.iloc[i,c],'_count_'] 562 | else: 563 | n=count_data_tmp.xs(list(labels.iloc[i,:c+1]))['_count_'] 564 | label='{} = {}\ncount:{:.0f} , {:.2f}%'.format(names[c],value,n,n*100/N) 565 | for k in number_features: 566 | if len(count_data_tmp.index.names)==1: 567 | vmean=count_data_tmp.loc[labels.iloc[i,c],k] 568 | else: 569 | vmean=count_data_tmp.xs(list(labels.iloc[i,:c+1]))[k] 570 | label=label+'\n{}: {:.1f}'.format(k,vmean) 571 | dot.node(node_id,label) 572 | if c==0: 573 | pre_node_id='0' 574 | else: 575 | pre_node_id=labels_node.iloc[i,c-1] 576 | dot.edge(pre_node_id,node_id) 577 | #print('---创建节点{},节点信息如下'.format(node_id)) 578 | #print(label) 579 | #print('{} 连接节点{}'.format(node_id,pre_node_id)) 580 | #labels_prenode.iloc[i,c]=pre_node_id 581 | labels_node.iloc[i,c]=str(nodes[-1]['id']) 582 | if out_file is not None: 583 | graph=pydotplus.graphviz.graph_from_dot_data(dot.source) 584 | graph.write(out_file,format=os.path.splitext(out_file)[1][1:]) 585 | #graph.write_png(out_file) 586 | else: 587 | dot.view() 588 | return dot 589 | 590 | 591 | 592 | def AnalysisReport(data,filename=None,var_list=None,save_pptx=True,return_report=False,combine=False): 593 | ''' 594 | 直接生成报告 595 | ''' 596 | if var_list is None: 597 | var_list,data=var_detection(data,combine=combine) 598 | #print(var_list) 599 | #print('============') 600 | 601 | slides_data=[] 602 | 603 | if filename is None: 604 | filename='AnalysisReport'+time.strftime('_%Y%m%d%H%M', time.localtime()) 605 | p=_rpt.Report() 606 | p.add_cover(title=os.path.splitext(filename)[0]) 607 | elif isinstance(filename,str): 608 | p=_rpt.Report() 609 | p.add_cover(title=os.path.splitext(filename)[0]) 610 | elif isinstance(filename,_rpt.Report): 611 | p=filename 612 | filename='AnalysisReport'+time.strftime('_%Y%m%d%H%M', time.localtime()) 613 | else: 614 | print('reportgen.AnalysisReport::cannot understand the filename') 615 | return None 616 | 617 | summary=describe(data) 618 | f_cut=10# 每一页展示的最大字段数 619 | n_cut=round(summary.shape[1]/f_cut) 620 | n_cut=1 if n_cut==0 else n_cut 621 | for i in range(n_cut): 622 | if i!=n_cut-1: 623 | summary_tmp=summary.iloc[:,f_cut*i:f_cut*i+f_cut] 624 | else: 625 | summary_tmp=summary.iloc[:,f_cut*i:] 626 | slide_data={'data':summary_tmp,'slide_type':'table'} 627 | title='数据字段描述{}-{}'.format(i*f_cut+1,min(summary.shape[1],i*f_cut+f_cut)) 628 | p.add_slide(data=slide_data,title=title) 629 | 630 | for v in var_list: 631 | vtype=v['vtype'] 632 | name=v['name'] 633 | vlist=v['vlist'] 634 | #print(name,':',vtype) 635 | if len(data.loc[:,vlist].dropna())==0: 636 | print('the field: ',name,'have no valid data!') 637 | continue 638 | # 之前的方案,暂时留着测试用,后期稳定后删除 639 | if vtype == 'number_test': 640 | chart=plot(data[name],figure_type='mpl',chart_type='kde') 641 | chart['fig'].savefig('kdeplot1.png',dpi=200) 642 | chart['fig'].clf() 643 | del chart 644 | chart=plot(data[name],figure_type='mpl',chart_type='dist') 645 | chart['fig'].savefig('kdeplot2.png',dpi=200) 646 | chart['fig'].clf() 647 | del chart 648 | summary='''平均数为:{:.2f},标准差为:{:.2f},最大为:{}'''\ 649 | .format(data[name].mean(),data[name].std(),data[name].max()) 650 | footnote='注: 样本N={}'.format(data[name].count()) 651 | slide_data=[{'data':'kdeplot1.png','slide_type':'picture'},{'data':'kdeplot2.png','slide_type':'picture'}] 652 | p.add_slide(data=slide_data,title=name+' 的分析',summary=summary,footnote=footnote) 653 | slides_data.append(slide_data) 654 | os.remove('kdeplot1.png') 655 | os.remove('kdeplot2.png') 656 | 657 | if vtype == 'number': 658 | if len(data[name].dropna())==1: 659 | print('the fiele ',name,' of number type must have more than two items.') 660 | continue 661 | chart=plot(data[name],figure_type='mpl',chart_type='kde') 662 | chart['fig'].savefig('kdeplot.png',dpi=200) 663 | chart['fig'].clf() 664 | del chart 665 | chart=plot(data[name],figure_type='pptx',chart_type='bar') 666 | summary='''MIN: {}, MAX: {}, MEAN: {:.2f}, STD: {:.2f}'''\ 667 | .format(data[name].min(),data[name].max(),data[name].mean(),data[name].std()) 668 | footnote='注: 样本N={}'.format(data[name].count()) 669 | slide_data=[{'data':chart['chart_data'],'slide_type':'chart'},{'data':'kdeplot.png','slide_type':'picture'}] 670 | p.add_slide(data=slide_data,title=name+' 的分析',summary=summary,footnote=footnote) 671 | slides_data.append(slide_data) 672 | os.remove('kdeplot.png') 673 | elif vtype == 'category': 674 | tmp=pd.DataFrame(data[name].value_counts()) 675 | tmp=tmp*100/tmp.sum()#转换成百分数 676 | if ('ordered' in v) and v['ordered']: 677 | tmp=pd.DataFrame(tmp,index=v['categories']) 678 | footnote='注: 样本N={}'.format(data[name].count()) 679 | slide_data={'data':tmp,'slide_type':'chart','type':'COLUMN_CLUSTERED'} 680 | summary='{}占比最大为: {:.2f}%'.format(tmp.iloc[:,0].argmax(),tmp.iloc[:,0].max()) 681 | p.add_slide(data=slide_data,title=name+' 的分析',summary=summary,footnote=footnote) 682 | slides_data.append(slide_data) 683 | elif vtype == 'datetime': 684 | if data[name].value_counts().max()==1: 685 | print('the dtype of {} column is datetime, continue...') 686 | continue 687 | tmp=pd.DataFrame(data[name].astype('object').value_counts()) 688 | tmp=tmp*100/tmp.sum()#转换成百分数 689 | tmp=tmp.sort_index()#排序 690 | if ('ordered' in v) and v['ordered']: 691 | tmp=pd.DataFrame(tmp,index=v['categories']) 692 | footnote='注: 样本N={}'.format(data[name].count()) 693 | slide_data={'data':tmp,'slide_type':'chart','type':'COLUMN_CLUSTERED'} 694 | summary='{}占比最大为: {:.2f}%'.format(tmp.iloc[:,0].argmax(),tmp.iloc[:,0].max()) 695 | p.add_slide(data=slide_data,title=name+' 的分析',summary=summary,footnote=footnote) 696 | slides_data.append(slide_data) 697 | elif vtype == 'text': 698 | try: 699 | tmp=','.join(data[name].dropna()) 700 | if len(tmp)>1: 701 | img=genwordcloud(tmp,font_path=font_path) 702 | img.save('tmp.png') 703 | footnote='注: 样本N={}'.format(data[name].count()) 704 | slide_data={'data':'tmp.png','slide_type':'picture'} 705 | p.add_slide(data=slide_data,title=name+' 的词云分析',footnote=footnote) 706 | slides_data.append(slide_data) 707 | os.remove('tmp.png') 708 | except: 709 | print('cannot understand the field: {}'.format(name)) 710 | pass 711 | elif vtype == 'group_number': 712 | tmp=pd.DataFrame(data.loc[:,vlist].mean()) 713 | footnote='注: 样本N={}'.format(data.loc[:,vlist].count().max()) 714 | slide_data={'data':tmp,'slide_type':'chart','type':'COLUMN_CLUSTERED'} 715 | summary='{}占比最大为: {:.2f}%'.format(tmp.iloc[:,0].argmax(),tmp.iloc[:,0].max()) 716 | p.add_slide(data=slide_data,title=name+' 的分析',summary=summary,footnote=footnote) 717 | slides_data.append(slide_data) 718 | elif vtype == 'text_st': 719 | print('The field: {} may be id or need to be designed'.format(name)) 720 | else: 721 | print('unknown type: {}'.format(name)) 722 | 723 | if save_pptx: 724 | p.save(os.path.splitext(filename)[0]+'.pptx') 725 | if return_report: 726 | return p,slides_data 727 | 728 | 729 | 730 | def ClassifierReport(y_true,y_preds,y_probas,img_save=False): 731 | '''二分类模型评估(后期可能会修改为多分类) 732 | 真实数据和预测数据之间的各种可视化和度量 733 | 734 | parameters: 735 | ----------- 736 | y_true: array_like 真实的标签,binary 737 | y_preds: dict or array_like. 预测的标签,binary,可以用 dict 存储多个模型的预测标签数据 738 | y_probas: dict or array_like. 预测的概率,0-1,可以用 dict 存储多个模型的预测标签数据 739 | img_save:Bool,是否直接将图片保存到本地 740 | 741 | return: 742 | --------- 743 | models_report: 各模型的各种评估数据 744 | conf_matrix: 各模型的混淆矩阵 745 | ''' 746 | 747 | 748 | #from sklearn import metrics 749 | assert type(y_preds) == type(y_probas) 750 | if not(isinstance(y_preds,dict)): 751 | y_preds={'clf':y_preds} 752 | y_probas={'clf':y_probas} 753 | models_report=pd.DataFrame() 754 | conf_matrix={} 755 | fig1,ax1=plt.subplots() 756 | fig2,ax2=plt.subplots() 757 | fig3,ax3=plt.subplots() 758 | for clf in y_preds: 759 | y_pred=y_preds[clf] 760 | y_proba=y_probas[clf] 761 | try: 762 | kl_div_score=entropyc.kl_div(y_proba[y_true==1],y_proba[y_true==0]) 763 | kl_div_score+=entropyc.kl_div(y_proba[y_true==0],y_proba[y_true==1]) 764 | except: 765 | kl_div_score=np.nan 766 | scores = pd.Series({'model' : clf, 767 | 'roc_auc_score' : metrics.roc_auc_score(y_true, y_proba), 768 | 'good_rate': y_true.value_counts()[0]/len(y_true), 769 | 'matthews_corrcoef': metrics.matthews_corrcoef(y_true, y_pred), 770 | 'accuracy_score': metrics.accuracy_score(y_true,y_pred), 771 | 'ks_score': np.nan, 772 | 'precision_score': metrics.precision_score(y_true, y_pred), 773 | 'recall_score': metrics.recall_score(y_true, y_pred), 774 | 'kl_div': kl_div_score, 775 | 'f1_score': metrics.f1_score(y_true, y_pred)}) 776 | models_report=models_report.append(scores,ignore_index = True) 777 | conf_matrix[clf]=pd.crosstab(y_true, y_pred, rownames=['True'], colnames= ['Predicted'], margins=False) 778 | #print('\n{} 模型的混淆矩阵:'.format(clf)) 779 | #print(conf_matrix[clf]) 780 | 781 | # ROC 曲线 782 | fpr, tpr, thresholds=metrics.roc_curve(y_true,y_proba,pos_label=1) 783 | auc_score=metrics.auc(fpr,tpr) 784 | w=tpr-fpr 785 | ks_score=w.max() 786 | models_report.loc[models_report['model']==clf,'ks_score']=ks_score 787 | ks_x=fpr[w.argmax()] 788 | ks_y=tpr[w.argmax()] 789 | #sc=thresholds[w.argmax()] 790 | #fig1,ax1=plt.subplots() 791 | ax1.set_title('ROC Curve') 792 | ax1.set_xlabel('False Positive Rate') 793 | ax1.set_ylabel('True Positive Rate') 794 | ax1.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6)) 795 | ax1.plot([ks_x,ks_x], [ks_x,ks_y], '--', color='red') 796 | ax1.text(ks_x,(ks_x+ks_y)/2,r' $S_c$=%.2f, KS=%.3f'%(thresholds[w.argmax()],ks_score)) 797 | ax1.plot(fpr,tpr,label='{}:AUC={:.5f}'.format(clf,auc_score)) 798 | ax1.legend() 799 | # PR 曲线 800 | precision, recall, thresholds=metrics.precision_recall_curve(y_true,y_proba,pos_label=1) 801 | #fig2,ax2=plt.subplots() 802 | ax2.plot(recall,precision,label=clf) 803 | ax2.set_title('P-R Curve') 804 | ax2.set_xlabel('Recall') 805 | ax2.set_ylabel('Precision') 806 | ax2.legend() 807 | #fig2.show() 808 | #密度函数和KL距离 809 | #fig3,ax3=plt.subplots() 810 | sns.kdeplot(y_proba[y_true==0],ax=ax3,shade=True,label='{}-0'.format(clf)) 811 | sns.kdeplot(y_proba[y_true==1],ax=ax3,shade=True,label='{}-1'.format(clf)) 812 | ax3.set_title('Density Curve') 813 | ax3.legend() 814 | ax3.autoscale() 815 | #fig3.show() 816 | 817 | 818 | if img_save: 819 | fig1.savefig('roc_curve_{}.png'.format(time.strftime('%Y%m%d%H%M', time.localtime())),dpi=400) 820 | fig2.savefig('pr_curve_{}.png'.format(time.strftime('%Y%m%d%H%M', time.localtime())),dpi=400) 821 | fig3.savefig('density_curve_{}.png'.format(time.strftime('%Y%m%d%H%M', time.localtime())),dpi=400) 822 | else: 823 | fig1.show() 824 | fig2.show() 825 | fig3.show() 826 | models_report=models_report.set_index('model') 827 | #print('模型的性能评估:') 828 | #print(models_report) 829 | return models_report,conf_matrix 830 | -------------------------------------------------------------------------------- /reportgen/report.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Nov 8 20:05:36 2016 4 | @author: JSong 5 | """ 6 | 7 | import os 8 | import time 9 | 10 | 11 | import pandas as pd 12 | import numpy as np 13 | pd.set_option('display.float_format', lambda x: '%.2f' % x) 14 | 15 | from . import config 16 | from .utils import Delaunay2D 17 | 18 | import matplotlib.image as mpimg 19 | import seaborn as sns 20 | 21 | from pptx import Presentation 22 | from pptx.chart.data import ChartData,XyChartData,BubbleChartData 23 | from pptx.enum.chart import XL_CHART_TYPE 24 | from pptx.util import Inches, Pt, Emu 25 | from pptx.enum.chart import XL_LEGEND_POSITION 26 | #from pptx.enum.chart import XL_LABEL_POSITION 27 | from pptx.dml.color import RGBColor 28 | 29 | _thisdir = os.path.split(__file__)[0] 30 | # default chinese font 31 | from matplotlib.font_manager import FontProperties 32 | font_path=config.font_path 33 | if font_path: 34 | myfont=FontProperties(fname=font_path) 35 | sns.set(font=myfont.get_name()) 36 | 37 | # default template of pptx report 38 | template_pptx=config.template_pptx 39 | 40 | 41 | 42 | __all__=['Report', 43 | 'df_to_table', 44 | 'df_to_chartdata', 45 | 'plot_table', 46 | 'plot_textbox', 47 | 'plot_chart', 48 | 'plot_picture', 49 | 'slides_data_gen', 50 | 'plot_cover', 51 | 'genwordcloud'] 52 | 53 | 54 | 55 | chart_list={\ 56 | "AREA":[1,"ChartData"],\ 57 | "AREA_STACKED":[76,"ChartData"],\ 58 | "AREA_STACKED_100":[77,"ChartData"],\ 59 | "THREE_D_AREA":[-4098,"ChartData"],\ 60 | "THREE_D_AREA_STACKED":[78,"ChartData"],\ 61 | "THREE_D_AREA_STACKED_100":[79,"ChartData"],\ 62 | "BAR_CLUSTERED":[57,"ChartData"],\ 63 | "BAR_TWO_WAY":[57,"ChartData"],\ 64 | "BAR_OF_PIE":[71,"ChartData"],\ 65 | "BAR_STACKED":[58,"ChartData"],\ 66 | "BAR_STACKED_100":[59,"ChartData"],\ 67 | "THREE_D_BAR_CLUSTERED":[60,"ChartData"],\ 68 | "THREE_D_BAR_STACKED":[61,"ChartData"],\ 69 | "THREE_D_BAR_STACKED_100":[62,"ChartData"],\ 70 | "BUBBLE":[15,"BubbleChartData"],\ 71 | "BUBBLE_THREE_D_EFFECT":[87,"BubbleChartData"],\ 72 | "COLUMN_CLUSTERED":[51,"ChartData"],\ 73 | "COLUMN_STACKED":[52,"ChartData"],\ 74 | "COLUMN_STACKED_100":[53,"ChartData"],\ 75 | "THREE_D_COLUMN":[-4100,"ChartData"],\ 76 | "THREE_D_COLUMN_CLUSTERED":[54,"ChartData"],\ 77 | "THREE_D_COLUMN_STACKED":[55,"ChartData"],\ 78 | "THREE_D_COLUMN_STACKED_100":[56,"ChartData"],\ 79 | "CYLINDER_BAR_CLUSTERED":[95,"ChartData"],\ 80 | "CYLINDER_BAR_STACKED":[96,"ChartData"],\ 81 | "CYLINDER_BAR_STACKED_100":[97,"ChartData"],\ 82 | "CYLINDER_COL":[98,"ChartData"],\ 83 | "CYLINDER_COL_CLUSTERED":[92,"ChartData"],\ 84 | "CYLINDER_COL_STACKED":[93,"ChartData"],\ 85 | "CYLINDER_COL_STACKED_100":[94,"ChartData"],\ 86 | "DOUGHNUT":[-4120,"ChartData"],\ 87 | "DOUGHNUT_EXPLODED":[80,"ChartData"],\ 88 | "LINE":[4,"ChartData"],\ 89 | "LINE_MARKERS":[65,"ChartData"],\ 90 | "LINE_MARKERS_STACKED":[66,"ChartData"],\ 91 | "LINE_MARKERS_STACKED_100":[67,"ChartData"],\ 92 | "LINE_STACKED":[63,"ChartData"],\ 93 | "LINE_STACKED_100":[64,"ChartData"],\ 94 | "THREE_D_LINE":[-4101,"ChartData"],\ 95 | "PIE":[5,"ChartData"],\ 96 | "PIE_EXPLODED":[69,"ChartData"],\ 97 | "PIE_OF_PIE":[68,"ChartData"],\ 98 | "THREE_D_PIE":[-4102,"ChartData"],\ 99 | "THREE_D_PIE_EXPLODED":[70,"ChartData"],\ 100 | "PYRAMID_BAR_CLUSTERED":[109,"ChartData"],\ 101 | "PYRAMID_BAR_STACKED":[110,"ChartData"],\ 102 | "PYRAMID_BAR_STACKED_100":[111,"ChartData"],\ 103 | "PYRAMID_COL":[112,"ChartData"],\ 104 | "PYRAMID_COL_CLUSTERED":[106,"ChartData"],\ 105 | "PYRAMID_COL_STACKED":[107,"ChartData"],\ 106 | "PYRAMID_COL_STACKED_100":[108,"ChartData"],\ 107 | "RADAR":[-4151,"ChartData"],\ 108 | "RADAR_FILLED":[82,"ChartData"],\ 109 | "RADAR_MARKERS":[81,"ChartData"],\ 110 | "STOCK_HLC":[88,"ChartData"],\ 111 | "STOCK_OHLC":[89,"ChartData"],\ 112 | "STOCK_VHLC":[90,"ChartData"],\ 113 | "STOCK_VOHLC":[91,"ChartData"],\ 114 | "SURFACE":[83,"ChartData"],\ 115 | "SURFACE_TOP_VIEW":[85,"ChartData"],\ 116 | "SURFACE_TOP_VIEW_WIREFRAME":[86,"ChartData"],\ 117 | "SURFACE_WIREFRAME":[84,"ChartData"],\ 118 | "XY_SCATTER":[-4169,"XyChartData"],\ 119 | "XY_SCATTER_LINES":[74,"XyChartData"],\ 120 | "XY_SCATTER_LINES_NO_MARKERS":[75,"XyChartData"],\ 121 | "XY_SCATTER_SMOOTH":[72,"XyChartData"],\ 122 | "XY_SCATTER_SMOOTH_NO_MARKERS":[73,"XyChartData"]} 123 | 124 | 125 | 126 | 127 | 128 | 129 | def df_to_table(slide,df,left,top,width,height,index_names=False,columns_names=True): 130 | '''将pandas数据框添加到slide上,并生成pptx上的表格 131 | 输入: 132 | slide:PPT的一个页面,由pptx.Presentation().slides.add_slide()给定 133 | df:需要转换的数据框 134 | lef,top: 表格在slide中的位置 135 | width,height: 表格在slide中的大小 136 | index_names: Bool,是否需要显示行类别的名称 137 | columns_names: Bool,是否需要显示列类别的名称 138 | 返回: 139 | 返回带表格的slide 140 | ''' 141 | df=pd.DataFrame(df) 142 | rows, cols = df.shape 143 | res = slide.shapes.add_table(rows+columns_names, cols+index_names, left, top, width, height) 144 | # 固定表格的宽度 145 | ''' 146 | for c in range(cols+rownames): 147 | res.table.columns[c].width = colwidth 148 | res.table.rows[c].width = colwidth 149 | ''' 150 | # Insert the column names 151 | if columns_names: 152 | for col_index, col_name in enumerate(list(df.columns)): 153 | cell=res.table.cell(0,col_index+index_names) 154 | #cell.text_frame.fit_text(max_size=12) 155 | #cell.text_frame.text='%s'%(col_name) 156 | cell.text = '%s'%(col_name) 157 | if index_names: 158 | for col_index, col_name in enumerate(list(df.index)): 159 | cell=res.table.cell(col_index+columns_names,0) 160 | cell.text = '%s'%(col_name) 161 | #cell.text_frame.fit_text(max_size=12) 162 | m = df.as_matrix() 163 | for row in range(rows): 164 | for col in range(cols): 165 | cell=res.table.cell(row+columns_names, col+index_names) 166 | if isinstance(m[row, col],float): 167 | cell.text = '%.2f'%(m[row, col]) 168 | else: 169 | cell.text = '%s'%(m[row, col]) 170 | #cell.text_frame.fit_text(max_size=12) 171 | 172 | 173 | def df_to_chartdata(df,datatype,number_format=None): 174 | ''' 175 | 根据给定的图表数据类型生成相应的数据 176 | Chartdata:一般的数据 177 | XyChartData: 散点图数据 178 | BubbleChartData:气泡图数据 179 | ''' 180 | if isinstance(df,pd.Series): 181 | df=pd.DataFrame(df) 182 | df.fillna(0,inplace=True) 183 | datatype=datatype.lower() 184 | if datatype == 'chartdata': 185 | chart_data = ChartData() 186 | chart_data.categories = ['%s'%(c) for c in list(df.index)] 187 | for col_name in df.columns: 188 | chart_data.add_series('%s'%(col_name),list(df[col_name]),number_format) 189 | return chart_data 190 | if datatype == 'xychartdata': 191 | chart_data=XyChartData() 192 | if not isinstance(df,list): 193 | df=[df] 194 | for d in df: 195 | series_name='%s'%(d.columns[0])+' vs '+'%s'%(d.columns[1]) 196 | series_ = chart_data.add_series(series_name) 197 | for i in range(len(d)): 198 | series_.add_data_point(d.iloc[i,0], d.iloc[i,1]) 199 | return chart_data 200 | if datatype == 'bubblechartdata': 201 | chart_data=BubbleChartData() 202 | if not isinstance(df,list): 203 | df=[df] 204 | for d in df: 205 | series_name='%s'%(d.columns[0])+' vs '+'%s'%(d.columns[1]) 206 | series_ = chart_data.add_series(series_name) 207 | for i in range(len(d)): 208 | series_.add_data_point(d.iloc[i,0],d.iloc[i,1],d.iloc[i,2]) 209 | return chart_data 210 | 211 | 212 | 213 | def plot_table(prs,df,layouts=[0,5],title=u'我是标题',summary=u'我是简短的结论',footnote=''): 214 | '''根据给定的数据,在给定的prs上新增一页表格ppt 215 | 输入: 216 | prs: PPT文件接口 217 | df: 数据框 218 | layouts: [0]为PPT母版顺序,[1]为母版内的版式顺序 219 | 输出: 220 | 更新后的prs 221 | ''' 222 | df=pd.DataFrame(df) 223 | slide_width=prs.slide_width 224 | slide_height=prs.slide_height 225 | # 可能需要修改以适应更多的情形 226 | title_only_slide = prs.slide_masters[layouts[0]].slide_layouts[layouts[1]] 227 | slide = prs.slides.add_slide(title_only_slide) 228 | #title=u'这里是标题' 229 | slide.shapes.title.text = title 230 | left,top = Emu(0.05*slide_width), Emu(0.10*slide_height) 231 | width,height = Emu(0.7*slide_width), Emu(0.1*slide_height) 232 | txBox = slide.shapes.add_textbox(left, top, width, height) 233 | #summary=u'这里是一些简短的结论' 234 | txBox.text_frame.text=summary 235 | # 绘制表格 236 | '''添加自适应的表格大小 237 | 默认最大12*6,width=0.80,height=0.70 238 | left=0.1,top=0.25 239 | ''' 240 | R,C=df.shape 241 | width=max(0.5,min(1,C/6.0))*0.80 242 | height=max(0.5,min(1,R/12.0))*0.70 243 | left=0.5-width/2 244 | top=0.25 245 | left=Emu(left*slide_width) 246 | top=Emu(top*slide_height) 247 | width=Emu(width*slide_width) 248 | height=Emu(height*slide_height) 249 | df_to_table(slide,df,left,top,width,height,index_names=True) 250 | 251 | # 添加脚注 footnote=u'这里是脚注' 252 | if footnote: 253 | left,top = Emu(0.025*slide_width), Emu(0.95*slide_height) 254 | width,height = Emu(0.70*slide_width), Emu(0.10*slide_height) 255 | txBox = slide.shapes.add_textbox(left, top, width, height) 256 | #p = text_frame.paragraphs[0] 257 | p=txBox.text_frame.paragraphs[0] 258 | p.text=footnote 259 | p.font.size = Pt(10) 260 | p.font.language_id = 3076 261 | p.font.name='Microsoft YaHei UI' 262 | p.font.color.rgb=RGBColor(127,127,127) 263 | try: 264 | txBox.text_frame.fit_text(max_size=10) 265 | except: 266 | pass 267 | #print('cannot fit the size of font') 268 | return prs 269 | 270 | 271 | def plot_textbox(prs,texts,title=u'我是文本框页标题',summary=u'我是内容',footnote='',layouts=[0,0]): 272 | ''' 273 | 只绘制一个文本框,用于目录、小结等 274 | ''' 275 | slide_width=prs.slide_width 276 | slide_height=prs.slide_height 277 | # 可能需要修改以适应更多的情形 278 | title_only_slide = prs.slide_masters[layouts[0]].slide_layouts[layouts[1]] 279 | slide = prs.slides.add_slide(title_only_slide) 280 | #title=u'这里是标题' 281 | slide.shapes.title.text = title 282 | # 绘制副标题 283 | if summary: 284 | left,top = Emu(0.15*slide_width), Emu(0.10*slide_height) 285 | width,height = Emu(0.7*slide_width), Emu(0.1*slide_height) 286 | txBox = slide.shapes.add_textbox(left, top, width, height) 287 | txBox.text_frame.text=summary 288 | # 绘制主体 289 | left,top = Emu(0.15*slide_width), Emu(0.20*slide_height) 290 | width,height = Emu(0.7*slide_width), Emu(0.7*slide_height) 291 | txBox = slide.shapes.add_textbox(left, top, width, height) 292 | txBox.text_frame.text=texts 293 | 294 | # 添加脚注 footnote=u'这里是脚注' 295 | if footnote: 296 | left,top = Emu(0.025*slide_width), Emu(0.95*slide_height) 297 | width,height = Emu(0.70*slide_width), Emu(0.10*slide_height) 298 | txBox = slide.shapes.add_textbox(left, top, width, height) 299 | #p = text_frame.paragraphs[0] 300 | p=txBox.text_frame.paragraphs[0] 301 | p.text=footnote 302 | p.font.size = Pt(10) 303 | p.font.language_id = 3076 304 | p.font.name='Microsoft YaHei UI' 305 | p.font.color.rgb=RGBColor(127,127,127) 306 | try: 307 | txBox.text_frame.fit_text(max_size=10) 308 | except: 309 | pass 310 | #print('cannot fit the size of font') 311 | return prs 312 | 313 | def plot_picture(prs,img_path,layouts=[0,0],title=u'我是文本框页标题',summary='',\ 314 | footnote=''): 315 | ''' 316 | 只插入一张图片,用于目录、小结等 317 | ''' 318 | slide_width=prs.slide_width 319 | slide_height=prs.slide_height 320 | # 可能需要修改以适应更多的情形 321 | title_only_slide = prs.slide_masters[layouts[0]].slide_layouts[layouts[1]] 322 | slide = prs.slides.add_slide(title_only_slide) 323 | #title=u'这里是标题' 324 | slide.shapes.title.text = title 325 | if summary: 326 | left,top = Emu(0.05*slide_width), Emu(0.10*slide_height) 327 | width,height = Emu(0.7*slide_width), Emu(0.1*slide_height) 328 | txBox = slide.shapes.add_textbox(left, top, width, height) 329 | txBox.text_frame.text=summary 330 | left,top = Emu(0.15*slide_width), Emu(0.2*slide_height) 331 | height=Emu(0.7*slide_height) 332 | slide.shapes.add_picture(img_path, left, top, height=height) 333 | # 添加脚注 footnote=u'这里是脚注' 334 | if footnote: 335 | left,top = Emu(0.025*slide_width), Emu(0.95*slide_height) 336 | width,height = Emu(0.70*slide_width), Emu(0.10*slide_height) 337 | txBox = slide.shapes.add_textbox(left, top, width, height) 338 | #p = text_frame.paragraphs[0] 339 | p=txBox.text_frame.paragraphs[0] 340 | p.text=footnote 341 | p.font.size = Pt(10) 342 | p.font.language_id = 3076 343 | p.font.name='Microsoft YaHei UI' 344 | p.font.color.rgb=RGBColor(127,127,127) 345 | try: 346 | txBox.text_frame.fit_text(max_size=10) 347 | except: 348 | pass 349 | #print('cannot fit the size of font') 350 | return prs 351 | 352 | 353 | 354 | def plot_chart(prs,df,chart_type,title=u'我是标题',summary=u'我是简短的结论',\ 355 | footnote=None,chart_format=None,layouts=[0,0],has_data_labels=True): 356 | ''' 357 | 直接将数据绘制到一张ppt上,且高度定制化 358 | 默认都有图例,且图例在下方 359 | 默认都有数据标签 360 | ''' 361 | 362 | slide_width=prs.slide_width 363 | slide_height=prs.slide_height 364 | # 可能需要修改以适应更多的情形 365 | # layouts[0]代表第几个母版,layouts[1]代表母版中的第几个版式 366 | title_only_slide = prs.slide_masters[layouts[0]].slide_layouts[layouts[1]] 367 | slide = prs.slides.add_slide(title_only_slide) 368 | # 添加标题 title=u'这里是标题' 369 | try: 370 | slide.shapes.title.text = title 371 | except: 372 | print('请检查模板,脚本没有找到合适的slide') 373 | return 374 | # 添加结论 summary=u'这里是一些简短的结论' 375 | #summary_loc=[0.10,0.14,0.80,0.15] 376 | left,top = Emu(config.summary_loc[0]*slide_width), Emu(config.summary_loc[1]*slide_height) 377 | width,height = Emu(config.summary_loc[2]*slide_width), Emu(config.summary_loc[3]*slide_height) 378 | txBox = slide.shapes.add_textbox(left, top, width, height) 379 | txBox.text_frame.text=summary 380 | txBox.text_frame.paragraphs[0].font.language_id = 3076 381 | try: 382 | txBox.text_frame.fit_text(max_size=12) 383 | except: 384 | pass 385 | #print('cannot fit the size of font') 386 | 387 | 388 | # 添加脚注 footnote=u'这里是脚注' 389 | if footnote: 390 | left,top = Emu(0.025*slide_width), Emu(0.95*slide_height) 391 | width,height = Emu(0.70*slide_width), Emu(0.10*slide_height) 392 | txBox = slide.shapes.add_textbox(left, top, width, height) 393 | #p = text_frame.paragraphs[0] 394 | p=txBox.text_frame.paragraphs[0] 395 | p.text=footnote 396 | p.font.size = Pt(10) 397 | p.font.language_id = 3076 398 | p.font.name='Microsoft YaHei UI' 399 | p.font.color.rgb=RGBColor(127,127,127) 400 | try: 401 | txBox.text_frame.fit_text(max_size=10) 402 | except: 403 | pass 404 | #print('cannot fit the size of font') 405 | 406 | 407 | # 插入图表 408 | chart_type_code=chart_list[chart_type][1] 409 | chart_data=df_to_chartdata(df,chart_type_code) 410 | #left, top = Emu(0.05*slide_width), Emu(0.20*slide_height) 411 | #width, height = Emu(0.85*slide_width), Emu(0.70*slide_height) 412 | #chart_loc=[0.10,0.30,0.80,0.60] 413 | left, top = Emu(config.chart_loc[0]*slide_width), Emu(config.chart_loc[1]*slide_height) 414 | width, height = Emu(config.chart_loc[2]*slide_width), Emu(config.chart_loc[3]*slide_height) 415 | 416 | chart=slide.shapes.add_chart(chart_list[chart_type.upper()][0], \ 417 | left, top, width, height, chart_data).chart 418 | 419 | if chart_type_code in [-4169,72,73,74,75]: 420 | return 421 | 422 | font_default_size=Pt(10) 423 | # 添加图例 424 | if (df.shape[1]>1) or (chart_type=='PIE'): 425 | chart.has_legend = True 426 | chart.legend.font.size=font_default_size 427 | chart.legend.position = XL_LEGEND_POSITION.BOTTOM 428 | chart.legend.include_in_layout = False 429 | 430 | try: 431 | chart.category_axis.tick_labels.font.size=font_default_size 432 | except: 433 | pass#暂时不知道怎么处理 434 | try: 435 | chart.value_axis.tick_labels.font.size=font_default_size 436 | except: 437 | pass 438 | # 添加数据标签 439 | 440 | non_available_list=['BUBBLE','BUBBLE_THREE_D_EFFECT','XY_SCATTER',\ 441 | 'XY_SCATTER_LINES','PIE'] 442 | # 大致检测是否采用百分比 443 | # 1、单选题每列的和肯定是100,顶多相差+-5 444 | # 2、多选题每一列的和大于100,但单个的小于100.此处可能会有误判,但暂时无解 445 | # 3、可能会有某一列全为0,此时单独考虑 446 | if ((df.sum()[df.sum()!=0]>90).all()) and ((df<=100).all().all()) and (u'总体' not in df.index): 447 | # 数据条的数据标签格式 448 | #number_format1='0.0"%"' 449 | number_format1=config.number_format_data 450 | # 坐标轴的数据标签格式 451 | #number_format2='0"%"' 452 | number_format2=config.number_format_tick 453 | else: 454 | number_format1='0.00' 455 | number_format2='0.0' 456 | 457 | if (chart_type not in non_available_list) or (chart_type == 'PIE'): 458 | plot = chart.plots[0] 459 | plot.has_data_labels = True 460 | plot.data_labels.font.size = font_default_size 461 | plot.data_labels.number_format = number_format1 462 | #plot.data_labels.number_format_is_linked=True 463 | #data_labels = plot.data_labels 464 | #plot.data_labels.position = XL_LABEL_POSITION.BEST_FIT 465 | if (chart_type not in non_available_list): 466 | #chart.value_axis.maximum_scale = 1 467 | if df.shape[1]==1: 468 | chart.value_axis.has_major_gridlines = False 469 | else: 470 | chart.value_axis.has_major_gridlines = True 471 | tick_labels = chart.value_axis.tick_labels 472 | tick_labels.number_format = number_format2 473 | tick_labels.font.size = font_default_size 474 | 475 | # 修改纵坐标格式 476 | ''' 477 | tick_labels = chart.value_axis.tick_labels 478 | tick_labels.number_format = '0"%"' 479 | tick_labels.font.bold = True 480 | tick_labels.font.size = Pt(10) 481 | ''' 482 | 483 | # 填充系列的颜色 484 | ''' 最好的方法还是修改母版文件中的主题颜色,这里只提供方法 485 | if df.shape[1]==1: 486 | chart.series[0].fill() 487 | ''' 488 | 489 | # 自定义format 490 | if chart_format: 491 | for k in chart_format: 492 | exec('chart.'+k+'='+'%s'%(chart_format[k])) 493 | 494 | return prs 495 | 496 | ''' 497 | if chart_type == 'BAR_TWO_WAY': 498 | chart 499 | ''' 500 | 501 | 502 | def plot_cover(prs,title=u'reportgen工具包封面',layouts=[0,0],xspace=8,yspace=6): 503 | 504 | slide_width=prs.slide_width 505 | slide_height=prs.slide_height 506 | # 可能需要修改以适应更多的情形 507 | title_only_slide = prs.slide_masters[layouts[0]].slide_layouts[layouts[1]] 508 | slide = prs.slides.add_slide(title_only_slide) 509 | 510 | ## 随机生成连接点 511 | seeds=np.round(np.dot(np.random.rand((xspace-1)*(yspace-1),2),np.diag([slide_width,slide_height]))) 512 | # 添加左边点 513 | tmp=np.linspace(0,slide_height,yspace) 514 | seeds=np.concatenate((seeds,np.array([[0]*len(tmp),tmp]).T)) 515 | # 添加上边点 516 | tmp=np.linspace(0,slide_width,xspace)[1:] 517 | seeds=np.concatenate((seeds,np.array([tmp,[0]*len(tmp)]).T)) 518 | # 添加右边点 519 | tmp=np.linspace(0,slide_height,yspace)[1:] 520 | seeds=np.concatenate((seeds,np.array([[slide_width]*len(tmp),tmp]).T)) 521 | # 添加下边点 522 | tmp=np.linspace(0,slide_width,xspace)[1:-1] 523 | seeds=np.concatenate((seeds,np.array([tmp,[slide_height]*len(tmp)]).T)) 524 | 525 | # 构造三角剖分,生成相应的三角形和平面图数据 526 | center = np.mean(seeds, axis=0) 527 | t=np.sqrt(slide_width**2+slide_height**2)/2 528 | dt = Delaunay2D(center, 2**(np.floor(np.log2(t))+1)) 529 | for s in seeds: 530 | dt.AddPoint(s) 531 | tri=dt.exportTriangles() 532 | graph=np.zeros((len(seeds),len(seeds))) 533 | for t in tri: 534 | graph[t[0],t[1]]=1 535 | graph[t[1],t[2]]=1 536 | graph[t[0],t[2]]=1 537 | graph[t[1],t[0]]=1 538 | graph[t[2],t[1]]=1 539 | graph[t[2],t[1]]=1 540 | 541 | 542 | from pptx.enum.shapes import MSO_CONNECTOR 543 | from pptx.enum.shapes import MSO_SHAPE 544 | shapes = slide.shapes 545 | # 添加连接线 546 | for i in range(len(seeds)): 547 | for j in range(len(seeds)): 548 | if (i1: 713 | tmp[i,j]=255 714 | mask=np.zeros((900,1200,4),dtype=np.uint8) 715 | mask[:,:,0]=tmp 716 | mask[:,:,1]=tmp 717 | mask[:,:,2]=tmp 718 | mask[:,:,3]=255 719 | else: 720 | mask=np.array(Image.open(mask)) 721 | wordcloud = WordCloud(background_color = background_color,font_path=font_path, mask = mask) 722 | wordcloud.generate(texts) 723 | img=wordcloud.to_image() 724 | return img 725 | 726 | 727 | 728 | 729 | class Report(): 730 | ''' 731 | 底层的类,负责一个 pptx 报告的相关接口 732 | parameters: 733 | ----------- 734 | filename: pptx 文件路径,若无则新建一个文件 735 | chart_type_default: 默认的图表类型 736 | layouts_default: 新建slide时默认使用的 pptx 模板 737 | title: 报告的名称 738 | author: 报告的作者 739 | 740 | example: 741 | --------- 742 | >>>r=Report(filename='') 743 | >>>r.add_cover(title='reportgen') 744 | >>>r.add_slides([]) 745 | >>>r.save() 746 | ''' 747 | def __init__(self,filename=None,chart_type_default='COLUMN_CLUSTERED',**kwargs): 748 | self.title=None 749 | self.author=None 750 | # self.filename = filename #导入一个存在的pptx文件 751 | self.chart_type_default=chart_type_default 752 | if filename is None: 753 | if os.path.exists('template.pptx'): 754 | prs=Presentation('template.pptx') 755 | elif template_pptx is not None: 756 | prs=Presentation(template_pptx) 757 | else: 758 | prs=Presentation() 759 | else : 760 | # 分离出路径中的文件名 761 | self.title=os.path.splitext(os.path.split(filename)[1])[0] 762 | prs=Presentation(filename) 763 | self.prs=prs 764 | title_only_slide=self._layouts() 765 | if title_only_slide: 766 | layouts=title_only_slide[0] 767 | else: 768 | layouts=[0,0] 769 | self.layouts_default=layouts 770 | for k in kwargs: 771 | setattr(self,k.lower(),kwargs[k]) 772 | 773 | 774 | def _layouts(self): 775 | '''给定pptx文件,自动识别标题等版式 776 | ''' 777 | slide_width=self.prs.slide_width 778 | slide_height=self.prs.slide_height 779 | title_only_slide=[] 780 | #blank_slide=[] 781 | for i in range(len(self.prs.slide_masters)): 782 | slides=self.prs.slide_masters[i] 783 | #print('第{}个有{}个版式'.format(i,len(slides.slide_layouts))) 784 | for j in range(len(slides.slide_layouts)): 785 | slide=slides.slide_layouts[j] 786 | title_slide=0 787 | placeholder_size=0 788 | for k in range(len(slide.shapes)): 789 | shape=slide.shapes[k] 790 | if shape.is_placeholder and shape.has_text_frame: 791 | left,top=shape.left/slide_width,shape.top/slide_height 792 | height=shape.height/slide_height 793 | if left<1 and top<1 and height<1 and left>0 and top>0 and height>0: 794 | placeholder_size+=1 795 | #print('left={:.2f},top={:.2f},height={:.2f}'.format(left,top,height)) 796 | if left<0.15 and top<0.15 and height <0.25: 797 | title_slide+=1 798 | #print('{}个文本占位符,{}个title'.format(placeholder_size,title_slide)) 799 | if placeholder_size==1 and title_slide==1: 800 | title_only_slide.append([i,j]) 801 | #if placeholder_size==0: 802 | #blank_slide.append((i,j))s 803 | return title_only_slide 804 | 805 | 806 | 807 | def get_texts(self): 808 | # one for each text run in presentation 809 | text_runs = [] 810 | 811 | for slide in self.prs.slides: 812 | for shape in slide.shapes: 813 | if not shape.has_text_frame: 814 | continue 815 | for paragraph in shape.text_frame.paragraphs: 816 | for run in paragraph.runs: 817 | text_runs.append(run.text) 818 | return text_runs 819 | 820 | def get_images(self): 821 | try: 822 | from PIL import Image as PIL_Image 823 | from io import BytesIO 824 | except: 825 | print('please install the PIL.') 826 | return 827 | if not os.path.exists('.\\images'): 828 | os.mkdir('.\\images') 829 | n_images=0 830 | for slide in self.prs.slides: 831 | for shape in slide.shapes: 832 | if 'Image' in str(type(shape)) or 'Picture' in str(type(shape)): 833 | n_images+=1 834 | shape_image=shape.image 835 | #filename='.\\images\\'+shape_image.filename 836 | #r=str(np.random.randint(99)).zfill(2) 837 | filename='.\\images\\image%d'%n_images+'.'+shape_image.ext 838 | p = PIL_Image.open(BytesIO(shape_image.blob)) 839 | p.save(filename) 840 | #print('save {}'.format(shape_image.filename)) 841 | 842 | 843 | 844 | 845 | def add_slides(self,slides_data,chart_type_default=None): 846 | '''!使用的接口和下方的add_slide不一样,建议使用add_slide 847 | slides_data: 每一页ppt所需要的元素[ 848 | {title:,#标题 849 | summary:,#结论 850 | data:,# DataFrame数据、文本数据、图片地址等 851 | slide_type:,#chart、table、text 852 | chart_type:图表类型 853 | data_config:,#字典格式,绘制data其他所需要的相关参数,保留字段,暂时不用 854 | footnote:,#脚注 855 | layouts:#该slide使用的ppt版式 856 | },] 857 | ''' 858 | if chart_type_default is None: 859 | chart_type_default=self.chart_type_default 860 | slides_data=slides_data_gen(slides_data,chart_type_default) 861 | for slide in slides_data: 862 | slide_type=slide['slide_type'] 863 | title=slide['title'] 864 | summary=slide['summary'] 865 | footnote=slide['footnote'] 866 | layouts=self.layouts_default if slide['layouts'] == 'auto' else slide['layouts'] 867 | data=slide['data'] 868 | chart_type=slide['chart_type'] if 'chart_type' in slide else None 869 | #data_config=slide['data_config']#暂时没有用该参数 870 | if (slide_type is None) or (not isinstance(slide_type,str)): 871 | continue 872 | if slide_type == 'chart': 873 | self.prs=plot_chart(self.prs,data,chart_type=chart_type,layouts=layouts,\ 874 | title=title,summary=summary,footnote=footnote); 875 | elif slide_type == 'table': 876 | self.prs=plot_table(self.prs,data,layouts=layouts,title=title,summary=summary,\ 877 | footnote=footnote); 878 | elif slide_type in ['textbox','text']: 879 | self.prs=plot_textbox(self.prs,data,layouts=layouts,title=title,summary=summary,\ 880 | footnote=footnote); 881 | elif slide_type in ['picture','figure']: 882 | self.prs=plot_picture(self.prs,data,layouts=layouts,title=title,summary=summary,\ 883 | footnote=footnote); 884 | 885 | 886 | def add_cover(self,title='',author='',style='default',layouts='auto',size=[8,6]): 887 | if len(title) == 0: 888 | title = 'Analysis Report Powered by reportgen' if self.title is None else self.title 889 | if len(author) == 0: 890 | author='' if self.author is None else self.author 891 | title=title+'\n作者: '+author if len(author)>0 else title 892 | layouts=self.layouts_default if layouts == 'auto' else layouts 893 | if style == 'default': 894 | self.prs=plot_cover(self.prs,title=title,layouts=layouts,xspace=size[0],yspace=size[1]); 895 | 896 | 897 | 898 | def location_suggest(self,num=1,rate=0.78,data=None,summary=None): 899 | '''统一管理slides各个模块的位置 900 | parameter 901 | -------- 902 | num: 主体内容(如图、外链图片、文本框等)的个数,默认从左到右依次排列 903 | rate: 主体内容的宽度综合 904 | data: list,通过数据类型智能判断位置,如有,则 num 失效 905 | summary:如果summary为空,则非图表等位置都会上移动 906 | 907 | return 908 | ----- 909 | locations: dict格式. l代表left,t代表top,w代表width,h代表height 910 | ''' 911 | slide_width,slide_height=self.prs.slide_width,self.prs.slide_height 912 | if 'summary_loc' in config.__dict__: 913 | summary_loc=config.summary_loc 914 | else: 915 | summary_loc=[0.10,0.14,0.80,0.15] 916 | 917 | if 'footnote_loc' in config.__dict__: 918 | footnote_loc=config.footnote_loc 919 | else: 920 | footnote_loc=[0.025,0.95,0.70,0.06] 921 | 922 | if 'data_loc' in config.__dict__: 923 | data_loc=config.data_loc 924 | else: 925 | data_loc=[0.11,0.30,0.78,0.60] 926 | 927 | num=len(data) if isinstance(data,list) else num 928 | locations={} 929 | locations['summary']={'l':Emu(summary_loc[0]*slide_width),'t':Emu(summary_loc[1]*slide_height),\ 930 | 'w':Emu(summary_loc[2]*slide_width),'h':Emu(summary_loc[3]*slide_height)} 931 | 932 | locations['footnote']={'l':Emu(footnote_loc[0]*slide_width),'t':Emu(footnote_loc[1]*slide_height),\ 933 | 'w':Emu(footnote_loc[2]*slide_width),'h':Emu(footnote_loc[3]*slide_height)} 934 | # 主体部分只有一个的情形 935 | ''' 936 | 控制主体的宽度为78%,且居中显示。 937 | ''' 938 | if (summary is not None) and len(summary)==0: 939 | data_loc[1]=data_loc[1]*0.84 940 | if num>1: 941 | left=[(1-rate)*(i+1)/(float(num)+1)+rate*i/float(num) for i in range(num)] 942 | top=[data_loc[1]]*num 943 | width=[rate/float(num)]*num 944 | height=[data_loc[3]]*num 945 | locations['data']=[{'l':Emu(left[i]*slide_width),'t':Emu(top[i]*slide_height),\ 946 | 'w':Emu(width[i]*slide_width),'h':Emu(height[i]*slide_height)} for i in range(num)] 947 | else: 948 | # 暂时只修正单张图片常常不居中的问题,后期会修正多张图片 949 | if data[0]['slide_type'] == 'picture': 950 | imgdata=mpimg.imread(data[0]['data']) 951 | img_height,img_width=imgdata.shape[:2] 952 | img_width_in_pptx=data_loc[3]*slide_height*img_width/img_height/slide_width 953 | data_loc[0]=0.5-img_width_in_pptx/2 954 | 955 | locations['data']=[{'l':Emu(data_loc[0]*slide_width),'t':Emu(data_loc[1]*slide_height),\ 956 | 'w':Emu(data_loc[2]*slide_width),'h':Emu(data_loc[3]*slide_height)}] 957 | 958 | return locations 959 | 960 | def add_slide(self,data=[],title='',summary='',footnote='',layouts='auto',**kwarg): 961 | '''通用函数,添加一页幻灯片 962 | parameter 963 | --------- 964 | data=[{'data':,'slide_type':,'type':,},] # 三个是必须字段,其他根据slide_type不同而不同 965 | title: 标题 966 | summary: 小结论 967 | footnote: 脚注 968 | layouts: 使用的母版样式 969 | legend: bool,是否画网格线 970 | data_labels: bool,是否画数据标签 971 | number_format_data: 图的数据标签格式 972 | number_format_tick: 横纵坐标的数据标签格式 973 | ''' 974 | #slide_width=self.prs.slide_width 975 | #slide_height=self.prs.slide_height 976 | 977 | # 标准化data格式 978 | if not(isinstance(data,list)): 979 | data=[data] 980 | for i,d in enumerate(data): 981 | if not(isinstance(d,dict)): 982 | if isinstance(d,(pd.core.frame.DataFrame,pd.core.frame.Series)): 983 | slide_type='chart' 984 | chart_type=self.chart_type_default 985 | d=pd.DataFrame(d) 986 | elif isinstance(d,str) and os.path.exists(d): 987 | slide_type='picture' 988 | chart_type='' 989 | elif isinstance(d,str) and not(os.path.exists(d)): 990 | slide_type='textbox' 991 | chart_type='' 992 | else: 993 | print('未知的数据格式,请检查数据') 994 | slide_type='' 995 | chart_type='' 996 | data[i]={'data':d,'slide_type':slide_type,'type':chart_type} 997 | 998 | # 各个模板的位置 999 | locations=self.location_suggest(data=data,summary=summary) 1000 | summary_loc=locations['summary'] 1001 | footnote_loc=locations['footnote'] 1002 | data_loc=locations['data'] 1003 | 1004 | # 选取的板式 1005 | if layouts == 'auto': 1006 | layouts=self.layouts_default 1007 | title_only_slide = self.prs.slide_masters[layouts[0]].slide_layouts[layouts[1]] 1008 | slide = self.prs.slides.add_slide(title_only_slide) 1009 | 1010 | #输出标题 1011 | slide.shapes.title.text = title 1012 | 1013 | # 输出副标题 summary 1014 | if summary: 1015 | txBox = slide.shapes.add_textbox(summary_loc['l'], summary_loc['t'], summary_loc['w'], summary_loc['h']) 1016 | txBox.text_frame.text=summary 1017 | txBox.text_frame.paragraphs[0].font.language_id = 3076 1018 | try: 1019 | txBox.text_frame.fit_text(max_size=12) 1020 | except: 1021 | pass 1022 | 1023 | 1024 | # 输出脚注 footnote 1025 | if footnote: 1026 | txBox = slide.shapes.add_textbox(footnote_loc['l'], footnote_loc['t'], footnote_loc['w'], footnote_loc['h']) 1027 | #p = text_frame.paragraphs[0] 1028 | p=txBox.text_frame.paragraphs[0] 1029 | p.text=footnote 1030 | p.font.size = Pt(10) 1031 | p.font.language_id = 3076 1032 | p.font.name='Microsoft YaHei UI' 1033 | p.font.color.rgb=RGBColor(127,127,127) 1034 | try: 1035 | txBox.text_frame.fit_text(max_size=10) 1036 | except: 1037 | pass 1038 | #print('cannot fit the size of font') 1039 | # 绘制主体部分 1040 | for i,dd in enumerate(data): 1041 | slide_type=dd['slide_type'] 1042 | left,top=data_loc[i]['l'],data_loc[i]['t'] 1043 | width,height=data_loc[i]['w'],data_loc[i]['h'] 1044 | chart_type=dd['type'] if 'type' in dd else self.chart_type_default 1045 | if slide_type in ['table']: 1046 | # 绘制表格 1047 | '''针对表格大小修正 1048 | R,C=dd['data'].shape 1049 | width=max(0.5,min(1,C/6.0))*width 1050 | height=max(0.5,min(1,R/12.0))*height 1051 | left=0.5-width/2 1052 | top=0.25 1053 | ''' 1054 | df_to_table(slide,dd['data'],left,top,width,height,index_names=True) 1055 | elif slide_type in ['textbox']: 1056 | # 输出文本框 1057 | txBox = slide.shapes.add_textbox(left, top, width, height) 1058 | txBox.text_frame.text=dd['data'] 1059 | txBox.text_frame.paragraphs[0].font.language_id = 3076 1060 | try: 1061 | txBox.text_frame.fit_text(max_size=12) 1062 | except: 1063 | pass 1064 | elif slide_type in ['picture','figure']: 1065 | slide.shapes.add_picture(dd['data'], left, top, height=height) 1066 | elif slide_type in ['chart']: 1067 | # 插入图表 1068 | chart_type_code=chart_list[chart_type][1] 1069 | if 'pptx.chart.data.ChartData' in str(type(dd['data'])): 1070 | chart_data=dd['data'] 1071 | else: 1072 | chart_data=df_to_chartdata(dd['data'],chart_type_code) 1073 | chart=slide.shapes.add_chart(chart_list[chart_type.upper()][0],left, top, width, height, chart_data).chart 1074 | 1075 | if chart_type_code in [-4169,72,73,74,75]: 1076 | continue 1077 | font_default_size=Pt(10) if 'font_default_size' not in config.__dict__ else config.font_default_size 1078 | # 添加图例 1079 | has_legend=kwarg['legend'] if 'legend' in kwarg else True 1080 | if has_legend and ((dd['data'].shape[1]>1) or (chart_type=='PIE')): 1081 | chart.has_legend = has_legend 1082 | chart.legend.font.size=font_default_size 1083 | chart.legend.position = XL_LEGEND_POSITION.BOTTOM 1084 | chart.legend.include_in_layout = False 1085 | try: 1086 | chart.category_axis.tick_labels.font.size=font_default_size 1087 | except: 1088 | pass#暂时不知道怎么处理 1089 | try: 1090 | chart.value_axis.tick_labels.font.size=font_default_size 1091 | except: 1092 | pass 1093 | # 添加数据标签 1094 | 1095 | non_available_list=['BUBBLE','BUBBLE_THREE_D_EFFECT','XY_SCATTER','XY_SCATTER_LINES','PIE'] 1096 | 1097 | # 数据标签数值格式 1098 | # 大致检测是否采用百分比 1099 | # 1、单选题每列的和肯定是100,顶多相差+-5 1100 | # 2、多选题每一列的和大于100,但单个的小于100.此处可能会有误判,但暂时无解 1101 | # 3、可能会有某一列全为0,此时单独考虑 1102 | if isinstance(dd['data'],(pd.core.frame.DataFrame,pd.core.frame.Series)) and ((dd['data'].sum()[dd['data'].sum()!=0]>90).all()) and ((dd['data']<=100).all().all()): 1103 | # 数据条的数据标签格式 1104 | number_format1=config.number_format_data 1105 | # 坐标轴的数据标签格式 1106 | number_format2=config.number_format_tick 1107 | else: 1108 | number_format1='0.00' 1109 | number_format2='0.0' 1110 | if 'number_format_data' in dd: 1111 | number_format1=dd['number_format_data'] 1112 | if 'number_format_tick' in dd: 1113 | number_format2=dd['number_format_tick'] 1114 | 1115 | if 'number_format_data' in kwarg: 1116 | number_format1=kwarg['number_format_data'] 1117 | if 'number_format_tick' in kwarg: 1118 | number_format2=kwarg['number_format_tick'] 1119 | 1120 | if 'data_labels' in kwarg: 1121 | has_data_labels = kwarg['data_labels'] 1122 | else: 1123 | has_data_labels=True 1124 | 1125 | if (chart_type not in non_available_list) or (chart_type == 'PIE'): 1126 | plot = chart.plots[0] 1127 | plot.has_data_labels = has_data_labels 1128 | if has_data_labels: 1129 | plot.data_labels.font.size = font_default_size 1130 | plot.data_labels.number_format = number_format1 1131 | #data_labels = plot.data_labels 1132 | #plot.data_labels.position = XL_LABEL_POSITION.BEST_FIT 1133 | if (chart_type not in non_available_list): 1134 | #chart.value_axis.maximum_scale = 1 1135 | if dd['data'].shape[1]==1: 1136 | chart.value_axis.has_major_gridlines = False 1137 | else: 1138 | chart.value_axis.has_major_gridlines = True 1139 | tick_labels = chart.value_axis.tick_labels 1140 | tick_labels.number_format = number_format2 1141 | tick_labels.font.size = font_default_size 1142 | 1143 | 1144 | 1145 | def save(self,filename=None): 1146 | assert (filename is not None) or (self.title is not None) 1147 | filename=self.title+time.strftime('_%Y%m%d%H%M.pptx', time.localtime()) if filename is None else filename 1148 | filename=os.path.splitext(filename)[0]+'.pptx' 1149 | self.prs.save(filename) 1150 | --------------------------------------------------------------------------------