├── reportgen.egg-info
    ├── dependency_links.txt
    ├── top_level.txt
    ├── requires.txt
    ├── SOURCES.txt
    └── PKG-INFO
├── reportgen
    ├── associate
    │   ├── __init__.py
    │   ├── _fpgrowth.pyx
    │   └── fpgrowth.py
    ├── font
    │   ├── readme.txt
    │   └── DroidSansFallback.ttf
    ├── images
    │   └── logo.png
    ├── template
    │   └── template.pptx
    ├── utils
    │   ├── __init__.py
    │   ├── utils.py
    │   ├── preprocessing.py
    │   ├── delaunay.py
    │   └── metrics.py
    ├── questionnaire
    │   ├── __init__.py
    │   └── README.md
    ├── __init__.py
    ├── config.py
    ├── README.rst
    ├── analysis.py
    └── report.py
├── example
    ├── datasets
    │   ├── [问卷星数据]800_800_0.xls
    │   ├── [问卷星数据]800_800_2.xls
    │   └── LendingClub_Sample.xlsx
    ├── analysis_example.py
    └── questionnaire_example.py
├── MANIFEST.in
├── LICENSE.txt
├── setup.py
└── README.rst


/reportgen.egg-info/dependency_links.txt:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/reportgen.egg-info/top_level.txt:
--------------------------------------------------------------------------------
1 | reportgen
2 | 


--------------------------------------------------------------------------------
/reportgen/associate/__init__.py:
--------------------------------------------------------------------------------
1 | from .fpgrowth import *
2 | del fpgrowth
3 | 


--------------------------------------------------------------------------------
/reportgen.egg-info/requires.txt:
--------------------------------------------------------------------------------
1 | pandas
2 | numpy
3 | seaborn
4 | python-pptx
5 | Pillow
6 | 


--------------------------------------------------------------------------------
/reportgen/font/readme.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gasongjian/reportgen/HEAD/reportgen/font/readme.txt


--------------------------------------------------------------------------------
/reportgen/images/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gasongjian/reportgen/HEAD/reportgen/images/logo.png


--------------------------------------------------------------------------------
/reportgen/template/template.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gasongjian/reportgen/HEAD/reportgen/template/template.pptx


--------------------------------------------------------------------------------
/reportgen/font/DroidSansFallback.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gasongjian/reportgen/HEAD/reportgen/font/DroidSansFallback.ttf


--------------------------------------------------------------------------------
/example/datasets/[问卷星数据]800_800_0.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gasongjian/reportgen/HEAD/example/datasets/[问卷星数据]800_800_0.xls


--------------------------------------------------------------------------------
/example/datasets/[问卷星数据]800_800_2.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gasongjian/reportgen/HEAD/example/datasets/[问卷星数据]800_800_2.xls


--------------------------------------------------------------------------------
/example/datasets/LendingClub_Sample.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gasongjian/reportgen/HEAD/example/datasets/LendingClub_Sample.xlsx


--------------------------------------------------------------------------------
/reportgen/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8
 2 | '''
 3 | 存在一些工具
 4 | '''
 5 | from .utils import iqr
 6 | from .delaunay import Delaunay2D
 7 | 
 8 | __all__=['iqr',
 9 | 'Delaunay2D']
10 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | #phonetic representation
2 | include reportgen/images/logo.png
3 | include reportgen/template/template.pptx
4 | include reportgen/font/readme.txt
5 | include reportgen/font/DroidSansFallback.ttf
6 | 


--------------------------------------------------------------------------------
/reportgen/questionnaire/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8
 2 | '''
 3 | '''
 4 | from __future__ import division
 5 | 
 6 | from . import questionnaire
 7 | 
 8 | from .questionnaire import *
 9 | 
10 | del questionnaire


--------------------------------------------------------------------------------
/reportgen/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8
 2 | from . import report
 3 | from .report import *
 4 | from . import analysis
 5 | from .analysis import *
 6 | from reportgen.utils import preprocessing
 7 | from reportgen.utils import metrics
 8 | from reportgen import questionnaire
 9 | from reportgen import utils
10 | from reportgen import associate
11 | 
12 | del report
13 | del analysis
14 | 
15 | __version__ = '0.1.8'
16 | 


--------------------------------------------------------------------------------
/reportgen.egg-info/SOURCES.txt:
--------------------------------------------------------------------------------
 1 | MANIFEST.in
 2 | README.rst
 3 | setup.py
 4 | reportgen/__init__.py
 5 | reportgen/analysis.py
 6 | reportgen/config.py
 7 | reportgen/report.py
 8 | reportgen.egg-info/PKG-INFO
 9 | reportgen.egg-info/SOURCES.txt
10 | reportgen.egg-info/dependency_links.txt
11 | reportgen.egg-info/requires.txt
12 | reportgen.egg-info/top_level.txt
13 | reportgen/associate/__init__.py
14 | reportgen/associate/fpgrowth.py
15 | reportgen/font/DroidSansFallback.ttf
16 | reportgen/font/readme.txt
17 | reportgen/images/logo.png
18 | reportgen/questionnaire/__init__.py
19 | reportgen/questionnaire/questionnaire.py
20 | reportgen/template/template.pptx
21 | reportgen/utils/__init__.py
22 | reportgen/utils/delaunay.py
23 | reportgen/utils/metrics.py
24 | reportgen/utils/preprocessing.py
25 | reportgen/utils/utils.py


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2017 The Python Packaging Authority (PyPA)
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in all
11 | copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19 | SOFTWARE.


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | 
 3 | from setuptools import setup, find_packages
 4 | 
 5 | setup(
 6 |     name='reportgen',
 7 |     version='0.1.8',
 8 |     description=(
 9 |         'reportgen is a Python library for creating and updating analysis report.'
10 |     ),
11 |     long_description=open('README.rst').read(),
12 |     author='JSong',
13 |     author_email='gasongjian@126.com',
14 |     maintainer='JSong',
15 |     maintainer_email='gasongjian@126.com',
16 |     license='BSD License',
17 |     packages=find_packages(),
18 |     include_package_data=True,
19 |         # relative to the vfclust directory
20 |         package_data={
21 |             'images':[
22 |                  'logo.png'],
23 |             'template':
24 |                  ['template.pptx'],
25 |             'font':['readme.txt','DroidSansFallback.ttf']
26 |         },
27 |     platforms=["all"],
28 |     url='https://github.com/gasongjian/reportgen',
29 |     classifiers=[
30 |         'Development Status :: 4 - Beta',
31 |         'Intended Audience :: Developers',
32 |         'License :: OSI Approved :: BSD License',
33 |         'Programming Language :: Python',
34 |         'Programming Language :: Python :: Implementation',
35 |         'Programming Language :: Python :: 3.4',
36 |         'Programming Language :: Python :: 3.5',
37 |         'Programming Language :: Python :: 3.6',
38 |         'Topic :: Software Development :: Libraries'
39 |     ],
40 | 	install_requires=[
41 |         'pandas',
42 |         'numpy',
43 |         'seaborn',
44 |         'python-pptx',
45 |         'Pillow'
46 |     ]
47 | )
48 | 


--------------------------------------------------------------------------------
/reportgen/associate/_fpgrowth.pyx:
--------------------------------------------------------------------------------
 1 | #cython: boundscheck=False
 2 | #cython: wraparound=False
 3 | #cython: initializedcheck=False
 4 | #cython: cdivision=True
 5 | #cython: embedsignature=True
 6 | #cython: language_level=3
 7 | #cython: language=c++
 8 | 
 9 | from libcpp.set cimport set as cppset
10 | from libcpp.vector cimport vector
11 | from libcpp.utility cimport pair
12 | from libcpp.unordered_map cimport unordered_map as hashmap
13 | 
14 | 
15 | cdef int _BUCKETING_FEW_ITEMS = 16
16 | BUCKETING_FEW_ITEMS = _BUCKETING_FEW_ITEMS
17 | 
18 | ctypedef cppset[int] itemset_t
19 | ctypedef vector[pair[itemset_t, int]] itemsets_t
20 | 
21 | 
22 | cpdef itemsets_t bucketing_count(list db,
23 |                                  cppset[int] frequent_items,
24 |                                  int min_support):
25 |     """ The bucketing count operation. """
26 |     cdef:
27 |         int i, j, k = frequent_items.size()
28 | 
29 |         vector[int] inv_map = vector[int]()
30 |         hashmap[int, int] fwd_map = hashmap[int, int]()
31 |         int index = 0
32 | 
33 |         vector[int] buckets = vector[int](2**k, 0)
34 |         pair[int, vector[int]] transaction
35 |         int tid = 0
36 |         int item
37 | 
38 |         int count
39 |         itemset_t result
40 |         itemsets_t results = itemsets_t()
41 | 
42 |     # Forward and inverse mapping of frequent_items to [0, n_items)
43 |     for item in frequent_items:
44 |         inv_map.push_back(item)
45 |         fwd_map[item] = index
46 |         index += 1
47 |     # Project transactions
48 |     for transaction in db:
49 |         tid = 0
50 |         for item in transaction.second:
51 |             if not frequent_items.count(item): continue
52 |             tid |= 1 << fwd_map.at(item)
53 |         buckets[tid] += transaction.first
54 |     # Aggregate bucketing counts ([2], Figure 5)
55 |     for i in range(0, k):
56 |         i = 1 << i
57 |         for j in range(1 << k):
58 |             if j & i == 0:
59 |                 buckets[j] += buckets[j + i]
60 |     # Count results
61 |     for tid in range(1, <int>buckets.size()):
62 |         count = buckets[tid]
63 |         if count >= min_support:
64 |             result = itemset_t()
65 |             for i in range(_BUCKETING_FEW_ITEMS):
66 |                 if tid & 1 << i:
67 |                     result.insert(inv_map[i])
68 |             results.push_back(pair[itemset_t, int](result, count))
69 |     return results
70 | 


--------------------------------------------------------------------------------
/reportgen/config.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Mon Aug 14 09:39:10 2017
 4 | 
 5 | @author: JSong
 6 | """
 7 | 
 8 | import os
 9 | import sys
10 | import pandas as pd
11 | 
12 | 
13 | _thisdir = os.path.realpath(os.path.split(__file__)[0])
14 | 
15 | __all__=['template_pptx',
16 |          'font_path',
17 |          'chart_type_list',
18 |          'number_format_data',
19 |          'number_format_tick',
20 |          'font_default_size',
21 |          'summary_loc',
22 |          'chart_loc']
23 | 
24 | def _get_element_path(dir_name,suffix=None):
25 |     if not(os.path.exists(os.path.join(_thisdir,dir_name))):
26 |         element_path=None
27 |         return element_path
28 |     element_path=None
29 |     filelist=os.listdir(os.path.join(_thisdir,dir_name))
30 |     if isinstance(suffix,str):
31 |         suffix=[suffix]
32 |     elif suffix is not None:
33 |         suffix=list(suffix)
34 |     for f in filelist:
35 |         if isinstance(suffix,list) and os.path.splitext(f)[1][1:] in suffix:
36 |             element_path=os.path.join(_thisdir,dir_name,f)
37 |     return element_path
38 | 
39 | 
40 | # default pptx template
41 | template_pptx=_get_element_path('template',suffix=['pptx'])
42 | #template='template.pptx'
43 | 
44 | 
45 | # default font of chinese
46 | font_path=_get_element_path('font',suffix=['ttf','ttc'])
47 | if font_path is None:
48 |     if sys.platform.startswith('win'):
49 |         #font_path='C:\\windows\\fonts\\msyh.ttc'
50 |         fontlist=['calibri.ttf','simfang.ttf','simkai.ttf','simhei.ttf','simsun.ttc','msyh.ttf','MSYH.TTC','msyh.ttc']
51 |         for f in fontlist:
52 |             if os.path.exists(os.path.join('C:\\windows\\fonts\\',f)):
53 |                 font_path=os.path.join('C:\\windows\\fonts\\',f)
54 | 
55 | chart_type_list={\
56 | "COLUMN_CLUSTERED":['柱状图','ChartData','pptx'],\
57 | "BAR_CLUSTERED":['条形图','ChartData','pptx'],
58 | 'HIST':['分布图,KDE','XChartData','matplotlib']}
59 | chart_type_list=pd.DataFrame(chart_type_list)
60 | 
61 | 
62 | # PPT图表中的数字位数
63 | number_format_data='0"%"'
64 | 
65 | # PPT图表中坐标轴的数字标签格式
66 | number_format_tick='0"%"'
67 | 
68 | #  默认字体大小
69 | '''
70 | Pt(8):101600,  Pt(10):127000,  Pt(12):152400,  Pt(14):177800
71 | Pt(16):203200,  Pt(18):228600,  Pt(20):254000,  Pt(22):279400
72 | Pt(24):304800,  Pt(26):330200
73 | '''
74 | font_default_size=127000# Pt(10)
75 | 
76 | 
77 | #  PPT中结论文本框所在的位置
78 | # 四个值依次为left、top、width、height
79 | summary_loc=[0.10,0.14,0.80,0.15]
80 | 
81 | 
82 | #  PPT中结论文本框所在的位置
83 | # 四个值依次为left、top、width、height
84 | chart_loc=[0.10,0.30,0.80,0.60]
85 | 


--------------------------------------------------------------------------------
/example/analysis_example.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Sun Mar 25 17:57:48 2018
 4 | 
 5 | @author: gason
 6 | """
 7 | import pandas as pd
 8 | import numpy as np
 9 | import reportgen as rpt
10 | from sklearn import preprocessing
11 | from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
12 | from sklearn.linear_model import LogisticRegressionCV
13 | 
14 | import warnings
15 | warnings.filterwarnings('ignore') #为了整洁，去除弹出的warnings
16 | pd.set_option('precision', 5) #设置精度
17 | pd.set_option('display.float_format', lambda x: '%.5f' % x) #为了直观的显示数字，不采用科学计数法
18 | pd.options.display.max_rows = 200 #最多显示200行
19 | 
20 | 
21 | 
22 | 
23 | # 数据导入
24 | data=pd.read_excel('.\\datasets\\LendingClub_Sample.xlsx')
25 | 
26 | # 数据预览
27 | rpt.AnalysisReport(data.copy(),filename='LendingClub 数据预览');
28 | 
29 | # 机器学习相关函数补充
30 | 
31 | # 只作工具包测试，所以不区分训练集和测试集
32 | y=data['target']
33 | X=data.drop(['target'],axis=1)
34 | 
35 | 
36 | # convert into dummies
37 | categorical_var=list(set(X.columns[X.apply(pd.Series.nunique)<30])|set(X.select_dtypes(include=['O']).columns))
38 | #categorical_var = ['collections_12_mths_ex_med', 'home_ownership', 'sub_grade',\
39 | #'inq_last_6mths', 'initial_list_status', 'emp_length', 'application_type', \
40 | #'acc_now_delinq', 'grade', 'purpose', 'verification_status', 'addr_state', 'term', 'pub_rec', 'delinq_2yrs']
41 | 
42 | continuous_var=list(set(X.columns)-set(categorical_var))
43 | #continuous_var=['open_acc', 'total_rev_hi_lim', 'loan_amnt', 'tot_coll_amt', \
44 | #'total_acc', 'tot_cur_bal', 'dti', 'annual_inc', 'earliest_cr_line', 'int_rate', 'installment']
45 | 
46 | # WOE 编码
47 | woe=rpt.preprocessing.WeightOfEvidence(categorical_features=categorical_var,encoder_na=False)
48 | X=woe.fit_transform(X,y)
49 | 
50 | # 离散化
51 | #dis=rpt.preprocessing.Discretization(continous_features=continuous_var)
52 | #X2=dis.fit_transform(X,y)
53 | 
54 | # 补缺和标准化
55 | X=X.fillna(-99)
56 | X[continuous_var]=preprocessing.MinMaxScaler().fit_transform(X[continuous_var])
57 | 
58 | 
59 | clfs={'LogisticRegression':LogisticRegressionCV(),\
60 | 'RandomForest':RandomForestClassifier(),'GradientBoosting':GradientBoostingClassifier()}
61 | y_preds,y_probas={},{}
62 | for clf in clfs:
63 |     clfs[clf].fit(X, y)
64 |     y_preds[clf] =clfs[clf].predict(X)
65 |     y_probas[clf] = clfs[clf].predict_proba(X)[:,1]
66 | 
67 | models_report,conf_matrix=rpt.ClassifierReport(y,y_preds,y_probas)
68 | print(models_report)
69 | 
70 | 
71 | # 信息论度量
72 | p=y_probas['LogisticRegression'][y==1]
73 | q=y_probas['LogisticRegression'][y==0]
74 | print(rpt.metrics.entropyc.kl_div(p,q))
75 | 
76 | 
77 | def xiu(data):
78 |     data.iloc[:,0]=1
79 |     return 2
80 | 
81 | 
82 | 
83 | 
84 | 
85 | 
86 | 
87 | 
88 | 
89 | 
90 | 


--------------------------------------------------------------------------------
/reportgen/utils/utils.py:
--------------------------------------------------------------------------------
  1 | """Small plotting-related utility functions."""
  2 | from __future__ import print_function, division
  3 | import colorsys
  4 | import warnings
  5 | import os
  6 | 
  7 | import numpy as np
  8 | from scipy import stats
  9 | import pandas as pd
 10 | 
 11 | 
 12 | 
 13 | 
 14 | 
 15 | def _kde_support(data, bw, gridsize, cut, clip):
 16 |     """Establish support for a kernel density estimate."""
 17 |     support_min = max(data.min() - bw * cut, clip[0])
 18 |     support_max = min(data.max() + bw * cut, clip[1])
 19 |     return np.linspace(support_min, support_max, gridsize)
 20 | 
 21 | 
 22 | def percentiles(a, pcts, axis=None):
 23 |     """Like scoreatpercentile but can take and return array of percentiles.
 24 | 
 25 |     Parameters
 26 |     ----------
 27 |     a : array
 28 |         data
 29 |     pcts : sequence of percentile values
 30 |         percentile or percentiles to find score at
 31 |     axis : int or None
 32 |         if not None, computes scores over this axis
 33 | 
 34 |     Returns
 35 |     -------
 36 |     scores: array
 37 |         array of scores at requested percentiles
 38 |         first dimension is length of object passed to ``pcts``
 39 | 
 40 |     """
 41 |     scores = []
 42 |     try:
 43 |         n = len(pcts)
 44 |     except TypeError:
 45 |         pcts = [pcts]
 46 |         n = 0
 47 |     for i, p in enumerate(pcts):
 48 |         if axis is None:
 49 |             score = stats.scoreatpercentile(a.ravel(), p)
 50 |         else:
 51 |             score = np.apply_along_axis(stats.scoreatpercentile, axis, a, p)
 52 |         scores.append(score)
 53 |     scores = np.asarray(scores)
 54 |     if not n:
 55 |         scores = scores.squeeze()
 56 |     return scores
 57 | 
 58 | 
 59 | def ci(a, which=95, axis=None):
 60 |     """Return a percentile range from an array of values."""
 61 |     p = 50 - which / 2, 50 + which / 2
 62 |     return percentiles(a, p, axis)
 63 | 
 64 | 
 65 | def iqr(a):
 66 |     """Calculate the IQR for an array of numbers."""
 67 |     a = np.asarray(a)
 68 |     q1 = stats.scoreatpercentile(a, 25)
 69 |     q3 = stats.scoreatpercentile(a, 75)
 70 |     return q3 - q1
 71 | 
 72 | 
 73 | 
 74 | def categorical_order(values, order=None):
 75 |     """Return a list of unique data values.
 76 | 
 77 |     Determine an ordered list of levels in ``values``.
 78 | 
 79 |     Parameters
 80 |     ----------
 81 |     values : list, array, Categorical, or Series
 82 |         Vector of "categorical" values
 83 |     order : list-like, optional
 84 |         Desired order of category levels to override the order determined
 85 |         from the ``values`` object.
 86 | 
 87 |     Returns
 88 |     -------
 89 |     order : list
 90 |         Ordered list of category levels not including null values.
 91 | 
 92 |     """
 93 |     if order is None:
 94 |         if hasattr(values, "categories"):
 95 |             order = values.categories
 96 |         else:
 97 |             try:
 98 |                 order = values.cat.categories
 99 |             except (TypeError, AttributeError):
100 |                 try:
101 |                     order = values.unique()
102 |                 except AttributeError:
103 |                     order = pd.unique(values)
104 |                 try:
105 |                     np.asarray(values).astype(np.float)
106 |                     order = np.sort(order)
107 |                 except (ValueError, TypeError):
108 |                     order = order
109 |         order = filter(pd.notnull, order)
110 |     return list(order)
111 | 
112 | 
113 | 
114 | 


--------------------------------------------------------------------------------
/example/questionnaire_example.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | # reportgen v 0.1.8
  4 | -------------------
  5 | 
  6 | ## **问卷模块** :问卷类型的数据分析
  7 | 
  8 | ------------------
  9 | 
 10 | 问卷数据涉及到各种题型，包括单选题、多选题、填空题、矩阵多选题、排序题等等。不管是频数统计还是交叉分析，单选题都很好处理。但其他题目就相对复杂的多，比如单选题和多选题之间的交叉统计，多选题和多选题之间的交叉统计。
 11 | 
 12 | 为了区分题目类型和统计处理方法，本工具包统一使用新型的数据类型（或者说编码方式）。在这种类型中，每一份问卷都有两个文件，data 和 code ,它们的含义如下：
 13 | 
 14 | - 1)、data：按序号编码的数据(csv、xlsx等都可以)，示例如下：
 15 | 
 16 |     |Q1|Q2|Q3_A1|Q3_A2|Q3_A3|Q3_A4|
 17 |     |:----:|:---:|:----:|:----:|:---:|:----:|
 18 |     |1|1|1|0|1|0|
 19 |     |1|2|0|0|1|0|
 20 |     |1|1|1|0|0|1|
 21 |     |2|3|0|1|1|0|
 22 |     |1|2|1|0|1|0|
 23 |     |1|4|0|1|0|1|
 24 |     |2|2|1|0|1|0|
 25 |     |1|1|0|1|0|1|
 26 |     |2|2|1|0|1|0|
 27 | 
 28 | - 2)、code：编码文件（ json格式，就是 python中的字典类型）, 给定每道题的题号、序号编码等内容，
 29 |     每一个题目都有如下字段：
 30 | 
 31 |         - content: 题目内容
 32 |         - code:题目对应的编码
 33 |         - code_r: 题目对应的编码(矩阵单选题专有)
 34 |         - qtype:题目类型，单选题、多选题、矩阵单选题、排序题、填空题等
 35 |         - qlist:该题的索引，如多选题的 ['Q1_A1','Q1_A2',..]
 36 |         - code_order: 非必须，题目类别的顺序，用于PPT报告的生成[一般后期添加]
 37 |         - name: 非必须，特殊题型的标注
 38 |         - weight:非必须，dict,每个选项的权重，用于如月收入等的平均数统计
 39 | 
 40 |     示例：
 41 | 
 42 |     ```json
 43 |     code={'Q1':{
 44 |         'content':'性别',
 45 |         'code':{
 46 |             1:'男',
 47 |             2:'女'
 48 |         }
 49 |         'qtype':'单选题',
 50 |         'qlist':['Q1']
 51 |     },
 52 |     'Q2':{
 53 |         'content':'年龄',
 54 |         'code':{
 55 |             1:'17岁以下',
 56 |             2:'18-25岁',
 57 |             3:'26-35岁',
 58 |             4:'36-46岁'
 59 |         },
 60 |         'qtype':'单选题',
 61 |         'qlist':['Q2']
 62 |     },
 63 |     'Q3':{
 64 |         'content':'爱好',
 65 |         'code':{
 66 |             'Q3_A1':'17岁以下',
 67 |             'Q3_A2':'18-25岁',
 68 |             'Q3_A3':'26-35岁',
 69 |             'Q3_A4':'36-46岁'
 70 |         },
 71 |         'qtype':'多选题',
 72 |         'qlist':['Q3_A1','Q3_A2','Q3_A3','Q3_A4']
 73 |     }
 74 |     }
 75 | 
 76 | ##该工具包包含如下函数：
 77 | 
 78 | ### 文件 IO
 79 | 
 80 | - `read_code`, 从本地读取code数据，支持excel文件和json文件
 81 | - `save_code`, 将code 保存为 xlsx 或json数据
 82 | - `load_data`, 支持打开文件窗口来选择问卷数据
 83 | - `read_data`, 读取本地的数据,自适应xlsx、csv等
 84 | - `save_data`, 将问卷数据（data和code）保存到本地
 85 | - `wenjuanwang`, 编码问卷网平台的问卷数据，输入为问卷网上下载的三个文件
 86 | - `wenjuanxing`, 编码问卷星平台的问卷数据，输入为问卷星网站上下载的两个xls文件（按选项序号和按选项文本）
 87 | 
 88 | ### 数据处理
 89 | - `spec_rcode`: 对问卷中的一些特殊题型进行处理，如将城市题分类成省份、城市、城市级别等
 90 | - `dataText_to_code`:
 91 | - `dataCode_to_text`:
 92 | - `var_combine`: 见data_merge
 93 | - `data_merge`: 合并两份问卷数据，常见于多个推动渠道的问卷合并
 94 | - `clean_ftime`: 根据用户填写时间来筛选问卷，会根据填问卷累计时间曲线的拐点来给出剔除的时间点
 95 | - `data_auto_code`:
 96 | - `qdata_flatten`: 将问卷数据展平，便于将多份问卷数据存储在同一个数据库中
 97 | 
 98 | ### 统计检验等
 99 | - `sample_size_cal`: 样本量计算公式
100 | - `confidence_interval`: 置信区间计算公式
101 | - `gof_test`: 拟合优度检验
102 | - `chi2_test`: 卡方检验
103 | - `fisher_exact`: 卡方检验，适用于观察频数过少的情形
104 | - `anova`: 方差分析
105 | 
106 | ### 数据分析
107 | - `mca`: 对应分析，目前只支持两个变量
108 | - `cluster`: 态度题的聚类分析，会根据轮廓系数自动选择最佳类别数
109 | - `association_rules`: 关联分析，用于多选题的进一步分析
110 | 
111 | ### 统计
112 | - `contingency`: 列联表分析，统一给出列联表的各种数据，包含fo、fop、TGI等
113 | - `qtable`: 单个题目的统计分析和两个题目的交叉分析，给出频数表和频率表
114 | 
115 | ### 可视化
116 | - `summary_chart`: 整体统计报告，针对每一道题，选择合适的图表进行展示，并输出为pptx文件
117 | - `cross_chart`: 交叉分析报告，如能将年龄与每一道题目进行交叉分析，并输出为pptx文件
118 | - `onekey_gen`: 综合上两个，一键生成
119 | - `scorpion`: 生成一个表格，内含每个题目的相关统计信息
120 | - `scatter`: 散点图绘制，不同于matplotlib的是，其能给每个点加文字标签
121 | - `sankey`: 桑基图绘制，不画图，只提供 R 需要的数据
122 | 
123 | """
124 | 
125 | import reportgen.questionnaire as ques
126 | 
127 | 
128 | # 导入问卷星数据
129 | datapath=['.\\datasets\\[问卷星数据]800_800_0.xls','.\\datasets\\[问卷星数据]800_800_2.xls']
130 | data,code=ques.wenjuanxing(datapath)
131 | 
132 | # 导出
133 | ques.save_data(data,filename='data.xlsx')
134 | ques.save_data(data,filename='data.xlsx',code=code)# 会将选项编码替换成文本
135 | ques.save_code(code,filename='code.xlsx')
136 | 
137 | 
138 | # 对单变量进行统计分析
139 | result=ques.qtable(data,code,'Q1')
140 | print(result['fo'])
141 | 
142 | # 两个变量的交叉分析
143 | result=ques.qtable(data,code,'Q1','Q2')
144 | print(result['fop'])
145 | 
146 | # 聚类分析，会在原数据上添加一列，类别题
147 | #ques.cluster(data,code,'态度题')
148 | 
149 | # 在.\\out\\下 生成 pptx文件
150 | ques.summary_chart(data,code,filename='整体统计报告');
151 | ques.cross_chart(data,code,cross_class='Q4',filename='交叉分析报告_年龄');
152 | ques.scorpion(data,code,filename='详细分析数据')
153 | ques.onekey_gen(data,code,filename='reportgen 自动生成报告');
154 | 


--------------------------------------------------------------------------------
/reportgen/questionnaire/README.md:
--------------------------------------------------------------------------------
  1 | # reportgen v 0.1.8
  2 | -------------------
  3 | 
  4 | ## **问卷模块** :问卷类型的数据分析
  5 | 
  6 | ------------------
  7 | 
  8 | 问卷数据涉及到各种题型，包括单选题、多选题、填空题、矩阵多选题、排序题等等。不管是频数统计还是交叉分析，单选题都很好处理。但其他题目就相对复杂的多，比如单选题和多选题之间的交叉统计，多选题和多选题之间的交叉统计。
  9 | 
 10 | 为了区分题目类型和统计处理方法，本工具包统一使用新型的数据类型（或者说编码方式）。在这种类型中，每一份问卷都有两个文件，data 和 code ,它们的含义如下：
 11 | 
 12 | - 1)、data：按序号编码的数据(csv、xlsx等都可以)，示例如下：
 13 | 
 14 |     |Q1|Q2|Q3_A1|Q3_A2|Q3_A3|Q3_A4|
 15 |     |:----:|:---:|:----:|:----:|:---:|:----:|
 16 |     |1|1|1|0|1|0|
 17 |     |1|2|0|0|1|0|
 18 |     |1|1|1|0|0|1|
 19 |     |2|3|0|1|1|0|
 20 |     |1|2|1|0|1|0|
 21 |     |1|4|0|1|0|1|
 22 |     |2|2|1|0|1|0|
 23 |     |1|1|0|1|0|1|
 24 |     |2|2|1|0|1|0|
 25 | 
 26 | - 2)、code：编码文件（ json格式，就是 python中的字典类型）, 给定每道题的题号、序号编码等内容，
 27 |     每一个题目都有如下字段：
 28 | 
 29 |         - content: 题目内容
 30 |         - code:题目对应的编码
 31 |         - code_r: 题目对应的编码(矩阵单选题专有)
 32 |         - qtype:题目类型，单选题、多选题、矩阵单选题、排序题、填空题等
 33 |         - qlist:该题的索引，如多选题的 ['Q1_A1','Q1_A2',..]
 34 |         - code_order: 非必须，题目类别的顺序，用于PPT报告的生成[一般后期添加]
 35 |         - name: 非必须，特殊题型的标注
 36 |         - weight:非必须，dict,每个选项的权重，用于如月收入等的平均数统计
 37 | 
 38 |     示例：
 39 | 
 40 |     ```json
 41 |     code={'Q1':{
 42 |         'content':'性别',
 43 |         'code':{
 44 |             1:'男',
 45 |             2:'女'
 46 |         }
 47 |         'qtype':'单选题',
 48 |         'qlist':['Q1']
 49 |     },
 50 |     'Q2':{
 51 |         'content':'年龄',
 52 |         'code':{
 53 |             1:'17岁以下',
 54 |             2:'18-25岁',
 55 |             3:'26-35岁',
 56 |             4:'36-46岁'
 57 |         },
 58 |         'qtype':'单选题',
 59 |         'qlist':['Q2']
 60 |     },
 61 |     'Q3':{
 62 |         'content':'爱好',
 63 |         'code':{
 64 |             'Q3_A1':'17岁以下',
 65 |             'Q3_A2':'18-25岁',
 66 |             'Q3_A3':'26-35岁',
 67 |             'Q3_A4':'36-46岁'
 68 |         },
 69 |         'qtype':'多选题',
 70 |         'qlist':['Q3_A1','Q3_A2','Q3_A3','Q3_A4']
 71 |     }
 72 |     }
 73 | 
 74 | ##该工具包包含如下函数：
 75 | 
 76 | ### 文件 IO
 77 | 
 78 | - `read_code`, 从本地读取code数据，支持excel文件和json文件
 79 | - `save_code`, 将code 保存为 xlsx 或json数据
 80 | - `load_data`, 支持打开文件窗口来选择问卷数据
 81 | - `read_data`, 读取本地的数据,自适应xlsx、csv等
 82 | - `save_data`, 将问卷数据（data和code）保存到本地
 83 | - `wenjuanwang`, 编码问卷网平台的问卷数据，输入为问卷网上下载的三个文件
 84 | - `wenjuanxing`, 编码问卷星平台的问卷数据，输入为问卷星网站上下载的两个xls文件（按选项序号和按选项文本）
 85 | 
 86 | ### 数据处理
 87 | - `spec_rcode`: 对问卷中的一些特殊题型进行处理，如将城市题分类成省份、城市、城市级别等
 88 | - `dataText_to_code`:
 89 | - `dataCode_to_text`:
 90 | - `var_combine`: 见data_merge
 91 | - `data_merge`: 合并两份问卷数据，常见于多个推动渠道的问卷合并
 92 | - `clean_ftime`: 根据用户填写时间来筛选问卷，会根据填问卷累计时间曲线的拐点来给出剔除的时间点
 93 | - `data_auto_code`:
 94 | - `qdata_flatten`: 将问卷数据展平，便于将多份问卷数据存储在同一个数据库中
 95 | 
 96 | ### 统计检验等
 97 | - `sample_size_cal`: 样本量计算公式
 98 | - `confidence_interval`: 置信区间计算公式
 99 | - `gof_test`: 拟合优度检验
100 | - `chi2_test`: 卡方检验
101 | - `fisher_exact`: 卡方检验，适用于观察频数过少的情形
102 | - `anova`: 方差分析
103 | 
104 | ### 数据分析
105 | - `mca`: 对应分析，目前只支持两个变量
106 | - `cluster`: 态度题的聚类分析，会根据轮廓系数自动选择最佳类别数
107 | - `association_rules`: 关联分析，用于多选题的进一步分析
108 | 
109 | ### 统计
110 | - `contingency`: 列联表分析，统一给出列联表的各种数据，包含fo、fop、TGI等
111 | - `qtable`: 单个题目的统计分析和两个题目的交叉分析，给出频数表和频率表
112 | 
113 | ### 可视化
114 | - `summary_chart`: 整体统计报告，针对每一道题，选择合适的图表进行展示，并输出为pptx文件
115 | - `cross_chart`: 交叉分析报告，如能将年龄与每一道题目进行交叉分析，并输出为pptx文件
116 | - `onekey_gen`: 综合上两个，一键生成
117 | - `scorpion`: 生成一个表格，内含每个题目的相关统计信息
118 | - `scatter`: 散点图绘制，不同于matplotlib的是，其能给每个点加文字标签
119 | - `sankey`: 桑基图绘制，不画图，只提供 R 需要的数据
120 | """
121 | 
122 | 
123 | ## 一些实践：
124 | 
125 | 数据在 .\\example\\datasets\\
126 | 
127 | ```python
128 | import reportgen.questionnaire as ques
129 | 
130 | 
131 | # 导入问卷星数据
132 | datapath=['.\\datasets\\[问卷星数据]800_800_0.xls','.\\datasets\\[问卷星数据]800_800_2.xls']
133 | data,code=ques.wenjuanxing(datapath)
134 | 
135 | # 导出
136 | ques.save_data(data,filename='data.xlsx')
137 | ques.save_data(data,filename='data.xlsx',code=code)# 会将选项编码替换成文本
138 | ques.save_code(code,filename='code.xlsx')
139 | 
140 | 
141 | # 对单变量进行统计分析
142 | result=ques.qtable(data,code,'Q1')
143 | print(result['fo'])
144 | 
145 | # 两个变量的交叉分析
146 | result=ques.qtable(data,code,'Q1','Q2')
147 | print(result['fop'])
148 | 
149 | # 聚类分析，会在原数据上添加一列，类别题
150 | #ques.cluster(data,code,'态度题')
151 | 
152 | # 在.\\out\\下 生成 pptx文件
153 | ques.summary_chart(data,code,filename='整体统计报告');
154 | ques.cross_chart(data,code,cross_class='Q4',filename='交叉分析报告_年龄');
155 | ques.scorpion(data,code,filename='详细分析数据')
156 | ques.onekey_gen(data,code,filename='reportgen 自动生成报告');
157 | ```
158 | 


--------------------------------------------------------------------------------
/reportgen/README.rst:
--------------------------------------------------------------------------------
  1 | reportgen
  2 | ===========
  3 | 
  4 | Release v0.1.8
  5 | 
  6 | *reportgen* is a Python library for creating and updating analysis report.
  7 | 
  8 | Release History
  9 | ------------------
 10 | 0.1.8(2018-03-28)
 11 | 
 12 | - Add subpackages metrics and preprocessing which contain entropy,WOE,discretization etc..
 13 | - Add associate analysis(FP growth): frequent_itemsets and association_rules.
 14 | - Add functions :ClassifierReport,type_of_var.
 15 | - Fix the logic of package.
 16 | - Fix some bugs.
 17 | 
 18 | 0.1.6(2017-12-06)
 19 | 
 20 | - Add function rpt.plot().
 21 | - Support drawing on the exist matplotlib figure and Report file
 22 | - Fix some bugs.
 23 | 
 24 | 0.1.5(2017-11-29)
 25 | 
 26 | - Add function AnalysisReport, it can plot the general data to pptx files.
 27 | - Fix some bugs.
 28 | 
 29 | 0.1.0(2017-11-18)
 30 | 
 31 | - Create.
 32 | 
 33 | 
 34 | Feature Support
 35 | ------------------
 36 | 
 37 | **reportgen** has the following capabilities, with many more on the roadmap:
 38 | 
 39 | - get all the texts in the pptx file
 40 | - get all the images in the pptx file
 41 | - add one slide simply about charts/tables/images with pandas in a pptx file
 42 | - add slides simply about charts/tables/images with pandas in a pptx file
 43 | 
 44 | Quick Start
 45 | ------------
 46 | 
 47 | 1. Get texts or images in a pptx file.
 48 | 
 49 | ::
 50 | 
 51 |   import reportgen as rpt
 52 |   # Open a pptx file
 53 |   p=rpt.Report('analysis.pptx')
 54 |   # We can get the texts and images simply.
 55 |   result=p.get_texts()
 56 |   print('\n'.join(result))
 57 |   # All the images will saved in folder '.\\images\\'.
 58 |   p.get_images()
 59 | 
 60 | 2. Created a analysis report.
 61 | 
 62 | ::
 63 | 
 64 |   import reportgen as rpt
 65 | 	import pandas as pd
 66 | 	# Open a pptx file
 67 | 	p=rpt.Report('template.pptx')# The parameters can be defaulted
 68 | 	# add a cover
 69 | 	p.add_cover(title='A analysis report powered by reportgen')
 70 | 	# add a chart slide
 71 | 	data=pd.DataFrame({'Jack':[90,80,100],'David':[100,70,85]},index=['Math','English','Physics'])
 72 | 	p.add_slide(data={'data':data,'slide_type':'chart','type':'COLUMN_CLUSTERED'},\
 73 | 	title='the scores report',summary='Our class got excellent results',footnote='This is a footnote.')
 74 |   # add a table slide
 75 | 	data=pd.DataFrame({'Jack':[90,80,100],'David':[100,70,85]},index=['Math','English','Physics'])
 76 | 	p.add_slide(data={'data':data,'slide_type':'table'},title='the scores report',summary='Our class got excellent results',footnote='This is a footnote.')
 77 | 	# add a textbox slide
 78 | 	data='This a paragraph. \n'*4
 79 | 	p.add_slide(data={'data':data,'slide_type':'textbox'},title='This is a textbox slide',summary='',footnote='')
 80 | 	# add a picture slide
 81 | 	data='.\\images\\images.png'
 82 | 	p.add_slide(data={'data':data,'slide_type':'picture'},title='This is a picture slide')
 83 |   p.save('analysis report.pptx')
 84 | 
 85 | 
 86 | 
 87 | 
 88 | In general, I divide a slide of analysis report into four parts: title、summary、footnote and the body data. And the body are one or more charts/textboxs/tables/pictures.
 89 | 
 90 | The *add_slide* which is the most commonly used function  has the following parameters:
 91 | 
 92 | ::
 93 | 
 94 |   add_slide(data=[{'data':,'slide_type':,'type':},],title='',summary='',footnote='',layouts='auto')
 95 | 
 96 | For example, we can draw a chart on the left side, and insert a picture on the right.
 97 | 
 98 | ::
 99 | 
100 |   import reportgen as rpt
101 |   import pandas as pd
102 |   p=rpt.Report()
103 |   scores=pd.DataFrame({'Jack':[90,80,100],'David':[100,70,85]},index=['Math','English','Physics'])
104 |   data=[{'data':scores,'slide_type':'chart','type':'COLUMN_CLUSTERED'},\
105 |   {'data':'.\\images2.jpg','slide_type':'picture'}]
106 |   p.add_slide(data=data)
107 |   p.save('add_slide.pptx')
108 | 
109 | As a lazy person, I also provide a solution with less scripts.
110 | 
111 | ::
112 | 
113 |   import reportgen as rpt
114 |   p=rpt.Report()
115 |   imgs=['.\\images\\'+img for img in os.listdir('.\\images\\')]
116 |   p.add_slides(data=imgs)
117 |   # more functions way
118 |   slides_data=[{'title':'ppt{}'.format(i),'data':data} for i in range(10)]
119 |   p.add.slides(slides_data)
120 |   p.save('add_slides.pptx')
121 | 
122 | 
123 | Now you can get a glance at any data.
124 | 
125 | ::
126 | 
127 |   import pandas as pd
128 |   import reportgen as rpt
129 | 
130 |   data=pd.read_excel('Scores.xlsx')
131 |   rpt.AnalysisReport(data,filename='Analysis Report of Scores.pptx');
132 | 
133 | The scripts will make a pptx file which analysis all the fields of the data in a visual way.
134 | 
135 | TO DO
136 | -------
137 | 
138 | - support export analysis report to html
139 | - make the chart_type recommend more intelligence
140 | 
141 | 
142 | Contact
143 | --------
144 | 
145 | If you have any question,you can email to gasongjian AT 126.com. And if you have a WeChat account,you can focus to my WeChat Official Account: gasongjian.
146 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
  1 | reportgen
  2 | ===========
  3 | 
  4 | Release v0.1.8
  5 | 
  6 | *reportgen* is a Python library for creating and updating analysis report.
  7 | 
  8 | Release History
  9 | ------------------
 10 | 0.1.8(2018-03-28)
 11 | 
 12 | - Add subpackages metrics and preprocessing which contain entropy,WOE,discretization etc..
 13 | - Add associate analysis(FP growth): frequent_itemsets and association_rules.
 14 | - Add functions :ClassifierReport,type_of_var.
 15 | - Fix the logic of package.
 16 | - Fix some bugs.
 17 | 
 18 | 0.1.6(2017-12-06)
 19 | 
 20 | - Add function rpt.plot().
 21 | - Support drawing on the exist matplotlib figure and Report file
 22 | - Fix some bugs.
 23 | 
 24 | 0.1.5(2017-11-29)
 25 | 
 26 | - Add function AnalysisReport, it can plot the general data to pptx files.
 27 | - Fix some bugs.
 28 | 
 29 | 0.1.0(2017-11-18)
 30 | 
 31 | - Create.
 32 | 
 33 | 
 34 | Feature Support
 35 | ------------------
 36 | 
 37 | **reportgen** has the following capabilities, with many more on the roadmap:
 38 | 
 39 | - get all the texts in the pptx file
 40 | - get all the images in the pptx file
 41 | - add one slide simply about charts/tables/images with pandas in a pptx file
 42 | - add slides simply about charts/tables/images with pandas in a pptx file
 43 | 
 44 | Quick Start
 45 | ------------
 46 | 
 47 | 1. Get texts or images in a pptx file.
 48 | 
 49 | ::
 50 | 
 51 |   # import
 52 |   import reportgen as rpt
 53 |   # Open a pptx file
 54 |   p=rpt.Report('analysis.pptx')
 55 |   # We can get the texts and images simply.
 56 |   result=p.get_texts()
 57 |   print('\n'.join(result))
 58 |   # All the images will saved in folder '.\\images\\'.
 59 |   p.get_images()
 60 | 
 61 | 2. Created a analysis report.
 62 | 
 63 | ::
 64 | 
 65 |   #
 66 |   import reportgen as rpt
 67 | 	import pandas as pd
 68 | 	# Open a pptx file
 69 | 	p=rpt.Report('template.pptx')# The parameters can be defaulted
 70 | 	# add a cover
 71 | 	p.add_cover(title='A analysis report powered by reportgen')
 72 | 	# add a chart slide
 73 | 	data=pd.DataFrame({'Jack':[90,80,100],'David':[100,70,85]},index=['Math','English','Physics'])
 74 | 	p.add_slide(data={'data':data,'slide_type':'chart','type':'COLUMN_CLUSTERED'},\
 75 | 	title='the scores report',summary='Our class got excellent results',footnote='This is a footnote.')
 76 |   # add a table slide
 77 | 	data=pd.DataFrame({'Jack':[90,80,100],'David':[100,70,85]},index=['Math','English','Physics'])
 78 | 	p.add_slide(data={'data':data,'slide_type':'table'},title='the scores report',summary='Our class got excellent results',footnote='This is a footnote.')
 79 | 	# add a textbox slide
 80 | 	data='This a paragraph. \n'*4
 81 | 	p.add_slide(data={'data':data,'slide_type':'textbox'},title='This is a textbox slide',summary='',footnote='')
 82 | 	# add a picture slide
 83 | 	data='.\\images\\images.png'
 84 | 	p.add_slide(data={'data':data,'slide_type':'picture'},title='This is a picture slide')
 85 |   p.save('analysis report.pptx')
 86 | 
 87 | 
 88 | 
 89 | 
 90 | In general, I divide a slide of analysis report into four parts: title、summary、footnote and the body data. And the body are one or more charts/textboxs/tables/pictures.
 91 | 
 92 | The *add_slide* which is the most commonly used function  has the following parameters:
 93 | 
 94 | ::
 95 | 
 96 |   add_slide(data=[{'data':,'slide_type':,'type':},],title='',summary='',footnote='',layouts='auto')
 97 | 
 98 | For example, we can draw a chart on the left side, and insert a picture on the right.
 99 | 
100 | ::
101 | 
102 |   import reportgen as rpt
103 |   import pandas as pd
104 |   p=rpt.Report()
105 |   scores=pd.DataFrame({'Jack':[90,80,100],'David':[100,70,85]},index=['Math','English','Physics'])
106 |   data=[{'data':scores,'slide_type':'chart','type':'COLUMN_CLUSTERED'},\
107 |   {'data':'.\\images2.jpg','slide_type':'picture'}]
108 |   p.add_slide(data=data)
109 |   p.save('add_slide.pptx')
110 | 
111 | As a lazy person, I also provide a solution with less scripts.
112 | 
113 | ::
114 | 
115 |   import reportgen as rpt
116 |   p=rpt.Report()
117 |   imgs=['.\\images\\'+img for img in os.listdir('.\\images\\')]
118 |   p.add_slides(data=imgs)
119 |   # more functions way
120 |   slides_data=[{'title':'ppt{}'.format(i),'data':data} for i in range(10)]
121 |   p.add.slides(slides_data)
122 |   p.save('add_slides.pptx')
123 | 
124 | 
125 | Now you can get a glance at any data.
126 | 
127 | ::
128 | 
129 |   import pandas as pd
130 |   import reportgen as rpt
131 | 
132 |   data=pd.read_excel('Scores.xlsx')
133 |   rpt.AnalysisReport(data,filename='Analysis Report of Scores.pptx');
134 | 
135 | The scripts will make a pptx file which analysis all the fields of the data in a visual way.
136 | 
137 | TO DO
138 | -------
139 | 
140 | - support export analysis report to html
141 | - make the chart_type recommend more intelligence
142 | 
143 | 
144 | Contact
145 | --------
146 | 
147 | If you have any question,you can email to gasongjian AT 126.com. And if you have a WeChat account,you can focus to my WeChat Official Account: gasongjian.
148 | 


--------------------------------------------------------------------------------
/reportgen.egg-info/PKG-INFO:
--------------------------------------------------------------------------------
  1 | Metadata-Version: 1.1
  2 | Name: reportgen
  3 | Version: 0.1.8
  4 | Summary: reportgen is a Python library for creating and updating analysis report.
  5 | Home-page: https://github.com/gasongjian/reportgen
  6 | Author: JSong
  7 | Author-email: gasongjian@126.com
  8 | License: BSD License
  9 | Description-Content-Type: UNKNOWN
 10 | Description: reportgen
 11 |         ===========
 12 |         
 13 |         Release v0.1.8
 14 |         
 15 |         *reportgen* is a Python library for creating and updating analysis report.
 16 |         
 17 |         Release History
 18 |         ------------------
 19 |         0.1.8(2018-03-28)
 20 |         
 21 |         - Add subpackages metrics and preprocessing which contain entropy銆乄OE銆乨iscretization etc..
 22 |         - Add associate analysis(FP growth): frequent_itemsets and association_rules.
 23 |         - Add functions :ClassifierReport,type_of_var.
 24 |         - Fix the logic of package.
 25 |         - Fix some bugs.
 26 |         
 27 |         0.1.6(2017-12-06)
 28 |         
 29 |         - Add function rpt.plot().
 30 |         - Support drawing on the exist matplotlib figure and Report file
 31 |         - Fix some bugs.
 32 |         
 33 |         0.1.5(2017-11-29)
 34 |         
 35 |         - Add function AnalysisReport, it can plot the general data to pptx files.
 36 |         - Fix some bugs.
 37 |         
 38 |         0.1.0(2017-11-18)
 39 |         
 40 |         - Create.
 41 |         
 42 |         
 43 |         Feature Support
 44 |         ------------------
 45 |         
 46 |         **reportgen** has the following capabilities, with many more on the roadmap:
 47 |         
 48 |         - get all the texts in the pptx file
 49 |         - get all the images in the pptx file
 50 |         - add one slide simply about charts/tables/images with pandas in a pptx file
 51 |         - add slides simply about charts/tables/images with pandas in a pptx file
 52 |         
 53 |         Quick Start
 54 |         ------------
 55 |         
 56 |         1. Get texts or images in a pptx file.
 57 |         
 58 |         ::
 59 |         
 60 |           # import
 61 |           import reportgen as rpt
 62 |           # Open a pptx file
 63 |           p=rpt.Report('analysis.pptx')
 64 |           # We can get the texts and images simply.
 65 |           result=p.get_texts()
 66 |           print('\n'.join(result))
 67 |           # All the images will saved in folder '.\\images\\'.
 68 |           p.get_images()
 69 |         
 70 |         2. Created a analysis report.
 71 |         
 72 |         ::
 73 |         
 74 |           #
 75 |           import reportgen as rpt
 76 |         	import pandas as pd
 77 |         	# Open a pptx file
 78 |         	p=rpt.Report('template.pptx')# The parameters can be defaulted
 79 |         	# add a cover
 80 |         	p.add_cover(title='A analysis report powered by reportgen')
 81 |         	# add a chart slide
 82 |         	data=pd.DataFrame({'Jack':[90,80,100],'David':[100,70,85]},index=['Math','English','Physics'])
 83 |         	p.add_slide(data={'data':data,'slide_type':'chart','type':'COLUMN_CLUSTERED'},\
 84 |         	title='the scores report',summary='Our class got excellent results',footnote='This is a footnote.')
 85 |           # add a table slide
 86 |         	data=pd.DataFrame({'Jack':[90,80,100],'David':[100,70,85]},index=['Math','English','Physics'])
 87 |         	p.add_slide(data={'data':data,'slide_type':'table'},title='the scores report',summary='Our class got excellent results',footnote='This is a footnote.')
 88 |         	# add a textbox slide
 89 |         	data='This a paragraph. \n'*4
 90 |         	p.add_slide(data={'data':data,'slide_type':'textbox'},title='This is a textbox slide',summary='',footnote='')
 91 |         	# add a picture slide
 92 |         	data='.\\images\\images.png'
 93 |         	p.add_slide(data={'data':data,'slide_type':'picture'},title='This is a picture slide')
 94 |           p.save('analysis report.pptx')
 95 |         
 96 |         
 97 |         
 98 |         
 99 |         In general, I divide a slide of analysis report into four parts: title銆乻ummary銆乫ootnote and the body data. And the body are one or more charts/textboxs/tables/pictures.
100 |         
101 |         The *add_slide* which is the most commonly used function  has the following parameters:
102 |         
103 |         ::
104 |         
105 |           add_slide(data=[{'data':,'slide_type':,'type':},],title='',summary='',footnote='',layouts='auto')
106 |         
107 |         For example, we can draw a chart on the left side, and insert a picture on the right.
108 |         
109 |         ::
110 |         
111 |           import reportgen as rpt
112 |           import pandas as pd
113 |           p=rpt.Report()
114 |           scores=pd.DataFrame({'Jack':[90,80,100],'David':[100,70,85]},index=['Math','English','Physics'])
115 |           data=[{'data':scores,'slide_type':'chart','type':'COLUMN_CLUSTERED'},\
116 |           {'data':'.\\images2.jpg','slide_type':'picture'}]
117 |           p.add_slide(data=data)
118 |           p.save('add_slide.pptx')
119 |         
120 |         As a lazy person, I also provide a solution with less scripts.
121 |         
122 |         ::
123 |         
124 |           import reportgen as rpt
125 |           p=rpt.Report()
126 |           imgs=['.\\images\\'+img for img in os.listdir('.\\images\\')]
127 |           p.add_slides(data=imgs)
128 |           # more functions way
129 |           slides_data=[{'title':'ppt{}'.format(i),'data':data} for i in range(10)]
130 |           p.add.slides(slides_data)
131 |           p.save('add_slides.pptx')
132 |         
133 |         
134 |         Now you can get a glance at any data.
135 |         
136 |         ::
137 |         
138 |           import pandas as pd
139 |           import reportgen as rpt
140 |         
141 |           data=pd.read_excel('Scores.xlsx')
142 |           rpt.AnalysisReport(data,filename='Analysis Report of Scores.pptx');
143 |         
144 |         The scripts will make a pptx file which analysis all the fields of the data in a visual way.
145 |         
146 |         TO DO
147 |         -------
148 |         
149 |         - support export analysis report to html
150 |         - make the chart_type recommend more intelligence
151 |         
152 |         
153 |         Contact
154 |         --------
155 |         
156 |         If you have any question,you can email to gasongjian AT 126.com. And if you have a WeChat account,you can focus to my WeChat Official Account: gasongjian.
157 |         
158 | Platform: all
159 | Classifier: Development Status :: 4 - Beta
160 | Classifier: Intended Audience :: Developers
161 | Classifier: License :: OSI Approved :: BSD License
162 | Classifier: Programming Language :: Python
163 | Classifier: Programming Language :: Python :: Implementation
164 | Classifier: Programming Language :: Python :: 3.4
165 | Classifier: Programming Language :: Python :: 3.5
166 | Classifier: Programming Language :: Python :: 3.6
167 | Classifier: Topic :: Software Development :: Libraries
168 | 


--------------------------------------------------------------------------------
/reportgen/utils/preprocessing.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Sun Mar 25 14:09:46 2018
  4 | 
  5 | @author: JSong
  6 | """
  7 | 
  8 | import pandas as pd
  9 | import numpy as np
 10 | from scipy import stats
 11 | from sklearn.utils.multiclass import type_of_target
 12 | 
 13 | 
 14 | __all__=['WeightOfEvidence',
 15 |          'chimerge',
 16 |          'Discretization']
 17 | 
 18 | 
 19 | def check_array(X,ensure_DataFrame=True,copy=True):
 20 |     '''Convert X to DataFrame   
 21 |     '''
 22 |     X=X.copy()
 23 |     if not(np.issubdtype(type(X),np.ndarray)):
 24 |         X=np.array(X)
 25 |     X=pd.DataFrame(X)
 26 |     return X
 27 | 
 28 | 
 29 | def _features_selected(X, selected="all"):
 30 |     """Apply a transform function to portion of selected features
 31 | 
 32 |     Parameters
 33 |     ----------
 34 |     X : {array-like}, shape [n_samples, n_features]
 35 |     
 36 |     selected: "all" or array of indices or mask
 37 |         Specify which features to apply the transform to.
 38 | 
 39 |     Returns
 40 |     -------
 41 |     n_features_new : array
 42 |     """
 43 | 
 44 |     X=check_array(X)
 45 |     if selected == "all":
 46 |         return np.array(X.columns)
 47 |     n_features = X.shape[1]
 48 |     sel = pd.Series(np.zeros(n_features, dtype=bool),index=X.columns)
 49 |     sel[np.asarray(selected)] = True
 50 |     return np.array(X.columns[sel])
 51 | 
 52 | 
 53 | class WeightOfEvidence():
 54 |     """ WOE Encoder
 55 |     
 56 |     parameters:
 57 |     -----------
 58 |     
 59 |     categorical_features : "all" or array of indices or mask
 60 |         Specify what features are treated as categorical.
 61 | 
 62 |         - 'all' (default): All features are treated as categorical.
 63 |         - array of indices: Array of categorical feature indices.
 64 |         - mask: Array of length n_features and with dtype=bool.
 65 |     encoder_na: default False, take nan as a single class of the features
 66 |         
 67 |     attribute:
 68 |     -----------
 69 |     woe (Dict): - the woe of trained data
 70 |     iv (Dict): - info value of trained data
 71 |     """
 72 | 
 73 |     def __init__(self,categorical_features='all',encoder_na=False,woe_min=-20, woe_max=20):
 74 |         self.woe = {}
 75 |         self.iv = {}
 76 |         self.encoder_na=encoder_na
 77 |         self.woe_min=woe_min
 78 |         self.woe_max=woe_max
 79 |         self.categorical_features=categorical_features
 80 | 
 81 |     def _posibility(self, x, tag, event=1):
 82 |         """计算触发概率
 83 |         Parameters:
 84 |         ----------
 85 |             x (Sequence): - 离散特征序列
 86 |             tag (Sequence): - 用于训练的标签序列
 87 |             event (any): - True指代的触发事件
 88 |         Returns:
 89 |         ----------
 90 |             Dict[str,Tuple[rate_T, rate_F]]: - 训练好后的好坏触发概率
 91 |         """
 92 |         if type_of_target(tag) not in ['binary']:
 93 |             raise AttributeError("tag must be a binary array")
 94 |         #if type_of_target(x) in ['continuous']:
 95 |         #    raise AttributeError("input array must not continuous")
 96 |         tag = np.array(tag)
 97 |         x = np.array(x)
 98 |         event_total = (tag == event).sum()
 99 |         non_event_total = tag.shape[-1] - event_total
100 |         x_labels = pd.unique(x[pd.notnull(x)])
101 |         pos_dic = {}
102 |         for x1 in x_labels:
103 |             # 当 x1 是nan时，y1 也为空
104 |             y1 = tag[np.where(x == x1)[0]]
105 |             event_count = (y1 == event).sum()
106 |             non_event_count = y1.shape[-1] - event_count
107 |             rate_event = 1.0 * event_count / event_total
108 |             rate_non_event = 1.0 * non_event_count / non_event_total
109 |             pos_dic[x1] = (rate_event, rate_non_event)
110 |         return pos_dic
111 | 
112 |     def fit(self, X, y, event=1):
113 |         """训练对单独一项自变量(列,特征)的woe值.
114 |         WOE_k=log (该特征中正类占比/该特征中负类占比)
115 |         Parameters:
116 |         -----------
117 |             X : DataFrame, 训练数据
118 |             y (Sequence):  标签
119 |             event: - True指代的触发事件
120 |             woe_min (munber): - woe的最小值,默认值为 -20
121 |             woe_max (munber): - woe的最大值,默认值为 20
122 |         """
123 |         X = check_array(X,ensure_DataFrame=True)       
124 |         y = np.array(y)       
125 |         if np.isnan(y).sum()>0:
126 |             raise AttributeError("y contain NaN number!")
127 |         feartures_new=_features_selected(X,self.categorical_features)
128 |         if self.encoder_na:
129 |             X[feartures_new]=X[feartures_new].fillna('np.nan')
130 |         for v in feartures_new:
131 |             woe_dict = {}
132 |             iv = 0
133 |             pos_dic = self._posibility(x=X[v], tag=y, event=event)
134 |             for l, (rate_event, rate_non_event) in pos_dic.items():
135 |                 if rate_event == 0:
136 |                     woe1 = self.woe_min
137 |                 elif rate_non_event == 0:
138 |                     woe1 = self.woe_max
139 |                 else:
140 |                     woe1 = np.log(rate_event / rate_non_event)  # np.log就是ln
141 |                 iv += (rate_event - rate_non_event) * woe1
142 |                 woe_dict[l] = woe1
143 |             self.woe[v] = woe_dict
144 |             self.iv[v] = iv
145 | 
146 |     def transform(self, X):
147 |         """将离散特征序列转换为woe值组成的序列
148 |         Parameters:
149 |             X : DataFrame, 训练数据
150 |         Returns:
151 |             DataFrame: - 替换特征序列枚举值为woe对应数值后的序列
152 |         """
153 |         X=check_array(X)
154 |         feartures_new=_features_selected(X,self.categorical_features)
155 |         if self.encoder_na:
156 |             X[feartures_new]=X[feartures_new].fillna('np.nan')
157 |         for v in feartures_new:
158 |             X[v]=X[v].replace(self.woe[v])
159 |         return X
160 |     def fit_transform(self,X,y,event=1):
161 |         self.fit(X, y, event=event)
162 |         return self.transform(X)
163 |         
164 | 
165 | 
166 | def _chisqure_fo(fo):
167 |     if any(fo==0):
168 |         fo=fo+1
169 |     s=stats.chi2_contingency(fo)
170 |     return s[0],s[1]
171 | 
172 | 
173 | def chimerge(x,y,max_intervals=30,threshold=5,sample=None):
174 |     '''卡方分箱
175 |     parameter
176 |     ---------
177 |     x: {array-like}, shape [n_samples, 1]
178 |     y: target, connot contain nan 
179 |     max_intervals: 最大的区间数
180 |     threshold：卡方阈值(两个变量)
181 |     sample: int,当样本数过大时，对数据进行取样
182 |     
183 |     return
184 |     ------
185 |     bins: 
186 |     
187 |     '''
188 |     
189 |     x=pd.Series(x)
190 |     y=pd.Series(y)
191 |     class_y=list(pd.unique(y[pd.notnull(y)]))
192 |     value_max=x.max()
193 |     #value_max=np.sort(x)[-1]
194 |     value_min=x.min()
195 |     # 随机取样，且确保取样后的y能包含class_y中的所有类别
196 |     if isinstance(sample,int):
197 |         sample=min(sample,len(x))
198 |         tmp=set()
199 |         while tmp!=set(class_y):
200 |             cc=np.random.choice([True,False],size=len(x),p=[sample/len(x),1-sample/len(x)])
201 |             tmp=set(np.unique(y[cc]))
202 |         x=x[cc]
203 |         y=y[cc]
204 |     fo=pd.crosstab(x,y)# 列联表
205 |     fo=fo.sort_index()
206 |    
207 |     while fo.shape[0] > max_intervals:
208 |         chitest={}
209 |         index=list(fo.index)
210 |         for r in range(len(fo)-1):
211 |             #chi2,_=stats.chi2_contingency(fo.iloc[[r,r+1],:])
212 |             chi2,_=_chisqure_fo(fo.iloc[[r,r+1],:])
213 |             if chi2 not in chitest:
214 |                 chitest[chi2]=[]
215 |             chitest[chi2].append((r,r+1))
216 |         smallest = min(chitest.keys())
217 |         if smallest <= threshold:
218 |             #print('最小的chi2值: {}'.format(smallest))
219 |             #print([(index[r[0]],index[r[1]]) for r in list(reversed(chitest[smallest]))])
220 |             for (lower,upper) in list(reversed(chitest[smallest])):
221 |                 fo.loc[index[lower],:]=fo.loc[index[lower],:]+fo.loc[index[upper],:]
222 |                 fo = fo.drop(index[upper],axis=0)
223 |                 #print('已经删除 {}'.format(index[upper]))
224 |         else:
225 |             break
226 |     bins=list(fo.index)+[value_max]
227 |     bins[0]=value_min
228 |     # 如果bins都是数值，则最左和最右都扩大1%以囊括最小最大值
229 |     if np.issubdtype(type(bins[0]),np.number):
230 |         bins[0]=bins[0]*0.99 if bins[0]>0 else bins[0]-0.01
231 |         bins[-1]=bins[-1]*1.01
232 |     return bins
233 | 
234 | 
235 | class Discretization():
236 |     """离散化连续数据.需要实例化以保存bins状态.
237 |     parameter:
238 |     bins (Sequence): - 用于分段的列表,第一位为下限,最后一位为上限
239 |     method: 离散的方法
240 |     """
241 | 
242 |     def __init__(self, bins=None,method='auto',continous_features='all',**kwargs):
243 |         self.bins = bins
244 |         self.method=method
245 |         self.continous_features=continous_features
246 |         if 'max_intervals' in kwargs:
247 |             self.max_intervals=kwargs['max_intervals']
248 |         else:
249 |             self.max_intervals=10
250 |         if 'threshold' in kwargs:
251 |             self.threshold=kwargs['threshold']
252 |         else:
253 |             self.threshold=5
254 |         if 'sample' in kwargs:
255 |             self.sample=kwargs['sample']
256 |         else:
257 |             self.sample=None
258 | 
259 |     def fit(self,X,y=None):
260 |         if self.method == 'auto':
261 |             if y is not None:
262 |                 method='chimerge'
263 |             elif self.bins is not None:
264 |                 method=''
265 |             else:
266 |                 method=''
267 |         else:
268 |             method=self.method
269 |         X=check_array(X)
270 |         feartures_new=_features_selected(X,self.continous_features)           
271 |         if method.lower() in ['chimerge']:
272 |             self.bins={}
273 |             for v in feartures_new:
274 |                 bins=chimerge(X[v],y,max_intervals=self.max_intervals,threshold=self.threshold,sample=self.sample)
275 |                 self.bins[v]=bins
276 | 
277 |     def transform(self, X):
278 |         X=check_array(X)
279 |         feartures_new=_features_selected(X,self.continous_features)
280 |         for v in feartures_new:
281 |             bins=self.bins[v]
282 |             labels=['[{},{})'.format(bins[i],bins[i+1]) for i in range(len(bins)-1)]
283 |             X[v] = pd.cut(X[v], bins=bins,labels=labels,right=False)
284 |         return X
285 | 
286 |     def fit_transform(self,X,y=None):
287 |         self.fit(X,y)
288 |         return self.transform(X)
289 |         
290 |         
291 |         
292 | 
293 | 
294 | 


--------------------------------------------------------------------------------
/reportgen/utils/delaunay.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: ascii -*-
  2 | """
  3 | Simple structured Delaunay triangulation in 2D with Bowyer-Watson algorithm.
  4 | 
  5 | Written by Jose M. Espadero ( http://github.com/jmespadero/pyDelaunay2D )
  6 | Based on code from Ayron Catteau. Published at http://github.com/ayron/delaunay
  7 | 
  8 | Just pretend to be simple and didactic. The only requisite is numpy.
  9 | Robust checks disabled by default. May not work in degenerate set of points.
 10 | """
 11 | 
 12 | import numpy as np
 13 | from math import sqrt
 14 | 
 15 | 
 16 | class Delaunay2D:
 17 |     """
 18 |     Class to compute a Delaunay triangulation in 2D
 19 |     ref: http://en.wikipedia.org/wiki/Bowyer-Watson_algorithm
 20 |     ref: http://www.geom.uiuc.edu/~samuelp/del_project.html
 21 |     """
 22 | 
 23 |     def __init__(self, center=(0, 0), radius=9999):
 24 |         """ Init and create a new frame to contain the triangulation
 25 |         center -- Optional position for the center of the frame. Default (0,0)
 26 |         radius -- Optional distance from corners to the center.
 27 |         """
 28 |         center = np.asarray(center)
 29 |         # Create coordinates for the corners of the frame
 30 |         self.coords = [center+radius*np.array((-1, -1)),
 31 |                        center+radius*np.array((+1, -1)),
 32 |                        center+radius*np.array((+1, +1)),
 33 |                        center+radius*np.array((-1, +1))]
 34 | 
 35 |         # Create two dicts to store triangle neighbours and circumcircles.
 36 |         self.triangles = {}
 37 |         self.circles = {}
 38 | 
 39 |         # Create two CCW triangles for the frame
 40 |         T1 = (0, 1, 3)
 41 |         T2 = (2, 3, 1)
 42 |         self.triangles[T1] = [T2, None, None]
 43 |         self.triangles[T2] = [T1, None, None]
 44 | 
 45 |         # Compute circumcenters and circumradius for each triangle
 46 |         for t in self.triangles:
 47 |             self.circles[t] = self.Circumcenter(t)
 48 | 
 49 |     def Circumcenter(self, tri):
 50 |         """Compute Circumcenter and circumradius of a triangle in 2D.
 51 |         Uses an extension of the method described here:
 52 |         http://www.ics.uci.edu/~eppstein/junkyard/circumcenter.html
 53 |         """
 54 |         pts = np.asarray([self.coords[v] for v in tri])
 55 |         pts2 = np.dot(pts, pts.T)
 56 |         A = np.bmat([[2 * pts2, [[1],
 57 |                                  [1],
 58 |                                  [1]]],
 59 |                       [[[1, 1, 1, 0]]]])
 60 | 
 61 |         b = np.hstack((np.sum(pts * pts, axis=1), [1]))
 62 |         x = np.linalg.solve(A, b)
 63 |         bary_coords = x[:-1]
 64 |         center = np.dot(bary_coords, pts)
 65 | 
 66 |         # radius = np.linalg.norm(pts[0] - center) # euclidean distance
 67 |         radius = np.sum(np.square(pts[0] - center))  # squared distance
 68 |         return (center, radius)
 69 | 
 70 |     def inCircleFast(self, tri, p):
 71 |         """Check if point p is inside of precomputed circumcircle of tri.
 72 |         """
 73 |         center, radius = self.circles[tri]
 74 |         return np.sum(np.square(center - p)) <= radius
 75 | 
 76 |     def inCircleRobust(self, tri, p):
 77 |         """Check if point p is inside of circumcircle around the triangle tri.
 78 |         This is a robust predicate, slower than compare distance to centers
 79 |         ref: http://www.cs.cmu.edu/~quake/robust.html
 80 |         """
 81 |         m1 = np.asarray([self.coords[v] - p for v in tri])
 82 |         m2 = np.sum(np.square(m1), axis=1).reshape((3, 1))
 83 |         m = np.hstack((m1, m2))    # The 3x3 matrix to check
 84 |         return np.linalg.det(m) <= 0
 85 | 
 86 |     def AddPoint(self, p):
 87 |         """Add a new point to the current DT, and refine it using Bowyer-Watson.
 88 |         """
 89 |         p = np.asarray(p)
 90 |         idx = len(self.coords)
 91 |         # print("coords[", idx,"] ->",p)
 92 |         self.coords.append(p)
 93 | 
 94 |         # Search the triangle(s) whose circumcircle contains p
 95 |         bad_triangles = []
 96 |         for T in self.triangles:
 97 |             # Choose one method: inCircleRobust(T, p) or inCircleFast(T, p)
 98 |             if self.inCircleFast(T, p):
 99 |                 bad_triangles.append(T)
100 | 
101 |         # Find the CCW boundary (star shape) of the bad triangles,
102 |         # expressed as a list of edges (point pairs) and the opposite
103 |         # triangle to each edge.
104 |         boundary = []
105 |         # Choose a "random" triangle and edge
106 |         T = bad_triangles[0]
107 |         edge = 0
108 |         # get the opposite triangle of this edge
109 |         while True:
110 |             # Check if edge of triangle T is on the boundary...
111 |             # if opposite triangle of this edge is external to the list
112 |             tri_op = self.triangles[T][edge]
113 |             if tri_op not in bad_triangles:
114 |                 # Insert edge and external triangle into boundary list
115 |                 boundary.append((T[(edge+1) % 3], T[(edge-1) % 3], tri_op))
116 | 
117 |                 # Move to next CCW edge in this triangle
118 |                 edge = (edge + 1) % 3
119 | 
120 |                 # Check if boundary is a closed loop
121 |                 if boundary[0][0] == boundary[-1][1]:
122 |                     break
123 |             else:
124 |                 # Move to next CCW edge in opposite triangle
125 |                 edge = (self.triangles[tri_op].index(T) + 1) % 3
126 |                 T = tri_op
127 | 
128 |         # Remove triangles too near of point p of our solution
129 |         for T in bad_triangles:
130 |             del self.triangles[T]
131 |             del self.circles[T]
132 | 
133 |         # Retriangle the hole left by bad_triangles
134 |         new_triangles = []
135 |         for (e0, e1, tri_op) in boundary:
136 |             # Create a new triangle using point p and edge extremes
137 |             T = (idx, e0, e1)
138 | 
139 |             # Store circumcenter and circumradius of the triangle
140 |             self.circles[T] = self.Circumcenter(T)
141 | 
142 |             # Set opposite triangle of the edge as neighbour of T
143 |             self.triangles[T] = [tri_op, None, None]
144 | 
145 |             # Try to set T as neighbour of the opposite triangle
146 |             if tri_op:
147 |                 # search the neighbour of tri_op that use edge (e1, e0)
148 |                 for i, neigh in enumerate(self.triangles[tri_op]):
149 |                     if neigh:
150 |                         if e1 in neigh and e0 in neigh:
151 |                             # change link to use our new triangle
152 |                             self.triangles[tri_op][i] = T
153 | 
154 |             # Add triangle to a temporal list
155 |             new_triangles.append(T)
156 | 
157 |         # Link the new triangles each another
158 |         N = len(new_triangles)
159 |         for i, T in enumerate(new_triangles):
160 |             self.triangles[T][1] = new_triangles[(i+1) % N]   # next
161 |             self.triangles[T][2] = new_triangles[(i-1) % N]   # previous
162 | 
163 |     def exportTriangles(self):
164 |         """Export the current list of Delaunay triangles
165 |         """
166 |         # Filter out triangles with any vertex in the extended BBox
167 |         return [(a-4, b-4, c-4)
168 |                 for (a, b, c) in self.triangles if a > 3 and b > 3 and c > 3]
169 | 
170 |     def exportCircles(self):
171 |         """Export the circumcircles as a list of (center, radius)
172 |         """
173 |         # Remember to compute circumcircles if not done before
174 |         # for t in self.triangles:
175 |         #     self.circles[t] = self.Circumcenter(t)
176 | 
177 |         # Filter out triangles with any vertex in the extended BBox
178 |         # Do sqrt of radius before of return
179 |         return [(self.circles[(a, b, c)][0], sqrt(self.circles[(a, b, c)][1]))
180 |                 for (a, b, c) in self.triangles if a > 3 and b > 3 and c > 3]
181 | 
182 |     def exportDT(self):
183 |         """Export the current set of Delaunay coordinates and triangles.
184 |         """
185 |         # Filter out coordinates in the extended BBox
186 |         coord = self.coords[4:]
187 | 
188 |         # Filter out triangles with any vertex in the extended BBox
189 |         tris = [(a-4, b-4, c-4)
190 |                 for (a, b, c) in self.triangles if a > 3 and b > 3 and c > 3]
191 |         return coord, tris
192 | 
193 |     def exportExtendedDT(self):
194 |         """Export the Extended Delaunay Triangulation (with the frame vertex).
195 |         """
196 |         return self.coords, list(self.triangles)
197 |         
198 |     def exportVoronoiRegions(self):
199 |         """Export coordinates and regions of Voronoi diagram as indexed data.
200 |         """
201 |         # Remember to compute circumcircles if not done before
202 |         # for t in self.triangles:
203 |         #     self.circles[t] = self.Circumcenter(t)
204 |         useVertex = {i:[] for i in range(len(self.coords))}
205 |         vor_coors = []
206 |         index={}
207 |         # Build a list of coordinates and a index per triangle/region
208 |         for tidx, (a, b, c) in enumerate(self.triangles):
209 |             vor_coors.append(self.circles[(a,b,c)][0])
210 |             # Insert triangle, rotating it so the key is the "last" vertex 
211 |             useVertex[a]+=[(b, c, a)]
212 |             useVertex[b]+=[(c, a, b)]
213 |             useVertex[c]+=[(a, b, c)]
214 |             # Set tidx as the index to use with this triangles
215 |             index[(a, b, c)] = tidx;
216 |             index[(c, a, b)] = tidx;
217 |             index[(b, c, a)] = tidx;
218 |             
219 |         # init regions per coordinate dictionary
220 |         regions = {}
221 |         # Sort each region in a coherent order, and substitude each triangle
222 |         # by its index
223 |         for i in range (4, len(self.coords)):
224 |             v = useVertex[i][0][0]  # Get a vertex of a triangle
225 |             r=[]
226 |             for _ in range(len(useVertex[i])):
227 |                 # Search the triangle beginning with vertex v
228 |                 t = [t for t in useVertex[i] if t[0] == v][0]
229 |                 r.append(index[t])  # Add the index of this triangle to region
230 |                 v = t[1]            # Choose the next vertex to search
231 |             regions[i-4]=r          # Store region.
232 |             
233 |         return vor_coors, regions
234 | 


--------------------------------------------------------------------------------
/reportgen/utils/metrics.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import pandas as pd
  4 | import numpy as np
  5 | from scipy import stats
  6 | import scipy.spatial as ss
  7 | from scipy.special import digamma
  8 | from math import log
  9 | import numpy.random as nr
 10 | import random
 11 | 
 12 | #from collections import Iterable
 13 | 
 14 | __all__=['entropy',
 15 | 'entropyc',
 16 | 'entropyd',
 17 | 'chi2',
 18 | 'info_value']
 19 | 
 20 | 
 21 | # 待定，还未修改好
 22 | class feature_encoder():
 23 |     '''
 24 |     用于单个特征对因变量的分析，如
 25 |     - 该特征中每个item的影响力
 26 |     - 对item重编码
 27 | 
 28 |     '''
 29 | 
 30 |     def chi2(X,y):
 31 |         N=pd.Series(y).count()
 32 |         fo=pd.crosstab(X,y)
 33 |         fe=stats.contingency.expected_freq(fo)
 34 |         weight_chi2=(fo-fe)**2/fe/N/min(fo.shape[0],fo.shape[1])
 35 |         weight_chi2=weight_chi2.sum(axis=1)
 36 |         return weight_chi2
 37 | 
 38 | 
 39 |     def woe(X,y):
 40 |         ctable=pd.crosstab(X,y)
 41 |         # 如果有0则每一项都加1
 42 |         ctable=ctable+1 if (ctable==0).any().any() else ctable
 43 |         if ctable.shape[1]==2:
 44 |             n_g,n_b=ctable.sum()
 45 |             ctable=(ctable/ctable.sum()).assign(woe=lambda x:np.log2(x.iloc[:,0]/x.iloc[:,1]))\
 46 |             .assign(ivi=lambda x:(x.iloc[:,0]-x.iloc[:,1])*x['woe'])
 47 |             return ctable.loc[:,['woe','ivi']]
 48 |         else:
 49 |             woe_dict={}
 50 |             p=ctable.sum()/ctable.sum().sum()
 51 |             for cc in ctable.columns:
 52 |                 ctable_bin=pd.DataFrame(index=ctable.index,columns=['one','rest'])
 53 |                 ctable_bin['one']=ctable.loc[:,cc]
 54 |                 ctable_bin['rest']=ctable.loc[:,~(ctable.columns==cc)].sum(axis=1)
 55 |                 n_o,n_r=ctable_bin.sum()
 56 |                 ctable_bin=ctable_bin/ctable_bin.sum()
 57 |                 ctable_bin['woe']=np.log2(ctable_bin['one']/ctable_bin['rest'])
 58 |                 ctable_bin['ivi']=(ctable_bin['one']-ctable_bin['rest'])*ctable_bin['woe']
 59 |                 woe_dict[cc]=ctable_bin.loc[:,['woe','ivi']]
 60 |             tmp=0
 61 |             for cc in ctable.columns:
 62 |                 tmp+=woe_dict[cc]*p[cc]
 63 |             woe_dict['avg']=tmp
 64 |             return woe_dict
 65 | 
 66 | 
 67 | 
 68 | def chi2(X,y):
 69 |     '''计算一组数据的卡方值，弥补sklearn中的chi2只支持2*2的缺憾
 70 |     parameter
 71 |     ----------
 72 |     X:可以是单个特征，也可以是一组特征
 73 |     y:目标变量
 74 |     
 75 |     return
 76 |     ------
 77 |     chi2_value: np.array 数组
 78 |     chi2_pvalue：np.array 数组
 79 |     '''
 80 |     X=np.asarray(X)
 81 |     if len(X.shape)==1:
 82 |         X=X.reshape((len(X),1))
 83 |     X=pd.DataFrame(X)
 84 |     chi2_value=[]
 85 |     chi2_pvalue=[]
 86 |     for c in X.columns:
 87 |         fo=pd.crosstab(X[c],y)
 88 |         s=stats.chi2_contingency(fo)
 89 |         chi2_value.append(s[0])
 90 |         chi2_pvalue.append(s[1])
 91 |     return (np.array(chi2_value),np.array(chi2_pvalue))
 92 | 
 93 | 
 94 | 
 95 | # 待定
 96 | def info_value(X,y,bins='auto'):
 97 |     '''计算连续变量的IV值
 98 |     计算X和y之间的IV值
 99 |     IV=\sum (g_k/n_g-b_k/n_b)*log2(g_k*n_b/n_g/)
100 |     '''
101 |     threshold=[]
102 |     for q in [0.05,0.04,0.03,0.02,0.01,1e-7]:
103 |          t_down=max([X[y==k].quantile(q) for k in y.dropna().unique()])
104 |          t_up=min([X[y==k].quantile(1-q) for k in y.dropna().unique()])
105 |          threshold.append((t_down,t_up))
106 | 
107 |     if bins is not None:
108 |         X=pd.cut(X,bins)
109 |     ctable=pd.crosstab(X,y)
110 |     p=ctable.sum()/ctable.sum().sum()
111 |     if ctable.shape[1]==2:
112 |         ctable=ctable/ctable.sum()
113 |         IV=((ctable.iloc[:,0]-ctable.iloc[:,1])*np.log2(ctable.iloc[:,0]/ctable.iloc[:,1])).sum()
114 |         return IV
115 | 
116 |     IV=0
117 |     for cc in ctable.columns:
118 |         ctable_bin=pd.concat([ctable[cc],ctable.loc[:,~(ctable.columns==cc)].sum(axis=1)],axis=1)
119 |         ctable_bin=ctable_bin/ctable_bin.sum()
120 |         IV_bin=((ctable_bin.iloc[:,0]-ctable_bin.iloc[:,1])*np.log2(ctable_bin.iloc[:,0]/ctable_bin.iloc[:,1])).sum()
121 |         IV+=IV_bin*p[cc]
122 |     return IV
123 | 
124 | 
125 | 
126 | # 计算离散随机变量的熵
127 | class entropy:
128 | 
129 |     '''
130 |     计算样本的熵以及相关的指标
131 |     函数的输入默认均为原始的样本集
132 | 
133 |     '''
134 |     def entropy(X):
135 |         '''
136 |         计算随机变量的信息熵
137 |         H(X)=-\sum p_i log2(p_i)
138 |         '''
139 |         X=pd.Series(X)
140 |         p=X.value_counts(normalize=True)
141 |         p=p[p>0]
142 |         h=-(p*np.log2(p)).sum()
143 |         return h
144 | 
145 | 
146 |     def cond_entropy(x,y):
147 |         '''
148 |         计算随机变量的条件熵
149 |         y必须是因子型变量
150 |         H(X,y)=\sum p(y_i)H(X|y=y_i)
151 |         '''
152 |         #h=entropy_combination(X,y)-entropy(y)
153 |         y=pd.Series(y)
154 |         p=y.value_counts(normalize=True)
155 |         h=0
156 |         for yi in y.dropna().unique():
157 |             h+=p[yi]*entropy.entropy(x[y==yi])
158 |         return h
159 | 
160 |     def comb_entropy(x,y):
161 |         '''
162 |         计算随机变量的联合熵
163 |         H(X,y)=-\sum p(x_i,y_i)*log2(p(x_i,y_i))=H(X)+H(y|X)
164 |         '''
165 |         '''
166 |         w=pd.crosstab(X,y)
167 |         N=w.sum().sum()
168 |         w=w/N
169 |         w=w.values.flatten()
170 |         w=w[w>0]
171 |         h=-(w*np.log2(w)).sum()
172 |         '''
173 |         h=entropy.entropy(y)+entropy.cond_entropy(x,y)
174 |         return h
175 | 
176 |     def mutual_info(x,y):
177 |         '''
178 |         计算随机变量的互信息
179 |         I(X;y)=H(X)-H(X|y)=H(y)-H(y|X)
180 |         '''
181 |         h=entropy.entropy(x)-entropy.cond_entropy(x,y)
182 |         return h
183 | 
184 |     def info_gain(x,y):
185 |         '''
186 |         计算随机变量的互信息
187 |         I(X;y)=H(X)-H(X|y)=H(y)-H(y|X)
188 |         '''
189 |         h=entropy.entropy(x)-entropy.cond_entropy(x,y)
190 |         return h
191 | 
192 |     def info_gain_ratio(x,y):
193 |         '''
194 |         计算随机变量的信息增益比，此时X是总体，y是某个特征
195 |         I(X;y)=H(X)-H(X|y)=H(y)-H(y|X)
196 |         IG(X;y)=I(X;y)/H(y)
197 |         '''
198 |         h=entropy.entropy(x)-entropy.cond_entropy(x,y)
199 |         hy=entropy.entropy(y)
200 |         h=h/hy if hy>0 else 0
201 |         return h
202 | 
203 | 
204 | 
205 |     def cross_entropy(x,y):
206 |         '''
207 |         计算随机变量的交叉熵
208 |         要求X和y的测度空间相同,此时X和y的样本数量可以不一致
209 | 
210 |         H(p,q)=-\sum p(x)log2(q(x))
211 | 
212 |         parameter
213 |         --------
214 |         '''
215 |         X=pd.Series(x)
216 |         y=pd.Series(y)
217 |         p=X.value_counts(normalize=True)
218 |         q=y.value_counts(normalize=True)
219 |         h=-(p*np.log2(q)).sum()
220 |         return h
221 | 
222 | 
223 |     def relative_entropy(x,y):
224 |         '''
225 |         计算随机变量的相对熵
226 |         要求X和y的测度空间相同,此时X和y的样本数量可以不一致
227 |         D=\sum p(x) log2(p(x)/q(x))=H(p,q)-H(p)
228 | 
229 |         parameter
230 |         --------
231 |         dtype: X和y的数据类型，因子变量category和数值变量numeric，默认是category
232 |         '''
233 | 
234 |         X=pd.Series(x)
235 |         y=pd.Series(y)
236 |         p=X.value_counts(normalize=True)
237 |         q=y.value_counts(normalize=True)
238 |         #h=entropy.entropy_cross(p,q)-entropy.entropy(p)
239 |         h=(p*np.log2(p/q)).sum()
240 |         return h
241 | 
242 | 
243 | 
244 | 
245 | # 计算连续变量的熵（利用分布进行近似 CONTINUOUS ESTIMATORS）
246 | class entropyc:
247 | 
248 |     '''
249 |     原作者：Greg Ver Steeg
250 |     GitHub：https://github.com/gregversteeg/NPEET
251 |     Or go to http://www.isi.edu/~gregv/npeet.html
252 | 
253 |     ref:Alexander Kraskov etc. Estimating mutual information. Phys. Rev. E, 69:066138, Jun 2004
254 | 
255 |     连续分布的熵估计
256 |     '''
257 |     
258 |     def __reshape(x):
259 |         x=np.asarray(x)
260 |         if len(x.shape)==1:
261 |             x=x.reshape((len(x),1))
262 |         return x
263 | 
264 |     def entropy(x, k=3, base=2):
265 |         """
266 |         The classic K-L k-nearest neighbor continuous entropy estimator
267 | 
268 |         if x is a one-dimensional scalar and we have:
269 |         H(X)=-\sum p_i log2(p_i)
270 |         if we only have random sample (x1 . . . xN) of N realizations of X,
271 |         we can estimator H(X):
272 | 
273 |         H(X) = −ψ(k) + ψ(N) + \log c_d + d/N \sum_{i=1}^{N} \log eps(i)
274 | 
275 |         where ψ(x) is digammer funciton,d is the dimention of x,
276 |          c_d is the volume of the d-dimensional unit ball
277 |         eps(i) is twice the distance from xi to its k-th neighbour
278 | 
279 |         parameter
280 |         ---------
281 |         x: 某个分布的抽样，且支持多维。
282 |         k: k近邻的
283 |         base：2
284 | 
285 |         return
286 |         -------
287 |         entropy
288 |         """
289 |         x=entropyc.__reshape(x)
290 |         assert k <= len(x) - 1, "Set k smaller than num. samples - 1"
291 |         d = len(x[0])
292 |         N = len(x)
293 |         intens = 1e-10  # small noise to break degeneracy, see doc.
294 |         x = [list(p + intens * nr.rand(len(x[0]))) for p in x]
295 |         tree = ss.cKDTree(x)
296 |         nn = [tree.query(point, k + 1, p=float('inf'))[0][k] for point in x]
297 |         const = digamma(N) - digamma(k) + d * log(base)
298 |         return (const + d * np.mean(list(map(log, nn)))) / log(base)
299 | 
300 |     def cond_entropy(x, y, k=3, base=2):
301 |       """ The classic K-L k-nearest neighbor continuous entropy estimator for the
302 |           entropy of X conditioned on Y.
303 |       """
304 |       hxy = entropyc.entropy([xi + yi for (xi, yi) in zip(x, y)], k, base)
305 |       hy = entropyc.entropy(y, k, base)
306 |       return hxy - hy
307 | 
308 |     def __column(xs, i):
309 |       return [[x[i]] for x in xs]
310 | 
311 |     def tc(xs, k=3, base=2):
312 |       xis = [entropyc.entropy(entropyc.__column(xs, i), k, base) for i in range(0, len(xs[0]))]
313 |       return np.sum(xis) - entropyc.entropy(xs, k, base)
314 | 
315 |     def ctc(xs, y, k=3, base=2):
316 |       xis = [entropyc.cond_entropy(entropyc.__column(xs, i), y, k, base) for i in range(0, len(xs[0]))]
317 |       return np.sum(xis) - entropyc.cond_entropy(xs, y, k, base)
318 | 
319 |     def corex(xs, ys, k=3, base=2):
320 |       cxis = [entropyc.mutual_info(entropyc.__column(xs, i), ys, k, base) for i in range(0, len(xs[0]))]
321 |       return np.sum(cxis) - entropyc.mutual_info(xs, ys, k, base)
322 | 
323 |     def mutual_info(x, y, k=3, base=2):
324 |         """ Mutual information of x and y
325 |             x, y should be a list of vectors, e.g. x = [[1.3], [3.7], [5.1], [2.4]]
326 |             if x is a one-dimensional scalar and we have four samples
327 |         """
328 |         x=entropyc.__reshape(x)
329 |         y=entropyc.__reshape(y)
330 |         assert len(x) == len(y), "Lists should have same length"
331 |         assert k <= len(x) - 1, "Set k smaller than num. samples - 1"
332 |         intens = 1e-10  # small noise to break degeneracy, see doc.
333 |         x = [list(p + intens * nr.rand(len(x[0]))) for p in x]
334 |         y = [list(p + intens * nr.rand(len(y[0]))) for p in y]
335 |         points = zip2(x, y)
336 |         # Find nearest neighbors in joint space, p=inf means max-norm
337 |         tree = ss.cKDTree(points)
338 |         dvec = [tree.query(point, k + 1, p=float('inf'))[0][k] for point in points]
339 |         a, b, c, d = avgdigamma(x, dvec), avgdigamma(y, dvec), digamma(k), digamma(len(x))
340 |         return (-a - b + c + d) / log(base)
341 | 
342 | 
343 |     def cond_mutual_info(x, y, z, k=3, base=2):
344 |         """ Mutual information of x and y, conditioned on z
345 |             x, y, z should be a list of vectors, e.g. x = [[1.3], [3.7], [5.1], [2.4]]
346 |             if x is a one-dimensional scalar and we have four samples
347 |         """
348 |         x=entropyc.__reshape(x)
349 |         y=entropyc.__reshape(y)
350 |         z=entropyc.__reshape(z)
351 |         assert len(x) == len(y), "Lists should have same length"
352 |         assert k <= len(x) - 1, "Set k smaller than num. samples - 1"
353 |         intens = 1e-10  # small noise to break degeneracy, see doc.
354 |         x = [list(p + intens * nr.rand(len(x[0]))) for p in x]
355 |         y = [list(p + intens * nr.rand(len(y[0]))) for p in y]
356 |         z = [list(p + intens * nr.rand(len(z[0]))) for p in z]
357 |         points = zip2(x, y, z)
358 |         # Find nearest neighbors in joint space, p=inf means max-norm
359 |         tree = ss.cKDTree(points)
360 |         dvec = [tree.query(point, k + 1, p=float('inf'))[0][k] for point in points]
361 |         a, b, c, d = avgdigamma(zip2(x, z), dvec), avgdigamma(zip2(y, z), dvec), avgdigamma(z, dvec), digamma(k)
362 |         return (-a - b + c + d) / log(base)
363 | 
364 | 
365 |     def kl_div(x, xp, k=3, base=2):
366 |         """ KL Divergence between p and q for x~p(x), xp~q(x)
367 |             x, xp should be a list of vectors, e.g. x = [[1.3], [3.7], [5.1], [2.4]]
368 |             if x is a one-dimensional scalar and we have four samples
369 |         """
370 |         x=entropyc.__reshape(x)
371 |         xp=entropyc.__reshape(xp)
372 |         assert k <= len(x) - 1, "Set k smaller than num. samples - 1"
373 |         assert k <= len(xp) - 1, "Set k smaller than num. samples - 1"
374 |         assert len(x[0]) == len(xp[0]), "Two distributions must have same dim."
375 |         d = len(x[0])
376 |         n = len(x)
377 |         m = len(xp)
378 |         const = log(m) - log(n - 1)
379 |         tree = ss.cKDTree(x)
380 |         treep = ss.cKDTree(xp)
381 |         nn = [tree.query(point, k + 1, p=float('inf'))[0][k] for point in x]
382 |         nnp = [treep.query(point, k, p=float('inf'))[0][k - 1] for point in x]
383 |         return (const + d * np.mean(list(map(log, nnp))) - d * np.mean(list(map(log, nn)))) / log(base)
384 | 
385 | 
386 | 
387 | # 计算随机变量的熵（直接离散话估计 DISCRETE ESTIMATORS）
388 | class entropyd:
389 | 
390 |     def entropy(sx, base=2):
391 |         """ Discrete entropy estimator
392 |             Given a list of samples which can be any hashable object
393 |         """
394 |         return entropyd.entropyfromprobs(entropyd.hist(sx), base=base)
395 | 
396 | 
397 |     def mutual_info(x, y, base=2):
398 |         """ Discrete mutual information estimator
399 |             Given a list of samples which can be any hashable object
400 |         """
401 |         return -entropyd.entropy(zip(x, y), base) + entropyd.entropy(x, base) + entropyd.entropy(y, base)
402 | 
403 |     def cond_mutual_info(x, y, z):
404 |         """ Discrete mutual information estimator
405 |             Given a list of samples which can be any hashable object
406 |         """
407 |         return entropyd.entropy(zip(y, z))+entropyd.entropy(zip(x, z))-entropyd.entropy(zip(x, y, z))-entropyd.entropy(z)
408 | 
409 |     def cond_entropy(x, y, base=2):
410 |       """ The classic K-L k-nearest neighbor continuous entropy estimator for the
411 |           entropy of X conditioned on Y.
412 |       """
413 |       return entropyd.entropy(zip(x, y), base) - entropyd.entropy(y, base)
414 | 
415 |     def tcd(xs, base=2):
416 |       xis = [entropyd.entropy(entropyd._column(xs, i), base) for i in range(0, len(xs[0]))]
417 |       hx = entropyd.entropy(xs, base)
418 |       return np.sum(xis) - hx
419 | 
420 |     def ctcd(xs, y, base=2):
421 |       xis = [entropyd.cond_entropy(entropyd._column(xs, i), y, base) for i in range(0, len(xs[0]))]
422 |       return np.sum(xis) - entropyd.cond_entropy(xs, y, base)
423 | 
424 |     def corexd(xs, ys, base=2):
425 |       cxis = [entropyd.mutual_infod(entropyd._column(xs, i), ys, base) for i in range(0, len(xs[0]))]
426 |       return np.sum(cxis) - entropyd.mutual_info(xs, ys, base)
427 | 
428 |     def hist(sx):
429 |         sx = discretize(sx)
430 |         # Histogram from list of samples
431 |         d = dict()
432 |         for s in sx:
433 |             if type(s) == list:
434 |               s = tuple(s)
435 |             d[s] = d.get(s, 0) + 1
436 |         return map(lambda z: float(z) / len(sx), d.values())
437 | 
438 | 
439 |     def entropyfromprobs(probs, base=2):
440 |         # Turn a normalized list of probabilities of discrete outcomes into entropy (base 2)
441 |         return -sum(map(entropyd.elog, probs)) / log(base)
442 | 
443 |     def _column(xs, i):
444 |       return [[x[i]] for x in xs]
445 | 
446 |     def elog(x):
447 |         # for entropy, 0 log 0 = 0. but we get an error for putting log 0
448 |         if x <= 0. or x >= 1.:
449 |             return 0
450 |         else:
451 |             return x * log(x)
452 | 
453 | 
454 | 
455 | 
456 | 
457 | # UTILITY FUNCTIONS
458 | def vectorize(scalarlist):
459 |     """ Turn a list of scalars into a list of one-d vectors
460 |     """
461 |     return [[x] for x in scalarlist]
462 | 
463 | 
464 | def shuffle_test(measure, x, y, z=False, ns=200, ci=0.95, **kwargs):
465 |     """ Shuffle test
466 |         Repeatedly shuffle the x-values and then estimate measure(x, y, [z]).
467 |         Returns the mean and conf. interval ('ci=0.95' default) over 'ns' runs.
468 |         'measure' could me mi, cmi, e.g. Keyword arguments can be passed.
469 |         Mutual information and CMI should have a mean near zero.
470 |     """
471 |     xp = x[:]  # A copy that we can shuffle
472 |     outputs = []
473 |     for i in range(ns):
474 |         random.shuffle(xp)
475 |         if z:
476 |             outputs.append(measure(xp, y, z, **kwargs))
477 |         else:
478 |             outputs.append(measure(xp, y, **kwargs))
479 |     outputs.sort()
480 |     return np.mean(outputs), (outputs[int((1. - ci) / 2 * ns)], outputs[int((1. + ci) / 2 * ns)])
481 | 
482 | def _freedman_diaconis_bins(a):
483 |     """Calculate number of hist bins using Freedman-Diaconis rule."""
484 |     # From http://stats.stackexchange.com/questions/798/
485 |     a = np.asarray(a)
486 |     iqr = stats.scoreatpercentile(a, 75)-stats.scoreatpercentile(a, 25)
487 |     h = 2*iqr/(len(a)**(1/3))
488 |     bins=int(np.ceil((a.max()-a.min())/h)) if h!=0 else int(np.sqrt(a.size))
489 |     return bins
490 | 
491 | # INTERNAL FUNCTIONS
492 | 
493 | def avgdigamma(points, dvec):
494 |     # This part finds number of neighbors in some radius in the marginal space
495 |     # returns expectation value of <psi(nx)>
496 |     N = len(points)
497 |     tree = ss.cKDTree(points)
498 |     avg = 0.
499 |     for i in range(N):
500 |         dist = dvec[i]
501 |         # subtlety, we don't include the boundary point,
502 |         # but we are implicitly adding 1 to kraskov def bc center point is included
503 |         num_points = len(tree.query_ball_point(points[i], dist - 1e-15, p=float('inf')))
504 |         avg += digamma(num_points) / N
505 |     return avg
506 | 
507 | 
508 | def zip2(*args):
509 |     # zip2(x, y) takes the lists of vectors and makes it a list of vectors in a joint space
510 |     # E.g. zip2([[1], [2], [3]], [[4], [5], [6]]) = [[1, 4], [2, 5], [3, 6]]
511 |     return [sum(sublist, []) for sublist in zip(*args)]
512 | 
513 | def discretize(xs):
514 |     def discretize_one(x):
515 |         if len(x) > 1:
516 |             return tuple(x)
517 |         else:
518 |             return x[0]
519 |     # discretize(xs) takes a list of vectors and makes it a list of tuples or scalars
520 |     return [discretize_one(x) for x in xs]
521 | 


--------------------------------------------------------------------------------
/reportgen/associate/fpgrowth.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """
  3 | JSong：直接从 python 包 orangecontrib 中 fork，未作修改
  4 | 原文：
  5 | This module implements FP-growth [1] frequent pattern mining algorithm with
  6 | bucketing optimization [2] for conditional databases of few items.
  7 | 
  8 | The entry points are :obj:`frequent_itemsets()`, :obj:`association_rules()`, and
  9 | :obj:`rules_stats()` functions below.
 10 | 
 11 | 
 12 | [1]: J. Han, J. Pei, Y. Yin, R. Mao.
 13 |      Mining Frequent Patterns without Candidate Generation: A
 14 |      Frequent-Pattern Tree Approach. 2004.
 15 |      https://www.cs.sfu.ca/~jpei/publications/dami03_fpgrowth.pdf
 16 | 
 17 | [2]: R. Agrawal, C. Aggarwal, V. Prasad.
 18 |      Depth first generation of long patterns. 2000.
 19 |      http://www.cs.tau.ac.il/~fiat/dmsem03/Depth%20First%20Generation%20of%20Long%20Patterns%20-%202000.pdf
 20 | 
 21 | [3]: R. Agrawal, et al.
 22 |      Fast Discovery of Association Rules. 1996.
 23 |      http://cs-people.bu.edu/evimaria/cs565/advances.pdf
 24 | 
 25 | 
 26 | Examples
 27 | --------
 28 | Here's an example from R. Agrawal's original Apriori article [3 § 12.2.2].
 29 | Given a database of transactions:
 30 | 
 31 | >>> T = [[1,    3, 4   ],
 32 | ...      [   2, 3,    5],
 33 | ...      [1, 2, 3,    5],
 34 | ...      [   2,       5]]
 35 | 
 36 | We can enumerate all frequent itemsets with support greater than two
 37 | transactions:
 38 | 
 39 | >>> from orangecontrib.associate.fpgrowth import *  # doctest: +SKIP
 40 | >>> itemsets = frequent_itemsets(T, 2)
 41 | 
 42 | Note, functions in this module produce generators.
 43 | The results space can explode quite quickly
 44 | and can easily be too large to fit in your RAM. By using generators, you can
 45 | filter the results to your liking `as you pass them`.
 46 | 
 47 | >>> itemsets
 48 | <generator object ...>
 49 | >>> list(itemsets)
 50 | [(frozenset({1}), 2),
 51 |  (frozenset({2}), 3),
 52 |  (frozenset({3}), 3),
 53 |  (frozenset({1, 3}), 2),
 54 |  (frozenset({2, 3}), 2),
 55 |  (frozenset({5}), 3),
 56 |  (frozenset({2, 5}), 3),
 57 |  (frozenset({3, 5}), 2),
 58 |  (frozenset({2, 3, 5}), 2)]
 59 | 
 60 | We can try it with a larger and more real-world database of categorical values:
 61 | 
 62 | >>> import Orange
 63 | >>> data = Orange.data.Table('zoo')
 64 | >>> data
 65 | [[1, 0, 0, 1, 0, ... | mammal] {aardvark},
 66 |  [1, 0, 0, 1, 0, ... | mammal] {antelope},
 67 |  [0, 0, 1, 0, 0, ... | fish] {bass},
 68 |  [1, 0, 0, 1, 0, ... | mammal] {bear},
 69 |  [1, 0, 0, 1, 0, ... | mammal] {boar},
 70 |  ...
 71 | ]
 72 | 
 73 | We can't use table data directly; we first have to one-hot transform it:
 74 | 
 75 | >>> X, mapping = OneHot.encode(data, include_class=True)
 76 | 
 77 | We get a database we can use to find frequent itemsets, and a mapping we will
 78 | use later to revert the transformation.
 79 | 
 80 | >>> X
 81 | array([[False,  True, ...,  True, False],
 82 |        [False,  True, ...,  True, False],
 83 |        [ True, False, ..., False, False],
 84 |        ...,
 85 |        [False,  True, ...,  True, False],
 86 |        [ True, False, ..., False, False],
 87 |        [ True, False, ..., False, False]], dtype=bool)
 88 | >>> sorted(mapping.items())
 89 | [(0, (0, 0)),
 90 |  (1, (0, 1)),
 91 |  (2, (1, 0)),
 92 |  (3, (1, 1)),
 93 |  ...
 94 |  (40, (16, 4)),
 95 |  (41, (16, 5)),
 96 |  (42, (16, 6))]
 97 | 
 98 | We want itemsets with >40% support:
 99 | 
100 | >>> itemsets = dict(frequent_itemsets(X, .4))
101 | >>> len(itemsets)
102 | 520
103 | 
104 | The transaction-coded items corresponding to class values are:
105 | 
106 | >>> class_items = {item
107 | ...                for item, var, _ in OneHot.decode(mapping, data, mapping)
108 | ...                if var is data.domain.class_var}
109 | >>> sorted(class_items)
110 | [36, 37, 38, 39, 40, 41, 42]
111 | 
112 | That makes sense as our class variable has seven values:
113 | 
114 | >>> data.domain.class_var.values
115 | ['amphibian', 'bird', 'fish', 'insect', 'invertebrate', 'mammal', 'reptile']
116 | 
117 | Now we can generate all association rules that have consequent equal to one
118 | of the class values and >80% confidence (i.e. classification rules):
119 | 
120 | >>> rules = [(P, Q, supp, conf)
121 | ...          for P, Q, supp, conf in association_rules(itemsets, .8)
122 | ...          if len(Q) == 1 and Q & class_items]
123 | >>> len(rules)
124 | 18
125 | >>> rules
126 | [(frozenset({17, 2, 19, 20, 7}), frozenset({41}), 41, 1.0),
127 |  (frozenset({17, 2, 19, 7}), frozenset({41}), 41, 1.0),
128 |  ...
129 |  (frozenset({20, 7}), frozenset({41}), 41, 1.0),
130 |  (frozenset({7}), frozenset({41}), 41, 1.0)]
131 | 
132 | To make them more helpful, we can use ``mapping`` to transform the rules' items
133 | back into table domain values, e.g. for first five rules:
134 | 
135 | >>> names = {item: '{}={}'.format(var.name, val)
136 | ...          for item, var, val in OneHot.decode(mapping, data, mapping)}
137 | >>> for ante, cons, supp, conf in rules[:5]:
138 | ...     print(', '.join(names[i] for i in ante), '-->',
139 | ...           names[next(iter(cons))],
140 | ...           '(supp: {}, conf: {})'.format(supp, conf))
141 | backbone=1, feathers=0, breathes=1, venomous=0, milk=1 --> type=mammal (supp: 41, conf: 1.0)
142 | backbone=1, feathers=0, breathes=1, milk=1 --> type=mammal (supp: 41, conf: 1.0)
143 | backbone=1, breathes=1, venomous=0, milk=1 --> type=mammal (supp: 41, conf: 1.0)
144 | feathers=0, breathes=1, venomous=0, milk=1 --> type=mammal (supp: 41, conf: 1.0)
145 | backbone=1, feathers=0, breathes=1, venomous=0 --> type=mammal (supp: 41, conf: 0.87...)
146 | 
147 | 
148 | Reference with further examples below.
149 | """
150 | 
151 | # TODO: Consider FPClose from "Efficiently using prefix-trees in mining frequent itemsets"
152 | # TODO: Consider ExAnte: Anticipated data reduction in constrained pattern mining
153 | 
154 | from collections import defaultdict, Iterator
155 | from itertools import combinations, chain
156 | from functools import reduce
157 | 
158 | import numpy as np
159 | from scipy.sparse import issparse, spmatrix
160 | 
161 | __all__=['frequent_itemsets', 'association_rules', 'rules_stats', 'OneHot', 'preprocess']
162 | 
163 | _FP_TREE_EMPTY = (None, [])
164 | _BUCKETING_FEW_ITEMS = 10
165 | 
166 | 
167 | class _Node(dict):
168 |     def __init__(self, item=None, parent=None, count=None):
169 |         self.item = item
170 |         self.parent = parent
171 |         self.count = count
172 | 
173 | 
174 | def _bucketing_count(db, frequent_items, min_support):
175 |     """
176 |     Bucket counting (bucketing) optimization for databases where few items
177 |     are frequent ([2] § 5).
178 |     """
179 |     # Forward and inverse mapping of frequent_items to [0, n_items)
180 |     inv_map = dict(enumerate(frequent_items)).__getitem__
181 |     fwd_map = {v: k for k, v in inv_map.__self__.items()}.__getitem__
182 |     # Project transactions
183 |     k = len(frequent_items)
184 |     buckets = [0] * 2**k
185 |     for count, transaction in db:
186 |         set_bits = (fwd_map(i) for i in frequent_items.intersection(transaction))
187 |         tid = reduce(lambda a, b: a | 1 << b, set_bits, 0)
188 |         buckets[tid] += count
189 |     # Aggregate bucketing counts ([2], Figure 5)
190 |     for i in range(0, k):
191 |         i = 2**i
192 |         for j in range(2**k):
193 |             if j & i == 0:
194 |                 buckets[j] += buckets[j + i]
195 |     # Announce results
196 |     buckets = enumerate(buckets)
197 |     next(buckets)  # Skip 000...0
198 |     for tid, count in buckets:
199 |         if count >= min_support:
200 |             yield frozenset(inv_map(i) for i, b in enumerate(reversed(bin(tid))) if b == '1'), count
201 | 
202 | 
203 | # Replace above bucketing count with the one from C module
204 | try:
205 |     from orangecontrib.associate._fpgrowth import bucketing_count as _bucketing_count, \
206 |                                                   BUCKETING_FEW_ITEMS as _BUCKETING_FEW_ITEMS
207 | except ImportError:
208 |     # The module may not have been compiled due to compiler missing (e.g. on WinDOS);
209 |     # just use above Python code
210 |     pass
211 | 
212 | 
213 | def _fp_tree_insert(item, T, node_links, count):
214 |     """ Insert item into _Node-tree T and return the new node """
215 |     node = T.get(item)
216 |     if node is None:
217 |         node = T[item] = _Node(item, T, count)
218 |         node_links[item].append(node)
219 |     else:  # Node for this item already in T, just inc its count
220 |         node.count += count
221 |     return node
222 | 
223 | 
224 | def _fp_tree(db, min_support):
225 |     """
226 |     FP-tree construction ([1] § 2.1, Algorithm 1).
227 | 
228 |     If frequent items in db are determined to be less than threshold,
229 |     "bucketing" [2] is used instead.
230 | 
231 |     Returns
232 |     -------
233 |     tuple
234 |         (FP-tree, None) or (None, list of frequent itemsets with support)
235 |     """
236 |     if not isinstance(db, list): db = list(db)
237 | 
238 |     if not db:
239 |         return _FP_TREE_EMPTY
240 | 
241 |     # Used to count item support so it can be reported when generating itemsets
242 |     item_support = defaultdict(int)
243 |     # Used for ordering transactions' items for "optimally" "compressed" tree
244 |     node_support = defaultdict(int)
245 |     for count, transaction in db:
246 |         for item in transaction:
247 |             item_support[item] += count
248 |             node_support[item] += 1
249 |     # Only ever consider items that have min_support
250 |     frequent_items = {item
251 |                       for item, support in item_support.items()
252 |                       if support >= min_support}
253 | 
254 |     # Short-circuit, if possible
255 |     n_items = len(frequent_items)
256 |     if 0 == n_items:
257 |         return _FP_TREE_EMPTY
258 |     if 1 == n_items:
259 |         item = frequent_items.pop()
260 |         return None, ((frozenset({item}), item_support[item]),)
261 |     if n_items <= _BUCKETING_FEW_ITEMS:
262 |         return None, ((frozenset(itemset), support)
263 |                       for itemset, support in _bucketing_count(db, frequent_items, min_support))
264 | 
265 |     # "The items [...] should be ordered in the frequency descending order of
266 |     # node occurrence of each item instead of its support" ([1], p. 12, bottom)
267 |     sort_index = {item: i
268 |                   for i, item in
269 |                       enumerate(sorted(frequent_items,
270 |                                        key=node_support.__getitem__,
271 |                                        reverse=True))}.__getitem__
272 |     # Only retain frequent items and sort them
273 |     db = ((count, sorted(frequent_items.intersection(transaction),
274 |                          key=sort_index))
275 |           for count, transaction in db)
276 | 
277 |     root = _Node()
278 |     node_links = defaultdict(list)
279 |     for count, transaction in db:
280 |         T = root
281 |         for item in transaction:
282 |             T = _fp_tree_insert(item, T, node_links, count)
283 |     # Sorted support-descending (in reverse because popping from the back for efficiency)
284 |     root.node_links = sorted(node_links.items(), key=lambda i: -sort_index(i[0]))
285 |     return root, None
286 | 
287 | 
288 | def _powerset(lst):
289 |     """
290 |     >>> list(_powerset([1, 2, 3]))
291 |     [(1,), (2,), (3,), (1, 2), (1, 3), (2, 3), (1, 2, 3)]
292 |     """
293 |     return chain.from_iterable(combinations(lst, r)
294 |                                for r in range(1, len(lst) + 1))
295 | 
296 | 
297 | def _single_prefix_path(root):
298 |     """ Return (single-prefix path, rest of tree with new root) """
299 |     path = []
300 |     tree = root
301 |     node_links = root.node_links
302 |     while len(tree) == 1:
303 |         tree = next(iter(tree.values()))
304 |         path.append((tree.item, tree.count))
305 |         node_links.pop()
306 |     tree.parent, tree.item, tree.node_links = None, None, node_links
307 |     return path, tree
308 | 
309 | 
310 | def _prefix_paths(tree, nodes):
311 |     """ Generate all paths of tree leading to all item nodes """
312 |     for node in nodes:
313 |         path = []
314 |         support = node.count
315 |         node = node.parent
316 |         while node.item is not None:
317 |             path.append(node.item)
318 |             node = node.parent
319 |         if path:
320 |             yield support, path
321 | 
322 | 
323 | def _freq_patterns_single(P, alpha, min_support):
324 |     """ Yield subsets of P as (frequent itemset, support) """
325 |     for itemset in _powerset(P):
326 |         yield alpha.union(i[0] for i in itemset), itemset[-1][1]
327 | 
328 | 
329 | def _freq_patterns_multi(Q, alpha, min_support):
330 |     """ Mine multi-path FP-tree """
331 |     for item, nodes in reversed(Q.node_links):
332 |         support = sum(n.count for n in nodes)
333 |         beta = alpha.union({item})
334 |         yield beta, support
335 |         tree, got_itemsets = _fp_tree(_prefix_paths(Q, nodes), min_support)
336 |         if got_itemsets:
337 |             for itemset, support in got_itemsets:
338 |                 yield beta.union(itemset), support
339 |         elif tree is not None:
340 |             yield from _fp_growth(tree, beta, min_support)
341 | 
342 | 
343 | def _fp_growth(tree, alpha, min_support):
344 |     """ FP-growth ([1], § 3.3, Algorithm 2). """
345 |     # Single prefix path optimization ([1] § 3.1)
346 |     P, Q = _single_prefix_path(tree) if len(tree) == 1 else ([], tree)
347 |     # Return P×Q
348 |     yield from _freq_patterns_single(P, alpha, min_support)
349 |     for itemsetQ, supportQ in _freq_patterns_multi(Q, alpha, min_support):
350 |         yield itemsetQ, supportQ
351 |         for itemsetP, supportP in _freq_patterns_single(P, alpha, min_support):
352 |             yield itemsetQ | itemsetP, supportQ
353 | 
354 | 
355 | def frequent_itemsets(X, min_support=.2):
356 |     """
357 |     Generator yielding frequent itemsets from database X.
358 | 
359 |     Parameters
360 |     ----------
361 |     X : list or numpy.ndarray or scipy.sparse.spmatrix or iterator
362 |         The database of transactions where each transaction is a collection
363 |         of integer items. If `numpy.ndarray`, the items are considered to be
364 |         indices of non-zero columns.
365 |     min_support : float or int
366 |         If float in range (0, 1), percent of minimal support for itemset to
367 |         be considered frequent. If int > 1, the absolute number of instances.
368 |         For example, general iterators don't have defined length, so you need
369 |         to pass the absolute minimal support as int.
370 | 
371 |     Yields
372 |     ------
373 |     itemset: frozenset
374 |         Iteratively yields all itemsets (as frozensets of item indices) with
375 |         support greater or equal to specified `min_support`.
376 |     support: int
377 |         Itemset's support as number of instaances.
378 | 
379 |     Examples
380 |     --------
381 |     Have a database of 50 transactions, 100 possible items:
382 | 
383 |     >>> import numpy as np
384 |     >>> np.random.seed(0)
385 |     >>> X = np.random.random((50, 100)) > .9
386 | 
387 |     Convert it to sparse so we show this type is supported:
388 | 
389 |     >>> from scipy.sparse import lil_matrix  # other types would convert to LIL anyway
390 |     >>> X = lil_matrix(X)
391 | 
392 |     Count the number of itemsets of at least two items with support greater
393 |     than 4%:
394 | 
395 |     >>> sum(1 for itemset, support in frequent_itemsets(X, .05)
396 |     ...     if len(itemset) >= 2)
397 |     72
398 | 
399 |     Let's get all the itemsets with at least 20% support:
400 | 
401 |     >>> gen = frequent_itemsets(X, .2)
402 |     >>> gen
403 |     <generator object ...>
404 | 
405 |     >>> itemsets = list(gen)
406 |     >>> itemsets
407 |     [(frozenset({4}), 11), (frozenset({25}), 10)]
408 | 
409 |     We get the same result by specifying the support as absolute number:
410 | 
411 |     >>> list(frequent_itemsets(X, 10)) == itemsets
412 |     True
413 | 
414 |     So the items '4' and '25' (fifth and twenty sixth columns of X) are the
415 |     only items (and itemsets) that appear 10 or more times. Let's check this:
416 | 
417 |     >>> (X.sum(axis=0) >= 10).nonzero()[1]
418 |     array([ 4, 25])
419 | 
420 |     Conclusion: Given databases of uniformly distributed random data,
421 |     there's not much to work with.
422 |     """
423 |     if not isinstance(X, (np.ndarray, spmatrix, list, Iterator)):
424 |         raise TypeError('X must be (sparse) array of boolean values, or'
425 |                         'list of lists of hashable items, or iterator')
426 |     if not (isinstance(min_support, int) and min_support > 0 or
427 |             isinstance(min_support, float) and 0 < min_support <= 1):
428 |         raise ValueError('min_support must be an integer number of instances,'
429 |                          'or a percent fraction in (0, 1]')
430 | 
431 |     min_support *= (1 if isinstance(min_support, int) else
432 |                     len(X) if isinstance(X, list) else
433 |                     X.shape[0])
434 |     min_support = max(1, int(np.ceil(min_support)))
435 | 
436 |     if issparse(X):
437 |         X = X.tolil().rows
438 |     elif isinstance(X, np.ndarray):
439 |         X = (t.nonzero()[-1] for t in X)
440 | 
441 |     db = ((1, transaction) for transaction in X)  # 1 is initial item support
442 |     tree, itemsets = _fp_tree(db, min_support)
443 |     if itemsets:
444 |         yield from itemsets
445 |     if tree:
446 |         yield from _fp_growth(tree, frozenset(), min_support)
447 | 
448 | 
449 | def _association_rules(left, right, last_item, support, min_confidence, itemsets):
450 |     if not left: return
451 |     confidence = support / itemsets[left]
452 |     if confidence >= min_confidence:
453 |         yield left, right, support, confidence
454 |         for item in left:
455 |             if item > last_item: continue  # This ensures same rules aren't visited twice
456 |             yield from _association_rules(
457 |                 left - {item}, right | {item},
458 |                 item, support, min_confidence, itemsets)
459 | 
460 | 
461 | def association_rules(itemsets, min_confidence, itemset=None):
462 |     """
463 |     Generate association rules ([3] § 12.3) from dict of itemsets' supports
464 |     (from :obj:`frequent_itemsets()`). If `itemset` is provided, only generate
465 |     its rules.
466 | 
467 |     Parameters
468 |     ----------
469 |     itemsets: dict
470 |         A `dict` mapping itemsets to their supports. Can be generated by
471 |         feeding the output of `frequent_itemsets()` to `dict` constructor.
472 |     min_confidence: float
473 |         Confidence percent. Defined as `itemset_support / antecedent_support`.
474 |     itemset: frozenset
475 |         Itemset the association rules of which we are interested in.
476 | 
477 |     Yields
478 |     ------
479 |     antecedent: frozenset
480 |         The LHS of the association rule.
481 |     consequent: frozenset
482 |         The RHS of the association rule.
483 |     support: int
484 |         The number of instances supporting (containing) this rule.
485 |     confidence: float
486 |         ``total_support / lhs_support``.
487 | 
488 |     Examples
489 |     --------
490 |     >>> np.random.seed(0)
491 |     >>> N = 100
492 |     >>> X = np.random.random((N, 100)) > .9
493 | 
494 |     Find all itemsets with at least 5% support:
495 | 
496 |     >>> itemsets = dict(frequent_itemsets(X, .05))
497 |     >>> len(itemsets)
498 |     116
499 | 
500 |     Generate all association rules from these itemsets with minimum
501 |     50% confidence:
502 | 
503 |     >>> rules = association_rules(itemsets, .5)
504 |     >>> rules
505 |     <generator object ...>
506 |     >>> rules = list(rules)
507 |     >>> len(rules)
508 |     7
509 |     >>> rules
510 |     [(frozenset({36}), frozenset({25}), 5, 0.55...),
511 |      (frozenset({63}), frozenset({58}), 5, 0.5),
512 |      ...
513 |      (frozenset({30}), frozenset({32}), 5, 0.55...),
514 |      (frozenset({75}), frozenset({98}), 5, 0.5)]
515 | 
516 |     Or only the rules for a particular itemset:
517 | 
518 |     >>> list(association_rules(itemsets, .3, frozenset({75, 98})))
519 |     [(frozenset({75}), frozenset({98}), 5, 0.5),
520 |      (frozenset({98}), frozenset({75}), 5, 0.45...)]
521 | 
522 |     """
523 |     assert (isinstance(itemsets, dict) and
524 |             isinstance(next(iter(itemsets), frozenset()), frozenset))
525 |     assert 0 < min_confidence <= 1
526 |     from_itemsets = (itemset,) if itemset else sorted(itemsets, key=len, reverse=True)
527 |     for itemset in from_itemsets:
528 |         support = itemsets[itemset]
529 |         for item in itemset:
530 |             right = frozenset({item})
531 |             yield from _association_rules(
532 |                 itemset - right, right,
533 |                 item, support, min_confidence, itemsets)
534 | 
535 | 
536 | def rules_stats(rules, itemsets, n_examples):
537 |     """
538 |     Generate additional stats for rules generated by :obj:`association_rules()`.
539 | 
540 |     Parameters
541 |     ----------
542 |     rules: iterable
543 |         Rules as output by `association_rules()`.
544 |     itemsets: dict
545 |         The itemsets as obtained by `dict(frequent_itemsets(...))`.
546 |     n_examples: int
547 |         The total number of instances (for calculating coverage, lift,
548 |         and leverage).
549 | 
550 |     Yields
551 |     ------
552 |     atecedent: frozenset
553 |         The LHS of the association rule.
554 |     consequent: frozenset
555 |         The RHS of the association rule.
556 |     support: int
557 |         Support as an absolute number of instances.
558 |     confidence: float
559 |         The confidence percent, calculated as: ``total_support / lhs_rupport``.
560 |     coverage: float
561 |         Calculated as: ``lhs_support / n_examples``
562 |     strength: float
563 |         Calculated as: ``rhs_support / lhs_examples``
564 |     lift: float
565 |         Calculated as: ``n_examples * total_support / lhs_support / rhs_support``
566 |     leverage: float
567 |         Calculated as: ``(total_support * n_examples - lhs_support * rhs_support) / n_examples**2``
568 | 
569 |     Examples
570 |     --------
571 |     >>> N = 30
572 |     >>> X = np.random.random((N, 50)) > .9
573 |     >>> itemsets = dict(frequent_itemsets(X, .1))
574 |     >>> rules = association_rules(itemsets, .6)
575 |     >>> list(rules_stats(rules, itemsets, N))
576 |     [(frozenset({15}), frozenset({0}), 3, 0.75, 0.13..., 1.5, 3.75, 0.073...),
577 |      (frozenset({47}), frozenset({22}), 3, 0.6, 0.16..., 1.4, 2.57..., 0.061...),
578 |      (frozenset({27}), frozenset({22}), 4, 0.66..., 0.2, 1.16..., 2.85..., 0.086...),
579 |      (frozenset({19}), frozenset({22}), 3, 0.6, 0.16..., 1.4, 2.57..., 0.061...)]
580 | 
581 |     """
582 |     assert (isinstance(itemsets, dict) and
583 |             isinstance(next(iter(itemsets), frozenset()), frozenset))
584 |     assert n_examples > 0
585 |     for left, right, support, confidence in rules:
586 |         l_support, r_support = itemsets[left], itemsets[right]
587 |         coverage = l_support / n_examples
588 |         strength = r_support / l_support
589 |         lift = n_examples * confidence / r_support
590 |         leverage = (support*n_examples - l_support*r_support) / n_examples**2
591 |         yield (left, right, support, confidence,
592 |                coverage, strength, lift, leverage)
593 | 
594 | 
595 | def __fp_tree_count_nodes(tree):
596 |     count = 1 if tree.item is not None else 0
597 |     for t in tree.values():
598 |         count += __fp_tree_count_nodes(t)
599 |     return count
600 | 
601 | 
602 | def __fp_tree_max_height(tree):
603 |     if tree:
604 |         return max((1 if tree.item is not None else 0) +
605 |                    __fp_tree_max_height(child) for child in tree.values())
606 |     return 1 if tree.item is not None else 0
607 | 
608 | 
609 | class OneHot:
610 |     """
611 |     Encode discrete Orange.data.Table into a 2D array of binary attributes.
612 |     """
613 |     @staticmethod
614 |     def encode(table, include_class=False):
615 |         """
616 |         Return a tuple of
617 |         (bool (one hot) ndarray, {col: (variable_index, value_index)} mapping)
618 | 
619 |         If the input table is sparse, a list of nonzero column indices
620 |         per row (LIL rows) is returned instead of the one-hot ndarray.
621 |         """
622 |         X, encoded, mapping = table.X, [], {}
623 |         if issparse(X):
624 |             encoded = X.tolil().rows.tolist()
625 |             for i, var in enumerate(table.domain.attributes):
626 |                 mapping[i] = i, 0
627 |         else:
628 |             for i, var in enumerate(table.domain.attributes):
629 |                 if not var.is_discrete: continue
630 |                 for j, val in enumerate(var.values):
631 |                     mapping[len(mapping)] = i, j
632 |                     encoded.append(X[:, i] == j)
633 | 
634 |         if include_class and table.domain.has_discrete_class:
635 |             i, var = len(table.domain.attributes), table.domain.class_var
636 |             for j, val in enumerate(var.values):
637 |                 mapping[len(mapping)] = i, j
638 |                 if issparse(X):
639 |                     for row in encoded:
640 |                         row.append(i + j)
641 |                 else:
642 |                     encoded.append(table.Y == j)
643 | 
644 |         if not issparse(X):
645 |             encoded = np.column_stack(encoded) if encoded else None
646 |         return encoded, mapping
647 | 
648 |     @staticmethod
649 |     def decode(itemset, table, mapping):
650 |         """Yield sorted (item, variable, value) tuples (one for each item)"""
651 |         attributes = table.domain.attributes
652 |         for item in itemset:
653 |             ivar, ival = mapping[item]
654 |             var = attributes[ivar] if ivar < len(attributes) else table.domain.class_var
655 |             yield item, var, (var.values[ival] if var.is_discrete else 0)
656 | 
657 | 
658 | def preprocess(table):
659 |     """
660 |     This function applies a one-hot transform to Orange data table, making it
661 |     suitable as an `X` input into :obj:`frequent_itemsets()` above.
662 | 
663 |     For a more fine-grained control, use :obj:`OneHot` methods directly.
664 | 
665 |     Parameters
666 |     ----------
667 |     table: Orange.data.Table
668 |         The table to encode into `X` compatible with `frequent_itemsets()`
669 |         above.
670 | 
671 |     Returns
672 |     -------
673 |     X: numpy.ndarray
674 |         The table's `X` with one-hot tranfsorm applied.
675 | 
676 | 
677 |     Examples
678 |     --------
679 |     For a more concrete example, i.e. using non-uniform data:
680 | 
681 |     >>> from Orange.data import Table
682 |     >>> table = Table('voting')
683 |     >>> table
684 |     [[n, y, n, y, y, ... | republican],
685 |      [n, y, n, y, y, ... | republican],
686 |      [?, y, y, ?, y, ... | democrat],
687 |      [n, y, y, n, ?, ... | democrat],
688 |      [y, y, y, n, y, ... | democrat],
689 |      ...
690 |     ]
691 | 
692 |     Table, as-is, can't be used with :obj:`frequent_itemsets()` directly (it can,
693 |     but it would produce garbage). We first need to one-hot transform it, i.e.
694 |     make binary columns for each value of each of its discrete variables.
695 | 
696 |     >>> X = preprocess(table)
697 |     >>> X
698 |     array([[ True, False, False, ...,  True,  True, False],
699 |            [ True, False, False, ..., False,  True, False],
700 |            ...,
701 |            [ True, False,  True, ...,  True,  True, False],
702 |            [ True, False, False, ..., False,  True, False]], dtype=bool)
703 | 
704 |     Now we `can` use it.
705 | 
706 |     Note: the transformation includes class if it's discrete. For a
707 |     finer-grained control, including the variable values to columns mapping,
708 |     use :obj:`OneHot` class directly.
709 |     """
710 |     if table.domain.has_continuous_attributes():
711 |         raise ValueError('Frequent itemsets require all variables to be discrete')
712 |     encoded, mapping = OneHot.encode(table, table.domain.has_discrete_class)
713 |     return encoded
714 | 
715 | 
716 | if __name__ == '__main__':
717 |     import doctest
718 |     import __main__, builtins
719 | 
720 |     class Context(dict):
721 |         # See http://bugs.python.org/issue26303
722 |         def copy(self): return self
723 |         def clear(self): pass
724 | 
725 |     globals = __main__.__dict__.copy()
726 |     globals.update(builtins.__dict__)
727 | 
728 |     doctest.testmod(globs=Context(globals),
729 |                     optionflags=doctest.NORMALIZE_WHITESPACE | doctest.ELLIPSIS)
730 | 


--------------------------------------------------------------------------------
/reportgen/analysis.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Thu Nov 23 21:53:32 2017
  4 | 
  5 | @author: gason
  6 | """
  7 | 
  8 | import pandas as pd
  9 | import numpy as np
 10 | import re
 11 | import time
 12 | import os
 13 | from collections import Iterable
 14 | 
 15 | from pandas.api.types import is_string_dtype
 16 | from pandas.api.types import is_numeric_dtype
 17 | from pandas.api.types import is_number
 18 | from pandas.api.types import is_datetime64_any_dtype
 19 | from pandas.api.types import is_categorical_dtype
 20 | from scipy import stats
 21 | from sklearn import metrics
 22 | 
 23 | from . import report as _rpt
 24 | from . import config
 25 | from .report import genwordcloud
 26 | from .utils.metrics import entropyc
 27 | 
 28 | from .utils import iqr
 29 | 
 30 | #from sklearn.neighbors import KernelDensity
 31 | import matplotlib.pyplot as plt
 32 | import seaborn as sns
 33 | 
 34 | _thisdir = os.path.split(__file__)[0]
 35 | # default chinese font
 36 | from matplotlib.font_manager import FontProperties
 37 | font_path=config.font_path
 38 | if font_path:
 39 |     myfont=FontProperties(fname=font_path)
 40 |     sns.set(font=myfont.get_name())
 41 | 
 42 | 
 43 | __all__=['type_of_var',
 44 |          'describe',
 45 |          'plot',
 46 |          'features_analysis',
 47 |          'distributions',
 48 |          'AnalysisReport',
 49 |          'ClassifierReport']
 50 | 
 51 | 
 52 | def _freedman_diaconis_bins(a):
 53 |     """Calculate number of hist bins using Freedman-Diaconis rule."""
 54 |     # From http://stats.stackexchange.com/questions/798/
 55 |     a = np.asarray(a)
 56 |     assert len(a.shape)>0
 57 |     assert len(a)>0
 58 |     h = 2 * iqr(a) / (len(a) ** (1 / 3))
 59 |     # fall back to sqrt(a) bins if iqr is 0
 60 |     if h == 0:
 61 |         return int(np.sqrt(a.size))
 62 |     else:
 63 |         return int(np.ceil((a.max() - a.min()) / h))
 64 | 
 65 | 
 66 | 
 67 | def distributions(a,hist=True,bins=None,norm_hist=True,kde=False,grid=None,gridsize=100,clip=None):
 68 |     '''数组的分布信息
 69 |     hist=True,则返回分布直方图(counts,bins)
 70 |     kde=True,则返回核密度估计数组(grid,y)
 71 | 
 72 |     example
 73 |     -------
 74 |     a=np.random.randint(1,50,size=(1000,1))
 75 |     '''
 76 |     a = np.asarray(a).squeeze()
 77 |     if hist:
 78 |         if bins is None:
 79 |             bins = min(_freedman_diaconis_bins(a), 50)
 80 |         counts,bins=np.histogram(a,bins=bins)
 81 |         if norm_hist:
 82 |             counts=counts/counts.sum()
 83 |     if kde:
 84 |         bw='scott'
 85 |         cut=3
 86 |         if clip is None:
 87 |             clip = (-np.inf, np.inf)
 88 |         try:
 89 |             kdemodel = stats.gaussian_kde(a, bw_method=bw)
 90 |         except TypeError:
 91 |             kdemodel = stats.gaussian_kde(a)
 92 |         bw = "scotts" if bw == "scott" else bw
 93 |         bw = getattr(kdemodel, "%s_factor" % bw)() * np.std(a)
 94 |         if grid is None:
 95 |             support_min = max(a.min() - bw * cut, clip[0])
 96 |             support_max = min(a.max() + bw * cut, clip[1])
 97 |             grid=np.linspace(support_min, support_max, gridsize)
 98 |         y = kdemodel(grid)
 99 |     if hist and not(kde):
100 |         return counts,bins
101 |     elif not(hist) and kde:
102 |         return grid,y
103 |     elif hist and kde:
104 |         return ((counts,bins),(grid,y))
105 |     else:
106 |         return None
107 | 
108 | 
109 | def dtype_detection(data,category_detection=True,StructureText_detection=True,\
110 | datetime_to_category=True,criterion='sqrt',min_mean_counts=5,fix=False):
111 |     '''检测数据中单个变量的数据类型
112 |     将数据类型分为以下4种
113 |     1. number,数值型
114 |     2. category,因子
115 |     3. datetime,时间类型
116 |     4. text,文本型
117 |     5. text_st,结构性文本，比如ID,
118 |     6. group_number,连续
119 | 
120 |     parameter
121 |     ---------
122 |     data: pd.Series 数据, 仅支持一维
123 |     # 如果有data,则函数会改变原来data的数据类型
124 |     category_detection: bool,根据 nunique 检测是否是因子类型
125 |     StructureText_detection: bool, 结构化文本，如列中都有一个分隔符"-"
126 |     datetime_to_category: 时间序列如果 nunique过少是否转化成因子变量
127 |     criterion: string or int, optional (default="sqrt",即样本数的开根号)
128 |         支持：'sqrt'：样本量的开根号, int: 绝对数, 0-1的float：样本数的百分多少
129 |         检测因子变量时，如果一个特征的nunique小于criterion,则判定为因子变量
130 |     min_mean_counts: default 5,数值型判定为因子变量时，需要满足每个类别的平均频数要大于min_mean_counts
131 |     fix: bool,是否返回修改好类型的数据
132 | 
133 | 
134 |     return:
135 |     result:dict{
136 |         'name':列名,
137 |         'vtype':变量类型,
138 |         'ordered':是否是有序因子,
139 |         'categories':所有的因子}
140 | 
141 |     '''
142 | 
143 |     assert len(data.shape)==1
144 |     data=data.copy()
145 |     data=pd.Series(data)
146 |     dtype,name,n_sample=data.dtype,data.name,data.count()
147 | 
148 |     min_mean_counts=5
149 |     if criterion=='sqrt':
150 |         max_nuniques=np.sqrt(n_sample)
151 |     elif isinstance(criterion,int):
152 |         max_nuniques=criterion
153 |     elif isinstance(criterion,float) and (0<criterion<1):
154 |         max_nuniques=criterion
155 |     else:
156 |         max_nuniques=np.sqrt(n_sample)
157 |     ordered=False
158 |     categories=[]
159 |     if is_numeric_dtype(dtype):
160 |         vtype='number'
161 |         ordered=False
162 |         categories=[]
163 |         # 纠正误分的数据类型。如将1.0，2.0，3.0都修正为1，2，3
164 |         if data.dropna().astype(np.int64).sum()==data.dropna().sum():
165 |             data[data.notnull()]=data[data.notnull()].astype(np.int64)
166 |         if category_detection:
167 |             nunique=len(data.dropna().unique())
168 |             mean_counts=data.value_counts().median()
169 |             if nunique<max_nuniques and mean_counts>=min_mean_counts:
170 |                 data=data.astype('category')
171 |                 ordered=data.cat.ordered
172 |                 vtype='category'
173 |                 categories=list(data.dropna().cat.categories)
174 |         result={'name':name,'vtype':vtype,'ordered':ordered,'categories':categories}
175 |     elif is_string_dtype(dtype):
176 |         # 处理时间类型
177 |         tmp=data.map(lambda x: np.nan if '%s'%x == 'nan' else len('%s'%x))
178 |         tmp=tmp.dropna().astype(np.int64)
179 |         if not(any(data.dropna().map(is_number))) and 7<tmp.max()<20 and tmp.std()<0.1:
180 |             try:
181 |                 data=pd.to_datetime(data)
182 |             except :
183 |                 pass
184 |         # 处理可能的因子类型
185 |         #时间格式是否处理为True 且
186 |         if datetime_to_category:
187 |             if len(data.dropna().unique())<np.sqrt(n_sample):
188 |                 data=data.astype('category')
189 |         else:
190 |             nunique=len(data.dropna().unique())
191 |             #print(data.dtype)
192 |             if not(is_categorical_dtype(data.dtype)) and not(np.issubdtype(data.dtype,np.datetime64)) and nunique<max_nuniques:
193 |                 data=data.astype('category')
194 | 
195 |         # 在非因子类型的前提下，将百分数转化成浮点数，例如21.12%-->0.2112
196 |         if is_string_dtype(data.dtype) and not(is_categorical_dtype(data.dtype)) and all(data.str.contains('%')):
197 |             data=data.str.strip('%').astype(np.float64)/100
198 | 
199 |         if is_categorical_dtype(data.dtype):
200 |             vtype='category'
201 |             categories=list(data.cat.categories)
202 |             ordered=data.cat.ordered
203 |         # 时间格式
204 |         elif np.issubdtype(data.dtype,np.datetime64):
205 |             vtype='datetime'
206 |         # 是否是结构化数组
207 |         elif StructureText_detection and tmp.dropna().std()==0:
208 |             # 不可迭代，不是字符串
209 |             if not(isinstance(data.dropna().iloc[0],Iterable)):
210 |                 vtype='text'
211 |             else:
212 |                 k=set(list(data.dropna().iloc[0]))
213 |                 for x in data:
214 |                     if isinstance(x,str) and len(x)>0:
215 |                         k&=set(list(x))
216 |                 if len(k)>0:
217 |                     vtype='text_st'
218 |                 else:
219 |                     vtype='text'
220 |         elif is_numeric_dtype(data.dtype):
221 |             vtype='number'
222 |             ordered=False
223 |             categories=[]
224 |         else:
225 |             vtype='text'
226 |         result={'name':name,'vtype':vtype,'ordered':ordered,'categories':categories}
227 |     elif is_datetime64_any_dtype(dtype):
228 |         vtype='datetime'
229 |         result={'name':name,'vtype':vtype,'ordered':ordered,'categories':categories}
230 |     else:
231 |         print('unknown dtype!')
232 |         result=None
233 | 
234 |     if fix:
235 |         return result,data
236 |     else:
237 |         return result
238 | 
239 | 
240 | 
241 | def type_of_var(data,category_detection=True,criterion='sqrt',min_mean_counts=5,copy=True):
242 |     '''返回各个变量的类型
243 |     将数据类型分为以下4种
244 |     1. number,数值型
245 |     2. category,因子
246 |     3. datetime,时间类型
247 |     4. text,文本型
248 |     5. text_st,结构性文本，比如ID,
249 | 
250 |     parameters
251 |     ----------
252 |     data: pd.DataFrame类型
253 |     category_detection: bool,根据 nunique 检测是否是因子类型
254 |     criterion: string or int, optional (default="sqrt",即样本数的开根号)
255 |         支持：'sqrt'：样本量的开根号, int: 绝对数, 0-1的float：样本数的百分多少
256 |         检测因子变量时，如果一个特征的nunique小于criterion,则判定为因子变量
257 |     min_mean_counts: default 5,数值型判定为因子变量时，需要满足每个类别的平均频数要大于min_mean_counts
258 |     copy: bool, 是否更改数据类型，如时间格式、因子变量等
259 | 
260 |     return:
261 |     --------
262 |     var_type:dict{
263 |         ColumnName:type,}
264 | 
265 |     '''
266 |     assert isinstance(data,pd.core.frame.DataFrame)
267 |     var_type={}
268 |     for c in data.columns:
269 |         #print('type_of_var : ',c)
270 |         if copy:
271 |             data=data.copy()
272 |             result=dtype_detection(data[c],category_detection=category_detection,\
273 |             criterion=criterion,min_mean_counts=min_mean_counts,datetime_to_category=False,fix=False)
274 |             if result is not None:
275 |                 var_type[c]=result['vtype']
276 |             else:
277 |                 var_type[c]='unknown'
278 |         else:
279 |             result,tmp=dtype_detection(data[c],category_detection=category_detection,\
280 |             criterion=criterion,min_mean_counts=min_mean_counts,datetime_to_category=False,fix=True)
281 |             data[c]=tmp
282 |             if result is not None:
283 |                 var_type[c]=result['vtype']
284 |             else:
285 |                 var_type[c]='unknown'
286 |     return var_type
287 | 
288 | 
289 | 
290 | def var_detection(data,combine=True):
291 |     '''检测整个数据的变量类型,内部使用，外部请用type_of_var
292 |     parameter
293 |     ---------
294 |     data: 数据,DataFrame格式
295 |     combine: 检测变量中是否有类似的变量，有的话则会合并。
296 | 
297 |     return
298 |     ------
299 |     var_list:[{'name':,'vtype':,'vlist':,'ordered':,'categories':,},]
300 | 
301 |     '''
302 |     var_list=[]
303 |     for c in data.columns:
304 |         result,tmp=dtype_detection(data[c],fix=True)
305 |         data[c]=tmp
306 |         if result is not None:
307 |             result['vlist']=[c]
308 |             var_list.append(result)
309 |     if not(combine):
310 |         return var_list,data
311 |     var_group=[]
312 |     i=0
313 |     pattern=re.compile(r'(.*?)(\d+)')
314 |     while i < len(var_list)-1:
315 |         v=var_list[i]
316 |         vnext=var_list[i+1]
317 |         if v['vtype']!='number' or vnext['vtype']!='number':
318 |             i+=1
319 |             continue
320 |         tmp1=[]
321 |         for vv in var_list[i:]:
322 |             if vv['vtype']!='number':
323 |                 break
324 |             w=re.findall(pattern,'%s'%vv['name'])
325 |             if len(w)==0 or (len(w)>0 and len(w[0])<2):
326 |                 break
327 |             tmp1.append((w[0][0],w[0][1]))
328 |         if len(tmp1)<2:
329 |             i+=1
330 |             continue
331 |         flag1=len(set([t[0] for t in tmp1]))==1
332 |         flag2=np.diff([int(t[1]) for t in tmp1]).sum()==len(tmp1)-1
333 |         if flag1 and flag2:
334 |             var_group.append(list(range(i,i+len(tmp1))))
335 |             i+=len(tmp1)
336 |     var_group_new={}
337 |     var_group_total=[]#将所有的分组ind加起来
338 |     for vi in var_group:
339 |         var_group_total+=vi
340 |         name='{}-->{}'.format(var_list[vi[0]]['name'],var_list[vi[-1]]['name'])
341 |         vlist=[var_list[v]['name'] for v in vi]
342 |         vtype='group_number'
343 |         tmp={'name':name,'vtype':vtype,'vlist':vlist,'ordered':True,'categories':vlist}
344 |         var_group_new[vi[0]]=tmp
345 |     var_list_new=[]
346 |     var_list_have=[]
347 |     for i,v in enumerate(var_list):
348 |         if i not in var_group_total:
349 |             v['vlist']=[v['name']]
350 |             var_list_new.append(v)
351 |             var_list_have+=v['vlist']
352 |         elif i in var_group_total and v['name'] not in var_list_have:
353 |             var_list_new.append(var_group_new[i])
354 |             var_list_have+=var_group_new[i]['vlist']
355 |     return var_list_new,data
356 | 
357 | def describe(data):
358 |     '''
359 |     对每个变量生成统计指标特征
360 |     对于每一个变量，生成如下字段：
361 |         数据类型：
362 |         最大值/频数最大的那个：
363 |         最小值/频数最小的那个：
364 |         均值/频数中间的那个：
365 |         缺失率：
366 |         范围/唯一数：
367 |     '''
368 | 
369 |     data=pd.DataFrame(data)
370 |     n_sample=len(data)
371 |     var_type=type_of_var(data,copy=True)
372 |     summary=pd.DataFrame(columns=data.columns,index=['dtype','max','min','mean','missing_pct','std/nuniue'])
373 |     for c in data.columns:
374 |         missing_pct=1-data[c].count()/n_sample
375 |         if var_type[c] == 'number':
376 |             max_value,min_value,mean_value=data[c].max(),data[c].min(),data[c].mean()
377 |             std_value=data[c].std()
378 |             summary.loc[:,c]=[var_type[c],max_value,min_value,mean_value,missing_pct,std_value]
379 |         elif var_type[c] == 'category' or is_categorical_dtype(data[c].dtype):
380 |             tmp=data[c].value_counts()
381 |             max_value,min_value=tmp.argmax(),tmp.argmin()
382 |             mean_value_index=tmp[tmp==tmp.median()].index
383 |             mean_value=mean_value_index[0] if len(mean_value_index)>0 else np.nan
384 |             summary.loc[:,c]=[var_type[c],max_value,min_value,mean_value,missing_pct,len(tmp)]
385 |         elif var_type[c] == 'datetime':
386 |             max_value,min_value=data[c].max(),data[c].min()
387 |             summary.loc[:,c]=[var_type[c],max_value,min_value,np.nan,missing_pct,np.nan]
388 |         else:
389 |             summary.loc[:,c]=[var_type[c],np.nan,np.nan,np.nan,missing_pct,np.nan]
390 |     return summary
391 | 
392 | 
393 | 
394 | def plot(data,figure_type='auto',chart_type='auto',vertical=False,ax=None):
395 |     '''auto choose the best chart type to draw the data 【还没完全写好】
396 |     paremeter
397 |     -----------
398 |     figure_type: 'mpl' or 'pptx' or 'html'
399 |     chart_type: 'hist' or 'dist' or 'kde' or 'bar' ......
400 | 
401 |     return
402 |     -------
403 |     chart:dict format.
404 |     .type: equal to figure_type
405 |     .fig: only return if type == 'mpl'
406 |     .ax:
407 |     .chart_data:
408 | 
409 |     '''
410 | 
411 |     # 判别部分
412 | 
413 |     # 绘制部分
414 |     data=pd.DataFrame(data)
415 |     assert len(data.dropna())>0
416 |     chart={}
417 |     if figure_type in ['mpl','matplotlib']:
418 |         chart['type']='mpl'
419 |         if ax is None:
420 |             fig,ax=plt.subplots()
421 |         if chart_type in ['hist','kde']:
422 |             for c in data.columns:
423 |                 if len(data[c].dropna())>10:
424 |                     sns.kdeplot(data[c].dropna(),shade=True,ax=ax)
425 |                 else:
426 |                     print('reportgen.plot:: ',c,'have no valid data!')
427 |             legend_label=ax.get_legend_handles_labels()
428 |             if len(legend_label)>0 and len(legend_label[0])>1:
429 |                 ax.legend()
430 |             else:
431 |                 try:
432 |                     ax.legend_.remove()
433 |                 except:
434 |                     pass
435 |             ax.axis('auto')
436 |         elif chart_type in ['dist']:
437 |             for c in data.columns:
438 |                 if len(data[c].dropna())>10:
439 |                     sns.distplot(data[c].dropna(),ax=ax)
440 |                 else:
441 |                     print('reportgen.plot:: ',c,'have no valid data!')
442 |             legend_label=ax.get_legend_handles_labels()
443 |             if len(legend_label)>0 and len(legend_label[0])>1:
444 |                 ax.legend()
445 |             else:
446 |                 try:
447 |                     ax.legend_.remove()
448 |                 except:
449 |                     pass
450 |             ax.axis('auto')
451 |         elif chart_type in ['scatter']:
452 |             ax.xaxis.set_ticks_position('none')
453 |             ax.yaxis.set_ticks_position('none')
454 |             ax.axhline(y=0, linestyle='-', linewidth=1.2, alpha=0.6)
455 |             ax.axvline(x=0, linestyle='-', linewidth=1.2, alpha=0.6)
456 |             color=['blue','red','green','dark']
457 |             if not isinstance(data,list):
458 |                 data=[data]
459 |             for i,dd in enumerate(data):
460 |                 if '%s'%dd.iloc[:,0] != 'nan' or '%s'%dd.iloc[:,1] != 'nan':
461 |                     ax.scatter(dd.iloc[:,0], dd.iloc[:,1], c=color[i], s=50,
462 |                                label=dd.columns[1])
463 |                     for _, row in dd.iterrows():
464 |                         ax.annotate(row.name, (row.iloc[0], row.iloc[1]), color=color[i],fontproperties=myfont,fontsize=10)
465 |             ax.axis('equal')
466 |             legend_label=ax.get_legend_handles_labels()
467 |             if len(legend_label)>0 and len(legend_label[0])>0:
468 |                 ax.legend()
469 |         try:
470 |             chart['fig']=fig
471 |         except:
472 |             pass
473 |         chart['ax']=ax
474 |         return chart
475 |     if figure_type in ['pptx']:
476 |         chart['type']='pptx'
477 |         count,bins=distributions(data.iloc[:,0].dropna(),kde=False)
478 |         if all(pd.Series(bins).astype(int)==bins):
479 |             decimals_format='{:.0f}~'
480 |         else:
481 |             decimals_format='{:.2f}~'
482 |         bins_index=[decimals_format.format(b) for b in bins[:-1]]
483 |         decimals_format=decimals_format[:-1]
484 |         bins_index[-1]=bins_index[-1]+decimals_format.format(bins[-1])
485 | 
486 |         chart_data=pd.DataFrame({'frequency':count*100},index=bins_index)
487 |         chart['chart_data']=chart_data
488 |         if isinstance(ax,_rpt.Report):
489 |             slide_data={'data':chart_data,'slide_type':'chart'}
490 |             ax.add_slide(data=slide_data,title='',summary='',footnote='')
491 |         # 暂时空缺，后期会将ax修改为Report接口
492 |         chart['ax']=ax
493 |         return chart
494 | 
495 | 
496 | # 仅测试用
497 | def features_analysis(X,y=None,out_file=None,categorical_features=[],number_features=[],\
498 | max_leafs=5):
499 |     '''
500 |     categorical_features=None
501 |     number_features=None
502 |     categorical_features=[] if categorical_features is None else categorical_features
503 |     number_features=[] if number_features is None else number_features
504 |     X=data
505 |     '''
506 |     from graphviz import Digraph
507 |     import pydotplus
508 |     N=len(X)
509 |     X=X.copy()
510 |     if len(categorical_features)==0:
511 |         var_type=type_of_var(X)
512 |         categorical_features=[k for k in var_type if var_type[k]=='category']
513 | 
514 |     #categorical_features=['grade','target','term']
515 |     #number_features=['tot_cur_bal','annual_inc']
516 |     X['_count_']=range(len(X))
517 |     # 根据唯一值个数的不同从小到大排列特征的顺序
518 |     nunique=X[categorical_features].apply(pd.Series.nunique).sort_values()
519 |     categorical_features=list(nunique.index)
520 |     for k in nunique[nunique>5].index:
521 |         topitems=X[k].value_counts().sort_values(ascending=False)
522 |         X[k]=X[k].replace(dict(zip(topitems.index[(max_leafs-1):],['others']*(len(topitems)-max_leafs+1))))
523 |     tmp=X.groupby(categorical_features)
524 | 
525 |     # 针对因子变量计数，针对数值变量，计算分组均值
526 |     aggfun={'_count_':'count'}
527 |     for k in number_features:
528 |         aggfun.update({k:'mean'})
529 |     count_data=tmp.agg(aggfun)
530 | 
531 |     # 每一个节点，定义一些属性1，父节点, 特征名称, value,
532 | 
533 |     # 生成节点的索引表格
534 |     names=count_data.index.names
535 |     levels=count_data.index.levels
536 |     labels=pd.DataFrame(count_data.index.labels).T
537 |     labels.columns=names
538 |     for i in range(len(names)):
539 |         labels[names[i]]=labels[names[i]].replace(dict(zip(range(len(levels[i])),levels[i])))
540 |     labels_node=pd.DataFrame(index=labels.index,columns=labels.columns)
541 |     #labels_prenode=pd.DataFrame(index=labels.index,columns=labels.columns)
542 |     dot=Digraph()
543 |     nodes=[{'id':0,'column':'start','value':None}]
544 |     dot.node(str(nodes[-1]['id']),'Total\n{} , 100%'.format(N),shape="diamond")
545 | 
546 |     for c in range(len(labels.columns)):
547 |         if c==len(labels.columns)-1:
548 |             count_data_tmp=count_data.copy()
549 |         else:
550 |             count_data_tmp=X.groupby(names[:c+1]).agg(aggfun)
551 |         for i in range(len(labels.index)):
552 |             value=labels.iloc[i,c]
553 |             if value!=nodes[-1]['value'] and c!=nodes[-1]['column']:
554 |                 # 增加一个新节点
555 |                 addnode={'id':nodes[-1]['id']+1,'column':names[c],'value':value}
556 |                 nodes.append(addnode)
557 |                 node_id=str(nodes[-1]['id'])
558 |                 #cond=labels.iloc[i,:c+1]
559 |                 #n=_cal_count(X,labels.iloc[i,:c+1])
560 |                 if len(count_data_tmp.index.names)==1:
561 |                     n=count_data_tmp.loc[labels.iloc[i,c],'_count_']
562 |                 else:
563 |                     n=count_data_tmp.xs(list(labels.iloc[i,:c+1]))['_count_']
564 |                 label='{} = {}\ncount:{:.0f} , {:.2f}%'.format(names[c],value,n,n*100/N)
565 |                 for k in number_features:
566 |                     if len(count_data_tmp.index.names)==1:
567 |                         vmean=count_data_tmp.loc[labels.iloc[i,c],k]
568 |                     else:
569 |                         vmean=count_data_tmp.xs(list(labels.iloc[i,:c+1]))[k]
570 |                     label=label+'\n{}: {:.1f}'.format(k,vmean)
571 |                 dot.node(node_id,label)
572 |                 if c==0:
573 |                     pre_node_id='0'
574 |                 else:
575 |                     pre_node_id=labels_node.iloc[i,c-1]
576 |                 dot.edge(pre_node_id,node_id)
577 |                 #print('---创建节点{},节点信息如下'.format(node_id))
578 |                 #print(label)
579 |                 #print('{} 连接节点{}'.format(node_id,pre_node_id))
580 |             #labels_prenode.iloc[i,c]=pre_node_id
581 |             labels_node.iloc[i,c]=str(nodes[-1]['id'])
582 |     if out_file is not None:
583 |         graph=pydotplus.graphviz.graph_from_dot_data(dot.source)
584 |         graph.write(out_file,format=os.path.splitext(out_file)[1][1:])
585 |         #graph.write_png(out_file)
586 |     else:
587 |         dot.view()
588 |         return dot
589 | 
590 | 
591 | 
592 | def AnalysisReport(data,filename=None,var_list=None,save_pptx=True,return_report=False,combine=False):
593 |     '''
594 |     直接生成报告
595 |     '''
596 |     if var_list is None:
597 |         var_list,data=var_detection(data,combine=combine)
598 |         #print(var_list)
599 |         #print('============')
600 | 
601 |     slides_data=[]
602 | 
603 |     if filename is None:
604 |         filename='AnalysisReport'+time.strftime('_%Y%m%d%H%M', time.localtime())
605 |         p=_rpt.Report()
606 |         p.add_cover(title=os.path.splitext(filename)[0])
607 |     elif isinstance(filename,str):
608 |         p=_rpt.Report()
609 |         p.add_cover(title=os.path.splitext(filename)[0])
610 |     elif isinstance(filename,_rpt.Report):
611 |         p=filename
612 |         filename='AnalysisReport'+time.strftime('_%Y%m%d%H%M', time.localtime())
613 |     else:
614 |         print('reportgen.AnalysisReport::cannot understand the filename')
615 |         return None
616 | 
617 |     summary=describe(data)
618 |     f_cut=10# 每一页展示的最大字段数
619 |     n_cut=round(summary.shape[1]/f_cut)
620 |     n_cut=1 if n_cut==0 else n_cut
621 |     for i in range(n_cut):
622 |         if i!=n_cut-1:
623 |             summary_tmp=summary.iloc[:,f_cut*i:f_cut*i+f_cut]
624 |         else:
625 |             summary_tmp=summary.iloc[:,f_cut*i:]
626 |         slide_data={'data':summary_tmp,'slide_type':'table'}
627 |         title='数据字段描述{}-{}'.format(i*f_cut+1,min(summary.shape[1],i*f_cut+f_cut))
628 |         p.add_slide(data=slide_data,title=title)
629 | 
630 |     for v in var_list:
631 |         vtype=v['vtype']
632 |         name=v['name']
633 |         vlist=v['vlist']
634 |         #print(name,':',vtype)
635 |         if len(data.loc[:,vlist].dropna())==0:
636 |             print('the field: ',name,'have no valid data!')
637 |             continue
638 |         # 之前的方案，暂时留着测试用，后期稳定后删除
639 |         if vtype == 'number_test':
640 |             chart=plot(data[name],figure_type='mpl',chart_type='kde')
641 |             chart['fig'].savefig('kdeplot1.png',dpi=200)
642 |             chart['fig'].clf()
643 |             del chart
644 |             chart=plot(data[name],figure_type='mpl',chart_type='dist')
645 |             chart['fig'].savefig('kdeplot2.png',dpi=200)
646 |             chart['fig'].clf()
647 |             del chart
648 |             summary='''平均数为：{:.2f}，标准差为：{:.2f}，最大为：{}'''\
649 |             .format(data[name].mean(),data[name].std(),data[name].max())
650 |             footnote='注: 样本N={}'.format(data[name].count())
651 |             slide_data=[{'data':'kdeplot1.png','slide_type':'picture'},{'data':'kdeplot2.png','slide_type':'picture'}]
652 |             p.add_slide(data=slide_data,title=name+' 的分析',summary=summary,footnote=footnote)
653 |             slides_data.append(slide_data)
654 |             os.remove('kdeplot1.png')
655 |             os.remove('kdeplot2.png')
656 | 
657 |         if vtype == 'number':
658 |             if len(data[name].dropna())==1:
659 |                 print('the fiele ',name,' of number type must have more than two items.')
660 |                 continue
661 |             chart=plot(data[name],figure_type='mpl',chart_type='kde')
662 |             chart['fig'].savefig('kdeplot.png',dpi=200)
663 |             chart['fig'].clf()
664 |             del chart
665 |             chart=plot(data[name],figure_type='pptx',chart_type='bar')
666 |             summary='''MIN: {}, MAX: {}, MEAN: {:.2f}, STD: {:.2f}'''\
667 |             .format(data[name].min(),data[name].max(),data[name].mean(),data[name].std())
668 |             footnote='注: 样本N={}'.format(data[name].count())
669 |             slide_data=[{'data':chart['chart_data'],'slide_type':'chart'},{'data':'kdeplot.png','slide_type':'picture'}]
670 |             p.add_slide(data=slide_data,title=name+' 的分析',summary=summary,footnote=footnote)
671 |             slides_data.append(slide_data)
672 |             os.remove('kdeplot.png')
673 |         elif vtype == 'category':
674 |             tmp=pd.DataFrame(data[name].value_counts())
675 |             tmp=tmp*100/tmp.sum()#转换成百分数
676 |             if ('ordered' in v) and v['ordered']:
677 |                 tmp=pd.DataFrame(tmp,index=v['categories'])
678 |             footnote='注: 样本N={}'.format(data[name].count())
679 |             slide_data={'data':tmp,'slide_type':'chart','type':'COLUMN_CLUSTERED'}
680 |             summary='{}占比最大为: {:.2f}%'.format(tmp.iloc[:,0].argmax(),tmp.iloc[:,0].max())
681 |             p.add_slide(data=slide_data,title=name+' 的分析',summary=summary,footnote=footnote)
682 |             slides_data.append(slide_data)
683 |         elif vtype == 'datetime':
684 |             if data[name].value_counts().max()==1:
685 |                 print('the dtype of {} column is datetime, continue...')
686 |                 continue
687 |             tmp=pd.DataFrame(data[name].astype('object').value_counts())
688 |             tmp=tmp*100/tmp.sum()#转换成百分数
689 |             tmp=tmp.sort_index()#排序
690 |             if ('ordered' in v) and v['ordered']:
691 |                 tmp=pd.DataFrame(tmp,index=v['categories'])
692 |             footnote='注: 样本N={}'.format(data[name].count())
693 |             slide_data={'data':tmp,'slide_type':'chart','type':'COLUMN_CLUSTERED'}
694 |             summary='{}占比最大为: {:.2f}%'.format(tmp.iloc[:,0].argmax(),tmp.iloc[:,0].max())
695 |             p.add_slide(data=slide_data,title=name+' 的分析',summary=summary,footnote=footnote)
696 |             slides_data.append(slide_data)
697 |         elif vtype == 'text':
698 |             try:
699 |                 tmp=','.join(data[name].dropna())
700 |                 if len(tmp)>1:
701 |                     img=genwordcloud(tmp,font_path=font_path)
702 |                     img.save('tmp.png')
703 |                     footnote='注: 样本N={}'.format(data[name].count())
704 |                     slide_data={'data':'tmp.png','slide_type':'picture'}
705 |                     p.add_slide(data=slide_data,title=name+' 的词云分析',footnote=footnote)
706 |                     slides_data.append(slide_data)
707 |                     os.remove('tmp.png')
708 |             except:
709 |                 print('cannot understand the field: {}'.format(name))
710 |                 pass
711 |         elif vtype == 'group_number':
712 |             tmp=pd.DataFrame(data.loc[:,vlist].mean())
713 |             footnote='注: 样本N={}'.format(data.loc[:,vlist].count().max())
714 |             slide_data={'data':tmp,'slide_type':'chart','type':'COLUMN_CLUSTERED'}
715 |             summary='{}占比最大为: {:.2f}%'.format(tmp.iloc[:,0].argmax(),tmp.iloc[:,0].max())
716 |             p.add_slide(data=slide_data,title=name+' 的分析',summary=summary,footnote=footnote)
717 |             slides_data.append(slide_data)
718 |         elif vtype == 'text_st':
719 |             print('The field: {} may be id or need to be designed'.format(name))
720 |         else:
721 |             print('unknown type: {}'.format(name))
722 | 
723 |     if save_pptx:
724 |         p.save(os.path.splitext(filename)[0]+'.pptx')
725 |     if return_report:
726 |         return p,slides_data
727 | 
728 | 
729 | 
730 | def ClassifierReport(y_true,y_preds,y_probas,img_save=False):
731 |     '''二分类模型评估（后期可能会修改为多分类）
732 |     真实数据和预测数据之间的各种可视化和度量
733 | 
734 |     parameters:
735 |     -----------
736 |     y_true: array_like 真实的标签,binary
737 |     y_preds: dict or array_like. 预测的标签，binary,可以用 dict 存储多个模型的预测标签数据
738 |     y_probas: dict or array_like. 预测的概率，0-1,可以用 dict 存储多个模型的预测标签数据
739 |     img_save：Bool，是否直接将图片保存到本地
740 | 
741 |     return:
742 |     ---------
743 |     models_report: 各模型的各种评估数据
744 |     conf_matrix: 各模型的混淆矩阵
745 |     '''
746 | 
747 | 
748 |     #from sklearn import metrics
749 |     assert type(y_preds) == type(y_probas)
750 |     if not(isinstance(y_preds,dict)):
751 |         y_preds={'clf':y_preds}
752 |         y_probas={'clf':y_probas}
753 |     models_report=pd.DataFrame()
754 |     conf_matrix={}
755 |     fig1,ax1=plt.subplots()
756 |     fig2,ax2=plt.subplots()
757 |     fig3,ax3=plt.subplots()
758 |     for clf in y_preds:
759 |         y_pred=y_preds[clf]
760 |         y_proba=y_probas[clf]
761 |         try:
762 |             kl_div_score=entropyc.kl_div(y_proba[y_true==1],y_proba[y_true==0])
763 |             kl_div_score+=entropyc.kl_div(y_proba[y_true==0],y_proba[y_true==1])
764 |         except:
765 |             kl_div_score=np.nan
766 |         scores = pd.Series({'model' : clf,
767 |                             'roc_auc_score' : metrics.roc_auc_score(y_true, y_proba),
768 |                              'good_rate': y_true.value_counts()[0]/len(y_true),
769 |                              'matthews_corrcoef': metrics.matthews_corrcoef(y_true, y_pred),
770 |                              'accuracy_score': metrics.accuracy_score(y_true,y_pred),
771 |                              'ks_score': np.nan,
772 |                              'precision_score': metrics.precision_score(y_true, y_pred),
773 |                              'recall_score': metrics.recall_score(y_true, y_pred),
774 |                              'kl_div': kl_div_score,
775 |                              'f1_score': metrics.f1_score(y_true, y_pred)})
776 |         models_report=models_report.append(scores,ignore_index = True)
777 |         conf_matrix[clf]=pd.crosstab(y_true, y_pred, rownames=['True'], colnames= ['Predicted'], margins=False)
778 |         #print('\n{} 模型的混淆矩阵:'.format(clf))
779 |         #print(conf_matrix[clf])
780 | 
781 |         # ROC 曲线
782 |         fpr, tpr, thresholds=metrics.roc_curve(y_true,y_proba,pos_label=1)
783 |         auc_score=metrics.auc(fpr,tpr)
784 |         w=tpr-fpr
785 |         ks_score=w.max()
786 |         models_report.loc[models_report['model']==clf,'ks_score']=ks_score
787 |         ks_x=fpr[w.argmax()]
788 |         ks_y=tpr[w.argmax()]
789 |         #sc=thresholds[w.argmax()]
790 |         #fig1,ax1=plt.subplots()
791 |         ax1.set_title('ROC Curve')
792 |         ax1.set_xlabel('False Positive Rate')
793 |         ax1.set_ylabel('True Positive Rate')
794 |         ax1.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6))
795 |         ax1.plot([ks_x,ks_x], [ks_x,ks_y], '--', color='red')
796 |         ax1.text(ks_x,(ks_x+ks_y)/2,r'   $S_c$=%.2f, KS=%.3f'%(thresholds[w.argmax()],ks_score))
797 |         ax1.plot(fpr,tpr,label='{}:AUC={:.5f}'.format(clf,auc_score))
798 |         ax1.legend()
799 |         # PR 曲线
800 |         precision, recall, thresholds=metrics.precision_recall_curve(y_true,y_proba,pos_label=1)
801 |         #fig2,ax2=plt.subplots()
802 |         ax2.plot(recall,precision,label=clf)
803 |         ax2.set_title('P-R Curve')
804 |         ax2.set_xlabel('Recall')
805 |         ax2.set_ylabel('Precision')
806 |         ax2.legend()
807 |         #fig2.show()
808 |         #密度函数和KL距离
809 |         #fig3,ax3=plt.subplots()
810 |         sns.kdeplot(y_proba[y_true==0],ax=ax3,shade=True,label='{}-0'.format(clf))
811 |         sns.kdeplot(y_proba[y_true==1],ax=ax3,shade=True,label='{}-1'.format(clf))
812 |         ax3.set_title('Density Curve')
813 |         ax3.legend()
814 |         ax3.autoscale()
815 |         #fig3.show()
816 | 
817 | 
818 |     if img_save:
819 |         fig1.savefig('roc_curve_{}.png'.format(time.strftime('%Y%m%d%H%M', time.localtime())),dpi=400)
820 |         fig2.savefig('pr_curve_{}.png'.format(time.strftime('%Y%m%d%H%M', time.localtime())),dpi=400)
821 |         fig3.savefig('density_curve_{}.png'.format(time.strftime('%Y%m%d%H%M', time.localtime())),dpi=400)
822 |     else:
823 |         fig1.show()
824 |         fig2.show()
825 |         fig3.show()
826 |     models_report=models_report.set_index('model')
827 |     #print('模型的性能评估:')
828 |     #print(models_report)
829 |     return models_report,conf_matrix
830 | 


--------------------------------------------------------------------------------
/reportgen/report.py:
--------------------------------------------------------------------------------
   1 | # -*- coding: utf-8 -*-
   2 | """
   3 | Created on Tue Nov  8 20:05:36 2016
   4 | @author: JSong
   5 | """
   6 | 
   7 | import os
   8 | import time
   9 | 
  10 | 
  11 | import pandas as pd
  12 | import numpy as np
  13 | pd.set_option('display.float_format', lambda x: '%.2f' % x)
  14 | 
  15 | from . import config
  16 | from .utils import Delaunay2D
  17 | 
  18 | import matplotlib.image as mpimg
  19 | import seaborn as sns
  20 | 
  21 | from pptx import Presentation
  22 | from pptx.chart.data import ChartData,XyChartData,BubbleChartData
  23 | from pptx.enum.chart import XL_CHART_TYPE
  24 | from pptx.util import Inches, Pt, Emu
  25 | from pptx.enum.chart import XL_LEGEND_POSITION
  26 | #from pptx.enum.chart import XL_LABEL_POSITION
  27 | from pptx.dml.color import RGBColor
  28 | 
  29 | _thisdir = os.path.split(__file__)[0]
  30 | # default chinese font
  31 | from matplotlib.font_manager import FontProperties
  32 | font_path=config.font_path
  33 | if font_path:
  34 |     myfont=FontProperties(fname=font_path)
  35 |     sns.set(font=myfont.get_name())
  36 | 
  37 | # default template of pptx report
  38 | template_pptx=config.template_pptx
  39 | 
  40 | 
  41 | 
  42 | __all__=['Report',
  43 |          'df_to_table',
  44 |          'df_to_chartdata',
  45 |          'plot_table',
  46 |          'plot_textbox',
  47 |          'plot_chart',
  48 |          'plot_picture',
  49 |          'slides_data_gen',
  50 |          'plot_cover',
  51 |          'genwordcloud']
  52 | 
  53 | 
  54 | 
  55 | chart_list={\
  56 | "AREA":[1,"ChartData"],\
  57 | "AREA_STACKED":[76,"ChartData"],\
  58 | "AREA_STACKED_100":[77,"ChartData"],\
  59 | "THREE_D_AREA":[-4098,"ChartData"],\
  60 | "THREE_D_AREA_STACKED":[78,"ChartData"],\
  61 | "THREE_D_AREA_STACKED_100":[79,"ChartData"],\
  62 | "BAR_CLUSTERED":[57,"ChartData"],\
  63 | "BAR_TWO_WAY":[57,"ChartData"],\
  64 | "BAR_OF_PIE":[71,"ChartData"],\
  65 | "BAR_STACKED":[58,"ChartData"],\
  66 | "BAR_STACKED_100":[59,"ChartData"],\
  67 | "THREE_D_BAR_CLUSTERED":[60,"ChartData"],\
  68 | "THREE_D_BAR_STACKED":[61,"ChartData"],\
  69 | "THREE_D_BAR_STACKED_100":[62,"ChartData"],\
  70 | "BUBBLE":[15,"BubbleChartData"],\
  71 | "BUBBLE_THREE_D_EFFECT":[87,"BubbleChartData"],\
  72 | "COLUMN_CLUSTERED":[51,"ChartData"],\
  73 | "COLUMN_STACKED":[52,"ChartData"],\
  74 | "COLUMN_STACKED_100":[53,"ChartData"],\
  75 | "THREE_D_COLUMN":[-4100,"ChartData"],\
  76 | "THREE_D_COLUMN_CLUSTERED":[54,"ChartData"],\
  77 | "THREE_D_COLUMN_STACKED":[55,"ChartData"],\
  78 | "THREE_D_COLUMN_STACKED_100":[56,"ChartData"],\
  79 | "CYLINDER_BAR_CLUSTERED":[95,"ChartData"],\
  80 | "CYLINDER_BAR_STACKED":[96,"ChartData"],\
  81 | "CYLINDER_BAR_STACKED_100":[97,"ChartData"],\
  82 | "CYLINDER_COL":[98,"ChartData"],\
  83 | "CYLINDER_COL_CLUSTERED":[92,"ChartData"],\
  84 | "CYLINDER_COL_STACKED":[93,"ChartData"],\
  85 | "CYLINDER_COL_STACKED_100":[94,"ChartData"],\
  86 | "DOUGHNUT":[-4120,"ChartData"],\
  87 | "DOUGHNUT_EXPLODED":[80,"ChartData"],\
  88 | "LINE":[4,"ChartData"],\
  89 | "LINE_MARKERS":[65,"ChartData"],\
  90 | "LINE_MARKERS_STACKED":[66,"ChartData"],\
  91 | "LINE_MARKERS_STACKED_100":[67,"ChartData"],\
  92 | "LINE_STACKED":[63,"ChartData"],\
  93 | "LINE_STACKED_100":[64,"ChartData"],\
  94 | "THREE_D_LINE":[-4101,"ChartData"],\
  95 | "PIE":[5,"ChartData"],\
  96 | "PIE_EXPLODED":[69,"ChartData"],\
  97 | "PIE_OF_PIE":[68,"ChartData"],\
  98 | "THREE_D_PIE":[-4102,"ChartData"],\
  99 | "THREE_D_PIE_EXPLODED":[70,"ChartData"],\
 100 | "PYRAMID_BAR_CLUSTERED":[109,"ChartData"],\
 101 | "PYRAMID_BAR_STACKED":[110,"ChartData"],\
 102 | "PYRAMID_BAR_STACKED_100":[111,"ChartData"],\
 103 | "PYRAMID_COL":[112,"ChartData"],\
 104 | "PYRAMID_COL_CLUSTERED":[106,"ChartData"],\
 105 | "PYRAMID_COL_STACKED":[107,"ChartData"],\
 106 | "PYRAMID_COL_STACKED_100":[108,"ChartData"],\
 107 | "RADAR":[-4151,"ChartData"],\
 108 | "RADAR_FILLED":[82,"ChartData"],\
 109 | "RADAR_MARKERS":[81,"ChartData"],\
 110 | "STOCK_HLC":[88,"ChartData"],\
 111 | "STOCK_OHLC":[89,"ChartData"],\
 112 | "STOCK_VHLC":[90,"ChartData"],\
 113 | "STOCK_VOHLC":[91,"ChartData"],\
 114 | "SURFACE":[83,"ChartData"],\
 115 | "SURFACE_TOP_VIEW":[85,"ChartData"],\
 116 | "SURFACE_TOP_VIEW_WIREFRAME":[86,"ChartData"],\
 117 | "SURFACE_WIREFRAME":[84,"ChartData"],\
 118 | "XY_SCATTER":[-4169,"XyChartData"],\
 119 | "XY_SCATTER_LINES":[74,"XyChartData"],\
 120 | "XY_SCATTER_LINES_NO_MARKERS":[75,"XyChartData"],\
 121 | "XY_SCATTER_SMOOTH":[72,"XyChartData"],\
 122 | "XY_SCATTER_SMOOTH_NO_MARKERS":[73,"XyChartData"]}
 123 | 
 124 | 
 125 | 
 126 | 
 127 | 
 128 | 
 129 | def df_to_table(slide,df,left,top,width,height,index_names=False,columns_names=True):
 130 |     '''将pandas数据框添加到slide上，并生成pptx上的表格
 131 |     输入：
 132 |     slide：PPT的一个页面，由pptx.Presentation().slides.add_slide()给定
 133 |     df：需要转换的数据框
 134 |     lef,top: 表格在slide中的位置
 135 |     width,height: 表格在slide中的大小
 136 |     index_names: Bool,是否需要显示行类别的名称
 137 |     columns_names: Bool,是否需要显示列类别的名称
 138 |     返回：
 139 |     返回带表格的slide
 140 |     '''
 141 |     df=pd.DataFrame(df)
 142 |     rows, cols = df.shape
 143 |     res = slide.shapes.add_table(rows+columns_names, cols+index_names, left, top, width, height)
 144 |     # 固定表格的宽度
 145 |     '''
 146 |     for c in range(cols+rownames):
 147 |         res.table.columns[c].width = colwidth
 148 |         res.table.rows[c].width = colwidth
 149 |     '''
 150 |     # Insert the column names
 151 |     if columns_names:
 152 |         for col_index, col_name in enumerate(list(df.columns)):
 153 |             cell=res.table.cell(0,col_index+index_names)
 154 |             #cell.text_frame.fit_text(max_size=12)
 155 |             #cell.text_frame.text='%s'%(col_name)
 156 |             cell.text = '%s'%(col_name)
 157 |     if index_names:
 158 |         for col_index, col_name in enumerate(list(df.index)):
 159 |             cell=res.table.cell(col_index+columns_names,0)
 160 |             cell.text = '%s'%(col_name)
 161 |             #cell.text_frame.fit_text(max_size=12)
 162 |     m = df.as_matrix()
 163 |     for row in range(rows):
 164 |         for col in range(cols):
 165 |             cell=res.table.cell(row+columns_names, col+index_names)
 166 |             if isinstance(m[row, col],float):
 167 |                 cell.text = '%.2f'%(m[row, col])
 168 |             else:
 169 |                 cell.text = '%s'%(m[row, col])
 170 |             #cell.text_frame.fit_text(max_size=12)
 171 | 
 172 | 
 173 | def df_to_chartdata(df,datatype,number_format=None):
 174 |     '''
 175 |     根据给定的图表数据类型生成相应的数据
 176 |     Chartdata:一般的数据
 177 |     XyChartData: 散点图数据
 178 |     BubbleChartData:气泡图数据
 179 |     '''
 180 |     if isinstance(df,pd.Series):
 181 |         df=pd.DataFrame(df)
 182 |     df.fillna(0,inplace=True)
 183 |     datatype=datatype.lower()
 184 |     if datatype == 'chartdata':
 185 |         chart_data = ChartData()
 186 |         chart_data.categories = ['%s'%(c) for c in list(df.index)]
 187 |         for col_name in df.columns:
 188 |             chart_data.add_series('%s'%(col_name),list(df[col_name]),number_format)
 189 |         return chart_data
 190 |     if datatype == 'xychartdata':
 191 |         chart_data=XyChartData()
 192 |         if not isinstance(df,list):
 193 |             df=[df]
 194 |         for d in df:
 195 |             series_name='%s'%(d.columns[0])+' vs '+'%s'%(d.columns[1])
 196 |             series_ = chart_data.add_series(series_name)
 197 |             for i in range(len(d)):
 198 |                 series_.add_data_point(d.iloc[i,0], d.iloc[i,1])
 199 |         return chart_data
 200 |     if datatype == 'bubblechartdata':
 201 |         chart_data=BubbleChartData()
 202 |         if not isinstance(df,list):
 203 |             df=[df]
 204 |         for d in df:
 205 |             series_name='%s'%(d.columns[0])+' vs '+'%s'%(d.columns[1])
 206 |             series_ = chart_data.add_series(series_name)
 207 |             for i in range(len(d)):
 208 |                 series_.add_data_point(d.iloc[i,0],d.iloc[i,1],d.iloc[i,2])
 209 |         return chart_data
 210 | 
 211 | 
 212 | 
 213 | def plot_table(prs,df,layouts=[0,5],title=u'我是标题',summary=u'我是简短的结论',footnote=''):
 214 |     '''根据给定的数据，在给定的prs上新增一页表格ppt
 215 |     输入：
 216 |     prs: PPT文件接口
 217 |     df: 数据框
 218 |     layouts: [0]为PPT母版顺序，[1]为母版内的版式顺序
 219 |     输出：
 220 |     更新后的prs
 221 |     '''
 222 |     df=pd.DataFrame(df)
 223 |     slide_width=prs.slide_width
 224 |     slide_height=prs.slide_height
 225 |     # 可能需要修改以适应更多的情形
 226 |     title_only_slide = prs.slide_masters[layouts[0]].slide_layouts[layouts[1]]
 227 |     slide = prs.slides.add_slide(title_only_slide)
 228 |     #title=u'这里是标题'
 229 |     slide.shapes.title.text = title
 230 |     left,top = Emu(0.05*slide_width), Emu(0.10*slide_height)
 231 |     width,height = Emu(0.7*slide_width), Emu(0.1*slide_height)
 232 |     txBox = slide.shapes.add_textbox(left, top, width, height)
 233 |     #summary=u'这里是一些简短的结论'
 234 |     txBox.text_frame.text=summary
 235 |     # 绘制表格
 236 |     '''添加自适应的表格大小
 237 |     默认最大12*6，width=0.80,height=0.70
 238 |     left=0.1,top=0.25
 239 |     '''
 240 |     R,C=df.shape
 241 |     width=max(0.5,min(1,C/6.0))*0.80
 242 |     height=max(0.5,min(1,R/12.0))*0.70
 243 |     left=0.5-width/2
 244 |     top=0.25
 245 |     left=Emu(left*slide_width)
 246 |     top=Emu(top*slide_height)
 247 |     width=Emu(width*slide_width)
 248 |     height=Emu(height*slide_height)
 249 |     df_to_table(slide,df,left,top,width,height,index_names=True)
 250 | 
 251 |         # 添加脚注 footnote=u'这里是脚注'
 252 |     if footnote:
 253 |         left,top = Emu(0.025*slide_width), Emu(0.95*slide_height)
 254 |         width,height = Emu(0.70*slide_width), Emu(0.10*slide_height)
 255 |         txBox = slide.shapes.add_textbox(left, top, width, height)
 256 |         #p = text_frame.paragraphs[0]
 257 |         p=txBox.text_frame.paragraphs[0]
 258 |         p.text=footnote
 259 |         p.font.size = Pt(10)
 260 |         p.font.language_id = 3076
 261 |         p.font.name='Microsoft YaHei UI'
 262 |         p.font.color.rgb=RGBColor(127,127,127)
 263 |         try:
 264 |             txBox.text_frame.fit_text(max_size=10)
 265 |         except:
 266 |             pass
 267 |             #print('cannot fit the size of font')
 268 |     return prs
 269 | 
 270 | 
 271 | def plot_textbox(prs,texts,title=u'我是文本框页标题',summary=u'我是内容',footnote='',layouts=[0,0]):
 272 |     '''
 273 |     只绘制一个文本框，用于目录、小结等
 274 |     '''
 275 |     slide_width=prs.slide_width
 276 |     slide_height=prs.slide_height
 277 |     # 可能需要修改以适应更多的情形
 278 |     title_only_slide = prs.slide_masters[layouts[0]].slide_layouts[layouts[1]]
 279 |     slide = prs.slides.add_slide(title_only_slide)
 280 |     #title=u'这里是标题'
 281 |     slide.shapes.title.text = title
 282 |     # 绘制副标题
 283 |     if summary:
 284 |         left,top = Emu(0.15*slide_width), Emu(0.10*slide_height)
 285 |         width,height = Emu(0.7*slide_width), Emu(0.1*slide_height)
 286 |         txBox = slide.shapes.add_textbox(left, top, width, height)
 287 |         txBox.text_frame.text=summary
 288 |     # 绘制主体
 289 |     left,top = Emu(0.15*slide_width), Emu(0.20*slide_height)
 290 |     width,height = Emu(0.7*slide_width), Emu(0.7*slide_height)
 291 |     txBox = slide.shapes.add_textbox(left, top, width, height)
 292 |     txBox.text_frame.text=texts
 293 | 
 294 |     # 添加脚注 footnote=u'这里是脚注'
 295 |     if footnote:
 296 |         left,top = Emu(0.025*slide_width), Emu(0.95*slide_height)
 297 |         width,height = Emu(0.70*slide_width), Emu(0.10*slide_height)
 298 |         txBox = slide.shapes.add_textbox(left, top, width, height)
 299 |         #p = text_frame.paragraphs[0]
 300 |         p=txBox.text_frame.paragraphs[0]
 301 |         p.text=footnote
 302 |         p.font.size = Pt(10)
 303 |         p.font.language_id = 3076
 304 |         p.font.name='Microsoft YaHei UI'
 305 |         p.font.color.rgb=RGBColor(127,127,127)
 306 |         try:
 307 |             txBox.text_frame.fit_text(max_size=10)
 308 |         except:
 309 |             pass
 310 |             #print('cannot fit the size of font')
 311 |     return prs
 312 | 
 313 | def plot_picture(prs,img_path,layouts=[0,0],title=u'我是文本框页标题',summary='',\
 314 | footnote=''):
 315 |     '''
 316 |     只插入一张图片，用于目录、小结等
 317 |     '''
 318 |     slide_width=prs.slide_width
 319 |     slide_height=prs.slide_height
 320 |     # 可能需要修改以适应更多的情形
 321 |     title_only_slide = prs.slide_masters[layouts[0]].slide_layouts[layouts[1]]
 322 |     slide = prs.slides.add_slide(title_only_slide)
 323 |     #title=u'这里是标题'
 324 |     slide.shapes.title.text = title
 325 |     if summary:
 326 |         left,top = Emu(0.05*slide_width), Emu(0.10*slide_height)
 327 |         width,height = Emu(0.7*slide_width), Emu(0.1*slide_height)
 328 |         txBox = slide.shapes.add_textbox(left, top, width, height)
 329 |         txBox.text_frame.text=summary
 330 |     left,top = Emu(0.15*slide_width), Emu(0.2*slide_height)
 331 |     height=Emu(0.7*slide_height)
 332 |     slide.shapes.add_picture(img_path, left, top, height=height)
 333 |     # 添加脚注 footnote=u'这里是脚注'
 334 |     if footnote:
 335 |         left,top = Emu(0.025*slide_width), Emu(0.95*slide_height)
 336 |         width,height = Emu(0.70*slide_width), Emu(0.10*slide_height)
 337 |         txBox = slide.shapes.add_textbox(left, top, width, height)
 338 |         #p = text_frame.paragraphs[0]
 339 |         p=txBox.text_frame.paragraphs[0]
 340 |         p.text=footnote
 341 |         p.font.size = Pt(10)
 342 |         p.font.language_id = 3076
 343 |         p.font.name='Microsoft YaHei UI'
 344 |         p.font.color.rgb=RGBColor(127,127,127)
 345 |         try:
 346 |             txBox.text_frame.fit_text(max_size=10)
 347 |         except:
 348 |             pass
 349 |             #print('cannot fit the size of font')
 350 |     return prs
 351 | 
 352 | 
 353 | 
 354 | def plot_chart(prs,df,chart_type,title=u'我是标题',summary=u'我是简短的结论',\
 355 | footnote=None,chart_format=None,layouts=[0,0],has_data_labels=True):
 356 |     '''
 357 |     直接将数据绘制到一张ppt上，且高度定制化
 358 |     默认都有图例，且图例在下方
 359 |     默认都有数据标签
 360 |     '''
 361 | 
 362 |     slide_width=prs.slide_width
 363 |     slide_height=prs.slide_height
 364 |     # 可能需要修改以适应更多的情形
 365 |     # layouts[0]代表第几个母版，layouts[1]代表母版中的第几个版式
 366 |     title_only_slide = prs.slide_masters[layouts[0]].slide_layouts[layouts[1]]
 367 |     slide = prs.slides.add_slide(title_only_slide)
 368 |     # 添加标题 title=u'这里是标题'
 369 |     try:
 370 |         slide.shapes.title.text = title
 371 |     except:
 372 |         print('请检查模板,脚本没有找到合适的slide')
 373 |         return
 374 |     # 添加结论 summary=u'这里是一些简短的结论'
 375 |     #summary_loc=[0.10,0.14,0.80,0.15]
 376 |     left,top = Emu(config.summary_loc[0]*slide_width), Emu(config.summary_loc[1]*slide_height)
 377 |     width,height = Emu(config.summary_loc[2]*slide_width), Emu(config.summary_loc[3]*slide_height)
 378 |     txBox = slide.shapes.add_textbox(left, top, width, height)
 379 |     txBox.text_frame.text=summary
 380 |     txBox.text_frame.paragraphs[0].font.language_id = 3076
 381 |     try:
 382 |         txBox.text_frame.fit_text(max_size=12)
 383 |     except:
 384 |         pass
 385 |         #print('cannot fit the size of font')
 386 | 
 387 | 
 388 |     # 添加脚注 footnote=u'这里是脚注'
 389 |     if footnote:
 390 |         left,top = Emu(0.025*slide_width), Emu(0.95*slide_height)
 391 |         width,height = Emu(0.70*slide_width), Emu(0.10*slide_height)
 392 |         txBox = slide.shapes.add_textbox(left, top, width, height)
 393 |         #p = text_frame.paragraphs[0]
 394 |         p=txBox.text_frame.paragraphs[0]
 395 |         p.text=footnote
 396 |         p.font.size = Pt(10)
 397 |         p.font.language_id = 3076
 398 |         p.font.name='Microsoft YaHei UI'
 399 |         p.font.color.rgb=RGBColor(127,127,127)
 400 |         try:
 401 |             txBox.text_frame.fit_text(max_size=10)
 402 |         except:
 403 |             pass
 404 |             #print('cannot fit the size of font')
 405 | 
 406 | 
 407 |     # 插入图表
 408 |     chart_type_code=chart_list[chart_type][1]
 409 |     chart_data=df_to_chartdata(df,chart_type_code)
 410 |     #left, top = Emu(0.05*slide_width), Emu(0.20*slide_height)
 411 |     #width, height = Emu(0.85*slide_width), Emu(0.70*slide_height)
 412 |     #chart_loc=[0.10,0.30,0.80,0.60]
 413 |     left, top = Emu(config.chart_loc[0]*slide_width), Emu(config.chart_loc[1]*slide_height)
 414 |     width, height = Emu(config.chart_loc[2]*slide_width), Emu(config.chart_loc[3]*slide_height)
 415 | 
 416 |     chart=slide.shapes.add_chart(chart_list[chart_type.upper()][0], \
 417 |     left, top, width, height, chart_data).chart
 418 | 
 419 |     if chart_type_code in [-4169,72,73,74,75]:
 420 |         return
 421 | 
 422 |     font_default_size=Pt(10)
 423 |     # 添加图例
 424 |     if (df.shape[1]>1) or (chart_type=='PIE'):
 425 |         chart.has_legend = True
 426 |         chart.legend.font.size=font_default_size
 427 |         chart.legend.position = XL_LEGEND_POSITION.BOTTOM
 428 |         chart.legend.include_in_layout = False
 429 | 
 430 |     try:
 431 |         chart.category_axis.tick_labels.font.size=font_default_size
 432 |     except:
 433 |         pass#暂时不知道怎么处理
 434 |     try:
 435 |         chart.value_axis.tick_labels.font.size=font_default_size
 436 |     except:
 437 |         pass
 438 |     # 添加数据标签
 439 | 
 440 |     non_available_list=['BUBBLE','BUBBLE_THREE_D_EFFECT','XY_SCATTER',\
 441 |     'XY_SCATTER_LINES','PIE']
 442 |     # 大致检测是否采用百分比
 443 |     # 1、单选题每列的和肯定是100，顶多相差+-5
 444 |     # 2、多选题每一列的和大于100，但单个的小于100.此处可能会有误判，但暂时无解
 445 |     # 3、可能会有某一列全为0，此时单独考虑
 446 |     if  ((df.sum()[df.sum()!=0]>90).all()) and ((df<=100).all().all()) and (u'总体' not in df.index):
 447 |         # 数据条的数据标签格式
 448 |         #number_format1='0.0"%"'
 449 |         number_format1=config.number_format_data
 450 |         # 坐标轴的数据标签格式
 451 |         #number_format2='0"%"'
 452 |         number_format2=config.number_format_tick
 453 |     else:
 454 |         number_format1='0.00'
 455 |         number_format2='0.0'
 456 | 
 457 |     if (chart_type not in non_available_list) or (chart_type == 'PIE'):
 458 |         plot = chart.plots[0]
 459 |         plot.has_data_labels = True
 460 |         plot.data_labels.font.size = font_default_size
 461 |         plot.data_labels.number_format = number_format1
 462 |         #plot.data_labels.number_format_is_linked=True
 463 |         #data_labels = plot.data_labels
 464 |         #plot.data_labels.position = XL_LABEL_POSITION.BEST_FIT
 465 |     if (chart_type not in non_available_list):
 466 |         #chart.value_axis.maximum_scale = 1
 467 |         if df.shape[1]==1:
 468 |             chart.value_axis.has_major_gridlines = False
 469 |         else:
 470 |             chart.value_axis.has_major_gridlines = True
 471 |         tick_labels = chart.value_axis.tick_labels
 472 |         tick_labels.number_format = number_format2
 473 |         tick_labels.font.size = font_default_size
 474 | 
 475 |     # 修改纵坐标格式
 476 |     '''
 477 |     tick_labels = chart.value_axis.tick_labels
 478 |     tick_labels.number_format = '0"%"'
 479 |     tick_labels.font.bold = True
 480 |     tick_labels.font.size = Pt(10)
 481 |     '''
 482 | 
 483 |     # 填充系列的颜色
 484 |     ''' 最好的方法还是修改母版文件中的主题颜色，这里只提供方法
 485 |     if df.shape[1]==1:
 486 |         chart.series[0].fill()
 487 |     '''
 488 | 
 489 |     # 自定义format
 490 |     if chart_format:
 491 |         for k in chart_format:
 492 |             exec('chart.'+k+'='+'%s'%(chart_format[k]))
 493 | 
 494 |     return prs
 495 | 
 496 |     '''
 497 |     if chart_type == 'BAR_TWO_WAY':
 498 |         chart
 499 |     '''
 500 | 
 501 | 
 502 | def plot_cover(prs,title=u'reportgen工具包封面',layouts=[0,0],xspace=8,yspace=6):
 503 | 
 504 |     slide_width=prs.slide_width
 505 |     slide_height=prs.slide_height
 506 |     # 可能需要修改以适应更多的情形
 507 |     title_only_slide = prs.slide_masters[layouts[0]].slide_layouts[layouts[1]]
 508 |     slide = prs.slides.add_slide(title_only_slide)
 509 | 
 510 |     ## 随机生成连接点
 511 |     seeds=np.round(np.dot(np.random.rand((xspace-1)*(yspace-1),2),np.diag([slide_width,slide_height])))
 512 |     # 添加左边点
 513 |     tmp=np.linspace(0,slide_height,yspace)
 514 |     seeds=np.concatenate((seeds,np.array([[0]*len(tmp),tmp]).T))
 515 |     # 添加上边点
 516 |     tmp=np.linspace(0,slide_width,xspace)[1:]
 517 |     seeds=np.concatenate((seeds,np.array([tmp,[0]*len(tmp)]).T))
 518 |     # 添加右边点
 519 |     tmp=np.linspace(0,slide_height,yspace)[1:]
 520 |     seeds=np.concatenate((seeds,np.array([[slide_width]*len(tmp),tmp]).T))
 521 |     # 添加下边点
 522 |     tmp=np.linspace(0,slide_width,xspace)[1:-1]
 523 |     seeds=np.concatenate((seeds,np.array([tmp,[slide_height]*len(tmp)]).T))
 524 | 
 525 |     # 构造三角剖分，生成相应的三角形和平面图数据
 526 |     center = np.mean(seeds, axis=0)
 527 |     t=np.sqrt(slide_width**2+slide_height**2)/2
 528 |     dt = Delaunay2D(center, 2**(np.floor(np.log2(t))+1))
 529 |     for s in seeds:
 530 |         dt.AddPoint(s)
 531 |     tri=dt.exportTriangles()
 532 |     graph=np.zeros((len(seeds),len(seeds)))
 533 |     for t in tri:
 534 |         graph[t[0],t[1]]=1
 535 |         graph[t[1],t[2]]=1
 536 |         graph[t[0],t[2]]=1
 537 |         graph[t[1],t[0]]=1
 538 |         graph[t[2],t[1]]=1
 539 |         graph[t[2],t[1]]=1
 540 | 
 541 | 
 542 |     from pptx.enum.shapes import MSO_CONNECTOR
 543 |     from pptx.enum.shapes import MSO_SHAPE
 544 |     shapes = slide.shapes
 545 |     # 添加连接线
 546 |     for i in range(len(seeds)):
 547 |         for j in range(len(seeds)):
 548 |             if (i<j) and graph[i,j]==1:
 549 |                 shapes.add_connector(
 550 |                 MSO_CONNECTOR.STRAIGHT, Emu(seeds[i,0]), Emu(seeds[i,1]), Emu(seeds[j,0]), Emu(seeds[j,1]))
 551 |     # 添加圆点，原点的半径符合高斯分布
 552 |     radius=slide_width/100
 553 |     for i in range(len(seeds)):
 554 |         eps=np.random.normal(scale=radius*0.2)
 555 |         left=Emu(seeds[i,0])-radius-eps
 556 |         top=Emu(seeds[i,1])-radius-eps
 557 |         width=height=2*(radius+eps)
 558 |         shape=shapes.add_shape(
 559 |         MSO_SHAPE.OVAL,left, top, width, height)
 560 |         shape.line.width=Emu(0)
 561 |         fill = shape.fill
 562 |         fill.solid()
 563 |         fill.fore_color.rgb = RGBColor(218,227,243)
 564 | 
 565 |     # 添加标题
 566 |     left,top = Emu(0), Emu(0.4*slide_height)
 567 |     width,height = Emu(1*slide_width), Emu(0.2*slide_height)
 568 |     shape=shapes.add_shape(
 569 |     MSO_SHAPE.RECTANGLE,left, top, width, height)
 570 |     shape.line.width=Emu(0)
 571 |     fill = shape.fill
 572 |     fill.solid()
 573 |     fill.fore_color.rgb = RGBColor(0,176,240)
 574 |     shape.text=title
 575 | 
 576 |     # 添加脚注
 577 |     left,top = Emu(0.72*slide_width), Emu(0.93*slide_height)
 578 |     width,height = Emu(0.25*slide_width), Emu(0.07*slide_height)
 579 |     txBox = slide.shapes.add_textbox(left, top, width, height)
 580 |     txBox.text_frame.text='POWERED BY REPORTGEN'
 581 | 
 582 |     # 添加LOGO
 583 |     logo_path=os.path.join(_thisdir,'images','logo.png')
 584 |     if os.path.exists(logo_path):
 585 |         left,top = Emu(0.65*slide_width), Emu(0.94*slide_height)
 586 |         height=Emu(0.06*slide_height)
 587 |         slide.shapes.add_picture(logo_path, left, top, height=height)
 588 |     return prs
 589 | 
 590 | 
 591 | def slides_data_gen(slides_data,chart_type_default='COLUMN_CLUSTERED'):
 592 |     '''自动补全pptx数据信息
 593 |     slides_data: 默认需可迭代
 594 |     每一页PPT设定为四个元素：标题、结论、主题、脚注
 595 |     return
 596 |     ------
 597 |     slides_data: 每一页ppt所需要的元素[
 598 |         {title:,#标题
 599 |         summary:,#结论
 600 |         data:,# DataFrame数据、文本数据、图片地址等
 601 |         slide_type:,#chart、table、text
 602 |         chart_type:图表类型
 603 |         data_config:,#字典格式，绘制data其他所需要的相关参数，保留字段，暂时不用
 604 |         footnote:,#脚注
 605 |         layouts:#该slide使用的ppt版式
 606 |         },]
 607 |     filename: 缺省以时间命名
 608 |     template:使用的模板
 609 |     '''
 610 | 
 611 | 
 612 |     title=''
 613 |     summary=''
 614 |     footnote=''
 615 |     # 处理slides_data数据
 616 |     if (not isinstance(slides_data,list)) and (not isinstance(slides_data,tuple)):
 617 |         slides_data=[slides_data]
 618 |     # 自动计算图表的格式
 619 |     #np.issubdtype(a.as_matrix().dtype,np.number)
 620 |     # 补全相关信息
 621 |     slides_data_new=[]
 622 |     for i in range(len(slides_data)):
 623 |         slide=slides_data[i]
 624 |         # 补全相关信息,数据处理部分待定
 625 |         if not isinstance(slide,dict):
 626 |             slide={'data':slide}
 627 |             slide['title']=title
 628 |             slide['summary']=summary
 629 |             slide['footnote']=footnote
 630 |             slide['layouts']='auto'
 631 |             slide['data_config']=None
 632 |             if isinstance(slide['data'],pd.core.frame.DataFrame):
 633 |                 slide['slide_type']='chart'
 634 |                 slide['chart_type']=chart_type_default
 635 |             elif isinstance(slide['data'],pd.core.series.Series):
 636 |                 slide['data']=pd.DataFrame(slide['data'])
 637 |                 slide['slide_type']='chart'
 638 |                 slide['chart_type']=chart_type_default
 639 |             elif isinstance(slide['data'],str) and os.path.exists(slide['data']):
 640 |                 slide['slide_type']='picture'
 641 |                 slide['chart_type']=None
 642 |             elif isinstance(slide['data'],str) and not(os.path.exists(slide['data'])):
 643 |                 slide['slide_type']='textbox'
 644 |                 slide['chart_type']=''
 645 |             else:
 646 |                 print('未知的数据格式，请检查数据')
 647 |                 slide['slide_type']=None
 648 |                 slide['chart_type']=None
 649 |                 continue
 650 |         elif isinstance(slide,dict):
 651 |             if 'data' not in slide:
 652 |                 print('没有找到需要的数据，请检查')
 653 |                 slide['slide_type']=None
 654 |                 slide['chart_type']=None
 655 |                 continue
 656 |             if isinstance(slide['data'],pd.core.series.Series):
 657 |                 slide['data']=pd.DataFrame(slide['data'])
 658 |             if 'title' not in slide:
 659 |                 slide['title']=title
 660 |             if 'summary' not in slide:
 661 |                 slide['summary']=summary
 662 |             if 'footnote' not in slide:
 663 |                 slide['footnote']=footnote
 664 |             if 'layouts' not in slide:
 665 |                 slide['layouts']='auto'
 666 |             if 'data_config' not in slide:
 667 |                 slide['data_config']=None
 668 |             slide['chart_type']=None if 'chart_type' not in slide else slide['chart_type']
 669 |             if 'slide_type' not in slide:
 670 |                 if isinstance(slide['data'],pd.core.frame.DataFrame):
 671 |                     slide['slide_type']='chart'
 672 |                     slide['chart_type']=chart_type_default
 673 |                 elif isinstance(slide['data'],str) and os.path.exists(slide['data']):
 674 |                     print('test')
 675 |                     slide['slide_type']='picture'
 676 |                     slide['chart_type']=''
 677 |                 elif isinstance(slide['data'],str) and not(os.path.exists(slide['data'])):
 678 |                     slide['slide_type']='textbox'
 679 |                     slide['chart_type']=''
 680 |                 else:
 681 |                     print('未知的数据格式，请检查数据')
 682 |                     slide['slide_type']=None
 683 |                     slide['chart_type']=None
 684 |                     continue
 685 |         slides_data_new.append(slide)
 686 | 
 687 |     return slides_data_new
 688 | 
 689 | 
 690 | def genwordcloud(texts,mask=None,font_path=None,background_color='white'):
 691 |     '''生成词云
 692 |     parameter
 693 |     ----------
 694 |     mask: RGBA模式数组，最后一个分量是alpha通道, 默认会生成一个900*1200的椭圆
 695 |     font_path: 采用的字体，建议采用安卓默认字体DroidSansFallback.ttf
 696 |     
 697 |     return
 698 |     -------
 699 |     img:可以直接img.save('test.png')
 700 |     '''
 701 |     from PIL import Image
 702 |     try:
 703 |         from wordcloud import WordCloud
 704 |     except:
 705 |         #raise Exception('wordcloud need install wordcloud package.')
 706 |         print('wordcloud need install wordcloud package.')
 707 |         return None
 708 |     if mask is None:
 709 |         tmp=np.zeros((900,1200),dtype=np.uint8)
 710 |         for i in range(tmp.shape[0]):
 711 |             for j in range(tmp.shape[1]):
 712 |                 if (i-449.5)**2/(430**2)+(j-599.5)**2/(580**2)>1:
 713 |                     tmp[i,j]=255
 714 |         mask=np.zeros((900,1200,4),dtype=np.uint8)
 715 |         mask[:,:,0]=tmp
 716 |         mask[:,:,1]=tmp
 717 |         mask[:,:,2]=tmp
 718 |         mask[:,:,3]=255
 719 |     else:
 720 |         mask=np.array(Image.open(mask))
 721 |     wordcloud = WordCloud(background_color = background_color,font_path=font_path, mask = mask)
 722 |     wordcloud.generate(texts)
 723 |     img=wordcloud.to_image()
 724 |     return img
 725 | 
 726 | 
 727 | 
 728 | 
 729 | class Report():
 730 |     '''
 731 |     底层的类，负责一个 pptx 报告的相关接口
 732 |     parameters:
 733 |     -----------
 734 |     filename: pptx 文件路径，若无则新建一个文件
 735 |     chart_type_default: 默认的图表类型
 736 |     layouts_default: 新建slide时默认使用的 pptx 模板
 737 |     title: 报告的名称
 738 |     author: 报告的作者
 739 |     
 740 |     example：
 741 |     ---------
 742 |     >>>r=Report(filename='')
 743 |     >>>r.add_cover(title='reportgen')
 744 |     >>>r.add_slides([])
 745 |     >>>r.save()
 746 |     '''
 747 |     def __init__(self,filename=None,chart_type_default='COLUMN_CLUSTERED',**kwargs):
 748 |         self.title=None
 749 |         self.author=None
 750 |         # self.filename = filename #导入一个存在的pptx文件
 751 |         self.chart_type_default=chart_type_default
 752 |         if filename is None:
 753 |             if os.path.exists('template.pptx'):
 754 |                 prs=Presentation('template.pptx')
 755 |             elif template_pptx is not None:
 756 |                 prs=Presentation(template_pptx)
 757 |             else:
 758 |                 prs=Presentation()
 759 |         else :
 760 |             # 分离出路径中的文件名
 761 |             self.title=os.path.splitext(os.path.split(filename)[1])[0]
 762 |             prs=Presentation(filename)
 763 |         self.prs=prs
 764 |         title_only_slide=self._layouts()
 765 |         if title_only_slide:
 766 |             layouts=title_only_slide[0]
 767 |         else:
 768 |             layouts=[0,0]
 769 |         self.layouts_default=layouts
 770 |         for k in kwargs:
 771 |             setattr(self,k.lower(),kwargs[k])
 772 | 
 773 | 
 774 |     def _layouts(self):
 775 |         '''给定pptx文件，自动识别标题等版式
 776 |         '''
 777 |         slide_width=self.prs.slide_width
 778 |         slide_height=self.prs.slide_height
 779 |         title_only_slide=[]
 780 |         #blank_slide=[]
 781 |         for i in range(len(self.prs.slide_masters)):
 782 |             slides=self.prs.slide_masters[i]
 783 |             #print('第{}个有{}个版式'.format(i,len(slides.slide_layouts)))
 784 |             for j in range(len(slides.slide_layouts)):
 785 |                 slide=slides.slide_layouts[j]
 786 |                 title_slide=0
 787 |                 placeholder_size=0
 788 |                 for k in range(len(slide.shapes)):
 789 |                     shape=slide.shapes[k]
 790 |                     if shape.is_placeholder and shape.has_text_frame:
 791 |                         left,top=shape.left/slide_width,shape.top/slide_height
 792 |                         height=shape.height/slide_height
 793 |                         if left<1 and top<1 and height<1 and left>0 and top>0 and height>0:
 794 |                             placeholder_size+=1
 795 |                         #print('left={:.2f},top={:.2f},height={:.2f}'.format(left,top,height))
 796 |                         if left<0.15 and top<0.15 and height <0.25:
 797 |                             title_slide+=1
 798 |                 #print('{}个文本占位符,{}个title'.format(placeholder_size,title_slide))
 799 |                 if placeholder_size==1 and title_slide==1:
 800 |                     title_only_slide.append([i,j])
 801 |                 #if placeholder_size==0:
 802 |                     #blank_slide.append((i,j))s
 803 |         return title_only_slide
 804 | 
 805 | 
 806 | 
 807 |     def get_texts(self):
 808 |         # one for each text run in presentation
 809 |         text_runs = []
 810 | 
 811 |         for slide in self.prs.slides:
 812 |             for shape in slide.shapes:
 813 |                 if not shape.has_text_frame:
 814 |                     continue
 815 |                 for paragraph in shape.text_frame.paragraphs:
 816 |                     for run in paragraph.runs:
 817 |                         text_runs.append(run.text)
 818 |         return text_runs
 819 | 
 820 |     def get_images(self):
 821 |         try:
 822 |             from PIL import Image as PIL_Image
 823 |             from io import BytesIO
 824 |         except:
 825 |             print('please install the PIL.')
 826 |             return
 827 |         if not os.path.exists('.\\images'):
 828 |             os.mkdir('.\\images')
 829 |         n_images=0
 830 |         for slide in self.prs.slides:
 831 |             for shape in slide.shapes:
 832 |                 if 'Image' in str(type(shape)) or 'Picture' in str(type(shape)):
 833 |                     n_images+=1
 834 |                     shape_image=shape.image
 835 |                     #filename='.\\images\\'+shape_image.filename
 836 |                     #r=str(np.random.randint(99)).zfill(2)
 837 |                     filename='.\\images\\image%d'%n_images+'.'+shape_image.ext
 838 |                     p = PIL_Image.open(BytesIO(shape_image.blob))
 839 |                     p.save(filename)
 840 |                     #print('save {}'.format(shape_image.filename))
 841 | 
 842 | 
 843 | 
 844 | 
 845 |     def add_slides(self,slides_data,chart_type_default=None):
 846 |         '''！使用的接口和下方的add_slide不一样，建议使用add_slide
 847 |         slides_data: 每一页ppt所需要的元素[
 848 |             {title:,#标题
 849 |             summary:,#结论
 850 |             data:,# DataFrame数据、文本数据、图片地址等
 851 |             slide_type:,#chart、table、text
 852 |             chart_type:图表类型
 853 |             data_config:,#字典格式，绘制data其他所需要的相关参数，保留字段，暂时不用
 854 |             footnote:,#脚注
 855 |             layouts:#该slide使用的ppt版式
 856 |             },]
 857 |         '''
 858 |         if chart_type_default is None:
 859 |             chart_type_default=self.chart_type_default
 860 |         slides_data=slides_data_gen(slides_data,chart_type_default)
 861 |         for slide in slides_data:
 862 |             slide_type=slide['slide_type']
 863 |             title=slide['title']
 864 |             summary=slide['summary']
 865 |             footnote=slide['footnote']
 866 |             layouts=self.layouts_default if slide['layouts'] == 'auto' else slide['layouts']
 867 |             data=slide['data']
 868 |             chart_type=slide['chart_type'] if 'chart_type' in slide else None
 869 |             #data_config=slide['data_config']#暂时没有用该参数
 870 |             if (slide_type is None) or (not isinstance(slide_type,str)):
 871 |                 continue
 872 |             if slide_type == 'chart':
 873 |                 self.prs=plot_chart(self.prs,data,chart_type=chart_type,layouts=layouts,\
 874 |                 title=title,summary=summary,footnote=footnote);
 875 |             elif slide_type == 'table':
 876 |                 self.prs=plot_table(self.prs,data,layouts=layouts,title=title,summary=summary,\
 877 |                 footnote=footnote);
 878 |             elif slide_type in ['textbox','text']:
 879 |                 self.prs=plot_textbox(self.prs,data,layouts=layouts,title=title,summary=summary,\
 880 |                 footnote=footnote);
 881 |             elif slide_type in ['picture','figure']:
 882 |                 self.prs=plot_picture(self.prs,data,layouts=layouts,title=title,summary=summary,\
 883 |                 footnote=footnote);
 884 | 
 885 | 
 886 |     def add_cover(self,title='',author='',style='default',layouts='auto',size=[8,6]):
 887 |         if len(title) == 0:
 888 |             title = 'Analysis Report Powered by reportgen' if self.title is None else self.title
 889 |         if len(author) == 0:
 890 |             author='' if self.author is None else self.author
 891 |         title=title+'\n作者: '+author if len(author)>0 else title
 892 |         layouts=self.layouts_default if layouts == 'auto' else layouts
 893 |         if style == 'default':
 894 |             self.prs=plot_cover(self.prs,title=title,layouts=layouts,xspace=size[0],yspace=size[1]);
 895 | 
 896 | 
 897 | 
 898 |     def location_suggest(self,num=1,rate=0.78,data=None,summary=None):
 899 |         '''统一管理slides各个模块的位置
 900 |         parameter
 901 |         --------
 902 |         num: 主体内容（如图、外链图片、文本框等）的个数，默认从左到右依次排列
 903 |         rate: 主体内容的宽度综合
 904 |         data: list,通过数据类型智能判断位置，如有，则 num 失效
 905 |         summary：如果summary为空，则非图表等位置都会上移动
 906 | 
 907 |         return
 908 |         -----
 909 |         locations: dict格式. l代表left,t代表top,w代表width，h代表height
 910 |         '''
 911 |         slide_width,slide_height=self.prs.slide_width,self.prs.slide_height
 912 |         if 'summary_loc' in config.__dict__:
 913 |             summary_loc=config.summary_loc
 914 |         else:
 915 |             summary_loc=[0.10,0.14,0.80,0.15]
 916 | 
 917 |         if 'footnote_loc' in config.__dict__:
 918 |             footnote_loc=config.footnote_loc
 919 |         else:
 920 |             footnote_loc=[0.025,0.95,0.70,0.06]
 921 | 
 922 |         if 'data_loc' in config.__dict__:
 923 |             data_loc=config.data_loc
 924 |         else:
 925 |             data_loc=[0.11,0.30,0.78,0.60]
 926 |             
 927 |         num=len(data) if isinstance(data,list) else num
 928 |         locations={}
 929 |         locations['summary']={'l':Emu(summary_loc[0]*slide_width),'t':Emu(summary_loc[1]*slide_height),\
 930 |                  'w':Emu(summary_loc[2]*slide_width),'h':Emu(summary_loc[3]*slide_height)}
 931 | 
 932 |         locations['footnote']={'l':Emu(footnote_loc[0]*slide_width),'t':Emu(footnote_loc[1]*slide_height),\
 933 |                  'w':Emu(footnote_loc[2]*slide_width),'h':Emu(footnote_loc[3]*slide_height)}
 934 |         # 主体部分只有一个的情形
 935 |         '''
 936 |         控制主体的宽度为78%，且居中显示。
 937 |         '''
 938 |         if (summary is not None) and len(summary)==0:
 939 |             data_loc[1]=data_loc[1]*0.84
 940 |         if num>1:
 941 |             left=[(1-rate)*(i+1)/(float(num)+1)+rate*i/float(num) for i in range(num)]
 942 |             top=[data_loc[1]]*num
 943 |             width=[rate/float(num)]*num
 944 |             height=[data_loc[3]]*num
 945 |             locations['data']=[{'l':Emu(left[i]*slide_width),'t':Emu(top[i]*slide_height),\
 946 |                      'w':Emu(width[i]*slide_width),'h':Emu(height[i]*slide_height)} for i in range(num)]
 947 |         else:
 948 |             # 暂时只修正单张图片常常不居中的问题，后期会修正多张图片         
 949 |             if data[0]['slide_type'] == 'picture':
 950 |                 imgdata=mpimg.imread(data[0]['data'])
 951 |                 img_height,img_width=imgdata.shape[:2]
 952 |                 img_width_in_pptx=data_loc[3]*slide_height*img_width/img_height/slide_width
 953 |                 data_loc[0]=0.5-img_width_in_pptx/2
 954 | 
 955 |             locations['data']=[{'l':Emu(data_loc[0]*slide_width),'t':Emu(data_loc[1]*slide_height),\
 956 |                      'w':Emu(data_loc[2]*slide_width),'h':Emu(data_loc[3]*slide_height)}]
 957 | 
 958 |         return locations
 959 | 
 960 |     def add_slide(self,data=[],title='',summary='',footnote='',layouts='auto',**kwarg):
 961 |         '''通用函数，添加一页幻灯片
 962 |         parameter
 963 |         ---------
 964 |         data=[{'data':,'slide_type':,'type':,},] # 三个是必须字段，其他根据slide_type不同而不同
 965 |         title: 标题
 966 |         summary: 小结论
 967 |         footnote: 脚注
 968 |         layouts: 使用的母版样式
 969 |         legend: bool,是否画网格线
 970 |         data_labels: bool,是否画数据标签
 971 |         number_format_data: 图的数据标签格式
 972 |         number_format_tick: 横纵坐标的数据标签格式
 973 |         '''
 974 |         #slide_width=self.prs.slide_width
 975 |         #slide_height=self.prs.slide_height
 976 | 
 977 |         # 标准化data格式
 978 |         if not(isinstance(data,list)):
 979 |             data=[data]
 980 |         for i,d in enumerate(data):
 981 |             if not(isinstance(d,dict)):
 982 |                 if isinstance(d,(pd.core.frame.DataFrame,pd.core.frame.Series)):
 983 |                     slide_type='chart'
 984 |                     chart_type=self.chart_type_default
 985 |                     d=pd.DataFrame(d)
 986 |                 elif isinstance(d,str) and os.path.exists(d):
 987 |                     slide_type='picture'
 988 |                     chart_type=''
 989 |                 elif isinstance(d,str) and not(os.path.exists(d)):
 990 |                     slide_type='textbox'
 991 |                     chart_type=''
 992 |                 else:
 993 |                     print('未知的数据格式，请检查数据')
 994 |                     slide_type=''
 995 |                     chart_type=''
 996 |                 data[i]={'data':d,'slide_type':slide_type,'type':chart_type}
 997 | 
 998 |         # 各个模板的位置
 999 |         locations=self.location_suggest(data=data,summary=summary)
1000 |         summary_loc=locations['summary']
1001 |         footnote_loc=locations['footnote']
1002 |         data_loc=locations['data']
1003 | 
1004 |         # 选取的板式
1005 |         if layouts == 'auto':
1006 |             layouts=self.layouts_default
1007 |         title_only_slide = self.prs.slide_masters[layouts[0]].slide_layouts[layouts[1]]
1008 |         slide = self.prs.slides.add_slide(title_only_slide)
1009 | 
1010 |         #输出标题
1011 |         slide.shapes.title.text = title
1012 | 
1013 |         # 输出副标题 summary
1014 |         if summary:
1015 |             txBox = slide.shapes.add_textbox(summary_loc['l'], summary_loc['t'], summary_loc['w'], summary_loc['h'])
1016 |             txBox.text_frame.text=summary
1017 |             txBox.text_frame.paragraphs[0].font.language_id = 3076
1018 |             try:
1019 |                 txBox.text_frame.fit_text(max_size=12)
1020 |             except:
1021 |                 pass
1022 | 
1023 | 
1024 |         # 输出脚注 footnote
1025 |         if footnote:
1026 |             txBox = slide.shapes.add_textbox(footnote_loc['l'], footnote_loc['t'], footnote_loc['w'], footnote_loc['h'])
1027 |             #p = text_frame.paragraphs[0]
1028 |             p=txBox.text_frame.paragraphs[0]
1029 |             p.text=footnote
1030 |             p.font.size = Pt(10)
1031 |             p.font.language_id = 3076
1032 |             p.font.name='Microsoft YaHei UI'
1033 |             p.font.color.rgb=RGBColor(127,127,127)
1034 |             try:
1035 |                 txBox.text_frame.fit_text(max_size=10)
1036 |             except:
1037 |                 pass
1038 |                 #print('cannot fit the size of font')
1039 |         # 绘制主体部分
1040 |         for i,dd in  enumerate(data):
1041 |             slide_type=dd['slide_type']
1042 |             left,top=data_loc[i]['l'],data_loc[i]['t']
1043 |             width,height=data_loc[i]['w'],data_loc[i]['h']
1044 |             chart_type=dd['type'] if 'type' in dd else self.chart_type_default
1045 |             if slide_type in ['table']:
1046 |                 # 绘制表格
1047 |                 '''针对表格大小修正
1048 |                 R,C=dd['data'].shape
1049 |                 width=max(0.5,min(1,C/6.0))*width
1050 |                 height=max(0.5,min(1,R/12.0))*height
1051 |                 left=0.5-width/2
1052 |                 top=0.25
1053 |                 '''
1054 |                 df_to_table(slide,dd['data'],left,top,width,height,index_names=True)
1055 |             elif slide_type in ['textbox']:
1056 |                 # 输出文本框
1057 |                 txBox = slide.shapes.add_textbox(left, top, width, height)
1058 |                 txBox.text_frame.text=dd['data']
1059 |                 txBox.text_frame.paragraphs[0].font.language_id = 3076
1060 |                 try:
1061 |                     txBox.text_frame.fit_text(max_size=12)
1062 |                 except:
1063 |                     pass
1064 |             elif slide_type in ['picture','figure']:
1065 |                 slide.shapes.add_picture(dd['data'], left, top, height=height)
1066 |             elif slide_type in ['chart']:
1067 |                 # 插入图表
1068 |                 chart_type_code=chart_list[chart_type][1]   
1069 |                 if 'pptx.chart.data.ChartData' in str(type(dd['data'])):
1070 |                     chart_data=dd['data']
1071 |                 else:                               
1072 |                     chart_data=df_to_chartdata(dd['data'],chart_type_code)
1073 |                 chart=slide.shapes.add_chart(chart_list[chart_type.upper()][0],left, top, width, height, chart_data).chart
1074 | 
1075 |                 if chart_type_code in [-4169,72,73,74,75]:
1076 |                     continue
1077 |                 font_default_size=Pt(10) if 'font_default_size' not in config.__dict__ else config.font_default_size
1078 |                 # 添加图例
1079 |                 has_legend=kwarg['legend'] if 'legend' in kwarg else True
1080 |                 if has_legend and ((dd['data'].shape[1]>1) or (chart_type=='PIE')):
1081 |                     chart.has_legend = has_legend
1082 |                     chart.legend.font.size=font_default_size
1083 |                     chart.legend.position = XL_LEGEND_POSITION.BOTTOM
1084 |                     chart.legend.include_in_layout = False
1085 |                 try:
1086 |                     chart.category_axis.tick_labels.font.size=font_default_size
1087 |                 except:
1088 |                     pass#暂时不知道怎么处理
1089 |                 try:
1090 |                     chart.value_axis.tick_labels.font.size=font_default_size
1091 |                 except:
1092 |                     pass
1093 |                 # 添加数据标签
1094 | 
1095 |                 non_available_list=['BUBBLE','BUBBLE_THREE_D_EFFECT','XY_SCATTER','XY_SCATTER_LINES','PIE']
1096 |                 
1097 |                 # 数据标签数值格式
1098 |                 # 大致检测是否采用百分比
1099 |                 # 1、单选题每列的和肯定是100，顶多相差+-5
1100 |                 # 2、多选题每一列的和大于100，但单个的小于100.此处可能会有误判，但暂时无解
1101 |                 # 3、可能会有某一列全为0，此时单独考虑               
1102 |                 if  isinstance(dd['data'],(pd.core.frame.DataFrame,pd.core.frame.Series)) and ((dd['data'].sum()[dd['data'].sum()!=0]>90).all()) and ((dd['data']<=100).all().all()):
1103 |                     # 数据条的数据标签格式
1104 |                     number_format1=config.number_format_data
1105 |                     # 坐标轴的数据标签格式
1106 |                     number_format2=config.number_format_tick
1107 |                 else:
1108 |                     number_format1='0.00'
1109 |                     number_format2='0.0'
1110 |                 if 'number_format_data' in dd:
1111 |                     number_format1=dd['number_format_data']
1112 |                 if 'number_format_tick' in dd:
1113 |                     number_format2=dd['number_format_tick']
1114 |                     
1115 |                 if 'number_format_data' in kwarg:
1116 |                     number_format1=kwarg['number_format_data']
1117 |                 if 'number_format_tick' in kwarg:
1118 |                     number_format2=kwarg['number_format_tick']
1119 |                     
1120 |                 if 'data_labels' in kwarg:
1121 |                     has_data_labels = kwarg['data_labels']
1122 |                 else:
1123 |                     has_data_labels=True
1124 |                     
1125 |                 if (chart_type not in non_available_list) or (chart_type == 'PIE'):
1126 |                     plot = chart.plots[0]
1127 |                     plot.has_data_labels = has_data_labels
1128 |                     if has_data_labels:
1129 |                         plot.data_labels.font.size = font_default_size
1130 |                         plot.data_labels.number_format = number_format1
1131 |                     #data_labels = plot.data_labels
1132 |                     #plot.data_labels.position = XL_LABEL_POSITION.BEST_FIT
1133 |                 if (chart_type not in non_available_list):
1134 |                     #chart.value_axis.maximum_scale = 1
1135 |                     if dd['data'].shape[1]==1:
1136 |                         chart.value_axis.has_major_gridlines = False
1137 |                     else:
1138 |                         chart.value_axis.has_major_gridlines = True
1139 |                     tick_labels = chart.value_axis.tick_labels
1140 |                     tick_labels.number_format = number_format2
1141 |                     tick_labels.font.size = font_default_size
1142 | 
1143 | 
1144 | 
1145 |     def save(self,filename=None):
1146 |         assert (filename is not None) or (self.title is not None)
1147 |         filename=self.title+time.strftime('_%Y%m%d%H%M.pptx', time.localtime()) if filename is None else filename
1148 |         filename=os.path.splitext(filename)[0]+'.pptx'
1149 |         self.prs.save(filename)
1150 | 


--------------------------------------------------------------------------------