├── PyClustering.py ├── README.md ├── SSE_k.py ├── community_louvain.py ├── community_status.py ├── cooperation_actor.py ├── k_keams_actortype.py ├── lesmiserables.gml ├── mysql_actor_type.py ├── reference_keams.py ├── s.py └── ssss.py /PyClustering.py: -------------------------------------------------------------------------------- 1 | # 这里选择聚类的方法. 2 | from pyclustering.cluster.birch import birch; 3 | # 这里选择k-means聚类方法,具体的介绍查看 https://codedocs.xyz/annoviko/pyclustering/ 4 | from pyclustering.cluster.kmeans import kmeans 5 | # 这里选择聚类的案例数据 6 | from pyclustering.utils import read_sample; 7 | from pyclustering.samples.definitions import FCPS_SAMPLES; 8 | # 可视化 9 | from pyclustering.cluster import cluster_visualizer 10 | import numpy as np 11 | import pandas 12 | import matplotlib.pyplot as plt 13 | from sklearn.cluster import KMeans 14 | from sklearn.cluster import KMeans 15 | from sklearn.metrics import completeness_score, homogeneity_score 16 | from sklearn.naive_bayes import GaussianNB 17 | from sklearn import cross_validation 18 | from pylab import figure, subplot, hist, xlim, show,plot 19 | from numpy import genfromtxt, zeros 20 | from sklearn.metrics import confusion_matrix 21 | from numpy import mean 22 | from sklearn.cross_validation import cross_val_score 23 | import pandas as pd 24 | 25 | 26 | actoridata=genfromtxt('C:\\Users\\26087\\PycharmProjects\\untitled\\venv\\coo_times_arr.csv',encoding='utf-8',delimiter=',',usecols=(0,1,2),dtype=str) 27 | print(actoridata) 28 | sample = read_sample(actoridata); 29 | # 使用birch算法,聚成三类,这里将类实例化,变成对象 30 | birch_instance = birch(actoridata, 128); 31 | # 使用对象里的方法,开始聚类 32 | birch_instance.process(); 33 | # 获取聚类结果 34 | clusters = birch_instance.get_clusters(); 35 | # 查看形状,可以看到长度为3,被分为三类 36 | visualizer = cluster_visualizer(); 37 | visualizer.append_clusters(clusters,sample); 38 | visualizer.show(); -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Cboa 2 | Chinese-box-office-analysis 3 | # 项目简介 4 | 利用python分析爬取的中国电影票房数据并聚类和可视化分析 5 | # 关键词 6 | 关键词:大数据可视化 聚类分析 7 | -------------------------------------------------------------------------------- /SSE_k.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas 3 | import matplotlib.pyplot as plt 4 | from sklearn.cluster import KMeans 5 | from sklearn.cluster import KMeans 6 | from sklearn.metrics import completeness_score, homogeneity_score 7 | from sklearn.naive_bayes import GaussianNB 8 | from sklearn import cross_validation 9 | from pylab import figure, subplot, hist, xlim, show,plot 10 | from numpy import genfromtxt, zeros 11 | from sklearn.metrics import confusion_matrix 12 | from numpy import mean 13 | from sklearn.cross_validation import cross_val_score 14 | import pandas as pd 15 | from sklearn.cluster import KMeans 16 | import matplotlib.pyplot as plt 17 | 18 | fff=open('data_notype.csv',encoding='utf-8') 19 | actoridata=genfromtxt('data_notype.csv',delimiter=',',usecols=(0),dtype=str) 20 | data = genfromtxt(fff,delimiter=',',usecols=(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69)) 21 | SSE=[] 22 | index=[] 23 | 24 | for k in range(1,69): 25 | for i in range(10): 26 | kk = KMeans(n_clusters=k, max_iter=300, n_init=10).fit(data) 27 | index.append(kk.inertia_) 28 | SSE.append(sum(index)/len(index)) 29 | index=[] 30 | 31 | X = range(1, 69) 32 | 33 | plt.xlabel('k') 34 | 35 | plt.ylabel('SSE') 36 | 37 | plt.plot(X, SSE, 'o-') 38 | 39 | plt.show() 40 | 41 | print("ok") 42 | #sse 和 k图片 -------------------------------------------------------------------------------- /community_louvain.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | This module implements community detection. 4 | """ 5 | from __future__ import print_function 6 | 7 | import array 8 | import random 9 | 10 | import networkx as nx 11 | 12 | from community_status import Status 13 | 14 | __author__ = """Thomas Aynaud (thomas.aynaud@lip6.fr)""" 15 | # Copyright (C) 2009 by 16 | # Thomas Aynaud 17 | # All rights reserved. 18 | # BSD license. 19 | 20 | __PASS_MAX = -1 21 | __MIN = 0.0000001 22 | 23 | 24 | def partition_at_level(dendrogram, level): 25 | """Return the partition of the nodes at the given level 26 | 27 | A dendrogram is a tree and each level is a partition of the graph nodes. 28 | Level 0 is the first partition, which contains the smallest communities, 29 | and the best is len(dendrogram) - 1. 30 | The higher the level is, the bigger are the communities 31 | 32 | Parameters 33 | ---------- 34 | dendrogram : list of dict 35 | a list of partitions, ie dictionnaries where keys of the i+1 are the 36 | values of the i. 37 | level : int 38 | the level which belongs to [0..len(dendrogram)-1] 39 | 40 | Returns 41 | ------- 42 | partition : dictionnary 43 | A dictionary where keys are the nodes and the values are the set it 44 | belongs to 45 | 46 | Raises 47 | ------ 48 | KeyError 49 | If the dendrogram is not well formed or the level is too high 50 | 51 | See Also 52 | -------- 53 | best_partition which directly combines partition_at_level and 54 | generate_dendrogram to obtain the partition of highest modularity 55 | 56 | Examples 57 | -------- 58 | >>> G=nx.erdos_renyi_graph(100, 0.01) 59 | >>> dendrogram = generate_dendrogram(G) 60 | >>> for level in range(len(dendrogram) - 1) : 61 | >>> print("partition at level", level, "is", partition_at_level(dendrogram, level)) # NOQA 62 | """ 63 | partition = dendrogram[0].copy() 64 | for index in range(1, level + 1): 65 | for node, community in partition.items(): 66 | partition[node] = dendrogram[index][community] 67 | return partition 68 | 69 | 70 | def modularity(partition, graph, weight='weight'): 71 | """Compute the modularity of a partition of a graph 72 | 73 | Parameters 74 | ---------- 75 | partition : dict 76 | the partition of the nodes, i.e a dictionary where keys are their nodes 77 | and values the communities 78 | graph : networkx.Graph 79 | the networkx graph which is decomposed 80 | weight : str, optional 81 | the key in graph to use as weight. Default to 'weight' 82 | 83 | 84 | Returns 85 | ------- 86 | modularity : float 87 | The modularity 88 | 89 | Raises 90 | ------ 91 | KeyError 92 | If the partition is not a partition of all graph nodes 93 | ValueError 94 | If the graph has no link 95 | TypeError 96 | If graph is not a networkx.Graph 97 | 98 | References 99 | ---------- 100 | .. 1. Newman, M.E.J. & Girvan, M. Finding and evaluating community 101 | structure in networks. Physical Review E 69, 26113(2004). 102 | 103 | Examples 104 | -------- 105 | >>> G=nx.erdos_renyi_graph(100, 0.01) 106 | >>> part = best_partition(G) 107 | >>> modularity(part, G) 108 | """ 109 | if graph.is_directed(): 110 | raise TypeError("Bad graph type, use only non directed graph") 111 | 112 | inc = dict([]) 113 | deg = dict([]) 114 | links = graph.size(weight=weight) 115 | if links == 0: 116 | raise ValueError("A graph without link has an undefined modularity") 117 | 118 | for node in graph: 119 | com = partition[node] 120 | deg[com] = deg.get(com, 0.) + graph.degree(node, weight=weight) 121 | for neighbor, datas in graph[node].items(): 122 | edge_weight = datas.get(weight, 1) 123 | if partition[neighbor] == com: 124 | if neighbor == node: 125 | inc[com] = inc.get(com, 0.) + float(edge_weight) 126 | else: 127 | inc[com] = inc.get(com, 0.) + float(edge_weight) / 2. 128 | 129 | res = 0. 130 | for com in set(partition.values()): 131 | res += (inc.get(com, 0.) / links) - \ 132 | (deg.get(com, 0.) / (2. * links)) ** 2 133 | return res 134 | 135 | 136 | def best_partition(graph, partition=None, 137 | weight='weight', resolution=1., randomize=False): 138 | """Compute the partition of the graph nodes which maximises the modularity 139 | (or try..) using the Louvain heuristices 140 | 141 | This is the partition of highest modularity, i.e. the highest partition 142 | of the dendrogram generated by the Louvain algorithm. 143 | 144 | Parameters 145 | ---------- 146 | graph : networkx.Graph 147 | the networkx graph which is decomposed 148 | partition : dict, optional 149 | the algorithm will start using this partition of the nodes. 150 | It's a dictionary where keys are their nodes and values the communities 151 | weight : str, optional 152 | the key in graph to use as weight. Default to 'weight' 153 | resolution : double, optional 154 | Will change the size of the communities, default to 1. 155 | represents the time described in 156 | "Laplacian Dynamics and Multiscale Modular Structure in Networks", 157 | R. Lambiotte, J.-C. Delvenne, M. Barahona 158 | randomize : boolean, optional 159 | Will randomize the node evaluation order and the community evaluation 160 | order to get different partitions at each call 161 | 162 | Returns 163 | ------- 164 | partition : dictionnary 165 | The partition, with communities numbered from 0 to number of communities 166 | 167 | Raises 168 | ------ 169 | NetworkXError 170 | If the graph is not Eulerian. 171 | 172 | See Also 173 | -------- 174 | generate_dendrogram to obtain all the decompositions levels 175 | 176 | Notes 177 | ----- 178 | Uses Louvain algorithm 179 | 180 | References 181 | ---------- 182 | .. 1. Blondel, V.D. et al. Fast unfolding of communities in 183 | large networks. J. Stat. Mech 10008, 1-12(2008). 184 | 185 | Examples 186 | -------- 187 | >>> #Basic usage 188 | >>> G=nx.erdos_renyi_graph(100, 0.01) 189 | >>> part = best_partition(G) 190 | 191 | >>> #other example to display a graph with its community : 192 | >>> #better with karate_graph() as defined in networkx examples 193 | >>> #erdos renyi don't have true community structure 194 | >>> G = nx.erdos_renyi_graph(30, 0.05) 195 | >>> #first compute the best partition 196 | >>> partition = community.best_partition(G) 197 | >>> #drawing 198 | >>> size = float(len(set(partition.values()))) 199 | >>> pos = nx.spring_layout(G) 200 | >>> count = 0. 201 | >>> for com in set(partition.values()) : 202 | >>> count += 1. 203 | >>> list_nodes = [nodes for nodes in partition.keys() 204 | >>> if partition[nodes] == com] 205 | >>> nx.draw_networkx_nodes(G, pos, list_nodes, node_size = 20, 206 | node_color = str(count / size)) 207 | >>> nx.draw_networkx_edges(G, pos, alpha=0.5) 208 | >>> plt.show() 209 | """ 210 | dendo = generate_dendrogram(graph, 211 | partition, 212 | weight, 213 | resolution, 214 | randomize) 215 | return partition_at_level(dendo, len(dendo) - 1) 216 | 217 | 218 | def generate_dendrogram(graph, 219 | part_init=None, 220 | weight='weight', 221 | resolution=1., 222 | randomize=False): 223 | """Find communities in the graph and return the associated dendrogram 224 | 225 | A dendrogram is a tree and each level is a partition of the graph nodes. 226 | Level 0 is the first partition, which contains the smallest communities, 227 | and the best is len(dendrogram) - 1. The higher the level is, the bigger 228 | are the communities 229 | 230 | 231 | Parameters 232 | ---------- 233 | graph : networkx.Graph 234 | the networkx graph which will be decomposed 235 | part_init : dict, optional 236 | the algorithm will start using this partition of the nodes. It's a 237 | dictionary where keys are their nodes and values the communities 238 | weight : str, optional 239 | the key in graph to use as weight. Default to 'weight' 240 | resolution : double, optional 241 | Will change the size of the communities, default to 1. 242 | represents the time described in 243 | "Laplacian Dynamics and Multiscale Modular Structure in Networks", 244 | R. Lambiotte, J.-C. Delvenne, M. Barahona 245 | 246 | Returns 247 | ------- 248 | dendrogram : list of dictionaries 249 | a list of partitions, ie dictionnaries where keys of the i+1 are the 250 | values of the i. and where keys of the first are the nodes of graph 251 | 252 | Raises 253 | ------ 254 | TypeError 255 | If the graph is not a networkx.Graph 256 | 257 | See Also 258 | -------- 259 | best_partition 260 | 261 | Notes 262 | ----- 263 | Uses Louvain algorithm 264 | 265 | References 266 | ---------- 267 | .. 1. Blondel, V.D. et al. Fast unfolding of communities in large 268 | networks. J. Stat. Mech 10008, 1-12(2008). 269 | 270 | Examples 271 | -------- 272 | >>> G=nx.erdos_renyi_graph(100, 0.01) 273 | >>> dendo = generate_dendrogram(G) 274 | >>> for level in range(len(dendo) - 1) : 275 | >>> print("partition at level", level, 276 | >>> "is", partition_at_level(dendo, level)) 277 | :param weight: 278 | :type weight: 279 | """ 280 | if graph.is_directed(): 281 | raise TypeError("Bad graph type, use only non directed graph") 282 | 283 | # special case, when there is no link 284 | # the best partition is everyone in its community 285 | if graph.number_of_edges() == 0: 286 | part = dict([]) 287 | for node in graph.nodes(): 288 | part[node] = node 289 | return [part] 290 | 291 | current_graph = graph.copy() 292 | status = Status() 293 | status.init(current_graph, weight, part_init) 294 | status_list = list() 295 | __one_level(current_graph, status, weight, resolution, randomize) 296 | new_mod = __modularity(status) 297 | partition = __renumber(status.node2com) 298 | status_list.append(partition) 299 | mod = new_mod 300 | current_graph = induced_graph(partition, current_graph, weight) 301 | status.init(current_graph, weight) 302 | 303 | while True: 304 | __one_level(current_graph, status, weight, resolution, randomize) 305 | new_mod = __modularity(status) 306 | if new_mod - mod < __MIN: 307 | break 308 | partition = __renumber(status.node2com) 309 | status_list.append(partition) 310 | mod = new_mod 311 | current_graph = induced_graph(partition, current_graph, weight) 312 | status.init(current_graph, weight) 313 | return status_list[:] 314 | 315 | 316 | def induced_graph(partition, graph, weight="weight"): 317 | """Produce the graph where nodes are the communities 318 | 319 | there is a link of weight w between communities if the sum of the weights 320 | of the links between their elements is w 321 | 322 | Parameters 323 | ---------- 324 | partition : dict 325 | a dictionary where keys are graph nodes and values the part the node 326 | belongs to 327 | graph : networkx.Graph 328 | the initial graph 329 | weight : str, optional 330 | the key in graph to use as weight. Default to 'weight' 331 | 332 | 333 | Returns 334 | ------- 335 | g : networkx.Graph 336 | a networkx graph where nodes are the parts 337 | 338 | Examples 339 | -------- 340 | >>> n = 5 341 | >>> g = nx.complete_graph(2*n) 342 | >>> part = dict([]) 343 | >>> for node in g.nodes() : 344 | >>> part[node] = node % 2 345 | >>> ind = induced_graph(part, g) 346 | >>> goal = nx.Graph() 347 | >>> goal.add_weighted_edges_from([(0,1,n*n),(0,0,n*(n-1)/2), (1, 1, n*(n-1)/2)]) # NOQA 348 | >>> nx.is_isomorphic(int, goal) 349 | True 350 | """ 351 | ret = nx.Graph() 352 | ret.add_nodes_from(partition.values()) 353 | 354 | for node1, node2, datas in graph.edges(data=True): 355 | edge_weight = datas.get(weight, 1) 356 | com1 = partition[node1] 357 | com2 = partition[node2] 358 | w_prec = ret.get_edge_data(com1, com2, {weight: 0}).get(weight, 1) 359 | ret.add_edge(com1, com2, **{weight: w_prec + edge_weight}) 360 | 361 | return ret 362 | 363 | 364 | def __renumber(dictionary): 365 | """Renumber the values of the dictionary from 0 to n 366 | """ 367 | count = 0 368 | ret = dictionary.copy() 369 | new_values = dict([]) 370 | 371 | for key in dictionary.keys(): 372 | value = dictionary[key] 373 | new_value = new_values.get(value, -1) 374 | if new_value == -1: 375 | new_values[value] = count 376 | new_value = count 377 | count += 1 378 | ret[key] = new_value 379 | 380 | return ret 381 | 382 | 383 | def load_binary(data): 384 | """Load binary graph as used by the cpp implementation of this algorithm 385 | """ 386 | data = open(data, "rb") 387 | 388 | reader = array.array("I") 389 | reader.fromfile(data, 1) 390 | num_nodes = reader.pop() 391 | reader = array.array("I") 392 | reader.fromfile(data, num_nodes) 393 | cum_deg = reader.tolist() 394 | num_links = reader.pop() 395 | reader = array.array("I") 396 | reader.fromfile(data, num_links) 397 | links = reader.tolist() 398 | graph = nx.Graph() 399 | graph.add_nodes_from(range(num_nodes)) 400 | prec_deg = 0 401 | 402 | for index in range(num_nodes): 403 | last_deg = cum_deg[index] 404 | neighbors = links[prec_deg:last_deg] 405 | graph.add_edges_from([(index, int(neigh)) for neigh in neighbors]) 406 | prec_deg = last_deg 407 | 408 | return graph 409 | 410 | 411 | def __randomly(seq, randomize): 412 | """ Convert sequence or iterable to an iterable in random order if 413 | randomize """ 414 | if randomize: 415 | shuffled = list(seq) 416 | random.shuffle(shuffled) 417 | return iter(shuffled) 418 | return seq 419 | 420 | 421 | def __one_level(graph, status, weight_key, resolution, randomize): 422 | """Compute one level of communities 423 | """ 424 | modified = True 425 | nb_pass_done = 0 426 | cur_mod = __modularity(status) 427 | new_mod = cur_mod 428 | 429 | while modified and nb_pass_done != __PASS_MAX: 430 | cur_mod = new_mod 431 | modified = False 432 | nb_pass_done += 1 433 | 434 | for node in __randomly(graph.nodes(), randomize): 435 | com_node = status.node2com[node] 436 | degc_totw = status.gdegrees.get(node, 0.) / (status.total_weight * 2.) # NOQA 437 | neigh_communities = __neighcom(node, graph, status, weight_key) 438 | remove_cost = - resolution * neigh_communities.get(com_node,0) + \ 439 | (status.degrees.get(com_node, 0.) - status.gdegrees.get(node, 0.)) * degc_totw 440 | __remove(node, com_node, 441 | neigh_communities.get(com_node, 0.), status) 442 | best_com = com_node 443 | best_increase = 0 444 | for com, dnc in __randomly(neigh_communities.items(), 445 | randomize): 446 | incr = remove_cost + resolution * dnc - \ 447 | status.degrees.get(com, 0.) * degc_totw 448 | if incr > best_increase: 449 | best_increase = incr 450 | best_com = com 451 | __insert(node, best_com, 452 | neigh_communities.get(best_com, 0.), status) 453 | if best_com != com_node: 454 | modified = True 455 | new_mod = __modularity(status) 456 | if new_mod - cur_mod < __MIN: 457 | break 458 | 459 | 460 | def __neighcom(node, graph, status, weight_key): 461 | """ 462 | Compute the communities in the neighborhood of node in the graph given 463 | with the decomposition node2com 464 | """ 465 | weights = {} 466 | for neighbor, datas in graph[node].items(): 467 | if neighbor != node: 468 | edge_weight = datas.get(weight_key, 1) 469 | neighborcom = status.node2com[neighbor] 470 | weights[neighborcom] = weights.get(neighborcom, 0) + edge_weight 471 | 472 | return weights 473 | 474 | 475 | def __remove(node, com, weight, status): 476 | """ Remove node from community com and modify status""" 477 | status.degrees[com] = (status.degrees.get(com, 0.) 478 | - status.gdegrees.get(node, 0.)) 479 | status.internals[com] = float(status.internals.get(com, 0.) - 480 | weight - status.loops.get(node, 0.)) 481 | status.node2com[node] = -1 482 | 483 | 484 | def __insert(node, com, weight, status): 485 | """ Insert node into community and modify status""" 486 | status.node2com[node] = com 487 | status.degrees[com] = (status.degrees.get(com, 0.) + 488 | status.gdegrees.get(node, 0.)) 489 | status.internals[com] = float(status.internals.get(com, 0.) + 490 | weight + status.loops.get(node, 0.)) 491 | 492 | 493 | def __modularity(status): 494 | """ 495 | Fast compute the modularity of the partition of the graph using 496 | status precomputed 497 | """ 498 | links = float(status.total_weight) 499 | result = 0. 500 | for community in set(status.node2com.values()): 501 | in_degree = status.internals.get(community, 0.) 502 | degree = status.degrees.get(community, 0.) 503 | if links > 0: 504 | result += in_degree / links - ((degree / (2. * links)) ** 2) 505 | return result 506 | -------------------------------------------------------------------------------- /community_status.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | 4 | class Status(object): 5 | """ 6 | To handle several data in one struct. 7 | 8 | Could be replaced by named tuple, but don't want to depend on python 2.6 9 | """ 10 | node2com = {} 11 | total_weight = 0 12 | internals = {} 13 | degrees = {} 14 | gdegrees = {} 15 | 16 | def __init__(self): 17 | self.node2com = dict([]) 18 | self.total_weight = 0 19 | self.degrees = dict([]) 20 | self.gdegrees = dict([]) 21 | self.internals = dict([]) 22 | self.loops = dict([]) 23 | 24 | def __str__(self): 25 | return ("node2com : " + str(self.node2com) + " degrees : " 26 | + str(self.degrees) + " internals : " + str(self.internals) 27 | + " total_weight : " + str(self.total_weight)) 28 | 29 | def copy(self): 30 | """Perform a deep copy of status""" 31 | new_status = Status() 32 | new_status.node2com = self.node2com.copy() 33 | new_status.internals = self.internals.copy() 34 | new_status.degrees = self.degrees.copy() 35 | new_status.gdegrees = self.gdegrees.copy() 36 | new_status.total_weight = self.total_weight 37 | 38 | def init(self, graph, weight, part=None): 39 | """Initialize the status of a graph with every node in one community""" 40 | count = 0 41 | self.node2com = dict([]) 42 | self.total_weight = 0 43 | self.degrees = dict([]) 44 | self.gdegrees = dict([]) 45 | self.internals = dict([]) 46 | self.total_weight = graph.size(weight=weight) 47 | if part is None: 48 | for node in graph.nodes(): 49 | self.node2com[node] = count 50 | deg = float(graph.degree(node, weight=weight)) 51 | if deg < 0: 52 | error = "Bad node degree ({})".format(deg) 53 | raise ValueError(error) 54 | self.degrees[count] = deg 55 | self.gdegrees[node] = deg 56 | edge_data = graph.get_edge_data(node, node, default={weight: 0}) 57 | self.loops[node] = float(edge_data.get(weight, 1)) 58 | self.internals[count] = self.loops[node] 59 | count += 1 60 | else: 61 | for node in graph.nodes(): 62 | com = part[node] 63 | self.node2com[node] = com 64 | deg = float(graph.degree(node, weight=weight)) 65 | self.degrees[com] = self.degrees.get(com, 0) + deg 66 | self.gdegrees[node] = deg 67 | inc = 0. 68 | for neighbor, datas in graph[node].items(): 69 | edge_weight = datas.get(weight, 1) 70 | if edge_weight <= 0: 71 | error = "Bad graph type ({})".format(type(graph)) 72 | raise ValueError(error) 73 | if part[neighbor] == com: 74 | if neighbor == node: 75 | inc += float(edge_weight) 76 | else: 77 | inc += float(edge_weight) / 2. 78 | self.internals[com] = self.internals.get(com, 0) + inc 79 | -------------------------------------------------------------------------------- /cooperation_actor.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas 3 | import matplotlib.pyplot as plt 4 | from sklearn.cluster import KMeans 5 | from sklearn.cluster import KMeans 6 | from sklearn.metrics import completeness_score, homogeneity_score 7 | from sklearn.naive_bayes import GaussianNB 8 | from sklearn import cross_validation 9 | from pylab import figure, subplot, hist, xlim, show,plot 10 | from numpy import genfromtxt, zeros 11 | from sklearn.metrics import confusion_matrix 12 | from numpy import mean 13 | from sklearn.cross_validation import cross_val_score 14 | import pandas as pd 15 | from sklearn.cluster import KMeans 16 | import matplotlib.pyplot as plt 17 | import pymysql as pm 18 | import re 19 | from itertools import combinations 20 | import community_louvain 21 | import networkx as nx 22 | 23 | 24 | 25 | #连接数据库 26 | cnx = pm.connect(user='root',password='260877289',host='127.0.0.1',database='moviedata') 27 | 28 | cur1 = cnx.cursor() 29 | cur1.execute("select MovieID from movie") 30 | 31 | cur2 = cnx.cursor() 32 | cur2.execute("select ActorID from actor") 33 | 34 | cur3=cnx.cursor() 35 | 36 | cur4=cnx.cursor() 37 | cur4.execute("select MovieSumBoxOffice from movie") 38 | 39 | movieid=cur1.fetchone() 40 | actorid = cur2.fetchone() 41 | movieboxoffice=cur4.fetchone() 42 | 43 | actor_one = [0 for i in range(15040)] 44 | for i in range(len(actor_one)): 45 | actorid = cur2.fetchone() 46 | if actorid is not None: 47 | actor_one[i]=int(re.sub("\D", "",list(actorid)[0]))#正则 48 | print (actor_one) 49 | 50 | actor_tow = [[0 for i in range(15040)] for i in range(15040)] 51 | actor_tow_pf=[[0 for i in range(15040)] for i in range(15040)] 52 | 53 | movieid=cur1.fetchone() 54 | movieboxoffice=cur4.fetchone() 55 | while movieboxoffice is not None and movieid is not None: 56 | if re.sub("\D", "",list(movieboxoffice)[0]) !="": 57 | movie_int_boxoffice=int(re.sub("\D", "",list(movieboxoffice)[0])) 58 | else: 59 | movie_int_boxoffice=0 60 | movie_int_ID=int(re.sub("\D", "",list(movieid)[0])) 61 | sqlstring1="select ActorID from movie_actor where MovieID ='"+str(movie_int_ID)+"'" 62 | cur3.execute(sqlstring1) 63 | actorid2=cur3.fetchone() 64 | actor_cooperation_id=[] 65 | while actorid2 is not None: 66 | actor_cooperation_id.append(int(re.sub("\D", "",list(actorid2)[0])))#正则 67 | actorid2=cur3.fetchone() 68 | #print(actor_cooperation_id) 69 | actor_coo_combinations=list(combinations(actor_cooperation_id,2)) 70 | #print(actor_coo_combinations) 71 | for i in range(len(actor_coo_combinations)): 72 | x=actor_one.index(actor_coo_combinations[i][0]) 73 | y=actor_one.index(actor_coo_combinations[i][1]) 74 | actor_tow[min(x,y)][max(x,y)]+=1 75 | actor_tow_pf[min(x,y)][max(x,y)]+=movie_int_boxoffice 76 | movieid=cur1.fetchone() 77 | movieboxoffice=cur4.fetchone() 78 | print(actor_tow_pf[1]) 79 | print(actor_tow[1]) 80 | 81 | 82 | data1 = pd.DataFrame(actor_tow) 83 | data1.to_csv('coo_times.csv',na_rep='NA',header=0,index=0) 84 | 85 | data2=pd.DataFrame(actor_tow_pf) 86 | data2.to_csv('coo_boxoffice.csv',na_rep='NA',header=0,index=0) 87 | 88 | 89 | print("k") 90 | row_num=0 91 | actor_coo_times_arr=[[0 for i in range(3)] for i in range(201494)] 92 | actor_coo_office_arr=[[0 for i in range(3)] for i in range(201494)] 93 | for i in range(len(actor_tow)): 94 | for j in range(len(actor_tow)-i): 95 | if actor_tow[i][i+j] !=0 : 96 | actor_coo_times=actor_tow[i][i+j] 97 | actor_coo_office=actor_tow_pf[i][i+j] 98 | actor_coo_times_arr[row_num][0]=actor_one[i] 99 | actor_coo_office_arr[row_num][0] = actor_one[i] 100 | actor_coo_times_arr[row_num][1]=actor_one[i+j] 101 | actor_coo_office_arr[row_num][1] = actor_one[i + j] 102 | actor_coo_times_arr[row_num][2]=actor_coo_times 103 | actor_coo_office_arr[row_num][2]=actor_coo_office 104 | row_num+=1 105 | print(row_num) 106 | print(actor_coo_office_arr[0]) 107 | print(actor_coo_times_arr[0]) 108 | 109 | G = nx.Graph() #初始化无向图 110 | G.add_weighted_edges_from(actor_coo_office_arr) #把带权重边的信息加入无向图中 111 | 112 | #first compute the best partition 113 | partition = community_louvain.best_partition(G) 114 | """ 115 | #drawing 116 | size = float(len(set(partition.values()))) 117 | 118 | pos = nx.spring_layout(G) 119 | 120 | count = 0. 121 | for com in set(partition.values()) : 122 | count = count + 1. 123 | list_nodes = [nodes for nodes in partition.keys() 124 | if partition[nodes] == com] 125 | nx.draw_networkx_nodes(G, pos, list_nodes, node_size = 20, 126 | node_color = str(count / size)) 127 | 128 | nx.draw_networkx_edges(G, pos, alpha=0.5) 129 | plt.show() 130 | 131 | 132 | data3 = pd.DataFrame(actor_coo_times_arr) 133 | data3.to_csv('coo_times_arr.csv',na_rep='NA',header=0,index=0) 134 | data4=pd.DataFrame(actor_coo_office_arr) 135 | data4.to_csv('coo_office_arr.csv',na_rep='NA',header=0,index=0) 136 | print("ok") 137 | """ 138 | print(partition) 139 | """ 140 | import json 141 | import datetime 142 | import numpy as np 143 | 144 | class JsonEncoder(json.JSONEncoder): 145 | 146 | def default(self, obj): 147 | if isinstance(obj, np.integer): 148 | return int(obj) 149 | elif isinstance(obj, np.floating): 150 | return float(obj) 151 | elif isinstance(obj, np.ndarray): 152 | return obj.tolist() 153 | elif isinstance(obj, datetime): 154 | return obj.__str__() 155 | else: 156 | return super(MyEncoder, self).default(obj) 157 | 158 | def save_dict(filename, dic): 159 | '''save dict into json file''' 160 | with open(filename,'w') as json_file: 161 | json.dump(dic, json_file, ensure_ascii=False, cls=JsonEncoder) 162 | save_dict("partition",partition) 163 | 164 | import json 165 | 166 | def load_dict(filename): 167 | '''load dict from json file''' 168 | with open(filename,"r") as json_file: 169 | dic = json.load(json_file) 170 | return dic 171 | """ 172 | 173 | list1 = list(partition.keys()) 174 | list2 = list(partition.values()) 175 | z = list(zip(list1,list2)) 176 | data5= pd.DataFrame(z) 177 | data5.to_csv('partition_arr_office.csv',na_rep='NA',header=0,index=0) -------------------------------------------------------------------------------- /k_keams_actortype.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas 3 | import matplotlib.pyplot as plt 4 | from sklearn.cluster import KMeans 5 | from sklearn.cluster import KMeans 6 | from sklearn.metrics import completeness_score, homogeneity_score 7 | from sklearn.naive_bayes import GaussianNB 8 | from sklearn import cross_validation 9 | from pylab import figure, subplot, hist, xlim, show,plot 10 | from numpy import genfromtxt, zeros 11 | from sklearn.metrics import confusion_matrix 12 | from numpy import mean 13 | from sklearn.cross_validation import cross_val_score 14 | import pandas as pd 15 | 16 | fff=open('data_notype.csv',encoding='utf-8') 17 | actoridata=genfromtxt('data_notype.csv',delimiter=',',usecols=(0),dtype=str) 18 | data = genfromtxt(fff,delimiter=',',usecols=(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69)) 19 | kk=KMeans(n_clusters=69, max_iter=300, n_init=10).fit(data) 20 | labelPred = kk.labels_ 21 | tt = [[0 for i in range(2)] for i in range(15040)] 22 | for i in range(len(actoridata)): 23 | tt[i][0]=int(actoridata[i]) 24 | tt[i][1]=labelPred[i] 25 | print(tt) 26 | 27 | data1 = pd.DataFrame(tt) 28 | data1.to_csv('first.csv',na_rep='NA',header=0,index=0) 29 | 30 | 31 | print("ok") 32 | #69=k -------------------------------------------------------------------------------- /lesmiserables.gml: -------------------------------------------------------------------------------- 1 | Creator "Mark Newman on Fri Jul 21 12:44:53 2006" 2 | graph 3 | [ 4 | node 5 | [ 6 | id 0 7 | label "Myriel" 8 | ] 9 | node 10 | [ 11 | id 1 12 | label "Napoleon" 13 | ] 14 | node 15 | [ 16 | id 2 17 | label "MlleBaptistine" 18 | ] 19 | node 20 | [ 21 | id 3 22 | label "MmeMagloire" 23 | ] 24 | node 25 | [ 26 | id 4 27 | label "CountessDeLo" 28 | ] 29 | node 30 | [ 31 | id 5 32 | label "Geborand" 33 | ] 34 | node 35 | [ 36 | id 6 37 | label "Champtercier" 38 | ] 39 | node 40 | [ 41 | id 7 42 | label "Cravatte" 43 | ] 44 | node 45 | [ 46 | id 8 47 | label "Count" 48 | ] 49 | node 50 | [ 51 | id 9 52 | label "OldMan" 53 | ] 54 | node 55 | [ 56 | id 10 57 | label "Labarre" 58 | ] 59 | node 60 | [ 61 | id 11 62 | label "Valjean" 63 | ] 64 | node 65 | [ 66 | id 12 67 | label "Marguerite" 68 | ] 69 | node 70 | [ 71 | id 13 72 | label "MmeDeR" 73 | ] 74 | node 75 | [ 76 | id 14 77 | label "Isabeau" 78 | ] 79 | node 80 | [ 81 | id 15 82 | label "Gervais" 83 | ] 84 | node 85 | [ 86 | id 16 87 | label "Tholomyes" 88 | ] 89 | node 90 | [ 91 | id 17 92 | label "Listolier" 93 | ] 94 | node 95 | [ 96 | id 18 97 | label "Fameuil" 98 | ] 99 | node 100 | [ 101 | id 19 102 | label "Blacheville" 103 | ] 104 | node 105 | [ 106 | id 20 107 | label "Favourite" 108 | ] 109 | node 110 | [ 111 | id 21 112 | label "Dahlia" 113 | ] 114 | node 115 | [ 116 | id 22 117 | label "Zephine" 118 | ] 119 | node 120 | [ 121 | id 23 122 | label "Fantine" 123 | ] 124 | node 125 | [ 126 | id 24 127 | label "MmeThenardier" 128 | ] 129 | node 130 | [ 131 | id 25 132 | label "Thenardier" 133 | ] 134 | node 135 | [ 136 | id 26 137 | label "Cosette" 138 | ] 139 | node 140 | [ 141 | id 27 142 | label "Javert" 143 | ] 144 | node 145 | [ 146 | id 28 147 | label "Fauchelevent" 148 | ] 149 | node 150 | [ 151 | id 29 152 | label "Bamatabois" 153 | ] 154 | node 155 | [ 156 | id 30 157 | label "Perpetue" 158 | ] 159 | node 160 | [ 161 | id 31 162 | label "Simplice" 163 | ] 164 | node 165 | [ 166 | id 32 167 | label "Scaufflaire" 168 | ] 169 | node 170 | [ 171 | id 33 172 | label "Woman1" 173 | ] 174 | node 175 | [ 176 | id 34 177 | label "Judge" 178 | ] 179 | node 180 | [ 181 | id 35 182 | label "Champmathieu" 183 | ] 184 | node 185 | [ 186 | id 36 187 | label "Brevet" 188 | ] 189 | node 190 | [ 191 | id 37 192 | label "Chenildieu" 193 | ] 194 | node 195 | [ 196 | id 38 197 | label "Cochepaille" 198 | ] 199 | node 200 | [ 201 | id 39 202 | label "Pontmercy" 203 | ] 204 | node 205 | [ 206 | id 40 207 | label "Boulatruelle" 208 | ] 209 | node 210 | [ 211 | id 41 212 | label "Eponine" 213 | ] 214 | node 215 | [ 216 | id 42 217 | label "Anzelma" 218 | ] 219 | node 220 | [ 221 | id 43 222 | label "Woman2" 223 | ] 224 | node 225 | [ 226 | id 44 227 | label "MotherInnocent" 228 | ] 229 | node 230 | [ 231 | id 45 232 | label "Gribier" 233 | ] 234 | node 235 | [ 236 | id 46 237 | label "Jondrette" 238 | ] 239 | node 240 | [ 241 | id 47 242 | label "MmeBurgon" 243 | ] 244 | node 245 | [ 246 | id 48 247 | label "Gavroche" 248 | ] 249 | node 250 | [ 251 | id 49 252 | label "Gillenormand" 253 | ] 254 | node 255 | [ 256 | id 50 257 | label "Magnon" 258 | ] 259 | node 260 | [ 261 | id 51 262 | label "MlleGillenormand" 263 | ] 264 | node 265 | [ 266 | id 52 267 | label "MmePontmercy" 268 | ] 269 | node 270 | [ 271 | id 53 272 | label "MlleVaubois" 273 | ] 274 | node 275 | [ 276 | id 54 277 | label "LtGillenormand" 278 | ] 279 | node 280 | [ 281 | id 55 282 | label "Marius" 283 | ] 284 | node 285 | [ 286 | id 56 287 | label "BaronessT" 288 | ] 289 | node 290 | [ 291 | id 57 292 | label "Mabeuf" 293 | ] 294 | node 295 | [ 296 | id 58 297 | label "Enjolras" 298 | ] 299 | node 300 | [ 301 | id 59 302 | label "Combeferre" 303 | ] 304 | node 305 | [ 306 | id 60 307 | label "Prouvaire" 308 | ] 309 | node 310 | [ 311 | id 61 312 | label "Feuilly" 313 | ] 314 | node 315 | [ 316 | id 62 317 | label "Courfeyrac" 318 | ] 319 | node 320 | [ 321 | id 63 322 | label "Bahorel" 323 | ] 324 | node 325 | [ 326 | id 64 327 | label "Bossuet" 328 | ] 329 | node 330 | [ 331 | id 65 332 | label "Joly" 333 | ] 334 | node 335 | [ 336 | id 66 337 | label "Grantaire" 338 | ] 339 | node 340 | [ 341 | id 67 342 | label "MotherPlutarch" 343 | ] 344 | node 345 | [ 346 | id 68 347 | label "Gueulemer" 348 | ] 349 | node 350 | [ 351 | id 69 352 | label "Babet" 353 | ] 354 | node 355 | [ 356 | id 70 357 | label "Claquesous" 358 | ] 359 | node 360 | [ 361 | id 71 362 | label "Montparnasse" 363 | ] 364 | node 365 | [ 366 | id 72 367 | label "Toussaint" 368 | ] 369 | node 370 | [ 371 | id 73 372 | label "Child1" 373 | ] 374 | node 375 | [ 376 | id 74 377 | label "Child2" 378 | ] 379 | node 380 | [ 381 | id 75 382 | label "Brujon" 383 | ] 384 | node 385 | [ 386 | id 76 387 | label "MmeHucheloup" 388 | ] 389 | edge 390 | [ 391 | source 1 392 | target 0 393 | value 1 394 | ] 395 | edge 396 | [ 397 | source 2 398 | target 0 399 | value 8 400 | ] 401 | edge 402 | [ 403 | source 3 404 | target 0 405 | value 10 406 | ] 407 | edge 408 | [ 409 | source 3 410 | target 2 411 | value 6 412 | ] 413 | edge 414 | [ 415 | source 4 416 | target 0 417 | value 1 418 | ] 419 | edge 420 | [ 421 | source 5 422 | target 0 423 | value 1 424 | ] 425 | edge 426 | [ 427 | source 6 428 | target 0 429 | value 1 430 | ] 431 | edge 432 | [ 433 | source 7 434 | target 0 435 | value 1 436 | ] 437 | edge 438 | [ 439 | source 8 440 | target 0 441 | value 2 442 | ] 443 | edge 444 | [ 445 | source 9 446 | target 0 447 | value 1 448 | ] 449 | edge 450 | [ 451 | source 11 452 | target 10 453 | value 1 454 | ] 455 | edge 456 | [ 457 | source 11 458 | target 3 459 | value 3 460 | ] 461 | edge 462 | [ 463 | source 11 464 | target 2 465 | value 3 466 | ] 467 | edge 468 | [ 469 | source 11 470 | target 0 471 | value 5 472 | ] 473 | edge 474 | [ 475 | source 12 476 | target 11 477 | value 1 478 | ] 479 | edge 480 | [ 481 | source 13 482 | target 11 483 | value 1 484 | ] 485 | edge 486 | [ 487 | source 14 488 | target 11 489 | value 1 490 | ] 491 | edge 492 | [ 493 | source 15 494 | target 11 495 | value 1 496 | ] 497 | edge 498 | [ 499 | source 17 500 | target 16 501 | value 4 502 | ] 503 | edge 504 | [ 505 | source 18 506 | target 16 507 | value 4 508 | ] 509 | edge 510 | [ 511 | source 18 512 | target 17 513 | value 4 514 | ] 515 | edge 516 | [ 517 | source 19 518 | target 16 519 | value 4 520 | ] 521 | edge 522 | [ 523 | source 19 524 | target 17 525 | value 4 526 | ] 527 | edge 528 | [ 529 | source 19 530 | target 18 531 | value 4 532 | ] 533 | edge 534 | [ 535 | source 20 536 | target 16 537 | value 3 538 | ] 539 | edge 540 | [ 541 | source 20 542 | target 17 543 | value 3 544 | ] 545 | edge 546 | [ 547 | source 20 548 | target 18 549 | value 3 550 | ] 551 | edge 552 | [ 553 | source 20 554 | target 19 555 | value 4 556 | ] 557 | edge 558 | [ 559 | source 21 560 | target 16 561 | value 3 562 | ] 563 | edge 564 | [ 565 | source 21 566 | target 17 567 | value 3 568 | ] 569 | edge 570 | [ 571 | source 21 572 | target 18 573 | value 3 574 | ] 575 | edge 576 | [ 577 | source 21 578 | target 19 579 | value 3 580 | ] 581 | edge 582 | [ 583 | source 21 584 | target 20 585 | value 5 586 | ] 587 | edge 588 | [ 589 | source 22 590 | target 16 591 | value 3 592 | ] 593 | edge 594 | [ 595 | source 22 596 | target 17 597 | value 3 598 | ] 599 | edge 600 | [ 601 | source 22 602 | target 18 603 | value 3 604 | ] 605 | edge 606 | [ 607 | source 22 608 | target 19 609 | value 3 610 | ] 611 | edge 612 | [ 613 | source 22 614 | target 20 615 | value 4 616 | ] 617 | edge 618 | [ 619 | source 22 620 | target 21 621 | value 4 622 | ] 623 | edge 624 | [ 625 | source 23 626 | target 16 627 | value 3 628 | ] 629 | edge 630 | [ 631 | source 23 632 | target 17 633 | value 3 634 | ] 635 | edge 636 | [ 637 | source 23 638 | target 18 639 | value 3 640 | ] 641 | edge 642 | [ 643 | source 23 644 | target 19 645 | value 3 646 | ] 647 | edge 648 | [ 649 | source 23 650 | target 20 651 | value 4 652 | ] 653 | edge 654 | [ 655 | source 23 656 | target 21 657 | value 4 658 | ] 659 | edge 660 | [ 661 | source 23 662 | target 22 663 | value 4 664 | ] 665 | edge 666 | [ 667 | source 23 668 | target 12 669 | value 2 670 | ] 671 | edge 672 | [ 673 | source 23 674 | target 11 675 | value 9 676 | ] 677 | edge 678 | [ 679 | source 24 680 | target 23 681 | value 2 682 | ] 683 | edge 684 | [ 685 | source 24 686 | target 11 687 | value 7 688 | ] 689 | edge 690 | [ 691 | source 25 692 | target 24 693 | value 13 694 | ] 695 | edge 696 | [ 697 | source 25 698 | target 23 699 | value 1 700 | ] 701 | edge 702 | [ 703 | source 25 704 | target 11 705 | value 12 706 | ] 707 | edge 708 | [ 709 | source 26 710 | target 24 711 | value 4 712 | ] 713 | edge 714 | [ 715 | source 26 716 | target 11 717 | value 31 718 | ] 719 | edge 720 | [ 721 | source 26 722 | target 16 723 | value 1 724 | ] 725 | edge 726 | [ 727 | source 26 728 | target 25 729 | value 1 730 | ] 731 | edge 732 | [ 733 | source 27 734 | target 11 735 | value 17 736 | ] 737 | edge 738 | [ 739 | source 27 740 | target 23 741 | value 5 742 | ] 743 | edge 744 | [ 745 | source 27 746 | target 25 747 | value 5 748 | ] 749 | edge 750 | [ 751 | source 27 752 | target 24 753 | value 1 754 | ] 755 | edge 756 | [ 757 | source 27 758 | target 26 759 | value 1 760 | ] 761 | edge 762 | [ 763 | source 28 764 | target 11 765 | value 8 766 | ] 767 | edge 768 | [ 769 | source 28 770 | target 27 771 | value 1 772 | ] 773 | edge 774 | [ 775 | source 29 776 | target 23 777 | value 1 778 | ] 779 | edge 780 | [ 781 | source 29 782 | target 27 783 | value 1 784 | ] 785 | edge 786 | [ 787 | source 29 788 | target 11 789 | value 2 790 | ] 791 | edge 792 | [ 793 | source 30 794 | target 23 795 | value 1 796 | ] 797 | edge 798 | [ 799 | source 31 800 | target 30 801 | value 2 802 | ] 803 | edge 804 | [ 805 | source 31 806 | target 11 807 | value 3 808 | ] 809 | edge 810 | [ 811 | source 31 812 | target 23 813 | value 2 814 | ] 815 | edge 816 | [ 817 | source 31 818 | target 27 819 | value 1 820 | ] 821 | edge 822 | [ 823 | source 32 824 | target 11 825 | value 1 826 | ] 827 | edge 828 | [ 829 | source 33 830 | target 11 831 | value 2 832 | ] 833 | edge 834 | [ 835 | source 33 836 | target 27 837 | value 1 838 | ] 839 | edge 840 | [ 841 | source 34 842 | target 11 843 | value 3 844 | ] 845 | edge 846 | [ 847 | source 34 848 | target 29 849 | value 2 850 | ] 851 | edge 852 | [ 853 | source 35 854 | target 11 855 | value 3 856 | ] 857 | edge 858 | [ 859 | source 35 860 | target 34 861 | value 3 862 | ] 863 | edge 864 | [ 865 | source 35 866 | target 29 867 | value 2 868 | ] 869 | edge 870 | [ 871 | source 36 872 | target 34 873 | value 2 874 | ] 875 | edge 876 | [ 877 | source 36 878 | target 35 879 | value 2 880 | ] 881 | edge 882 | [ 883 | source 36 884 | target 11 885 | value 2 886 | ] 887 | edge 888 | [ 889 | source 36 890 | target 29 891 | value 1 892 | ] 893 | edge 894 | [ 895 | source 37 896 | target 34 897 | value 2 898 | ] 899 | edge 900 | [ 901 | source 37 902 | target 35 903 | value 2 904 | ] 905 | edge 906 | [ 907 | source 37 908 | target 36 909 | value 2 910 | ] 911 | edge 912 | [ 913 | source 37 914 | target 11 915 | value 2 916 | ] 917 | edge 918 | [ 919 | source 37 920 | target 29 921 | value 1 922 | ] 923 | edge 924 | [ 925 | source 38 926 | target 34 927 | value 2 928 | ] 929 | edge 930 | [ 931 | source 38 932 | target 35 933 | value 2 934 | ] 935 | edge 936 | [ 937 | source 38 938 | target 36 939 | value 2 940 | ] 941 | edge 942 | [ 943 | source 38 944 | target 37 945 | value 2 946 | ] 947 | edge 948 | [ 949 | source 38 950 | target 11 951 | value 2 952 | ] 953 | edge 954 | [ 955 | source 38 956 | target 29 957 | value 1 958 | ] 959 | edge 960 | [ 961 | source 39 962 | target 25 963 | value 1 964 | ] 965 | edge 966 | [ 967 | source 40 968 | target 25 969 | value 1 970 | ] 971 | edge 972 | [ 973 | source 41 974 | target 24 975 | value 2 976 | ] 977 | edge 978 | [ 979 | source 41 980 | target 25 981 | value 3 982 | ] 983 | edge 984 | [ 985 | source 42 986 | target 41 987 | value 2 988 | ] 989 | edge 990 | [ 991 | source 42 992 | target 25 993 | value 2 994 | ] 995 | edge 996 | [ 997 | source 42 998 | target 24 999 | value 1 1000 | ] 1001 | edge 1002 | [ 1003 | source 43 1004 | target 11 1005 | value 3 1006 | ] 1007 | edge 1008 | [ 1009 | source 43 1010 | target 26 1011 | value 1 1012 | ] 1013 | edge 1014 | [ 1015 | source 43 1016 | target 27 1017 | value 1 1018 | ] 1019 | edge 1020 | [ 1021 | source 44 1022 | target 28 1023 | value 3 1024 | ] 1025 | edge 1026 | [ 1027 | source 44 1028 | target 11 1029 | value 1 1030 | ] 1031 | edge 1032 | [ 1033 | source 45 1034 | target 28 1035 | value 2 1036 | ] 1037 | edge 1038 | [ 1039 | source 47 1040 | target 46 1041 | value 1 1042 | ] 1043 | edge 1044 | [ 1045 | source 48 1046 | target 47 1047 | value 2 1048 | ] 1049 | edge 1050 | [ 1051 | source 48 1052 | target 25 1053 | value 1 1054 | ] 1055 | edge 1056 | [ 1057 | source 48 1058 | target 27 1059 | value 1 1060 | ] 1061 | edge 1062 | [ 1063 | source 48 1064 | target 11 1065 | value 1 1066 | ] 1067 | edge 1068 | [ 1069 | source 49 1070 | target 26 1071 | value 3 1072 | ] 1073 | edge 1074 | [ 1075 | source 49 1076 | target 11 1077 | value 2 1078 | ] 1079 | edge 1080 | [ 1081 | source 50 1082 | target 49 1083 | value 1 1084 | ] 1085 | edge 1086 | [ 1087 | source 50 1088 | target 24 1089 | value 1 1090 | ] 1091 | edge 1092 | [ 1093 | source 51 1094 | target 49 1095 | value 9 1096 | ] 1097 | edge 1098 | [ 1099 | source 51 1100 | target 26 1101 | value 2 1102 | ] 1103 | edge 1104 | [ 1105 | source 51 1106 | target 11 1107 | value 2 1108 | ] 1109 | edge 1110 | [ 1111 | source 52 1112 | target 51 1113 | value 1 1114 | ] 1115 | edge 1116 | [ 1117 | source 52 1118 | target 39 1119 | value 1 1120 | ] 1121 | edge 1122 | [ 1123 | source 53 1124 | target 51 1125 | value 1 1126 | ] 1127 | edge 1128 | [ 1129 | source 54 1130 | target 51 1131 | value 2 1132 | ] 1133 | edge 1134 | [ 1135 | source 54 1136 | target 49 1137 | value 1 1138 | ] 1139 | edge 1140 | [ 1141 | source 54 1142 | target 26 1143 | value 1 1144 | ] 1145 | edge 1146 | [ 1147 | source 55 1148 | target 51 1149 | value 6 1150 | ] 1151 | edge 1152 | [ 1153 | source 55 1154 | target 49 1155 | value 12 1156 | ] 1157 | edge 1158 | [ 1159 | source 55 1160 | target 39 1161 | value 1 1162 | ] 1163 | edge 1164 | [ 1165 | source 55 1166 | target 54 1167 | value 1 1168 | ] 1169 | edge 1170 | [ 1171 | source 55 1172 | target 26 1173 | value 21 1174 | ] 1175 | edge 1176 | [ 1177 | source 55 1178 | target 11 1179 | value 19 1180 | ] 1181 | edge 1182 | [ 1183 | source 55 1184 | target 16 1185 | value 1 1186 | ] 1187 | edge 1188 | [ 1189 | source 55 1190 | target 25 1191 | value 2 1192 | ] 1193 | edge 1194 | [ 1195 | source 55 1196 | target 41 1197 | value 5 1198 | ] 1199 | edge 1200 | [ 1201 | source 55 1202 | target 48 1203 | value 4 1204 | ] 1205 | edge 1206 | [ 1207 | source 56 1208 | target 49 1209 | value 1 1210 | ] 1211 | edge 1212 | [ 1213 | source 56 1214 | target 55 1215 | value 1 1216 | ] 1217 | edge 1218 | [ 1219 | source 57 1220 | target 55 1221 | value 1 1222 | ] 1223 | edge 1224 | [ 1225 | source 57 1226 | target 41 1227 | value 1 1228 | ] 1229 | edge 1230 | [ 1231 | source 57 1232 | target 48 1233 | value 1 1234 | ] 1235 | edge 1236 | [ 1237 | source 58 1238 | target 55 1239 | value 7 1240 | ] 1241 | edge 1242 | [ 1243 | source 58 1244 | target 48 1245 | value 7 1246 | ] 1247 | edge 1248 | [ 1249 | source 58 1250 | target 27 1251 | value 6 1252 | ] 1253 | edge 1254 | [ 1255 | source 58 1256 | target 57 1257 | value 1 1258 | ] 1259 | edge 1260 | [ 1261 | source 58 1262 | target 11 1263 | value 4 1264 | ] 1265 | edge 1266 | [ 1267 | source 59 1268 | target 58 1269 | value 15 1270 | ] 1271 | edge 1272 | [ 1273 | source 59 1274 | target 55 1275 | value 5 1276 | ] 1277 | edge 1278 | [ 1279 | source 59 1280 | target 48 1281 | value 6 1282 | ] 1283 | edge 1284 | [ 1285 | source 59 1286 | target 57 1287 | value 2 1288 | ] 1289 | edge 1290 | [ 1291 | source 60 1292 | target 48 1293 | value 1 1294 | ] 1295 | edge 1296 | [ 1297 | source 60 1298 | target 58 1299 | value 4 1300 | ] 1301 | edge 1302 | [ 1303 | source 60 1304 | target 59 1305 | value 2 1306 | ] 1307 | edge 1308 | [ 1309 | source 61 1310 | target 48 1311 | value 2 1312 | ] 1313 | edge 1314 | [ 1315 | source 61 1316 | target 58 1317 | value 6 1318 | ] 1319 | edge 1320 | [ 1321 | source 61 1322 | target 60 1323 | value 2 1324 | ] 1325 | edge 1326 | [ 1327 | source 61 1328 | target 59 1329 | value 5 1330 | ] 1331 | edge 1332 | [ 1333 | source 61 1334 | target 57 1335 | value 1 1336 | ] 1337 | edge 1338 | [ 1339 | source 61 1340 | target 55 1341 | value 1 1342 | ] 1343 | edge 1344 | [ 1345 | source 62 1346 | target 55 1347 | value 9 1348 | ] 1349 | edge 1350 | [ 1351 | source 62 1352 | target 58 1353 | value 17 1354 | ] 1355 | edge 1356 | [ 1357 | source 62 1358 | target 59 1359 | value 13 1360 | ] 1361 | edge 1362 | [ 1363 | source 62 1364 | target 48 1365 | value 7 1366 | ] 1367 | edge 1368 | [ 1369 | source 62 1370 | target 57 1371 | value 2 1372 | ] 1373 | edge 1374 | [ 1375 | source 62 1376 | target 41 1377 | value 1 1378 | ] 1379 | edge 1380 | [ 1381 | source 62 1382 | target 61 1383 | value 6 1384 | ] 1385 | edge 1386 | [ 1387 | source 62 1388 | target 60 1389 | value 3 1390 | ] 1391 | edge 1392 | [ 1393 | source 63 1394 | target 59 1395 | value 5 1396 | ] 1397 | edge 1398 | [ 1399 | source 63 1400 | target 48 1401 | value 5 1402 | ] 1403 | edge 1404 | [ 1405 | source 63 1406 | target 62 1407 | value 6 1408 | ] 1409 | edge 1410 | [ 1411 | source 63 1412 | target 57 1413 | value 2 1414 | ] 1415 | edge 1416 | [ 1417 | source 63 1418 | target 58 1419 | value 4 1420 | ] 1421 | edge 1422 | [ 1423 | source 63 1424 | target 61 1425 | value 3 1426 | ] 1427 | edge 1428 | [ 1429 | source 63 1430 | target 60 1431 | value 2 1432 | ] 1433 | edge 1434 | [ 1435 | source 63 1436 | target 55 1437 | value 1 1438 | ] 1439 | edge 1440 | [ 1441 | source 64 1442 | target 55 1443 | value 5 1444 | ] 1445 | edge 1446 | [ 1447 | source 64 1448 | target 62 1449 | value 12 1450 | ] 1451 | edge 1452 | [ 1453 | source 64 1454 | target 48 1455 | value 5 1456 | ] 1457 | edge 1458 | [ 1459 | source 64 1460 | target 63 1461 | value 4 1462 | ] 1463 | edge 1464 | [ 1465 | source 64 1466 | target 58 1467 | value 10 1468 | ] 1469 | edge 1470 | [ 1471 | source 64 1472 | target 61 1473 | value 6 1474 | ] 1475 | edge 1476 | [ 1477 | source 64 1478 | target 60 1479 | value 2 1480 | ] 1481 | edge 1482 | [ 1483 | source 64 1484 | target 59 1485 | value 9 1486 | ] 1487 | edge 1488 | [ 1489 | source 64 1490 | target 57 1491 | value 1 1492 | ] 1493 | edge 1494 | [ 1495 | source 64 1496 | target 11 1497 | value 1 1498 | ] 1499 | edge 1500 | [ 1501 | source 65 1502 | target 63 1503 | value 5 1504 | ] 1505 | edge 1506 | [ 1507 | source 65 1508 | target 64 1509 | value 7 1510 | ] 1511 | edge 1512 | [ 1513 | source 65 1514 | target 48 1515 | value 3 1516 | ] 1517 | edge 1518 | [ 1519 | source 65 1520 | target 62 1521 | value 5 1522 | ] 1523 | edge 1524 | [ 1525 | source 65 1526 | target 58 1527 | value 5 1528 | ] 1529 | edge 1530 | [ 1531 | source 65 1532 | target 61 1533 | value 5 1534 | ] 1535 | edge 1536 | [ 1537 | source 65 1538 | target 60 1539 | value 2 1540 | ] 1541 | edge 1542 | [ 1543 | source 65 1544 | target 59 1545 | value 5 1546 | ] 1547 | edge 1548 | [ 1549 | source 65 1550 | target 57 1551 | value 1 1552 | ] 1553 | edge 1554 | [ 1555 | source 65 1556 | target 55 1557 | value 2 1558 | ] 1559 | edge 1560 | [ 1561 | source 66 1562 | target 64 1563 | value 3 1564 | ] 1565 | edge 1566 | [ 1567 | source 66 1568 | target 58 1569 | value 3 1570 | ] 1571 | edge 1572 | [ 1573 | source 66 1574 | target 59 1575 | value 1 1576 | ] 1577 | edge 1578 | [ 1579 | source 66 1580 | target 62 1581 | value 2 1582 | ] 1583 | edge 1584 | [ 1585 | source 66 1586 | target 65 1587 | value 2 1588 | ] 1589 | edge 1590 | [ 1591 | source 66 1592 | target 48 1593 | value 1 1594 | ] 1595 | edge 1596 | [ 1597 | source 66 1598 | target 63 1599 | value 1 1600 | ] 1601 | edge 1602 | [ 1603 | source 66 1604 | target 61 1605 | value 1 1606 | ] 1607 | edge 1608 | [ 1609 | source 66 1610 | target 60 1611 | value 1 1612 | ] 1613 | edge 1614 | [ 1615 | source 67 1616 | target 57 1617 | value 3 1618 | ] 1619 | edge 1620 | [ 1621 | source 68 1622 | target 25 1623 | value 5 1624 | ] 1625 | edge 1626 | [ 1627 | source 68 1628 | target 11 1629 | value 1 1630 | ] 1631 | edge 1632 | [ 1633 | source 68 1634 | target 24 1635 | value 1 1636 | ] 1637 | edge 1638 | [ 1639 | source 68 1640 | target 27 1641 | value 1 1642 | ] 1643 | edge 1644 | [ 1645 | source 68 1646 | target 48 1647 | value 1 1648 | ] 1649 | edge 1650 | [ 1651 | source 68 1652 | target 41 1653 | value 1 1654 | ] 1655 | edge 1656 | [ 1657 | source 69 1658 | target 25 1659 | value 6 1660 | ] 1661 | edge 1662 | [ 1663 | source 69 1664 | target 68 1665 | value 6 1666 | ] 1667 | edge 1668 | [ 1669 | source 69 1670 | target 11 1671 | value 1 1672 | ] 1673 | edge 1674 | [ 1675 | source 69 1676 | target 24 1677 | value 1 1678 | ] 1679 | edge 1680 | [ 1681 | source 69 1682 | target 27 1683 | value 2 1684 | ] 1685 | edge 1686 | [ 1687 | source 69 1688 | target 48 1689 | value 1 1690 | ] 1691 | edge 1692 | [ 1693 | source 69 1694 | target 41 1695 | value 1 1696 | ] 1697 | edge 1698 | [ 1699 | source 70 1700 | target 25 1701 | value 4 1702 | ] 1703 | edge 1704 | [ 1705 | source 70 1706 | target 69 1707 | value 4 1708 | ] 1709 | edge 1710 | [ 1711 | source 70 1712 | target 68 1713 | value 4 1714 | ] 1715 | edge 1716 | [ 1717 | source 70 1718 | target 11 1719 | value 1 1720 | ] 1721 | edge 1722 | [ 1723 | source 70 1724 | target 24 1725 | value 1 1726 | ] 1727 | edge 1728 | [ 1729 | source 70 1730 | target 27 1731 | value 1 1732 | ] 1733 | edge 1734 | [ 1735 | source 70 1736 | target 41 1737 | value 1 1738 | ] 1739 | edge 1740 | [ 1741 | source 70 1742 | target 58 1743 | value 1 1744 | ] 1745 | edge 1746 | [ 1747 | source 71 1748 | target 27 1749 | value 1 1750 | ] 1751 | edge 1752 | [ 1753 | source 71 1754 | target 69 1755 | value 2 1756 | ] 1757 | edge 1758 | [ 1759 | source 71 1760 | target 68 1761 | value 2 1762 | ] 1763 | edge 1764 | [ 1765 | source 71 1766 | target 70 1767 | value 2 1768 | ] 1769 | edge 1770 | [ 1771 | source 71 1772 | target 11 1773 | value 1 1774 | ] 1775 | edge 1776 | [ 1777 | source 71 1778 | target 48 1779 | value 1 1780 | ] 1781 | edge 1782 | [ 1783 | source 71 1784 | target 41 1785 | value 1 1786 | ] 1787 | edge 1788 | [ 1789 | source 71 1790 | target 25 1791 | value 1 1792 | ] 1793 | edge 1794 | [ 1795 | source 72 1796 | target 26 1797 | value 2 1798 | ] 1799 | edge 1800 | [ 1801 | source 72 1802 | target 27 1803 | value 1 1804 | ] 1805 | edge 1806 | [ 1807 | source 72 1808 | target 11 1809 | value 1 1810 | ] 1811 | edge 1812 | [ 1813 | source 73 1814 | target 48 1815 | value 2 1816 | ] 1817 | edge 1818 | [ 1819 | source 74 1820 | target 48 1821 | value 2 1822 | ] 1823 | edge 1824 | [ 1825 | source 74 1826 | target 73 1827 | value 3 1828 | ] 1829 | edge 1830 | [ 1831 | source 75 1832 | target 69 1833 | value 3 1834 | ] 1835 | edge 1836 | [ 1837 | source 75 1838 | target 68 1839 | value 3 1840 | ] 1841 | edge 1842 | [ 1843 | source 75 1844 | target 25 1845 | value 3 1846 | ] 1847 | edge 1848 | [ 1849 | source 75 1850 | target 48 1851 | value 1 1852 | ] 1853 | edge 1854 | [ 1855 | source 75 1856 | target 41 1857 | value 1 1858 | ] 1859 | edge 1860 | [ 1861 | source 75 1862 | target 70 1863 | value 1 1864 | ] 1865 | edge 1866 | [ 1867 | source 75 1868 | target 71 1869 | value 1 1870 | ] 1871 | edge 1872 | [ 1873 | source 76 1874 | target 64 1875 | value 1 1876 | ] 1877 | edge 1878 | [ 1879 | source 76 1880 | target 65 1881 | value 1 1882 | ] 1883 | edge 1884 | [ 1885 | source 76 1886 | target 66 1887 | value 1 1888 | ] 1889 | edge 1890 | [ 1891 | source 76 1892 | target 63 1893 | value 1 1894 | ] 1895 | edge 1896 | [ 1897 | source 76 1898 | target 62 1899 | value 1 1900 | ] 1901 | edge 1902 | [ 1903 | source 76 1904 | target 48 1905 | value 1 1906 | ] 1907 | edge 1908 | [ 1909 | source 76 1910 | target 58 1911 | value 1 1912 | ] 1913 | ] 1914 | -------------------------------------------------------------------------------- /mysql_actor_type.py: -------------------------------------------------------------------------------- 1 | import pymysql as pm 2 | import re 3 | import numpy 4 | from sklearn.cluster import KMeans 5 | from pylab import figure, subplot, hist, xlim, show,plot 6 | at = [[0 for i in range(70)] for i in range(15040)] 7 | #连接数据库 8 | cnx = pm.connect(user='root',password='260877289',host='127.0.0.1',database='moviedata') 9 | #演员数量下标 10 | acotrnum=0 11 | #数据库语句 12 | cur1 = cnx.cursor() 13 | cur1.execute("select * from movie") 14 | 15 | cur2 = cnx.cursor() 16 | cur2.execute("select ActorID from actor") 17 | #while cur.fetchone() is not None : 18 | data=cur1.fetchone() 19 | #print(len(cur2.fetchall())) 20 | actorinfo=cur2.fetchone() 21 | #ai=list(actorinfo) 22 | 23 | print(actorinfo) 24 | for ij in range(len(at)): 25 | actorinfo = cur2.fetchone() 26 | if actorinfo is not None: 27 | at[ij][0]=int(re.sub(repl="\D",string= "",count=list(actorinfo)[0]))#正则 28 | print(at[0][0]) 29 | print(at[10][0]) 30 | print(at[10][10]) 31 | 32 | while data is not None : 33 | 34 | 35 | 36 | t=[0 for i in range(70)] 37 | data=cur1.fetchone() 38 | if data is not None: 39 | #print(data) 40 | movieID=data[0] 41 | movietype=data[3] 42 | #print(movietype) 43 | if '爱情' in movietype : 44 | t[1]=1 45 | if '歌舞' in movietype : 46 | t[2]=1 47 | if '剧情' in movietype : 48 | t[3]=1 49 | if '纪录片' in movietype: 50 | t[4] = 1 51 | if '传记' in movietype: 52 | t[5] = 1 53 | if '动作' in movietype: 54 | t[6] = 1 55 | if '冒险' in movietype: 56 | t[7] = 1 57 | if '犯罪' in movietype: 58 | t[8] = 1 59 | if '青春' in movietype: 60 | t[9] = 1 61 | if '武侠' in movietype: 62 | t[10] = 1 63 | if '古装' in movietype: 64 | t[11] = 1 65 | if '奇幻' in movietype: 66 | t[12] = 1 67 | if '动画' in movietype: 68 | t[13] = 1 69 | if '科幻' in movietype: 70 | t[14] = 1 71 | if '惊悚' in movietype: 72 | t[15] = 1 73 | if '战争' in movietype: 74 | t[16] = 1 75 | if '悬疑' in movietype: 76 | t[17] = 1 77 | if '喜剧' in movietype: 78 | t[18] = 1 79 | if '运动' in movietype: 80 | t[19] = 1 81 | if '亲情' in movietype: 82 | t[20] = 1 83 | if '穿越' in movietype: 84 | t[21] = 1 85 | if '灾难' in movietype: 86 | t[22] = 1 87 | if '侦探' in movietype: 88 | t[23] = 1 89 | if '神秘' in movietype: 90 | t[24] = 1 91 | if '动物' in movietype: 92 | t[25] = 1 93 | if '恐怖' in movietype: 94 | t[26] = 1 95 | if '功夫' in movietype: 96 | t[27] = 1 97 | if '励志' in movietype: 98 | t[28] = 1 99 | if '公路' in movietype: 100 | t[29] = 1 101 | if '西部' in movietype: 102 | t[30] = 1 103 | if '都市' in movietype: 104 | t[31] = 1 105 | if '时尚' in movietype: 106 | t[32] = 1 107 | if '职场' in movietype: 108 | t[33] = 1 109 | if '警匪' in movietype: 110 | t[34] = 1 111 | if '舞台艺术片' in movietype: 112 | t[35] = 1 113 | if '儿童' in movietype: 114 | t[36] = 1 115 | if '军事' in movietype: 116 | t[37] = 1 117 | if '音乐' in movietype: 118 | t[38] = 1 119 | if '怀旧' in movietype: 120 | t[39] = 1 121 | if '玄幻' in movietype: 122 | t[40] = 1 123 | if '革命' in movietype: 124 | t[41] = 1 125 | if '军旅' in movietype: 126 | t[42] = 1 127 | if '友情' in movietype: 128 | t[43] = 1 129 | if '家庭' in movietype: 130 | t[44] = 1 131 | if '历史' in movietype: 132 | t[45] = 1 133 | if '农村' in movietype: 134 | t[46] = 1 135 | if '商战' in movietype: 136 | t[47] = 1 137 | if '同性' in movietype: 138 | t[48] = 1 139 | if '涉案' in movietype: 140 | t[49] = 1 141 | if '贺岁' in movietype: 142 | t[50] = 1 143 | if '抢险' in movietype: 144 | t[51] = 1 145 | if '灵异' in movietype: 146 | t[52] = 1 147 | if '主旋律' in movietype: 148 | t[53] = 1 149 | if '女性' in movietype: 150 | t[54] = 1 151 | if '文艺' in movietype: 152 | t[55] = 1 153 | if '黑色' in movietype: 154 | t[56] = 1 155 | if '心理' in movietype: 156 | t[57] = 1 157 | if '戏曲' in movietype: 158 | t[58] = 1 159 | if '民族' in movietype: 160 | t[59] = 1 161 | if '反腐' in movietype: 162 | t[60] = 1 163 | if '真人秀' in movietype: 164 | t[61] = 1 165 | if '综艺' in movietype: 166 | t[62] = 1 167 | if '大电影' in movietype: 168 | t[63] = 1 169 | if '枪战' in movietype: 170 | t[64] = 1 171 | if '黑帮' in movietype: 172 | t[65] = 1 173 | if '科教片' in movietype: 174 | t[66] = 1 175 | if '神话' in movietype: 176 | t[67] = 1 177 | if '幽默' in movietype: 178 | t[68] = 1 179 | if '竞技' in movietype: 180 | t[69] = 1 181 | cur3 = cnx.cursor() 182 | cur3.execute("select ActorID from movie_actor where MovieID ='"+movieID+"'") 183 | actorid=0 184 | sum=0 185 | while actorid is not None: 186 | actorid=cur3.fetchone() 187 | if actorid is not None: 188 | actornum=int(re.sub("\D", "",list(actorid)[0]))#正则 189 | for i in range(len(at)): 190 | if at[i][0]==actornum: 191 | for j in range(len(t)): 192 | at[i][j]+=t[j] 193 | break 194 | #print(at[0]) 195 | 196 | cnx.close() 197 | #at.savetxt("actor_type.txt",a) 198 | 199 | """ 200 | kmeans = KMeans(3, init='random') # initialization 201 | kmeans.fit(at) # actual execution 202 | c=KMeans.predict() 203 | 204 | figure() 205 | plot(data[c==1,0],data[c==1,2],'bo',alpha=.7) 206 | plot(data[c==2,0],data[c==2,2],'go',alpha=.7) 207 | plot(data[c==0,0],data[c==0,2],'mo',alpha=.7) 208 | show() 209 | 210 | """ 211 | import pandas as pd 212 | data1 = pd.DataFrame(at) 213 | data1.to_csv('data_notype.csv',na_rep='NA',header=0,index=0) 214 | print("ok") -------------------------------------------------------------------------------- /reference_keams.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | from sklearn.cluster import KMeans 3 | # from sklearn.cluster import k_means#这个是先写的,他们两的参数就相差一个数据集,不过还是建议用KMeans 4 | import numpy as np 5 | from sklearn.datasets import load_iris 6 | 7 | 8 | def loadData(filePath): 9 | dataSet = [] 10 | file = open(filePath, 'r') 11 | 12 | for lines in file.readlines(): 13 | row = [] 14 | # curLine = lines.strip().split()#2维数据 15 | curLine = lines.strip().split(',') 16 | for line in curLine: 17 | x = float(line) 18 | row.append(x) 19 | 20 | dataSet.append(row) 21 | file.close() 22 | 23 | return np.mat(dataSet) 24 | 25 | 26 | # filePath = '../data/training_4k2_far.txt' 27 | filePath = 'C:/Users/26087/PycharmProjects/untitled/data_notype.csv' 28 | dataSet = loadData(filePath) 29 | 30 | print 31 | dataSet 32 | '''直接调用sklearn中的数据''' 33 | # dataSet = load_iris().data 34 | estimator = KMeans(n_clusters=4, max_iter=300, n_init=10).fit(dataSet) # 构造聚类器 35 | '''这个是必须写的,相当于上面构造出来,配置好,下面这句调用,当然也可以写到上面去 36 | fit方法对数据做training 并得到模型''' 37 | # estimator.fit(dataSet)#聚类 38 | 39 | # 下面是三个属性 40 | '''把聚类的样本打标签''' 41 | labelPred = estimator.labels_ 42 | '''显示聚类的质心''' 43 | centroids = estimator.cluster_centers_ 44 | '''这个也可以看成损失,就是样本距其最近样本的平方总和''' 45 | inertia = estimator.inertia_ 46 | 47 | print 48 | labelPred 49 | print 50 | centroids 51 | print 52 | inertia 53 | # 这下面是库里包装的方法 54 | '''返回预测的样本属于的类的聚类中心''' 55 | print 56 | estimator.fit_predict(dataSet) 57 | print 58 | estimator.predict(dataSet) 59 | '''这个是返回每个样本与聚类质心的距离''' 60 | print 61 | estimator.fit_transform(dataSet) 62 | print 63 | estimator.transform(dataSet) 64 | '''这个我觉得和损失一样,评价聚类好坏''' 65 | print 66 | estimator.score(dataSet) 67 | -------------------------------------------------------------------------------- /s.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | import pandas as pd 3 | import networkx as nx 4 | import matplotlib.pyplot as plt 5 | 6 | actor_edge = pd.read_csv('network_actor_rankavgbox.csv') #导入演员网络边的csv文件,导入后为Dataframe格式 7 | print(actor_edge.head(3)) #显示表的前3行 8 | 9 | weight_edge = [] 10 | for _,row in actor_edge.iterrows(): #把边及边的权重加入列表,数据格式为(节点,节点,权重) 11 | weight_edge.append((row['ActorID_1'],row['ActorID_2'],row['rankaveragebox'])) 12 | 13 | AW = nx.Graph() #初始化无向图 14 | AW.add_weighted_edges_from(weight_edge) #把带权重边的信息加入无向图中 15 | 16 | degree_hist = nx.degree_histogram(AW) #返回图中所有节点的度分布序列 17 | 18 | x = range(len(degree_hist)) #生成x轴序列 19 | y = [z / float(sum(degree_hist)) for z in degree_hist] #生产y轴序列,将频次转换为频率 20 | plt.loglog(x,y,color="blue",linewidth=2) #在双对数坐标轴上绘制度分布曲线 21 | plt.title('Degree Distribution Actor ') #图表标题 22 | plt.xlabel('Degree') #x轴标题 23 | plt.ylabel('Probability') #y轴标题 24 | plt.savefig('Degree Distribution Actor.png') #保存图片 25 | plt.show() #显示图表 -------------------------------------------------------------------------------- /ssss.py: -------------------------------------------------------------------------------- 1 | import community_louvain 2 | import networkx as nx 3 | import matplotlib.pyplot as plt 4 | import pandas as pd 5 | actor_edge = pd.read_csv('2.csv') #导入演员网络边的csv文件,导入后为Dataframe格式 6 | print(actor_edge.head(3)) #显示表的前3行 7 | 8 | weight_edge = [] 9 | for _,row in actor_edge.iterrows(): #把边及边的权重加入列表,数据格式为(节点,节点,权重) 10 | weight_edge.append((row['A'],row['B'],row['rankaveragebox'])) 11 | 12 | G = nx.Graph() #初始化无向图 13 | G.add_weighted_edges_from(weight_edge) #把带权重边的信息加入无向图中 14 | 15 | #first compute the best partition 16 | partition = community_louvain.best_partition(G) 17 | print("0") 18 | #drawing 19 | size = float(len(set(partition.values()))) 20 | print("0") 21 | pos = nx.spring_layout(G) 22 | 23 | count = 0. 24 | for com in set(partition.values()) : 25 | count = count + 1. 26 | list_nodes = [nodes for nodes in partition.keys() 27 | if partition[nodes] == com] 28 | nx.draw_networkx_nodes(G, pos, list_nodes, node_size = 20, 29 | node_color = str(count / size)) 30 | 31 | 32 | nx.draw_networkx_edges(G, pos, alpha=0.5) 33 | plt.show() 34 | --------------------------------------------------------------------------------