├── .travis.yml ├── LICENSE ├── README.md ├── demo.py ├── page_clustering ├── __init__.py ├── clustering.py └── features.py ├── requirements.txt ├── setup.py ├── test ├── stackoverflow │ ├── Jobs - 1 - Stack Overflow.html │ ├── Jobs - 10 - Stack Overflow.html │ ├── Jobs - 11 - Stack Overflow.html │ ├── Jobs - 12 - Stack Overflow.html │ ├── Jobs - 13 - Stack Overflow.html │ ├── Jobs - 14 - Stack Overflow.html │ ├── Jobs - 15 - Stack Overflow.html │ ├── Jobs - 2 - Stack Overflow.html │ ├── Jobs - 3- Stack Overflow.html │ ├── Jobs - 4 - Stack Overflow.html │ ├── Jobs - 5 - Stack Overflow.html │ ├── Jobs - 6 - Stack Overflow.html │ ├── Jobs - 7 - Stack Overflow.html │ ├── Jobs - 8 - Stack Overflow.html │ ├── Jobs - 9 - Stack Overflow.html │ ├── Newest Questions - Page 1 - Stack Overflow.html │ ├── Newest Questions - Page 10 - Stack Overflow.html │ ├── Newest Questions - Page 11 - Stack Overflow.html │ ├── Newest Questions - Page 12 - Stack Overflow.html │ ├── Newest Questions - Page 13 - Stack Overflow.html │ ├── Newest Questions - Page 14 - Stack Overflow.html │ ├── Newest Questions - Page 15 - Stack Overflow.html │ ├── Newest Questions - Page 2 - Stack Overflow.html │ ├── Newest Questions - Page 3 - Stack Overflow.html │ ├── Newest Questions - Page 4 - Stack Overflow.html │ ├── Newest Questions - Page 5 - Stack Overflow.html │ ├── Newest Questions - Page 6 - Stack Overflow.html │ ├── Newest Questions - Page 7 - Stack Overflow.html │ ├── Newest Questions - Page 8 - Stack Overflow.html │ ├── Newest Questions - Page 9 - Stack Overflow.html │ ├── Question - 1 - Stack Overflow.html │ ├── Question - 10 - Stack Overflow.html │ ├── Question - 11 - Stack Overflow.html │ ├── Question - 12 - Stack Overflow.html │ ├── Question - 13 - Stack Overflow.html │ ├── Question - 14 - Stack Overflow.html │ ├── Question - 15 - Stack Overflow.html │ ├── Question - 2 - Stack Overflow.html │ ├── Question - 3 - Stack Overflow.html │ ├── Question - 4 - Stack Overflow.html │ ├── Question - 5 - Stack Overflow.html │ ├── Question - 6 - Stack Overflow.html │ ├── Question - 7 - Stack Overflow.html │ ├── Question - 8 - Stack Overflow.html │ ├── Question - 9 - Stack Overflow.html │ ├── Tags - 1 - Stack Overflow.html │ ├── Tags - 10 - Stack Overflow.html │ ├── Tags - 11 - Stack Overflow.html │ ├── Tags - 12 - Stack Overflow.html │ ├── Tags - 13 - Stack Overflow.html │ ├── Tags - 14 - Stack Overflow.html │ ├── Tags - 15 - Stack Overflow.html │ ├── Tags - 2 - Stack Overflow.html │ ├── Tags - 3 - Stack Overflow.html │ ├── Tags - 4 - Stack Overflow.html │ ├── Tags - 5 - Stack Overflow.html │ ├── Tags - 6 - Stack Overflow.html │ ├── Tags - 7 - Stack Overflow.html │ ├── Tags - 8 - Stack Overflow.html │ ├── Tags - 9 - Stack Overflow.html │ ├── Users - 1 - Stack Overflow.html │ ├── Users - 10 - Stack Overflow.html │ ├── Users - 11 - Stack Overflow.html │ ├── Users - 12 - Stack Overflow.html │ ├── Users - 13 - Stack Overflow.html │ ├── Users - 14 - Stack Overflow.html │ ├── Users - 15 - Stack Overflow.html │ ├── Users - 2 - Stack Overflow.html │ ├── Users - 3 - Stack Overflow.html │ ├── Users - 4 - Stack Overflow.html │ ├── Users - 5 - Stack Overflow.html │ ├── Users - 6 - Stack Overflow.html │ ├── Users - 8 - Stack Overflow.html │ └── Users - 9 - Stack Overflow.html └── test_stackoverflow.py └── tox.ini /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 2.7 3 | 4 | install: 5 | - pip install -U tox 6 | 7 | script: 8 | - travis_wait tox -vvv 9 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) page_clustering developers. 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without modification, 5 | are permitted provided that the following conditions are met: 6 | 7 | 1. Redistributions of source code must retain the above copyright notice, 8 | this list of conditions and the following disclaimer. 9 | 10 | 2. Redistributions in binary form must reproduce the above copyright 11 | notice, this list of conditions and the following disclaimer in the 12 | documentation and/or other materials provided with the distribution. 13 | 14 | 3. Neither the name of page_clustering nor the names of its contributors may 15 | be used to endorse or promote products derived from this software without 16 | specific prior written permission. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 19 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 20 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 22 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 23 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 24 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 25 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 27 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Description [![Build Status](https://travis-ci.org/scrapinghub/page_clustering.svg?branch=master)](https://travis-ci.org/scrapinghub/page_clustering) 2 | A simple algorithm for clustering web pages. 3 | A wrapper around [KMeans](http://scikit-learn.org/stable/modules/generated/sklearn.cluster.MiniBatchKMeans.html#sklearn.cluster.MiniBatchKMeans). 4 | Web pages are converted to vectors, where each vector entry is just the count of a given tag and class attribute. 5 | The dimension of the vectors will change as new pages with new tags or class attributes arrive. 6 | Also a simple outlier detection is available and enabled by default. This allows for rejecting web pages 7 | that are highly improbable to belong to any cluster. 8 | 9 | # Install 10 | pip install page_clustering 11 | 12 | # Usage 13 | import page_clustering 14 | 15 | clt = page_clustering.OnlineKMeans(n_clusters=5) 16 | # `pages` must have been obtained somehow 17 | for page in pages: 18 | clt.add_page(page) 19 | y = clt.classify(new_page) 20 | for page in more_pages: 21 | clt.add_page(page) 22 | y = clt.classify(yet_another_page) 23 | 24 | # Demo 25 | wget -r --quota=5M https://news.ycombinator.com 26 | python demo.py news.ycombinator.com 27 | 28 | # Tests 29 | cd tests 30 | py.test 31 | 32 | # Algorithm 33 | 34 | The first part, vectorization, transforms the web page to a vector. For example, 35 | take the following page: 36 | 37 | ```html 38 | 39 | 40 | 44 | 48 | 49 | 50 | ``` 51 | 52 | Each non-closing (tag, class) pair is mapped to a vector position and the number 53 | of times it appears in the document is the value of the vector at that position. 54 | 55 | | tag, class | position | count | 56 | |------------|----------|-------| 57 | | html | 0 | 1 | 58 | | body | 1 | 1 | 59 | | ul, list1 | 2 | 1 | 60 | | li | 3 | 4 | 61 | | ul, list2 | 4 | 1 | 62 | 63 | The vector is therefore `[1, 1, 1, 4, 1]`. This vector is normalized so that 64 | it's elements sum up to 1 and the final frequency vector is: 65 | `[0.125, 0.125, 0.125, 0.5, 0.125]` 66 | 67 | When a new page arrives it can be possible that new (tag, class) pairs appear. 68 | For example imagine that this new page arrives: 69 | 70 | ```html 71 | 72 | 73 |

Another page with a paragraph tag

74 | 75 | 76 | ``` 77 | 78 | The new page would be mapped according to this table: 79 | 80 | | tag, class | position | count | 81 | |------------|----------|-------| 82 | | html | 0 | 1 | 83 | | body | 1 | 1 | 84 | | ul, list1 | 2 | 0 | 85 | | li | 3 | 0 | 86 | | ul, list2 | 4 | 0 | 87 | | p | 5 | 1 | 88 | 89 | The vector for this page would be `[1, 1, 0, 0, 0, 1]`, and with normalization: 90 | `[0.33, 0.33, 0, 0, 0, 0.33]`. 91 | 92 | The new vector has 6 dimensions, this means that the previous page vector needs 93 | to be extended accordingly with zeros to the right: `[0.125, 0.125, 0.125, 0.5, 0.125, 0]`. 94 | 95 | Once all needed pages are vectorized, KMeans is applied. 96 | -------------------------------------------------------------------------------- /demo.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import time 4 | 5 | import scrapely.htmlpage as hp 6 | 7 | import clustering 8 | 9 | 10 | if __name__ == '__main__': 11 | if len(sys.argv) > 1: 12 | path = sys.argv[1] 13 | else: 14 | path = 'news.ycombinator.com' 15 | for dirpath, dnames, fnames in os.walk(path): 16 | pages = [] 17 | for fname in fnames: 18 | full_path = os.path.abspath(os.path.join(dirpath, fname)) 19 | with open(full_path, 'r') as inp: 20 | try: 21 | body = inp.read().decode('utf-8') 22 | except UnicodeDecodeError: 23 | continue 24 | pages.append( 25 | hp.HtmlPage(url='file://' + full_path, body=body)) 26 | print 'Total pages: {0}'.format(len(pages)) 27 | 28 | t1 = time.clock() 29 | kmeans = clustering.OnlineKMeans() 30 | for page in pages: 31 | kmeans.add_page(page) 32 | t2 = time.clock() 33 | print 'Clustering in {0} seconds'.format(t2 - t1) 34 | print ' per page: {0} ms'.format((t2 - t1)/len(pages)*1000) 35 | 36 | for page in pages: 37 | print page.url, kmeans.classify(page) 38 | -------------------------------------------------------------------------------- /page_clustering/__init__.py: -------------------------------------------------------------------------------- 1 | from clustering import (OnlineKMeans, kmeans_from_samples) 2 | -------------------------------------------------------------------------------- /page_clustering/clustering.py: -------------------------------------------------------------------------------- 1 | import scrapely.htmlpage as hp 2 | from sklearn.cluster import MiniBatchKMeans 3 | import numpy as np 4 | 5 | from features import TagFrequency 6 | 7 | 8 | def reshape_cols(X, n): 9 | """Append with zero columns X until it has n columns""" 10 | Y = np.zeros((X.shape[0], n)) 11 | Y[:, :X.shape[1]] = X 12 | return Y 13 | 14 | 15 | class OnlineKMeans(object): 16 | def __init__(self, n_clusters=8, batch_size=None, 17 | max_std_dev=5.0, min_cluster_points=30, 18 | vectorizer=TagFrequency(), centers=None): 19 | """ 20 | Parameters 21 | ---------- 22 | n_clusters : int 23 | Number of clusters to use for KMeans. 24 | batch_size : int 25 | Update clusters only when the batch reaches this size. 26 | Default: batch_size = 10*n_clusters. 27 | max_std_dev : float 28 | For outlier detection, do not cluster measures whose 29 | distance exceeds 5 times the std dev of the cluster. 30 | Set to None to disable outlier detection. 31 | This value can be updated at any point after object creation. 32 | min_cluster_points : int 33 | Do not perform outlier detection on clusters that have less than 34 | this amount of points. 35 | vectorizer : callable 36 | Takes an scrapely.htmlpage.HtmlPage and returns a numpy array. 37 | Different pages can return different array sizes. If the vector 38 | is missing values they are assumed to be zero. 39 | """ 40 | self.vectorizer = vectorizer 41 | self.dimension = vectorizer.dimension 42 | 43 | if batch_size is None: 44 | self.batch_size = 10*n_clusters 45 | else: 46 | self.batch_size = batch_size 47 | self.batch = [] 48 | self.n_clusters = n_clusters if centers is None else centers.shape[0] 49 | self.kmeans = MiniBatchKMeans( 50 | self.n_clusters, 51 | init=centers if centers is not None else 'k-means++') 52 | 53 | # outlier_detection parameters 54 | self._sum_sqr_dist = np.zeros((self.n_clusters,)) 55 | self.max_std_dev = max_std_dev 56 | self.min_cluster_points = min_cluster_points 57 | 58 | @property 59 | def outlier_detection(self): 60 | """True if outlier detection active""" 61 | return self.max_std_dev is not None 62 | 63 | @property 64 | def is_fit(self): 65 | """True if cluster centers available""" 66 | return hasattr(self.kmeans, 'cluster_centers_') 67 | 68 | @property 69 | def _cluster_variance(self): 70 | """An array with the cluster variance for each cluster""" 71 | return self._sum_sqr_dist/self.kmeans.counts_ 72 | 73 | def _sqr_distance_to_center(self, X, y=None): 74 | """Compute the distance of X to cluster center y""" 75 | if y is None: 76 | y = self.kmeans.predict(X) 77 | return np.sum((X - self.kmeans.cluster_centers_[y])**2, axis=1) 78 | 79 | def _find_outliers(self, X, y=None): 80 | """Return a boolean array of size X.shape[0] where a True entry means 81 | that the point is an outlier""" 82 | if not self.is_fit: 83 | return np.zeros((X.shape[0],)) 84 | if y is None: 85 | y = self.kmeans.predict(X) 86 | return np.logical_and( 87 | self.kmeans.counts_[y]>self.min_cluster_points, 88 | (self._sqr_distance_to_center(X, y)/ 89 | self._cluster_variance[y]) > self.max_std_dev**2) 90 | 91 | def add_page(self, page): 92 | """Update cluster centers with new page""" 93 | x = self.vectorizer(page) 94 | self.batch.append(x) 95 | if len(self.batch) >= self.batch_size: 96 | # load batch data 97 | dimension_new = len(x) 98 | X_batch = np.zeros((self.batch_size, dimension_new)) 99 | for i, x_batch in enumerate(self.batch): 100 | X_batch[i, :len(x_batch)] = x_batch 101 | self.batch = [] 102 | # update dimension of cluster centers 103 | if dimension_new > self.dimension: 104 | if self.is_fit: 105 | self.kmeans.cluster_centers_ = reshape_cols( 106 | self.kmeans.cluster_centers_, dimension_new) 107 | elif isinstance(self.kmeans.init, np.ndarray): 108 | self.kmeans.init = reshape_cols( 109 | self.kmeans.init, dimension_new) 110 | self.dimension = dimension_new 111 | # filter out outliers 112 | if self.outlier_detection: 113 | X_batch = X_batch[np.logical_not(self._find_outliers(X_batch))] 114 | # fit data 115 | self.kmeans.partial_fit(X_batch) 116 | # update cluster variance 117 | y_batch = self.kmeans.predict(X_batch) 118 | D_batch = self._sqr_distance_to_center(X_batch, y_batch) 119 | for (y, d) in zip(y_batch, D_batch): 120 | self._sum_sqr_dist[y] += d 121 | 122 | def classify(self, page): 123 | """Return cluster index or -1 if outlier and outlier detection is active. 124 | 125 | page : scrapely.htmlpage.HtmlPage 126 | """ 127 | X = self.vectorizer(page)[:self.dimension].reshape(1, -1) 128 | y = self.kmeans.predict(X) 129 | if self.outlier_detection and self._find_outliers(X, y)[0]: 130 | return -1 131 | return y[0] 132 | 133 | 134 | def kmeans_from_samples(samples): 135 | """Initializes and returns the clustering using the provided samples. 136 | 137 | samples : Iterable[sample] 138 | A sample can be either: 139 | - a dict with `url` and `original_body` keys. 140 | - an string with the page body 141 | 142 | Returns : OnlineKMeans 143 | """ 144 | def build_htmlpage(sample): 145 | if isinstance(sample, hp.HtmlPage): 146 | return sample 147 | if isinstance(sample, (str, unicode)): 148 | url = '' 149 | body = sample 150 | else: 151 | url = sample.get('url') 152 | body = sample.get('original_body', sample.get('annotated_body')) 153 | return hp.HtmlPage(url=url, body=body) 154 | pages = map(build_htmlpage, samples) 155 | n_clusters = len(pages) 156 | vectorizer = TagFrequency() 157 | centers = map(vectorizer, pages) 158 | X = np.zeros((len(centers), vectorizer.dimension)) 159 | for i, c in enumerate(centers): 160 | X[i, :len(c)] = c 161 | return OnlineKMeans( 162 | n_clusters=n_clusters, 163 | centers=X, 164 | vectorizer=vectorizer) 165 | -------------------------------------------------------------------------------- /page_clustering/features.py: -------------------------------------------------------------------------------- 1 | import scrapely.htmlpage as hp 2 | import numpy as np 3 | 4 | 5 | def is_tag(fragment): 6 | """Check if a fragment is also an HTML tag""" 7 | return isinstance(fragment, hp.HtmlTag) 8 | 9 | 10 | def is_closing(fragment): 11 | return fragment.tag_type == hp.HtmlTagType.CLOSE_TAG 12 | 13 | 14 | def is_non_closing_tag(fragment): 15 | return is_tag(fragment) and not is_closing(fragment) 16 | 17 | 18 | def get_class(fragment): 19 | """Return a set with class attributes for a given fragment""" 20 | if is_tag(fragment): 21 | return frozenset((fragment.attributes.get('class') or '').split()) 22 | else: 23 | return frozenset() 24 | 25 | 26 | def tag_to_token(fragment): 27 | return (fragment.tag, get_class(fragment)) 28 | 29 | 30 | class TagFrequency(object): 31 | def __init__(self): 32 | self.dictionary = {} 33 | self.dimension = 0 34 | 35 | def __call__(self, page): 36 | to_index = [] 37 | for fragment in filter(is_non_closing_tag, page.parsed_body): 38 | token = tag_to_token(fragment) 39 | index = self.dictionary.get(token) 40 | if index is not None: 41 | to_index.append(index) 42 | else: 43 | to_index.append(self.dimension) 44 | self.dictionary[token] = self.dimension 45 | self.dimension += 1 46 | vector = np.zeros((len(self.dictionary),)) 47 | for index in to_index: 48 | vector[index] += 1 49 | return vector/np.sum(vector) 50 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy>=1.10.4 2 | scipy>=0.16.1 3 | scikit-learn>=0.17 4 | scrapely>=0.12.0 5 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | 4 | setup( 5 | name = 'page_clustering', 6 | version = '0.0.1', 7 | packages = ['page_clustering'], 8 | install_requires = [ 9 | 'numpy', 10 | 'scikit-learn', 11 | 'scrapely', 12 | ], 13 | url = 'https://github.com/scrapinghub/page_clustering', 14 | maintainer = 'Pedro Lopez-Adeva Fernandez-Layos', 15 | maintainer_email = 'pedro@scrapinghub.com', 16 | keywords = ['crawler', 'scrapy', 'scrapely', 'web'], 17 | description = 'Online k-means clustering of web pages', 18 | classifiers = [ 19 | #'Framework :: Crawl Frontier', 20 | 'Development Status :: 4 - Beta', 21 | 'Environment :: Console', 22 | 'Intended Audience :: Developers', 23 | 'License :: OSI Approved :: BSD License', 24 | 'Operating System :: POSIX', 25 | 'Programming Language :: Python', 26 | 'Programming Language :: Python :: 2', 27 | 'Programming Language :: Python :: 2.7', 28 | 'Topic :: Internet :: WWW/HTTP', 29 | 'Topic :: Software Development :: Libraries :: Python Modules', 30 | ], 31 | ) 32 | -------------------------------------------------------------------------------- /test/stackoverflow/Tags - 11 - Stack Overflow.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Tags - Stack Overflow 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 26 | 27 | 28 | 29 | 30 | 78 | 79 | 80 | 87 | 88 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 |
108 |
109 |
110 | 111 | 112 | 113 | 114 | 115 |
116 |
117 | 118 |
119 | 120 | 121 |
122 |
123 |

current community

124 |
125 | 171 |
172 |

173 | your communities

174 | 175 |
176 | 182 | 186 | 189 |
190 |
191 | 192 | 209 | 210 | 279 |
280 |
281 | 284 | 285 |
286 | 312 | 313 | 314 | 315 | 316 |
317 | 318 | 319 |
320 |
321 |

Tags

322 |
323 | 324 | popular 325 | 326 | name 327 | 328 | new 329 |
330 |
331 |
332 |

333 | A tag is a keyword or label that categorizes your question 334 | with other, similar questions. Using the right tags makes it easier for 335 | others to find and answer your question. 336 |

337 | 338 | 339 | 340 | 341 | 342 |
Type to find tags:
343 |
344 |
345 | 346 | 347 | 348 | 349 | 357 | 367 | 378 | 386 | 387 | 388 | 396 | 404 | 415 | 426 | 427 | 428 | 436 | 446 | 454 | 465 | 466 | 467 | 479 | 489 | 500 | 508 | 509 | 510 | 518 | 526 | 537 | 548 | 549 | 550 | 561 | 572 | 582 | 592 | 593 | 594 | 606 | 614 | 622 | 633 | 634 | 635 | 644 | 655 | 666 | 674 | 675 | 676 | 684 | 694 | 704 | 712 | 713 | 714 |
350 | × 12458 351 |
Refers generally to the process of moving data from an external source into one's platform, program, or data set.
352 |
353 | 354 |
355 |
356 |
358 | × 12444 359 |
a modern Java and Scala web 360 | application open-source framework that provides a clean alternative to 361 | bloated Enterprise Java stacks.
362 |
363 | 364 |
365 |
366 |
368 | × 12414 369 |
an actively maintained 370 | open-source chart-drawing package for R, written by Hadley Wickham, 371 | based upon the principles of "Grammar of Graphics". It partially 372 | replaces R's basic plot and the …
373 |
374 | 375 |
376 |
377 |
379 | × 12388 380 |
a control that has the functionality to trigger a user defined action at regular intervals as configured by the user.
381 |
382 | 383 |
384 |
385 |
389 | × 12356 390 |
a technique for mapping object-oriented systems to relational databases.
391 |
392 | 393 |
394 |
395 |
397 | × 12334 398 |
Custom add-ons and plugins for the jQuery library. jQuery functions and features not included in the standard jQuery library.
399 |
400 | 401 |
402 |
403 |
405 | × 12283 406 |
a property of systems in 407 | which multiple computations can be performed in overlapping time 408 | periods. The computations may be executing on multiple cores in the same 409 | c…
410 |
411 | 412 |
413 |
414 |
416 | × 12258 417 |
the management of changes 418 | to documents, programs, and other information stored as computer files. 419 | Use this tag to mark general questions about usage and applicability of 420 | version con…
421 |
422 | 423 |
424 |
425 |
429 | × 12246 430 |
Programming language constructs designed to handle errors signaled by error codes, exceptions or other language specific means.
431 |
432 | 433 |
434 |
435 |
437 | × 12161 438 |
Docker provides a 439 | high-level API to containerize unix processes and applications with some 440 | degree of isolation and repeatability across servers.
441 |
442 | 443 |
444 |
445 |
447 | × 12150 448 |
a web microframework for Python based on "Werkzeug, Jinja 2 and good intentions". BSD-licensed.
449 |
450 | 451 |
452 |
453 |
455 | × 12141 456 |
a PHP (5.3.2+) ORM. While 457 | Doctrine 1.2 uses the Active Record pattern, Doctrine 2 uses the Data 458 | Mapper pattern. The Doctrine project is a collection of open source 459 | libraries and tools …
460 |
461 | 462 |
463 |
464 |
468 | × 12124 469 |
iOS 5 was released by Apple on Oct 13, 2011. 470 | 471 | it runs on iPhone 3GS, iPhone 4, iPhone 4S, iPod Touch 3rd and 4th generation, as well as all iPad models. 472 | 473 | it has been succeeded by iOS 6.
474 |
475 | 476 |
477 |
478 |
480 | × 12120 481 |
A design pattern to reduce 482 | coupling between components, by dynamically injecting into a software 483 | component dependencies that it needs to function.
484 |
485 | 486 |
487 |
488 |
490 | × 12073 491 |
means different things in 492 | different contexts; consider using less ambiguous tags instead or in 493 | addition. Common meanings include: Dependency-Injection and Data 494 | Binding to Objects and Binding…
495 |
496 | 497 |
498 |
499 |
501 | × 12058 502 |
for questions about representing or manipulating colors in a programming language.
503 |
504 | 505 |
506 |
507 |
511 | × 12025 512 |
a type of variable used in a subroutine to refer to the data provided as input to the subroutine.
513 |
514 | 515 |
516 |
517 |
519 | × 12010 520 |
A special type of subroutine called at the creation of an object.
521 |
522 | 523 |
524 |
525 |
527 | × 12006 528 |
Combobox allows to select 529 | one option out of several (similar to a listbox), however, only the 530 | selected one is displayed by default (in the interest of screen real 531 | estate).
532 |
533 | 534 |
535 |
536 |
538 | × 11989 539 |
a tabular data structure. 540 | Usually, it contains data where rows are observations and columns are 541 | variables of various types. While "data frame" or "dataframe" is the 542 | term used for this …
543 |
544 | 545 |
546 |
547 |
551 | × 11950 552 |
A property, in some 553 | object-oriented programming languages, is a special sort of class 554 | member, intermediate between a field (or data member) and a method. 555 | Properties are read and written like fields, b…
556 |
557 | 558 |
559 |
560 |
562 | × 11917 563 |
an Object-relational 564 | Database Management System (ORDBMS) created by Oracle Corporation. 565 | Specific releases of the product are known as Oracle9i, Oracle10g, 566 | Oracle 11g and Oracle 12c, where "i…
567 |
568 | 569 |
570 |
571 |
573 | × 11858 574 |
about operating system 575 | processes. It may also refer to a specific construct on a given 576 | platform, e.g., the System.Diagnostics.Process class for .NET
577 |
578 | 579 |
580 |
581 |
583 | × 11816 584 |
the Microsoft supplied 585 | command line interpreter on OS/2, Windows CE, and Windows NT operating 586 | systems (including Windows 2000, XP, Vista, 7, Server 2003, an…
587 |
588 | 589 |
590 |
591 |
595 | × 11693 596 |
An interface refers to the 597 | point of interaction between components. Interfaces are applicable at 598 | both the hardware and software level. 599 | 600 | In general, an interface exposes a contract without exposing …
601 |
602 | 603 |
604 |
605 |
607 | × 11691 608 |
A data type consisting of a set of named values called elements, members or enumerators of the type.
609 |
610 | 611 |
612 |
613 |
615 | × 11681 616 |
Subset of the OpenGL 3D graphics API designed for embedded devices such as mobile phones.
617 |
618 | 619 |
620 |
621 |
623 | × 11680 624 |
The more general term 625 | pixmap refers to a map of pixels, where each one may store more than two 626 | colors, thus using more than one bit per pixel. Often bitmap is used 627 | for this as well. In some contexts, …
628 |
629 | 630 |
631 |
632 |
636 | × 11673 637 |
the standard Python interface to the "Tk" graphical user interface toolkit. 638 | In Python 3, the name of the module changed from Tkinter to tkinter.
639 |
640 | 641 |
642 |
643 |
645 | × 11665 646 |
a modern Lisp dialect for 647 | the Java Virtual Machine (with versions for the CLR and JavaScript). 648 | More than merely an implementation of Lisp in Java, Clojure provides 649 | access to Java's classes …
650 |
651 | 652 |
653 |
654 |
656 | × 11567 657 |
ambiguous. In .NET, it's a 658 | class that represents a table of in-memory data. In component based MVC 659 | frameworks like JSF and Wicket, it's an UI component that dynamically 660 | renders…
661 |
662 | 663 |
664 |
665 |
667 | × 11539 668 |
a process where an object type is explicitly converted into another type if the conversion is allowed.
669 |
670 | 671 |
672 |
673 |
677 | × 11536 678 |
The process of converting source code files into standalone software artifact(s) that can be run on a computer
679 |
680 | 681 |
682 |
683 |
685 | × 11502 686 |
a control structure used in 687 | many programming languages to loop over a set of instructions as long 688 | as a particular condition is met.
689 |
690 | 691 |
692 |
693 |
695 | × 11479 696 |
a free web analytics 697 | solution provided by Google, featuring several client-side APIs, as well 698 | as REST APIs for data export and for management.
699 |
700 | 701 |
702 |
703 |
705 | × 11457 706 |
A graphical user interface element that presents a tabular view of data.
707 |
708 | 709 |
710 |
711 |
715 |
716 | 717 | 718 | 719 | 720 | 721 | 722 | 1 723 | 9 724 | 10 725 | 11 12 726 | 13 727 | 1330 728 | 729 | 730 |
731 | 732 | 733 |
734 | 735 |
736 | 737 | 738 | tag synonyms 739 | 740 |
741 | 742 | 770 |
771 |
772 |
773 | 923 | 926 | 927 | 928 | 953 | 954 | 955 | 956 | -------------------------------------------------------------------------------- /test/stackoverflow/Tags - 12 - Stack Overflow.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Tags - Stack Overflow 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 26 | 27 | 28 | 29 | 30 | 78 | 79 | 80 | 87 | 88 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 |
108 |
109 |
110 | 111 | 112 | 113 | 114 | 115 |
116 |
117 | 118 |
119 | 120 | 121 |
122 |
123 |

current community

124 |
125 | 171 |
172 |

173 | your communities

174 | 175 |
176 | 182 | 186 | 189 |
190 |
191 | 192 | 209 | 210 | 279 |
280 |
281 | 284 | 285 |
286 | 312 | 313 | 314 | 315 | 316 |
317 | 318 | 319 |
320 |
321 |

Tags

322 |
323 | 324 | popular 325 | 326 | name 327 | 328 | new 329 |
330 |
331 |
332 |

333 | A tag is a keyword or label that categorizes your question 334 | with other, similar questions. Using the right tags makes it easier for 335 | others to find and answer your question. 336 |

337 | 338 | 339 | 340 | 341 | 342 |
Type to find tags:
343 |
344 |
345 | 346 | 347 | 348 | 349 | 360 | 371 | 379 | 387 | 388 | 389 | 400 | 411 | 419 | 429 | 430 | 431 | 441 | 452 | 460 | 468 | 469 | 470 | 480 | 491 | 499 | 507 | 508 | 509 | 517 | 528 | 539 | 550 | 551 | 552 | 563 | 571 | 582 | 590 | 591 | 592 | 603 | 611 | 622 | 630 | 631 | 632 | 642 | 653 | 663 | 673 | 674 | 675 | 685 | 693 | 704 | 712 | 713 | 714 |
350 | × 11447 351 |
a user interface mechanism 352 | that provides the user a means to view and execute application 353 | operations. (Generic; please use platform and/or language tags when 354 | possible instead)
355 |
356 | 357 |
358 |
359 |
361 | × 11356 362 |
a framework to build HTML 363 | UI. It delivers everything needed for client-side, jQuery-powered 364 | development in one integrated, compact package, complete with AngularJS 365 | integration.
366 |
367 | 368 |
369 |
370 |
372 | × 11334 373 |
an authentication gem for Ruby-on-Rails. It supersedes previous solutions such as Restful Authentication or Authlogic.
374 |
375 | 376 |
377 |
378 |
380 | × 11330 381 |
the process of specifying the settings used for a system or application
382 |
383 | 384 |
385 |
386 |
390 | × 11296 391 |
a programming language 392 | construct or computer hardware mechanism designed to handle the 393 | occurrence of exceptions, special conditions that change the normal flow 394 | of program executi…
395 |
396 | 397 |
398 |
399 |
401 | × 11279 402 |
AsyncTask enables proper 403 | and easy use of the UI thread. This class allows to perform background 404 | operations and publish results on the UI thread without having to 405 | manipulate threads and/or handlers. As…
406 |
407 | 408 |
409 |
410 |
412 | × 11271 413 |
a free software/open-source Java EE-based, Cross-platform Application Server.
414 |
415 | 416 |
417 |
418 |
420 | × 11227 421 |
a MongoDB object modeling 422 | tool, or ODM (Object Document Mapper), written in JavaScript and 423 | designed to work in an asynchronous environment.
424 |
425 | 426 |
427 |
428 |
432 | × 11139 433 |
the current version of the 434 | open-source PHP web development MVC framework created by Taylor Otwell. 435 | Laravel helps you create applications using simple, expressive syntax.
436 |
437 | 438 |
439 |
440 |
442 | × 11109 443 |
an approach to Web design 444 | and development that aims at crafting sites to provide optimal 445 | experiences across a wide range of devices based on screen size, 446 | platform, and o…
447 |
448 | 449 |
450 |
451 |
453 | × 11075 454 |
Receiving data to a local system from a remote system, or to initiate such a data transfer.
455 |
456 | 457 |
458 |
459 |
461 | × 11073 462 |
The 2010-2011 version of the iPhone/iPad/iPod Touch/Apple TV operating system family, made by Apple.
463 |
464 | 465 |
466 |
467 |
471 | × 11068 472 |
a UI feature provided by 473 | applications, where the program predicts a word or phrase that the user 474 | wants to type without the user actually typing it completely.
475 |
476 | 477 |
478 |
479 |
481 | × 11057 482 |
In computing, input/output, 483 | or I/O, refers to the communication between an information processing 484 | system (such as a computer), and the outside world, possibly a human, or 485 | another information processin…
486 |
487 | 488 |
489 |
490 |
492 | × 11055 493 |
A logical subdivision of a larger, more complex system.
494 |
495 | 496 |
497 |
498 |
500 | × 11034 501 |
rules, that when they evaluate to true perform one or more actions.
502 |
503 | 504 |
505 |
506 |
510 | × 10943 511 |
Architecture encompasses the process, artifacts and high-level structure of a solution.
512 |
513 | 514 |
515 |
516 |
518 | × 10921 519 |
SSIS (Microsoft SQL Server 520 | Integration Services) is a platform for building enterprise-level data 521 | integration and data transformations solutions. SQL Server Integration 522 | Services (SSIS) is a tool that …
523 |
524 | 525 |
526 |
527 |
529 | × 10917 530 |
a typed superset of 531 | JavaScript created by Microsoft that adds optional types, classes, 532 | interfaces and modules and compiles to plain JavaScript. TypeScript 533 | includes advanced features such…
534 |
535 | 536 |
537 |
538 |
540 | × 10879 541 |
software that modifies a 542 | web URL's appearance (URL rewriting). Rewritten URLs are used to provide 543 | shorter and more relevant-looking links to web pages, to route users 544 | from obso…
545 |
546 | 547 |
548 |
549 |
553 | × 10877 554 |
a generic term for 555 | combining two or more related sets of data. It is commonly associated 556 | with revision control systems when reconciling multiple changes made to a 557 | revision-controlled collec…
558 |
559 | 560 |
561 |
562 |
564 | × 10860 565 |
Part of the MVC pattern, the Model manages the behaviour and data of the application.
566 |
567 | 568 |
569 |
570 |
572 | × 10786 573 |
a device or program that 574 | stands between two or more interconnected programs or devices. Use 575 | [dynamic-proxy] for the Java class and [proxy-pattern] for the design 576 | pattern.
577 |
578 | 579 |
580 |
581 |
583 | × 10771 584 |
a software library that implements a self-contained, serverless, zero-configuration, transactional SQL database engine.
585 |
586 | 587 |
588 |
589 |
593 | × 10671 594 |
a server-side rapid 595 | application development platform, implementing the dynamic general 596 | purpose CFML programming language. Please include CFML version, OS & 597 | web server in questions.
598 |
599 | 600 |
601 |
602 |
604 | × 10598 605 |
a video-sharing website on which users can upload, share, and view videos.
606 |
607 | 608 |
609 |
610 |
612 | × 10576 613 |
an enclosing context where 614 | values and expressions are associated. Use this tag for questions about 615 | different types of scope as well for questions where scope may be 616 | unclear.
617 |
618 | 619 |
620 |
621 |
623 | × 10542 624 |
for questions about separating a item (e.g. a string) into parts, often by a delimiter.
625 |
626 | 627 |
628 |
629 |
633 | × 10500 634 |
a value that enables a 635 | program to indirectly access a particular datum, such as a variable or a 636 | record, in the computer's memory or in some other storage device.
637 |
638 | 639 |
640 |
641 |
643 | × 10478 644 |
an open-source, 645 | transactional graph database well suited to connected data. You can use 646 | it for a variety of use-cases directly from all JVM languages or via 647 | other language drivers via the HTT…
648 |
649 | 650 |
651 |
652 |
654 | × 10460 655 |
a term used in some 656 | programming languages to define a function or data storage area (field) 657 | that is not bound to any specific object instance.
658 |
659 | 660 |
661 |
662 |
664 | × 10454 665 |
a text-only interface for 666 | interacting with an operating system or a piece of software. A user 667 | typically types commands into the terminal to perform specific tas…
668 |
669 | 670 |
671 |
672 |
676 | × 10453 677 |
a C++ library of generic containers, iterators, algorithms, and function objects. 678 | 679 | When C++ was standardised, large parts of the STL were adopted into the S…
680 |
681 | 682 |
683 |
684 |
686 | × 10452 687 |
Objective-C version for iOS & OS X of Cocos2d. Cocos2d is a framework for building 2D games and graphical applications.
688 |
689 | 690 |
691 |
692 |
694 | × 10443 695 |
a powerful, fast, 696 | lightweight, embeddable scripting language. It's dynamically typed, runs 697 | by interpreting bytecode and has automatic garbage collection. Its 698 | speed is one of the main reasons It…
699 |
700 | 701 |
702 |
703 |
705 | × 10441 706 |
The successor to Microsoft Windows 7 that focuses on a new interface style for touch-based devices and tablets.
707 |
708 | 709 |
710 |
711 |
715 |
716 | 717 | 718 | 719 | 720 | 721 | 722 | 1 723 | 10 724 | 11 725 | 12 13 726 | 14 727 | 1330 728 | 729 | 730 |
731 | 732 | 733 |
734 | 735 |
736 | 737 | 738 | tag synonyms 739 | 740 |
741 | 742 | 770 |
771 |
772 |
773 | 923 | 926 | 927 | 928 | 953 | 954 | 955 | 956 | -------------------------------------------------------------------------------- /test/stackoverflow/Tags - 14 - Stack Overflow.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Tags - Stack Overflow 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 26 | 27 | 28 | 29 | 30 | 78 | 79 | 80 | 87 | 88 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 |
108 |
109 |
110 | 111 | 112 | 113 | 114 | 115 |
116 |
117 | 118 |
119 | 120 | 121 |
122 |
123 |

current community

124 |
125 | 171 |
172 |

173 | your communities

174 | 175 |
176 | 182 | 186 | 189 |
190 |
191 | 192 | 209 | 210 | 279 |
280 |
281 | 284 | 285 |
286 | 312 | 313 | 314 | 315 | 316 |
317 | 318 | 319 |
320 |
321 |

Tags

322 |
323 | 324 | popular 325 | 326 | name 327 | 328 | new 329 |
330 |
331 |
332 |

333 | A tag is a keyword or label that categorizes your question 334 | with other, similar questions. Using the right tags makes it easier for 335 | others to find and answer your question. 336 |

337 | 338 | 339 | 340 | 341 | 342 |
Type to find tags:
343 |
344 |
345 | 346 | 347 | 348 | 349 | 357 | 367 | 378 | 388 | 389 | 390 | 398 | 406 | 417 | 427 | 428 | 429 | 439 | 450 | 461 | 471 | 472 | 473 | 482 | 490 | 500 | 508 | 509 | 510 | 523 | 531 | 542 | 552 | 553 | 554 | 562 | 570 | 579 | 589 | 590 | 591 | 602 | 613 | 623 | 634 | 635 | 636 | 648 | 659 | 670 | 680 | 681 | 682 | 690 | 698 | 706 | 714 | 715 | 716 |
350 | × 9637 351 |
the fifth major version of the ASP.NET Model-View-Controller platform for web applications.
352 |
353 | 354 |
355 |
356 |
358 | × 9633 359 |
should be applied to 360 | questions concerning the programming of compilers or for questions about 361 | the detailed inner workings of compilers.
362 |
363 | 364 |
365 |
366 |
368 | × 9626 369 |
the eighth version of 370 | Apple's iOS mobile operating system. It was announced at Apple's 371 | Worldwide Developers Conference (WWDC) on June 2, 2014 and was later 372 | released to the public on September…
373 |
374 | 375 |
376 |
377 |
379 | × 9592 380 |
a hierarchical naming 381 | system built on a distributed database for computers, services, or any 382 | resource connected to the Internet or a private network.
383 |
384 | 385 |
386 |
387 |
391 | × 9591 392 |
display web content from within an iOS application.
393 |
394 | 395 |
396 |
397 |
399 | × 9586 400 |
a widely-used data structure that emulates a hierarchical tree-like structure with a set of linked nodes.
401 |
402 | 403 |
404 |
405 |
407 | × 9575 408 |
one of many request methods 409 | supported by the HTTP protocol. The GET request method is used when the 410 | client needs to get data from the server as part of the request-URI. 411 |
412 |
413 | 414 |
415 |
416 |
418 | × 9571 419 |
a succinct, expressive and 420 | efficient functional and object-oriented language for .NET which helps 421 | you write simple code to solve complex problems.
422 |
423 | 424 |
425 |
426 |
430 | × 9553 431 |
an open-source, web-based 432 | MySQL administration tool written in PHP. Use this tag for problems 433 | related to using this tool and not for general query problems
434 |
435 | 436 |
437 |
438 |
440 | × 9553 441 |
a data structure in which 442 | the elements contain references to the next (and optionally the 443 | previous) element. Linked lists offer O(1) insert after and removal of 444 | any element with known…
445 |
446 | 447 |
448 |
449 |
451 | × 9528 452 |
Spring Boot makes it easy 453 | to create Spring-powered, production-grade applications and services 454 | with absolute minimum fuss. It takes an opinionated view of the Spring 455 | platform so that new and existing …
456 |
457 | 458 |
459 |
460 |
462 | × 9518 463 |
a data structure relating keys to values. 464 | For questions about mapping functions use [map-function] instead. 465 | For questions about geography, use [maps] instead.
466 |
467 |
468 |
469 |
470 |
474 | × 9511 475 |
Null means *nothing* or *unknown*, depending on context. 476 |
477 |
478 | 479 |
480 |
481 |
483 | × 9500 484 |
The Microsoft Foundation Class Library (MFC) is a C++ framework for Windows GUI programming.
485 |
486 | 487 |
488 |
489 |
491 | × 9484 492 |
the latest version of 493 | Microsoft's Visual Studio product suite. Do not use this tag unless you 494 | have a specific question about Visual Studio -- not just a coding issue.
495 |
496 | 497 |
498 |
499 |
501 | × 9475 502 |
a commercial word processor designed by Microsoft.
503 |
504 | 505 |
506 |
507 |
511 | × 9463 512 |
for questions about the 513 | internals of the Linux kernel itself - particularly about writing code 514 | that runs within the context of the kernel (like kernel modules or 515 | drivers). 516 | 517 | Questions abo…
518 |
519 | 520 |
521 |
522 |
524 | × 9456 525 |
a highly scalable, eventually consistent, distributed, structured row/column store.
526 |
527 | 528 |
529 |
530 |
532 | × 9400 533 |
An XML parser goes through 534 | text documents containing XML trees and allows the information in the 535 | hierarchy to be used. Use this tag for problems implementing an XML 536 | parser or generated by the use of a…
537 |
538 | 539 |
540 |
541 |
543 | × 9392 544 |
a device that takes 545 | physical or digital photos. In the virtual world, it is used to aim at 546 | virtual objects and or move through a virtual scene.
547 |
548 | 549 |
550 |
551 |
555 | × 9392 556 |
for questions specific to the 2008 R2 version of Microsoft's SQL Server.
557 |
558 | 559 |
560 |
561 |
563 | × 9391 564 |
a graphical user interface element which allows for simple input text, usually only a single line of text.
565 |
566 | 567 |
568 |
569 |
571 | × 9368 572 |
a component technology from Microsoft, featuring remoting, language independence and interface-based programming. 573 | For questions about the COM serial port, you should u…
574 |
575 | 576 |
577 |
578 |
580 | × 9287 581 |
the process of selecting paths in a network along which to send network traffic. 582 | 583 | For the process of associating URLs to content, use the tag url-routing instead.
584 |
585 | 586 |
587 |
588 |
592 | × 9266 593 |
a parallel computing 594 | platform and programming model for Nvidia GPUs (Graphics Processing 595 | Units). CUDA provides an interface to Nvidia GPUs through a variety of 596 | programming languages, libraries…
597 |
598 | 599 |
600 |
601 |
603 | × 9229 604 |
the process of adapting a 605 | product or service to a particular language, culture, and desired local 606 | "look-and-feel. Localization can be referred to by the numeronym L10N or 607 | l10n (as in: …
608 |
609 | 610 |
611 |
612 |
614 | × 9220 615 |
a companion tool to the 616 | Android SDK that lets you build performance-critical portions of your 617 | apps in native code or port existing libraries in C/C++ to And…
618 |
619 | 620 |
621 |
622 |
624 | × 9194 625 |
a personal information 626 | manager from Microsoft (most notably used for handling e-mail), 627 | available both as a separate application as well as a part of the 628 | Microsoft Office suite.
629 |
630 | 631 |
632 |
633 |
637 | × 9162 638 |
a report writer that 639 | operates as a stand-alone report designer, an integrated part of Visual 640 | Studio, or part of SAP's Business Objects Enterprise suite. 641 | 642 |
643 |
644 | 645 |
646 |
647 |
649 | × 9127 650 |
IIS (Internet Information 651 | Services) Version 7 – is a web server application and a set of feature 652 | extension modules created by Microsoft for use with Microsoft Windows. 653 | Released with Window Server 2008…
654 |
655 | 656 |
657 |
658 |
660 | × 9121 661 |
An open source BSD-licensed 662 | in-memory data structure store used as database, cache and message 663 | broker. Supports data structures such as strings, hashes, lists, sets, 664 | sorted sets with range queries, bi…
665 |
666 | 667 |
668 |
669 |
671 | × 9107 672 |
alerts, badges, or sounds which are pushed to a mobile device from a remote server. 673 | 674 | Apple delivers push notifications via the Apple Push Notification Service (APNS). Android …
675 |
676 | 677 |
678 |
679 |
683 | × 9104 684 |
an algorithm for processing huge datasets on certain kinds of distributable problems using a large number of nodes
685 |
686 | 687 |
688 |
689 |
691 | × 9047 692 |
an OSGi extensible platform for tool integration. The extensions to Eclipse are Java plug-ins.
693 |
694 | 695 |
696 |
697 |
699 | × 8977 700 |
A notification informs users of events that are unrelated to the current user activity.
701 |
702 | 703 |
704 |
705 |
707 | × 8940 708 |
Microsoft Build Engine, also known as MSBuild, is a build platform for managed code and was part of .NET Framework.
709 |
710 | 711 |
712 |
713 |
717 |
718 | 719 | 720 | 721 | 722 | 723 | 724 | 1 725 | 12 726 | 13 727 | 14 15 728 | 16 729 | 1330 730 | 731 | 732 |
733 | 734 | 735 |
736 | 737 |
738 | 739 | 740 | tag synonyms 741 | 742 |
743 | 744 | 772 |
773 |
774 |
775 | 925 | 928 | 929 | 930 | 955 | 956 | 957 | 958 | -------------------------------------------------------------------------------- /test/test_stackoverflow.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import scrapely.htmlpage as hp 4 | 5 | import page_clustering 6 | 7 | 8 | JOBS = [ 9 | 'Jobs - 1 - Stack Overflow.html', 10 | 'Jobs - 2 - Stack Overflow.html', 11 | 'Jobs - 3- Stack Overflow.html', 12 | 'Jobs - 4 - Stack Overflow.html', 13 | 'Jobs - 5 - Stack Overflow.html', 14 | 'Jobs - 6 - Stack Overflow.html', 15 | 'Jobs - 7 - Stack Overflow.html', 16 | 'Jobs - 8 - Stack Overflow.html', 17 | 'Jobs - 9 - Stack Overflow.html', 18 | 'Jobs - 10 - Stack Overflow.html', 19 | 'Jobs - 11 - Stack Overflow.html', 20 | 'Jobs - 12 - Stack Overflow.html', 21 | 'Jobs - 13 - Stack Overflow.html', 22 | 'Jobs - 14 - Stack Overflow.html', 23 | 'Jobs - 15 - Stack Overflow.html', 24 | ] 25 | 26 | QUESTION_LIST = [ 27 | 'Newest Questions - Page 1 - Stack Overflow.html', 28 | 'Newest Questions - Page 2 - Stack Overflow.html', 29 | 'Newest Questions - Page 3 - Stack Overflow.html', 30 | 'Newest Questions - Page 4 - Stack Overflow.html', 31 | 'Newest Questions - Page 5 - Stack Overflow.html', 32 | 'Newest Questions - Page 6 - Stack Overflow.html', 33 | 'Newest Questions - Page 7 - Stack Overflow.html', 34 | 'Newest Questions - Page 8 - Stack Overflow.html', 35 | 'Newest Questions - Page 9 - Stack Overflow.html', 36 | 'Newest Questions - Page 10 - Stack Overflow.html', 37 | 'Newest Questions - Page 11 - Stack Overflow.html', 38 | 'Newest Questions - Page 12 - Stack Overflow.html', 39 | 'Newest Questions - Page 13 - Stack Overflow.html', 40 | 'Newest Questions - Page 14 - Stack Overflow.html', 41 | 'Newest Questions - Page 15 - Stack Overflow.html', 42 | ] 43 | 44 | QUESTION_DETAIL = [ 45 | 'Question - 1 - Stack Overflow.html', 46 | 'Question - 2 - Stack Overflow.html', 47 | 'Question - 3 - Stack Overflow.html', 48 | 'Question - 4 - Stack Overflow.html', 49 | 'Question - 5 - Stack Overflow.html', 50 | 'Question - 6 - Stack Overflow.html', 51 | 'Question - 7 - Stack Overflow.html', 52 | 'Question - 8 - Stack Overflow.html', 53 | 'Question - 9 - Stack Overflow.html', 54 | 'Question - 10 - Stack Overflow.html', 55 | 'Question - 11 - Stack Overflow.html', 56 | 'Question - 12 - Stack Overflow.html', 57 | 'Question - 13 - Stack Overflow.html', 58 | 'Question - 14 - Stack Overflow.html', 59 | 'Question - 15 - Stack Overflow.html', 60 | ] 61 | 62 | TAGS = [ 63 | 'Tags - 1 - Stack Overflow.html', 64 | 'Tags - 2 - Stack Overflow.html', 65 | 'Tags - 3 - Stack Overflow.html', 66 | 'Tags - 4 - Stack Overflow.html', 67 | 'Tags - 5 - Stack Overflow.html', 68 | 'Tags - 6 - Stack Overflow.html', 69 | 'Tags - 7 - Stack Overflow.html', 70 | 'Tags - 8 - Stack Overflow.html', 71 | 'Tags - 9 - Stack Overflow.html', 72 | 'Tags - 10 - Stack Overflow.html', 73 | 'Tags - 11 - Stack Overflow.html', 74 | 'Tags - 12 - Stack Overflow.html', 75 | 'Tags - 13 - Stack Overflow.html', 76 | 'Tags - 14 - Stack Overflow.html', 77 | 'Tags - 15 - Stack Overflow.html', 78 | ] 79 | 80 | USERS = [ 81 | 'Users - 1 - Stack Overflow.html', 82 | 'Users - 2 - Stack Overflow.html', 83 | 'Users - 3 - Stack Overflow.html', 84 | 'Users - 4 - Stack Overflow.html', 85 | 'Users - 5 - Stack Overflow.html', 86 | 'Users - 6 - Stack Overflow.html', 87 | 'Users - 8 - Stack Overflow.html', 88 | 'Users - 9 - Stack Overflow.html', 89 | 'Users - 10 - Stack Overflow.html', 90 | 'Users - 11 - Stack Overflow.html', 91 | 'Users - 12 - Stack Overflow.html', 92 | 'Users - 13 - Stack Overflow.html', 93 | 'Users - 14 - Stack Overflow.html', 94 | 'Users - 15 - Stack Overflow.html', 95 | ] 96 | 97 | ALL = [ 98 | JOBS, 99 | QUESTION_LIST, 100 | QUESTION_DETAIL, 101 | TAGS, 102 | USERS 103 | ] 104 | 105 | 106 | def load_page(name, path='stackoverflow'): 107 | datadir = os.environ.get('DATAPATH', '.') 108 | with open(os.path.join(datadir, path, name), 'r') as f: 109 | body = f.read() 110 | return hp.HtmlPage(url=name, body=body.decode('utf-8')) 111 | 112 | 113 | def test_clustering(): 114 | clt = page_clustering.kmeans_from_samples( 115 | load_page(group[0]) for group in ALL) 116 | for group in ALL: 117 | for name in group[1:11]: 118 | clt.add_page(load_page(name)) 119 | for i, group in enumerate(ALL): 120 | for name in group: 121 | assert(clt.classify(load_page(name)) == i) 122 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = py27 3 | 4 | [testenv] 5 | sitepackages=True 6 | deps = 7 | pytest 8 | -r{toxinidir}/requirements.txt 9 | 10 | commands=py.test 11 | 12 | setenv = 13 | DATAPATH={toxinidir}/test --------------------------------------------------------------------------------