├── requirements.txt ├── internal_html.png ├── sf_extraction.png ├── README.md ├── LICENSE └── sf_shingling.py /requirements.txt: -------------------------------------------------------------------------------- 1 | mmh3 2 | nltk 3 | numpy 4 | pandas 5 | tqdm -------------------------------------------------------------------------------- /internal_html.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jroakes/screaming-frog-shingling/HEAD/internal_html.png -------------------------------------------------------------------------------- /sf_extraction.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jroakes/screaming-frog-shingling/HEAD/sf_extraction.png -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # screaming-frog-shingling 2 | Uses Screaming Frog Internal HTML with text extraction along with a shingling algorithm to compare content duplication across the pages of a crawled site. 3 | 4 | ## Example Usage 5 | 6 | 1. `pip install -r requirements.txt` 7 | 8 | 1. Run Screaming Frog and use Extraction to pull the content out of a specific DOM element. 9 | ![Screaming Frog Extraction](https://raw.githubusercontent.com/jroakes/screaming-frog-shingling/master/sf_extraction.png "Screaming Frog Extraction Example") 10 | 11 | 1. Export the internal HTML to a CSV file. 12 | ![Export Internal HTML](https://raw.githubusercontent.com/jroakes/screaming-frog-shingling/master/internal_html.png "Screaming Frog Internal HTML Export") 13 | 14 | 1. Run the script using the following arguments. 15 | 16 | ``` 17 | Example Usage: 18 | -i : Input filename 19 | -o : Output filename 20 | -c : Column from Screaming Frog that contains your extracted content. 21 | Example invocation: 22 | python sf_shingling.py -i internal_html_ap.csv -o output_html_ap.csv -c "BodyContent 1" 23 | ``` 24 | 25 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 JR Oakes 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /sf_shingling.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # Needed Libraries 5 | def warn(*args, **kwargs): 6 | pass 7 | import warnings 8 | # Silly workaround to get rid of Sklearn deprication warnings. 9 | warnings.warn = lambda *a, **b : None 10 | import mmh3 11 | from nltk import ngrams 12 | import numpy as np 13 | import pandas 14 | import random 15 | import argparse 16 | from tqdm import tqdm 17 | 18 | # Functions and Classes 19 | 20 | def generate_random_seeds(n, seed=5): 21 | random.seed(seed) 22 | return random.sample(range(1, n+1), n) 23 | 24 | def jaccard_similarity(set_a, set_b): 25 | return len(set_a.intersection(set_b)) / len(set_a.union(set_b)) 26 | 27 | class ShingledText: 28 | def __init__(self, text, random_seed=5, shingle_length=5, minhash_size=200): 29 | split_text = text.split() 30 | if len(split_text) < shingle_length: 31 | raise ValueError(u'input text is too short for specified shingle length of {}'.format(shingle_length)) 32 | 33 | self.minhash = [] 34 | self.shingles = ngrams(split_text, shingle_length) 35 | 36 | for hash_seed in generate_random_seeds(minhash_size, random_seed): 37 | min_value = float('inf') 38 | for shingle in ngrams(split_text, shingle_length): 39 | value = mmh3.hash(' '.join(shingle), hash_seed) 40 | min_value = min(min_value, value) 41 | self.minhash.append(min_value) 42 | 43 | def similarity(self, other_shingled_text): 44 | return jaccard_similarity(set(self.minhash), 45 | set(other_shingled_text.minhash)) 46 | 47 | def apply_shingled(row,urls,shingles): 48 | 49 | url = row['address'] 50 | urli = urls.index(url) 51 | urlsh = shingles[urli] 52 | high = 0.0 53 | match = "" 54 | start = 0 55 | 56 | if not urlsh: 57 | row['Sim Score'] = 0.0 58 | row['Sim Match'] = "" 59 | return row 60 | 61 | for i, sh in enumerate(shingles): 62 | 63 | if not urli == i and sh: 64 | sim = jaccard_similarity(set(urlsh), set(sh)) 65 | if sim > high: 66 | high = sim 67 | match = urls[i] 68 | 69 | row['Sim Score'] = high 70 | row['Sim Match'] = match 71 | 72 | return row 73 | 74 | 75 | def main(args): 76 | 77 | print('Loading file: {}'.format(args.in_file)) 78 | df = pandas.read_csv(args.in_file) 79 | 80 | if df.columns[0] == 'Internal - HTML': 81 | df = pandas.read_csv(args.in_file, skiprows=1) 82 | 83 | df.columns = [c.lower() for c in df.columns] 84 | 85 | content_col = args.content_column.lower() 86 | 87 | #Easy way to get rid of NaN values 88 | df = df[df[content_col] == df[content_col]] 89 | df.reset_index(drop=True, inplace=True) 90 | 91 | urls = [] 92 | shingles = [] 93 | 94 | print('Building content shingles.') 95 | # Build content shingles list 96 | for i, row in tqdm(df.iterrows(), total=df.shape[0]): 97 | 98 | text = row[content_col] 99 | url = row['address'] 100 | default = "Maecenas vestibulum euismod dui id scelerisque." 101 | 102 | if isinstance(text, str) and len(text.split()) > 5: 103 | urls.append(url) 104 | shingles.append( ShingledText(text).minhash) 105 | else: 106 | urls.append(url) 107 | shingles.append(ShingledText(default).minhash) 108 | 109 | print('Applying scores to data.') 110 | df_comp = df.apply(apply_shingled, args=(urls,shingles), axis=1) 111 | 112 | print('Saving to file: {}'.format(args.out_file)) 113 | df_comp.to_csv(args.out_file, encoding='utf-8' ) 114 | 115 | 116 | ''' 117 | Example Usage: 118 | 119 | -i : Input filename 120 | -o : Output filename 121 | -c : Column from Screaming Frog that contains your extracted content. 122 | 123 | Example invocation: 124 | python sf_shingling.py -i internal_html_ap.csv -o output_html_ap.csv -c "BodyContent 1" 125 | 126 | ''' 127 | 128 | if __name__ == "__main__": 129 | parser = argparse.ArgumentParser() 130 | parser.add_argument('-i', '--in_file', type=str, required=True, help='Input Screaming Frog CSV filename') 131 | parser.add_argument('-o', '--out_file', type=str, required=True, help='Output CSV filename') 132 | parser.add_argument('-c', '--content_column', type=str, required=True, help='The name of the column holding the extracted content.') 133 | 134 | args = parser.parse_args() 135 | 136 | main(args) 137 | --------------------------------------------------------------------------------