├── requirements.txt
├── internal_html.png
├── sf_extraction.png
├── README.md
├── LICENSE
└── sf_shingling.py


/requirements.txt:
--------------------------------------------------------------------------------
1 | mmh3
2 | nltk
3 | numpy
4 | pandas
5 | tqdm


--------------------------------------------------------------------------------
/internal_html.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jroakes/screaming-frog-shingling/HEAD/internal_html.png


--------------------------------------------------------------------------------
/sf_extraction.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jroakes/screaming-frog-shingling/HEAD/sf_extraction.png


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # screaming-frog-shingling
 2 | Uses Screaming Frog Internal HTML with text extraction along with a shingling algorithm to compare content duplication across the pages of a crawled site. 
 3 | 
 4 | ## Example Usage
 5 | 
 6 | 1. `pip install -r requirements.txt`
 7 | 
 8 | 1. Run Screaming Frog and use Extraction to pull the content out of a specific DOM element.
 9 | ![Screaming Frog Extraction](https://raw.githubusercontent.com/jroakes/screaming-frog-shingling/master/sf_extraction.png "Screaming Frog Extraction Example")
10 | 
11 | 1. Export the internal HTML to a CSV file.
12 | ![Export Internal HTML](https://raw.githubusercontent.com/jroakes/screaming-frog-shingling/master/internal_html.png "Screaming Frog Internal HTML Export")
13 | 
14 | 1. Run the script using the following arguments.
15 | 
16 | ```
17 |  Example Usage:
18 |     -i : Input filename
19 |     -o : Output filename
20 |     -c : Column from Screaming Frog that contains your extracted content.
21 |     Example invocation:
22 |     python sf_shingling.py -i internal_html_ap.csv -o output_html_ap.csv -c "BodyContent 1"
23 | ```
24 | 
25 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 JR Oakes
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/sf_shingling.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | # Needed Libraries
  5 | def warn(*args, **kwargs):
  6 |     pass
  7 | import warnings
  8 | # Silly workaround to get rid of Sklearn deprication warnings.
  9 | warnings.warn = lambda *a, **b : None
 10 | import mmh3
 11 | from nltk import ngrams
 12 | import numpy as np
 13 | import pandas
 14 | import random
 15 | import argparse
 16 | from tqdm import tqdm
 17 | 
 18 | # Functions and Classes
 19 | 
 20 | def generate_random_seeds(n, seed=5):
 21 |     random.seed(seed)
 22 |     return random.sample(range(1, n+1), n)
 23 | 
 24 | def jaccard_similarity(set_a, set_b):
 25 |     return len(set_a.intersection(set_b)) / len(set_a.union(set_b))
 26 | 
 27 | class ShingledText:
 28 |     def __init__(self, text, random_seed=5, shingle_length=5, minhash_size=200):
 29 |         split_text = text.split()
 30 |         if len(split_text) < shingle_length:
 31 |             raise ValueError(u'input text is too short for specified shingle length of {}'.format(shingle_length))
 32 | 
 33 |         self.minhash = []
 34 |         self.shingles = ngrams(split_text, shingle_length)
 35 | 
 36 |         for hash_seed in generate_random_seeds(minhash_size, random_seed):
 37 |             min_value = float('inf')
 38 |             for shingle in ngrams(split_text, shingle_length):
 39 |                 value = mmh3.hash(' '.join(shingle), hash_seed)
 40 |                 min_value = min(min_value, value)
 41 |             self.minhash.append(min_value)
 42 | 
 43 |     def similarity(self, other_shingled_text):
 44 |         return jaccard_similarity(set(self.minhash),
 45 |                 set(other_shingled_text.minhash))
 46 | 
 47 | def apply_shingled(row,urls,shingles):
 48 | 
 49 |     url = row['address']
 50 |     urli = urls.index(url)
 51 |     urlsh = shingles[urli]
 52 |     high = 0.0
 53 |     match = ""
 54 |     start = 0
 55 | 
 56 |     if not urlsh:
 57 |         row['Sim Score'] = 0.0
 58 |         row['Sim Match'] = ""
 59 |         return row
 60 | 
 61 |     for i, sh in enumerate(shingles):
 62 | 
 63 |         if not urli == i and sh:
 64 |             sim = jaccard_similarity(set(urlsh), set(sh))
 65 |             if sim > high:
 66 |                 high = sim
 67 |                 match = urls[i]
 68 | 
 69 |     row['Sim Score'] = high
 70 |     row['Sim Match'] = match
 71 | 
 72 |     return row
 73 | 
 74 | 
 75 | def main(args):
 76 | 
 77 |     print('Loading file: {}'.format(args.in_file))
 78 |     df = pandas.read_csv(args.in_file)
 79 | 
 80 |     if df.columns[0] == 'Internal - HTML':
 81 |         df = pandas.read_csv(args.in_file, skiprows=1)
 82 | 
 83 |     df.columns = [c.lower() for c in df.columns]
 84 | 
 85 |     content_col = args.content_column.lower()
 86 | 
 87 |     #Easy way to get rid of NaN values
 88 |     df = df[df[content_col] == df[content_col]]
 89 |     df.reset_index(drop=True, inplace=True)
 90 | 
 91 |     urls = []
 92 |     shingles = []
 93 | 
 94 |     print('Building content shingles.')
 95 |     # Build content shingles list
 96 |     for i, row in tqdm(df.iterrows(), total=df.shape[0]):
 97 | 
 98 |         text = row[content_col]
 99 |         url = row['address']
100 |         default = "Maecenas vestibulum euismod dui id scelerisque."
101 | 
102 |         if isinstance(text, str) and len(text.split()) > 5:
103 |             urls.append(url)
104 |             shingles.append( ShingledText(text).minhash)
105 |         else:
106 |             urls.append(url)
107 |             shingles.append(ShingledText(default).minhash)
108 | 
109 |     print('Applying scores to data.')
110 |     df_comp = df.apply(apply_shingled, args=(urls,shingles), axis=1)
111 | 
112 |     print('Saving to file: {}'.format(args.out_file))
113 |     df_comp.to_csv(args.out_file, encoding='utf-8' )
114 | 
115 | 
116 | '''
117 |  Example Usage:
118 | 
119 |     -i : Input filename
120 |     -o : Output filename
121 |     -c : Column from Screaming Frog that contains your extracted content.
122 | 
123 |     Example invocation:
124 |     python sf_shingling.py -i internal_html_ap.csv -o output_html_ap.csv -c "BodyContent 1"
125 | 
126 | '''
127 | 
128 | if __name__ == "__main__":
129 |     parser = argparse.ArgumentParser()
130 |     parser.add_argument('-i', '--in_file', type=str, required=True, help='Input Screaming Frog CSV filename')
131 |     parser.add_argument('-o', '--out_file', type=str, required=True, help='Output CSV filename')
132 |     parser.add_argument('-c', '--content_column', type=str, required=True, help='The name of the column holding the extracted content.')
133 | 
134 |     args = parser.parse_args()
135 | 
136 |     main(args)
137 | 


--------------------------------------------------------------------------------