├── requirements.txt
├── internal_html.png
├── sf_extraction.png
├── README.md
├── LICENSE
└── sf_shingling.py
/requirements.txt:
--------------------------------------------------------------------------------
1 | mmh3
2 | nltk
3 | numpy
4 | pandas
5 | tqdm
--------------------------------------------------------------------------------
/internal_html.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jroakes/screaming-frog-shingling/HEAD/internal_html.png
--------------------------------------------------------------------------------
/sf_extraction.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jroakes/screaming-frog-shingling/HEAD/sf_extraction.png
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # screaming-frog-shingling
2 | Uses Screaming Frog Internal HTML with text extraction along with a shingling algorithm to compare content duplication across the pages of a crawled site.
3 |
4 | ## Example Usage
5 |
6 | 1. `pip install -r requirements.txt`
7 |
8 | 1. Run Screaming Frog and use Extraction to pull the content out of a specific DOM element.
9 | 
10 |
11 | 1. Export the internal HTML to a CSV file.
12 | 
13 |
14 | 1. Run the script using the following arguments.
15 |
16 | ```
17 | Example Usage:
18 | -i : Input filename
19 | -o : Output filename
20 | -c : Column from Screaming Frog that contains your extracted content.
21 | Example invocation:
22 | python sf_shingling.py -i internal_html_ap.csv -o output_html_ap.csv -c "BodyContent 1"
23 | ```
24 |
25 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2019 JR Oakes
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/sf_shingling.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding: utf-8
3 |
4 | # Needed Libraries
5 | def warn(*args, **kwargs):
6 | pass
7 | import warnings
8 | # Silly workaround to get rid of Sklearn deprication warnings.
9 | warnings.warn = lambda *a, **b : None
10 | import mmh3
11 | from nltk import ngrams
12 | import numpy as np
13 | import pandas
14 | import random
15 | import argparse
16 | from tqdm import tqdm
17 |
18 | # Functions and Classes
19 |
20 | def generate_random_seeds(n, seed=5):
21 | random.seed(seed)
22 | return random.sample(range(1, n+1), n)
23 |
24 | def jaccard_similarity(set_a, set_b):
25 | return len(set_a.intersection(set_b)) / len(set_a.union(set_b))
26 |
27 | class ShingledText:
28 | def __init__(self, text, random_seed=5, shingle_length=5, minhash_size=200):
29 | split_text = text.split()
30 | if len(split_text) < shingle_length:
31 | raise ValueError(u'input text is too short for specified shingle length of {}'.format(shingle_length))
32 |
33 | self.minhash = []
34 | self.shingles = ngrams(split_text, shingle_length)
35 |
36 | for hash_seed in generate_random_seeds(minhash_size, random_seed):
37 | min_value = float('inf')
38 | for shingle in ngrams(split_text, shingle_length):
39 | value = mmh3.hash(' '.join(shingle), hash_seed)
40 | min_value = min(min_value, value)
41 | self.minhash.append(min_value)
42 |
43 | def similarity(self, other_shingled_text):
44 | return jaccard_similarity(set(self.minhash),
45 | set(other_shingled_text.minhash))
46 |
47 | def apply_shingled(row,urls,shingles):
48 |
49 | url = row['address']
50 | urli = urls.index(url)
51 | urlsh = shingles[urli]
52 | high = 0.0
53 | match = ""
54 | start = 0
55 |
56 | if not urlsh:
57 | row['Sim Score'] = 0.0
58 | row['Sim Match'] = ""
59 | return row
60 |
61 | for i, sh in enumerate(shingles):
62 |
63 | if not urli == i and sh:
64 | sim = jaccard_similarity(set(urlsh), set(sh))
65 | if sim > high:
66 | high = sim
67 | match = urls[i]
68 |
69 | row['Sim Score'] = high
70 | row['Sim Match'] = match
71 |
72 | return row
73 |
74 |
75 | def main(args):
76 |
77 | print('Loading file: {}'.format(args.in_file))
78 | df = pandas.read_csv(args.in_file)
79 |
80 | if df.columns[0] == 'Internal - HTML':
81 | df = pandas.read_csv(args.in_file, skiprows=1)
82 |
83 | df.columns = [c.lower() for c in df.columns]
84 |
85 | content_col = args.content_column.lower()
86 |
87 | #Easy way to get rid of NaN values
88 | df = df[df[content_col] == df[content_col]]
89 | df.reset_index(drop=True, inplace=True)
90 |
91 | urls = []
92 | shingles = []
93 |
94 | print('Building content shingles.')
95 | # Build content shingles list
96 | for i, row in tqdm(df.iterrows(), total=df.shape[0]):
97 |
98 | text = row[content_col]
99 | url = row['address']
100 | default = "Maecenas vestibulum euismod dui id scelerisque."
101 |
102 | if isinstance(text, str) and len(text.split()) > 5:
103 | urls.append(url)
104 | shingles.append( ShingledText(text).minhash)
105 | else:
106 | urls.append(url)
107 | shingles.append(ShingledText(default).minhash)
108 |
109 | print('Applying scores to data.')
110 | df_comp = df.apply(apply_shingled, args=(urls,shingles), axis=1)
111 |
112 | print('Saving to file: {}'.format(args.out_file))
113 | df_comp.to_csv(args.out_file, encoding='utf-8' )
114 |
115 |
116 | '''
117 | Example Usage:
118 |
119 | -i : Input filename
120 | -o : Output filename
121 | -c : Column from Screaming Frog that contains your extracted content.
122 |
123 | Example invocation:
124 | python sf_shingling.py -i internal_html_ap.csv -o output_html_ap.csv -c "BodyContent 1"
125 |
126 | '''
127 |
128 | if __name__ == "__main__":
129 | parser = argparse.ArgumentParser()
130 | parser.add_argument('-i', '--in_file', type=str, required=True, help='Input Screaming Frog CSV filename')
131 | parser.add_argument('-o', '--out_file', type=str, required=True, help='Output CSV filename')
132 | parser.add_argument('-c', '--content_column', type=str, required=True, help='The name of the column holding the extracted content.')
133 |
134 | args = parser.parse_args()
135 |
136 | main(args)
137 |
--------------------------------------------------------------------------------