├── .gitignore ├── Pipfile ├── setup.py ├── LICENSE ├── Pipfile.lock ├── aggregate.py ├── map.py └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | data/* 2 | dist/* 3 | *.egg-info 4 | -------------------------------------------------------------------------------- /Pipfile: -------------------------------------------------------------------------------- 1 | [[source]] 2 | url = "https://pypi.org/simple" 3 | verify_ssl = true 4 | name = "pypi" 5 | 6 | [dev-packages] 7 | 8 | [packages] 9 | python-levenshtein = "*" 10 | 11 | [requires] 12 | python_version = "3.6" 13 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from setuptools import setup 4 | 5 | setup(name='redirect-mapper', 6 | version='0.1.0', 7 | description='Generates a redirect map from two sitemaps for website migration.', 8 | author='Joseph Paul', 9 | author_email='joseph@sehrgute.software', 10 | url='https://github.com/jsphpl/redirect-mapper', 11 | install_requires=['python-Levenshtein'], 12 | ) -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE 2 | Version 2, December 2004 3 | 4 | Copyright (C) 2017 Joseph Paul 5 | 6 | Everyone is permitted to copy and distribute verbatim or modified 7 | copies of this license document, and changing it is allowed as long 8 | as the name is changed. 9 | 10 | DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE 11 | TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 12 | 13 | 0. You just DO WHAT THE FUCK YOU WANT TO. -------------------------------------------------------------------------------- /Pipfile.lock: -------------------------------------------------------------------------------- 1 | { 2 | "_meta": { 3 | "hash": { 4 | "sha256": "9956d6dc248039b17dd9d9c75f6f6abbcd9547c9a2f73ae9b202a31bbd362eca" 5 | }, 6 | "pipfile-spec": 6, 7 | "requires": { 8 | "python_version": "3.6" 9 | }, 10 | "sources": [ 11 | { 12 | "name": "pypi", 13 | "url": "https://pypi.org/simple", 14 | "verify_ssl": true 15 | } 16 | ] 17 | }, 18 | "default": { 19 | "python-levenshtein": { 20 | "hashes": [ 21 | "sha256:033a11de5e3d19ea25c9302d11224e1a1898fe5abd23c61c7c360c25195e3eb1" 22 | ], 23 | "index": "pypi", 24 | "version": "==0.12.0" 25 | } 26 | }, 27 | "develop": {} 28 | } 29 | -------------------------------------------------------------------------------- /aggregate.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """Aggregates URLs from a set of XML sitemaps listed under the entry path. 5 | 6 | This script processes the XML file at given path, opens all sitemaps 7 | listed inside, and prints all URLs inside those maps to stdout. 8 | It should support most sitemaps that comply with the spec at 9 | https://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd. 10 | 11 | It was tested with sitemaps generated by the following WP plugins: 12 | - (Google XML Sitemaps)[https://wordpress.org/plugins/google-sitemap-generator/] 13 | - (XML Sitemap & Google News feeds)[https://wordpress.org/plugins/xml-sitemap-feed/] 14 | - (Yoast SEO)[https://wordpress.org/plugins/wordpress-seo/] 15 | 16 | Issues & Documentation: https://github.com/jsphpl/redirect-mapper 17 | """ 18 | 19 | import argparse 20 | import ssl 21 | from pprint import pprint 22 | from urllib2 import urlopen 23 | from xml.etree import ElementTree 24 | 25 | # Ignore self-signed certificates 26 | ssl_context = ssl.create_default_context() 27 | ssl_context.check_hostname = False 28 | ssl_context.verify_mode = ssl.CERT_NONE 29 | 30 | TAGS = { 31 | 'SITEMAP': '{http://www.sitemaps.org/schemas/sitemap/0.9}sitemap', 32 | 'LOCATION': '{http://www.sitemaps.org/schemas/sitemap/0.9}loc', 33 | } 34 | 35 | def main(args): 36 | sources = aggregate(args.entry) 37 | for source in sources: 38 | process(source) 39 | 40 | def aggregate(entry): 41 | """Aggregate urls of sitemaps from one entry xml sitemap""" 42 | file = urlopen(entry, context=ssl_context) 43 | tree = ElementTree.parse(file) 44 | 45 | for sitemap in tree.getroot().findall(TAGS['SITEMAP']): 46 | location = sitemap.find(TAGS['LOCATION']) 47 | if location is not None: 48 | yield location.text 49 | 50 | file.close() 51 | 52 | def process(source): 53 | """Process a single xml sitemap path""" 54 | file = urlopen(source, context=ssl_context) 55 | tree = ElementTree.parse(file) 56 | 57 | for item in tree.getroot(): 58 | location = item.find(TAGS['LOCATION']) 59 | if location is not None: 60 | print(location.text) 61 | 62 | file.close() 63 | 64 | if __name__ == '__main__': 65 | parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawTextHelpFormatter) 66 | parser.add_argument('entry', type=str, metavar='URL/PATH', help='Path or URL of the root sitemap.') 67 | 68 | main(parser.parse_args()) 69 | -------------------------------------------------------------------------------- /map.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """Generates a redirect map from two sitemaps for website migration. 5 | 6 | By default, all matches are dumped on the standard output. If an item 7 | from list1 is exactly contained in list2, it will be assigned right 8 | away, without calculating distance or checking for ambiguity. 9 | 10 | Issues & Documentation: https://github.com/jsphpl/redirect-mapper 11 | """ 12 | 13 | import csv 14 | import argparse 15 | import Levenshtein 16 | 17 | def main(args): 18 | # Read files into memory and remove trailing newlines 19 | list1 = [line[:-1] for line in args.list1] 20 | list2 = [line[:-1] for line in args.list2] 21 | 22 | # Inform the user what's happening 23 | print('%i lines in list1' % len(list1)) 24 | print('%i lines in list2' % len(list2)) 25 | print('Threshold is %f' % args.threshold) 26 | 27 | # Do the hard work 28 | iterator = levenshteinMatch(list1, list2, args.threshold, args.drop_exact) 29 | 30 | # Ouput results 31 | if (args.csv): 32 | print('Writing CSV output to "%s"' % args.csv) 33 | with open(args.csv, 'w') as file: 34 | writer = csv.writer(file) 35 | writer.writerow(('Item (list1)', 'Match (list2)', 'Score', 'Ambiguous', 'Exact', 'Alternatives')) 36 | writer.writerows(iterator) 37 | else: 38 | print('\nResults:') 39 | print('--------------------------------------------------------------------------------') 40 | for item in iterator: 41 | print(item) 42 | print('--------------------------------------------------------------------------------') 43 | print('Note: Use the --csv flag to save results into a file') 44 | 45 | def levenshteinMatch(list1, list2, threshold, drop_exact): 46 | """Find matches based on the levenshtein distance. 47 | 48 | Arguments: 49 | list1 {List} -- List of target items for which to find matches 50 | list2 {List} -- List of search items on which to search for matches 51 | threshold {float} -- Range within which two scores are considered equal 52 | drop_exact {bool} -- Omit exact matches 53 | """ 54 | for key in list1: 55 | if key in list2: 56 | # Can skip all the Levenshtein in this case 57 | if not drop_exact: 58 | yield((key, key, 1.0, False, True, [])) 59 | 60 | else: 61 | # Calculate all scores 62 | scores = [round(Levenshtein.ratio(key, value), 2) for value in list2] 63 | 64 | # Find all matches within `threshold` of the highest score 65 | max_score = max(scores) 66 | winners = [i for i, score in enumerate(scores) if max_score - score <= threshold] 67 | winners.sort(reverse=True) 68 | 69 | # Prepare result values for current pass 70 | is_ambiguous = len(winners) > 1 71 | winner_index = None 72 | winner = None 73 | winner_score = None 74 | alternatives = [] 75 | 76 | winner_index = winners[0] 77 | 78 | if winner_index is not None: 79 | winner = list2[winner_index] 80 | winner_score = scores[winner_index] 81 | 82 | if is_ambiguous: 83 | winners.remove(winner_index) 84 | alternatives = [list2[index] for index in winners] 85 | 86 | yield (key, winner, winner_score, is_ambiguous, False, alternatives) 87 | 88 | if __name__ == '__main__': 89 | parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawTextHelpFormatter) 90 | parser.add_argument('list1', type=argparse.FileType('r'), 91 | help='List of target items for which to find matches. (1 item per line)') 92 | parser.add_argument('list2', type=argparse.FileType('r'), 93 | help='List of search items on which to search for matches. (1 item per line)') 94 | parser.add_argument('-t', '--threshold', type=float, default=0.05, metavar='VALUE', 95 | help='Range within which two scores are considered equal. (default: 0.05)') 96 | parser.add_argument('-c', '--csv', type=str, metavar='PATH', 97 | help='If specified, the output will be formatted as CSV and written to PATH') 98 | parser.add_argument('-d', '--drop-exact', action='store_true', 99 | help='If specified, exact matches will be ommited from the output') 100 | 101 | main(parser.parse_args()) 102 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # redirect-mapper 2 | 3 | Takes two lists of URLs and outputs a mapping that assigns each entry in list 1 an item from list 2 along with a score that indicates how likely the two refer to the same thing. 4 | 5 | ## Use case 6 | 7 | This script was created to automatically generate a map of redirects when migrating a website. The input lists would be a sitemap of each the old and new website, both plain text files containing one url per line. The URLs are required to be "pretty", meaning not just `/post.php?id=123` but rather something like `/blog/why-wordpress-sucks` and ideally have their protocol- and domain parts removed. 8 | 9 | It can of course be used as a generic tool to fuzzy match two sets of strings. It uses the [Levenshtein distance](https://en.wikipedia.org/wiki/Levenshtein_distance) metric as implemented by [python-Levenshtein](https://rawgit.com/ztane/python-Levenshtein/master/docs/Levenshtein.html#Levenshtein-ratio). 10 | 11 | **Warning: Always check the results manually. Never trust the output of the script blindly. It will assign each item in list 1 one item from list 2, even if it's a really bad match.** 12 | 13 | ## map.py usage 14 | 15 | 1. Clone this repository `git clone https://github.com/jsphpl/redirect-mapper` 16 | 2. Enter it `cd redirect-mapper` 17 | 3. Install dependencies `python setup.py install` 18 | 4. Use it: 19 | 20 | ``` 21 | $ python map.py [-h] [-t VALUE] [-c PATH] [-d] list1 list2 22 | 23 | Generates a redirect map from two sitemaps for website migration. 24 | 25 | By default, all matches are dumped on the standard output. If an item 26 | from list1 is exactly contained in list2, it will be assigned right 27 | away, without calculating distance or checking for ambiguity. 28 | 29 | Issues & Documentation: https://github.com/jsphpl/redirect-mapper 30 | 31 | positional arguments: 32 | list1 List of target items for which to find matches. (1 item per line) 33 | list2 List of search items on which to search for matches. (1 item per line) 34 | 35 | optional arguments: 36 | -h, --help show this help message and exit 37 | -t VALUE, --threshold VALUE 38 | Range within which two scores are considered equal. (default: 0.05) 39 | -c PATH, --csv PATH If specified, the output will be formatted as CSV and written to PATH 40 | -d, --drop-exact If specified, exact matches will be ommited from the output 41 | ``` 42 | 43 | ### Examples 44 | 45 | #### Generate a list of redirects 46 | 47 | Say your're asking **where to redirect all the urls from *old_sitemap.txt* ?**. Pass it as the first argument like so: 48 | 49 | ```bash 50 | python map.py old_sitemap.txt new_sitemap.txt 51 | ``` 52 | 53 | #### Adjust ambiguity threshold 54 | 55 | To influence the level at which two matches are considered equally good, use the `-t VALUE` argument. 56 | 57 | ```bash 58 | python map.py -t 0.1 old_sitemap.txt new_sitemap.txt 59 | ``` 60 | 61 | #### Omit exact matches 62 | 63 | If the results are used to set up 301 redirects on the new website to catch all traffic arriving at old URLs, exact matches can be omitted. They will be handled by actual pages exisiting on the new site (list2). Use the `-d` flag here. 64 | 65 | ```bash 66 | python map.py -d old_sitemap.txt new_sitemap.txt 67 | ``` 68 | 69 | #### Save output to CSV file 70 | 71 | Specify the output filename with `-c PATH`. 72 | 73 | ```bash 74 | python map.py -c results.csv old_sitemap.txt new_sitemap.txt 75 | ``` 76 | 77 | ## Aggregating URLs from an XML sitemap 78 | 79 | A helper exists that lets you crawl an XML sitemap and outputs a flat list of URLs, as required as input by `map.py`. Together with that tool, the whole process of generating a redirect map could look like the following. After that, you would of course manually check the results.csv, taking special care of matches with a low score (≤0.8). 80 | 81 | ```bash 82 | python aggregate.py https://old-website.com/sitemap.xml > old.txt 83 | python aggregate.py https://new-website.com/sitemap.xml > new.txt 84 | python map.py --drop-exact --csv results.csv old.txt new.txt 85 | ``` 86 | 87 | ### aggregate.py usage 88 | 89 | ``` 90 | $ python aggregate.py [-h] URL/PATH 91 | 92 | Aggregates URLs from a set of XML sitemaps listed under the entry path. 93 | 94 | This script processes the XML file at given path, opens all sitemaps 95 | listed inside, and prints all URLs inside those maps to stdout. 96 | It should support most sitemaps that comply with the spec at 97 | https://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd. 98 | 99 | It was tested with sitemaps generated by the following WP plugins: 100 | - (Google XML Sitemaps)[https://wordpress.org/plugins/google-sitemap-generator/] 101 | - (XML Sitemap & Google News feeds)[https://wordpress.org/plugins/xml-sitemap-feed/] 102 | - (Yoast SEO)[https://wordpress.org/plugins/wordpress-seo/] 103 | 104 | Issues & Documentation: https://github.com/jsphpl/redirect-mapper 105 | 106 | positional arguments: 107 | URL/PATH Path or URL of the root sitemap. 108 | 109 | optional arguments: 110 | -h, --help show this help message and exit 111 | ``` 112 | --------------------------------------------------------------------------------