├── .funcignore ├── .github └── dependabot.yml ├── .gitignore ├── .vscode ├── extensions.json ├── launch.json ├── settings.json └── tasks.json ├── CODE_OF_CONDUCT.md ├── LICENSE ├── README.md ├── SECURITY.md ├── SUPPORT.md ├── analyze_results.py ├── companylookup.py ├── docintelligence.py ├── example.env ├── function_app.py ├── gptvision.py ├── host.json ├── localtest.py ├── orchestrator.py ├── requirements.txt └── rig.py /.funcignore: -------------------------------------------------------------------------------- 1 | 2 | 3 | .venv -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | # To get started with Dependabot version updates, you'll need to specify which 2 | # package ecosystems to update and where the package manifests are located. 3 | # Please see the documentation for more information: 4 | # https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates 5 | # https://containers.dev/guide/dependabot 6 | 7 | version: 2 8 | updates: 9 | - package-ecosystem: "devcontainers" 10 | directory: "/" 11 | schedule: 12 | interval: weekly 13 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control 110 | .pdm.toml 111 | .pdm-python 112 | .pdm-build/ 113 | 114 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 115 | __pypackages__/ 116 | 117 | # Celery stuff 118 | celerybeat-schedule 119 | celerybeat.pid 120 | 121 | # SageMath parsed files 122 | *.sage.py 123 | 124 | # Environments 125 | .env 126 | .venv 127 | env/ 128 | venv/ 129 | ENV/ 130 | env.bak/ 131 | venv.bak/ 132 | 133 | # Spyder project settings 134 | .spyderproject 135 | .spyproject 136 | 137 | # Rope project settings 138 | .ropeproject 139 | 140 | # mkdocs documentation 141 | /site 142 | 143 | # mypy 144 | .mypy_cache/ 145 | .dmypy.json 146 | dmypy.json 147 | 148 | # Pyre type checker 149 | .pyre/ 150 | 151 | # pytype static type analyzer 152 | .pytype/ 153 | 154 | # Cython debug symbols 155 | cython_debug/ 156 | 157 | # PyCharm 158 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 159 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 160 | # and can be added to the global gitignore or merged into this file. For a more nuclear 161 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 162 | #.idea/ 163 | .python_packages 164 | 165 | local.settings.json 166 | 167 | datafiles/ 168 | localtest/ -------------------------------------------------------------------------------- /.vscode/extensions.json: -------------------------------------------------------------------------------- 1 | { 2 | "recommendations": [ 3 | "ms-azuretools.vscode-azurefunctions", 4 | "ms-python.python" 5 | ] 6 | } -------------------------------------------------------------------------------- /.vscode/launch.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": "0.2.0", 3 | "configurations": [ 4 | { 5 | "name": "CodeWithMS - Local Testing", 6 | "type": "debugpy", 7 | "request": "launch", 8 | "program": "localtest.py", 9 | "console": "integratedTerminal", 10 | "args": "datafiles/documents > datafiles/test/output.txt" 11 | }, 12 | { 13 | "name": "Python Debugger: Current File with Arguments", 14 | "type": "debugpy", 15 | "request": "launch", 16 | "program": "${file}", 17 | "console": "integratedTerminal", 18 | "args": "${command:pickArgs}" 19 | }, 20 | { 21 | "name": "Attach to Python Functions", 22 | "type": "debugpy", 23 | "request": "attach", 24 | "connect": { 25 | "host": "localhost", 26 | "port": 9091 27 | }, 28 | "preLaunchTask": "func: host start" 29 | } 30 | ] 31 | } 32 | -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "azureFunctions.deploySubpath": ".", 3 | "azureFunctions.scmDoBuildDuringDeployment": true, 4 | "azureFunctions.pythonVenv": ".venv", 5 | "azureFunctions.projectLanguage": "Python", 6 | "azureFunctions.projectRuntime": "~4", 7 | "debug.internalConsoleOptions": "neverOpen", 8 | "azureFunctions.projectLanguageModel": 2 9 | } -------------------------------------------------------------------------------- /.vscode/tasks.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": "2.0.0", 3 | "tasks": [ 4 | { 5 | "type": "func", 6 | "label": "func: host start", 7 | "command": "host start", 8 | "problemMatcher": "$func-python-watch", 9 | "isBackground": true, 10 | "dependsOn": "pip install (functions)" 11 | }, 12 | { 13 | "label": "pip install (functions)", 14 | "type": "shell", 15 | "osx": { 16 | "command": "${config:azureFunctions.pythonVenv}/bin/python -m pip install -r requirements.txt" 17 | }, 18 | "windows": { 19 | "command": "${config:azureFunctions.pythonVenv}\\Scripts\\python -m pip install -r requirements.txt" 20 | }, 21 | "linux": { 22 | "command": "${config:azureFunctions.pythonVenv}/bin/python -m pip install -r requirements.txt" 23 | }, 24 | "problemMatcher": [] 25 | } 26 | ] 27 | } -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Microsoft Open Source Code of Conduct 2 | 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 4 | 5 | Resources: 6 | 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/) 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns 10 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) Microsoft Corporation. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Intelligent Document Processing with GPT Fallback 2 | 3 | The use of Azure Document Intelligence and its pre-built models is a well-established method to crack unstructured documents. Nevertheless, there are always documents that it is unable to perform reilable extraction with. This process attempts to extract required data points for an invoice. If those elements are not found then we fall back to other means. 4 | - Fuzzy matching against a list of known vales 5 | - Use of GPT models to extract data 6 | 7 | ## Prerequisites for local dev 8 | * Python 3.11 9 | * Azure Functions Core Tools 10 | * Azure Document Intelligence instance with endpoint and API key 11 | * CSV that matches format Type, Code, Name, Name 1, Name 2, City, District, Postal Code, Street, House Number, Country Key, Region 12 | 13 | ## Environment Variables 14 | * ARTIFACT_STORAGE 15 | * DOCUMENT_INTELLIGENCE_ENDPOINT 16 | * DOCUMENT_INTELLIGENCE_KEY 17 | * COMPANY_FILE_PATH 18 | * MODEL_CONFIDENCE_THRESHHOLD=0.8 19 | * GPT4_API_KEY 20 | * GPT4_ENDPOINT 21 | * APPLICATIONINSIGHTS_CONNECTION_STRING 22 | 23 | For local dev add these to a local.settings.json file 24 | 25 | ## Storage account setup 26 | The storage account should have three containers that the function app can write to: 27 | 1. invoices-inbox 28 | 2. invoices-results 29 | 3. invoices-processed 30 | 31 | ## Contributing 32 | 33 | This project welcomes contributions and suggestions. Most contributions require you to agree to a 34 | Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us 35 | the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com. 36 | 37 | When you submit a pull request, a CLA bot will automatically determine whether you need to provide 38 | a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions 39 | provided by the bot. You will only need to do this once across all repos using our CLA. 40 | 41 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 42 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or 43 | contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. 44 | 45 | ## Trademarks 46 | 47 | This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft 48 | trademarks or logos is subject to and must follow 49 | [Microsoft's Trademark & Brand Guidelines](https://www.microsoft.com/en-us/legal/intellectualproperty/trademarks/usage/general). 50 | Use of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship. 51 | Any use of third-party trademarks or logos are subject to those third-party's policies. 52 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## Security 4 | 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet) and [Xamarin](https://github.com/xamarin). 6 | 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/security.md/definition), please report it to us as described below. 8 | 9 | ## Reporting Security Issues 10 | 11 | **Please do not report security vulnerabilities through public GitHub issues.** 12 | 13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/security.md/msrc/create-report). 14 | 15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/security.md/msrc/pgp). 16 | 17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc). 18 | 19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue: 20 | 21 | * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.) 22 | * Full paths of source file(s) related to the manifestation of the issue 23 | * The location of the affected source code (tag/branch/commit or direct URL) 24 | * Any special configuration required to reproduce the issue 25 | * Step-by-step instructions to reproduce the issue 26 | * Proof-of-concept or exploit code (if possible) 27 | * Impact of the issue, including how an attacker might exploit the issue 28 | 29 | This information will help us triage your report more quickly. 30 | 31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/security.md/msrc/bounty) page for more details about our active programs. 32 | 33 | ## Preferred Languages 34 | 35 | We prefer all communications to be in English. 36 | 37 | ## Policy 38 | 39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/security.md/cvd). 40 | 41 | 42 | -------------------------------------------------------------------------------- /SUPPORT.md: -------------------------------------------------------------------------------- 1 | # TODO: The maintainer of this repo has not yet edited this file 2 | 3 | **REPO OWNER**: Do you want Customer Service & Support (CSS) support for this product/project? 4 | 5 | - **No CSS support:** Fill out this template with information about how to file issues and get help. 6 | - **Yes CSS support:** Fill out an intake form at [aka.ms/onboardsupport](https://aka.ms/onboardsupport). CSS will work with/help you to determine next steps. 7 | - **Not sure?** Fill out an intake as though the answer were "Yes". CSS will help you decide. 8 | 9 | *Then remove this first heading from this SUPPORT.MD file before publishing your repo.* 10 | 11 | # Support 12 | 13 | ## How to file issues and get help 14 | 15 | This project uses GitHub Issues to track bugs and feature requests. Please search the existing 16 | issues before filing new issues to avoid duplicates. For new issues, file your bug or 17 | feature request as a new Issue. 18 | 19 | For help and questions about using this project, please **REPO MAINTAINER: INSERT INSTRUCTIONS HERE 20 | FOR HOW TO ENGAGE REPO OWNERS OR COMMUNITY FOR HELP. COULD BE A STACK OVERFLOW TAG OR OTHER 21 | CHANNEL. WHERE WILL YOU HELP PEOPLE?**. 22 | 23 | ## Microsoft Support Policy 24 | 25 | Support for this **PROJECT or PRODUCT** is limited to the resources listed above. 26 | -------------------------------------------------------------------------------- /analyze_results.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import pandas as pd 4 | import glob 5 | import pathlib as Path 6 | import json 7 | 8 | def analyze(): 9 | # pass in a directory 10 | fsarg = os.fsencode(sys.argv[1]) 11 | if ( not os.path.isdir(fsarg)): 12 | print ('bad directory') 13 | return 14 | 15 | # create a dataframe over all json files in this directory 16 | dfs = [] # an empty list to store the data frames 17 | print (f'found {len(os.listdir(fsarg))} files in {sys.argv[1]}') 18 | 19 | for file in glob.glob(sys.argv[1] + '/*.json'): 20 | if (os.path.getsize(file) == 0): 21 | continue 22 | with open(file, 'r', encoding='utf-8') as f: 23 | data = json.load(f) 24 | df = pd.json_normalize(data["candidate_process"]) 25 | print(df) 26 | dfs.append(df) # append the data frame to the list 27 | 28 | temp = pd.concat(dfs, ignore_index=True) # concatenate all the data frames in the list. 29 | temp.to_csv(f'{sys.argv[1]}{os.sep}results.csv', sep=',', mode='w') 30 | 31 | 32 | analyze() 33 | 34 | # run this at command line with python -m analyze_results C:\temp\ipg-inv\processed\[batch stamp] (path to json output files) -------------------------------------------------------------------------------- /companylookup.py: -------------------------------------------------------------------------------- 1 | import os 2 | import abc 3 | import io 4 | import pandas as pd 5 | import requests 6 | 7 | from fuzzywuzzy import process 8 | from fuzzywuzzy import fuzz 9 | 10 | # "valueAddress": { 11 | # "houseNumber": "5454", 12 | # "road": "BEETHOVEN STREET", 13 | # "postalCode": "90066", 14 | # "city": "LOS ANGELES", 15 | # "state": "CA", 16 | # "countryRegion": "USA", 17 | # "streetAddress": "5454 BEETHOVEN STREET" 18 | # } 19 | 20 | # TODO: currently each strategy loops through the whole dataframe 21 | # at current volume that is acceptable but consider other alernatives 22 | class MatchStrategy(metaclass=abc.ABCMeta): 23 | @abc.abstractmethod 24 | def execute(self, df: pd.DataFrame, invoice_data_dict: dict) -> list: 25 | return 26 | 27 | @abc.abstractmethod 28 | def dict_has_required_fields(self, invoice_data_dict: dict) -> bool: 29 | return 30 | 31 | def safe_string(self, input: str) -> str: 32 | return input.replace("\n","").replace("\r","").replace("\t","").strip() 33 | 34 | class ExternalCompanyNameLookup_MatchStrategy(MatchStrategy): 35 | def dict_has_required_fields(self, invoice_data_dict: dict) -> bool: 36 | return invoice_data_dict.get('CustomerName') or None 37 | 38 | def execute(self, df: pd.DataFrame, invoice_data_dict: dict) -> list: 39 | matches = [] 40 | 41 | url = "https://postman-echo.com/post" 42 | payload = {"customer_name": self.safe_string(invoice_data_dict.get('CustomerName').get('valueString'))} 43 | 44 | # Send the HTTP POST request 45 | response = requests.post(url, data=payload) 46 | 47 | # Check if the request was successful 48 | if response.status_code == 200: 49 | # Parse the JSON response 50 | data = response.json() 51 | 52 | # assuming you got back an array you can pass it 53 | # back to the orchestrator 54 | for result in data: 55 | matches.append({'company_code': result['code'], 'company_name': result['name']}) 56 | 57 | return(matches) 58 | else: 59 | print(f"Request failed with status code {response.status_code}") 60 | return(matches) 61 | 62 | class FuzzyCompanyName_PostCode_City_RefineByStreetAndHouse_MatchStrategy(MatchStrategy): 63 | 64 | def dict_has_required_fields(self, invoice_data_dict: dict) -> bool: 65 | customer_name = invoice_data_dict.get('CustomerName') or None 66 | customer_address = invoice_data_dict.get('CustomerAddress') or None 67 | address_components = None if customer_address is None else customer_address.get('valueAddress') 68 | 69 | return ( 70 | customer_name 71 | and customer_address 72 | and address_components 73 | and customer_name.get("valueString") and customer_name.get('confidence') > 0.8 74 | and invoice_data_dict.get('CustomerAddress').get('confidence') > 0.8 ) 75 | 76 | def fuzzy_search_combined(self,query, df, threshold=90, alternative=60 ,limit=10): 77 | matches = process.extract(query, df['Combined'], limit=limit, scorer=fuzz.token_sort_ratio) 78 | results = [df.iloc[match[2]] for match in matches if match[1] >= threshold ] 79 | alternative_results = [df.iloc[match[2]] for match in matches if match[1] >= alternative and match[1] < threshold] 80 | return results, alternative_results 81 | 82 | def refine_results(self,initial_results, address_queries, threshold=80): 83 | for column, query in zip(address_queries.keys(), address_queries.values()): 84 | refined_results = [record for record in initial_results if fuzz.token_set_ratio(record[column], query) >= threshold] 85 | return refined_results 86 | 87 | def append_final_results_to_matches(self,result,final_results): 88 | for record in result: 89 | final_results.append({'company_code': record['Code'], 'company_name': record['Name']}) 90 | return final_results 91 | 92 | def combine_name_address(self,row): 93 | name_parts = [row['Name'], row['Name 1'], row['Name 2'], row['Postal Code'], row['City']] 94 | combined = ' '.join(filter(None, name_parts)) 95 | seen = set() 96 | unique_words = [word for word in combined.split() if not (word in seen or seen.add(word))] 97 | return ' '.join(unique_words) 98 | 99 | def execute(self, df: pd.DataFrame, invoice_data_dict: dict) -> list: 100 | matches = [] 101 | 102 | company_name = invoice_data_dict.get('CustomerName').get('valueString') 103 | address_components = invoice_data_dict.get('CustomerAddress').get('valueAddress') 104 | 105 | # Create a combined column for initial search 106 | df['Combined'] = df.apply(lambda x: self.combine_name_address(x), axis=1) 107 | #Query the column by company name, postal Code and City 108 | initial_query = ' '.join(filter(None,[ company_name.casefold(),(address_components.get('postalCode') or '').casefold(),(address_components.get('city') or '').casefold()])) 109 | 110 | #Get the Initial Search resuult 111 | best_results, alternative_results = self.fuzzy_search_combined(initial_query, df) 112 | 113 | #Store the Best Result 114 | matches = self.append_final_results_to_matches(best_results,matches) 115 | 116 | #Define the refine search components 117 | refine_components = { 118 | 'Street': ' '.join(filter(None,[(address_components.get('house') or '').casefold(),(address_components.get('streetAddress') or '').casefold()])) 119 | } 120 | 121 | #Refine the Initial alternative Result 122 | refine_results = self.refine_results(alternative_results, refine_components) 123 | 124 | #Append the refined result to matches 125 | matches = self.append_final_results_to_matches(refine_results,matches) 126 | 127 | return matches 128 | 129 | class FuzzyCompanyName_FuzzyStreet_ExactCity_ExactPostal_MatchStrategy(MatchStrategy): 130 | def dict_has_required_fields(self, invoice_data_dict: dict) -> bool: 131 | customer_name = invoice_data_dict.get('CustomerName') or None 132 | customer_address = invoice_data_dict.get('CustomerAddress') or None 133 | address_components = None if customer_address is None else invoice_data_dict.get('CustomerAddress').get('valueAddress') 134 | 135 | return ( 136 | customer_name 137 | and customer_name.get("valueString") and customer_name.get('confidence') > 0.8 138 | and customer_address 139 | and address_components 140 | and invoice_data_dict.get('CustomerAddress').get('confidence') > 0.8 141 | and address_components.get('houseNumber') 142 | and address_components.get('road') 143 | and address_components.get('city') 144 | and address_components.get('postalCode')) 145 | 146 | def execute(self, df: pd.DataFrame, invoice_data_dict: dict) -> list: 147 | matches = [] 148 | 149 | company_name = self.safe_string(invoice_data_dict.get('CustomerName').get('valueString')) 150 | address_components = invoice_data_dict.get('CustomerAddress').get('valueAddress') 151 | for key, val in address_components.items(): 152 | address_components[key] = self.safe_string(val) 153 | 154 | # Iterate over the rows in the DataFrame 155 | # TODO: is there a better way besides the brute force loop? 156 | # for company lookup this is probably fine but if more volume is expected than move to a database 157 | for index, row in df.iterrows(): 158 | # Compare the company name and address with the input 159 | name_match_ratio = fuzz.ratio(row['Name'].casefold(), company_name.casefold()) 160 | street_match_ratio = fuzz.ratio(row['Street'], address_components.get('houseNumber') + ' ' + address_components.get('road')) 161 | city_match = address_components.get('city').casefold() == row['City'].casefold() 162 | state_match = True # address_components.get('state').casefold() == row['Region'].casefold() # TODO: state abbreviations? non-US addresses? 163 | postal_match = address_components.get('postalCode') == row['Postal Code'] # TODO: is this US specific?? 164 | 165 | # If the match is above a certain threshold, add the company to the list of matches 166 | # TODO: make the threshold configurable 167 | # TODO: does this matching logic make sense? 168 | if name_match_ratio > 80 and street_match_ratio > 80 and city_match and state_match and postal_match: 169 | matches.append({'company_code': row['Code'], 'company_name': row['Name']}) 170 | 171 | return matches 172 | 173 | class ExactCompanyName_FuzzyStreet_ExactCity_ExactPostal_MatchStrategy(MatchStrategy): 174 | def dict_has_required_fields(self, invoice_data_dict: dict) -> bool: 175 | customer_name = invoice_data_dict.get('CustomerName') or None 176 | customer_address = invoice_data_dict.get('CustomerAddress') or None 177 | address_components = None if customer_address is None else invoice_data_dict.get('CustomerAddress').get('valueAddress') 178 | 179 | return ( 180 | customer_name 181 | and customer_name.get("valueString") and customer_name.get('confidence') > 0.8 182 | and customer_address 183 | and address_components 184 | and invoice_data_dict.get('CustomerAddress').get('confidence') > 0.8 185 | and address_components.get('houseNumber') 186 | and address_components.get('road') 187 | and address_components.get('city') 188 | and address_components.get('postalCode')) 189 | 190 | def execute(self, df: pd.DataFrame, invoice_data_dict: dict) -> list: 191 | 192 | matches = [] 193 | 194 | company_name = self.safe_string(invoice_data_dict.get('CustomerName').get('valueString')) 195 | address_components = invoice_data_dict.get('CustomerAddress').get('valueAddress') 196 | for key, val in address_components.items(): 197 | address_components[key] = self.safe_string(val) 198 | 199 | # Iterate over the rows in the DataFrame 200 | # TODO: is there a better way besides the brute force loop? 201 | # for company lookup this is probably fine but if more volume is expected than move to a database 202 | for index, row in df.iterrows(): 203 | # Compare the company name and address with the input 204 | name_match = row['Name'].casefold() == company_name.casefold() 205 | street_match_ratio = fuzz.ratio(row['Street'], address_components.get('houseNumber') + ' ' + address_components.get('road')) 206 | city_match = address_components.get('city').casefold() == row['City'].casefold() 207 | state_match = True # address_components.get('state').casefold() == row['Region'].casefold() # TODO: state abbreviations? non-US addresses? 208 | postal_match = address_components.get('postalCode') == row['Postal Code'] # TODO: is this US specific?? 209 | 210 | # If the match is above a certain threshold, add the company to the list of matches 211 | # TODO: make the threshold configurable 212 | # TODO: does this matching logic make sense? 213 | if name_match and street_match_ratio > 80 and city_match and state_match and postal_match: 214 | matches.append({'company_code': row['Code'], 'company_name': row['Name']}) 215 | 216 | return matches 217 | 218 | class CompanyMatcher(): 219 | strategy: MatchStrategy 220 | company_listing_df: pd.DataFrame 221 | 222 | def __init__(self, matching_strategy: MatchStrategy, company_listing_df: pd.DataFrame) -> None: 223 | self.strategy = matching_strategy 224 | self.company_listing_df = company_listing_df 225 | 226 | def match_companies(self, invoice_data_dict: dict) -> list: 227 | return self.strategy.execute(self.company_listing_df, invoice_data_dict) 228 | -------------------------------------------------------------------------------- /docintelligence.py: -------------------------------------------------------------------------------- 1 | import os 2 | import logging 3 | import json 4 | 5 | from azure.core.credentials import AzureKeyCredential 6 | from azure.ai.documentintelligence import DocumentIntelligenceClient 7 | from azure.ai.documentintelligence.models import AnalyzeResult 8 | from azure.ai.documentintelligence.models import AnalyzeDocumentRequest 9 | 10 | def crack_invoice(invoice: bytes) -> dict: 11 | # create a doc intelligence client 12 | # https://learn.microsoft.com/en-us/python/api/overview/azure/ai-documentintelligence-readme?view=azure-python-preview 13 | 14 | """ 15 | This code sample shows Prebuilt Invoice operations with the Azure Form Recognizer client library. 16 | The async versions of the samples require Python 3.6 or later. 17 | 18 | To learn more, please visit the documentation - Quickstart: Document Intelligence (formerly Form Recognizer) SDKs 19 | https://learn.microsoft.com/azure/ai-services/document-intelligence/quickstarts/get-started-sdks-rest-api?pivots=programming-language-python 20 | """ 21 | endpoint = os.environ["DOCUMENT_INTELLIGENCE_ENDPOINT"] 22 | key = os.environ["DOCUMENT_INTELLIGENCE_KEY"] 23 | 24 | document_intelligence_client = DocumentIntelligenceClient( 25 | endpoint=endpoint, credential=AzureKeyCredential(key) 26 | ) 27 | 28 | poller = document_intelligence_client.begin_analyze_document("prebuilt-invoice", analyze_request=invoice, content_type="application/octet-stream") 29 | invoice_data: AnalyzeResult = poller.result() 30 | 31 | # TODO: validation and error handling 32 | if invoice_data.documents: 33 | return invoice_data.as_dict().get("documents")[0].get("fields") 34 | 35 | return {} -------------------------------------------------------------------------------- /example.env: -------------------------------------------------------------------------------- 1 | DOCUMENT_INTELLIGENCE_ENDPOINT={URL of your Azure Cognitive Services endpoint} 2 | DOCUMENT_INTELLIGENCE_KEY={Your Azure Cognitive Services key} 3 | COMPANY_FILE_PATH={Path to your company file} 4 | MODEL_CONFIDENCE_THRESHHOLD=0.8 5 | GPT4_API_KEY={Azure OpenAI GPT-4o key} 6 | GPT4_ENDPOINT={Azure OpenAI GPT-4o endpoint} 7 | ARTIFACT_STORAGE='DefaultEndpointsProtocol=http;AccountName=devstoreaccount1;AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;BlobEndpoint=http://127.0.0.1:10000/devstoreaccount1;' -------------------------------------------------------------------------------- /function_app.py: -------------------------------------------------------------------------------- 1 | import azure.functions as func 2 | import datetime 3 | import json 4 | import logging 5 | import pandas 6 | import os 7 | 8 | from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient 9 | from azure.monitor.opentelemetry import configure_azure_monitor 10 | from opentelemetry import trace 11 | from orchestrator import ingest_invoice 12 | from pathlib import Path 13 | from datetime import datetime 14 | 15 | app = func.FunctionApp() 16 | 17 | @app.blob_trigger( 18 | arg_name="invoiceblob", 19 | path="invoices-inbox/{filename}", 20 | connection="ARTIFACT_STORAGE") 21 | @app.blob_input( 22 | arg_name="companylistingcsv", 23 | path="metadata/comp_plant_addr.csv", 24 | connection="ARTIFACT_STORAGE") 25 | @app.blob_output( 26 | arg_name="outputblob", 27 | path="invoices-results/{filename}-{rand-guid}.json", 28 | connection="ARTIFACT_STORAGE") 29 | def new_invoice_file( 30 | invoiceblob: func.InputStream, 31 | companylistingcsv: func.InputStream, 32 | outputblob: func.Out[str]): 33 | 34 | # Configure OpenTelemetry to use Azure Monitor with the 35 | # APPLICATIONINSIGHTS_CONNECTION_STRING environment variable. 36 | if ( "APPLICATIONINSIGHTS_CONNECTION_STRING" in os.environ ): 37 | configure_azure_monitor() 38 | 39 | logging.info(f"Python blob trigger function processed blob" 40 | f"Name: {invoiceblob.name} Blob Size: {invoiceblob.length} bytes") 41 | 42 | df = pandas.read_csv(companylistingcsv, dtype={'Postal Code':str}, keep_default_na=False) 43 | invoice_blob_bytes = invoiceblob.read() 44 | results = ingest_invoice(invoiceblob.name, invoice_blob_bytes, df) 45 | 46 | # do something with results 47 | outputblob.set(json.dumps(results)) 48 | 49 | # copy to processed container 50 | blob_service_client = BlobServiceClient.from_connection_string(os.getenv('ARTIFACT_STORAGE')) 51 | # TODO: container name should be soft-coded here 52 | container_client = blob_service_client.get_container_client("invoices-processed") 53 | container_client.upload_blob(f'{Path(invoiceblob.name).stem}_{datetime.now():%Y%m%d_%H%M%S.%f}{Path(invoiceblob.name).suffix}', invoice_blob_bytes) 54 | 55 | # TODO: container name should be soft-coded here 56 | blob_client = blob_service_client.get_blob_client("invoices-inbox",Path(invoiceblob.name).name) 57 | blob_client.delete_blob() 58 | 59 | return 60 | 61 | -------------------------------------------------------------------------------- /gptvision.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tempfile 3 | import pymupdf 4 | from openai import AzureOpenAI 5 | import base64 6 | import json 7 | 8 | # "valueAddress": { 9 | # "houseNumber": "5454", 10 | # "road": "BEETHOVEN STREET", 11 | # "postalCode": "90066", 12 | # "city": "LOS ANGELES", 13 | # "state": "CA", 14 | # "countryRegion": "USA", 15 | # "streetAddress": "5454 BEETHOVEN STREET" 16 | # } 17 | 18 | def scan_invoice_with_gpt(invoice: bytes) -> dict: 19 | # create a GPT-4o client 20 | # https://learn.microsoft.com/en-us/python/api/overview/azure/ai-documentintelligence-readme?view=azure-python- 21 | # https://platform.openai.com/docs/guides/vision 22 | # We currently support PNG (.png), JPEG (.jpeg and .jpg), WEBP (.webp), and non-animated GIF (.gif). 23 | images = convert_pdf_to_images(invoice) 24 | 25 | # this prompt tries to do a complete extraction in one shot 26 | prompt_structure = { 27 | "PurchaseOrder": {"valueString": "", "confidence": 100}, 28 | "CustomerName": {"valueString": "", "confidence": 100}, 29 | "CustomerAddress": { 30 | "valueAddress": { 31 | "houseNumber": "", 32 | "road": "", 33 | "city": "", 34 | "state": "", 35 | "postalCode": "" 36 | }, 37 | "confidence": 100 38 | } 39 | } 40 | 41 | # TODO: quality of extract might be improved with more thorough prompting, perhaps with examples 42 | prompt = ("Extract the data from this invoice and return it as json. If a value is not present, provide an empty string. Do not format the response with markdown. Use the following structure, setting the valueString key with the value: " + json.dumps(prompt_structure)) 43 | 44 | messages_content = [ 45 | { 46 | "type": "text", 47 | "text": prompt, 48 | } 49 | ] 50 | 51 | for image in images: 52 | messages_content.append( 53 | { 54 | "type": "image_url", 55 | "image_url": { 56 | "url": f"data:image/jpeg;base64,{base64.b64encode(image).decode('utf-8')}" 57 | } 58 | }) 59 | 60 | oai_client = AzureOpenAI( 61 | # https://learn.microsoft.com/azure/ai-services/openai/reference#rest-api-versioning 62 | api_version="2024-02-01", 63 | # https://learn.microsoft.com/azure/cognitive-services/openai/how-to/create-resource?pivots=web-portal#create-a-resource 64 | azure_endpoint=os.getenv("AZURE_OPENAI_API_ENDPOINT") 65 | ) 66 | 67 | api_response = oai_client.chat.completions.create( 68 | model="gpt-4o", 69 | response_format={"type": "json_object"}, 70 | messages= [ 71 | { 72 | "role":"user", 73 | "content":messages_content 74 | } 75 | ] 76 | ) 77 | 78 | # TODO: validate api_response 79 | 80 | return (json.loads(api_response.choices[0].message.content)) 81 | 82 | def convert_pdf_to_images(invoice: bytes) -> list: 83 | images = [] 84 | 85 | # TODO: when run locally invoice is not a byte array 86 | # and this stream parameter throws an exception 87 | pdf_document = pymupdf.open(stream=invoice) 88 | for page_number in range(len(pdf_document)): 89 | page = pdf_document.load_page(page_number) 90 | image = page.get_pixmap().pil_tobytes(format="PNG") 91 | images.append(image) 92 | 93 | return images -------------------------------------------------------------------------------- /host.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": "2.0", 3 | "logging": { 4 | "applicationInsights": { 5 | "samplingSettings": { 6 | "isEnabled": true, 7 | "excludedTypes": "Request" 8 | } 9 | } 10 | }, 11 | "extensionBundle": { 12 | "id": "Microsoft.Azure.Functions.ExtensionBundle", 13 | "version": "[4.*, 5.0.0)" 14 | } 15 | } -------------------------------------------------------------------------------- /localtest.py: -------------------------------------------------------------------------------- 1 | from orchestrator import ingest_invoice 2 | from dotenv import load_dotenv 3 | import json 4 | import sys 5 | import os 6 | 7 | def run_test(): 8 | load_dotenv() 9 | 10 | fsarg = os.fsencode(sys.argv[1]) 11 | 12 | if ( os.path.isdir(fsarg)): 13 | for file in os.listdir(fsarg): 14 | filename = os.fsdecode(file) 15 | if filename.endswith(".pdf"): 16 | with open(sys.argv[1] + '/' + filename, "rb") as f: 17 | print ('******* ' + filename + ' *******\n') 18 | print(json.dumps(ingest_invoice(f), indent=2)) 19 | print ('\n\n') 20 | continue 21 | else: 22 | continue 23 | elif ( os.path.isfile(fsarg)): 24 | with open(fsarg, "rb") as f: 25 | print(json.dumps(ingest_invoice(f), indent=2)) 26 | 27 | run_test() -------------------------------------------------------------------------------- /orchestrator.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import json 3 | import datetime 4 | import os 5 | import io 6 | import pandas 7 | 8 | from docintelligence import crack_invoice 9 | import companylookup 10 | from gptvision import scan_invoice_with_gpt 11 | 12 | model_confidence_threshhold = float(os.environ.get("MODEL_CONFIDENCE_THRESHHOLD", 0.8)) 13 | 14 | # 15 | def attempt_company_lookup_strategies( 16 | invoice_data_dict: dict, 17 | company_listing_df: pandas.DataFrame, 18 | ai_service: str, 19 | candidateprocess_dict: dict) -> dict: 20 | """ Attempt to match the company name and address to known companies using various strategies """ 21 | 22 | # this list has all the strategies we want to try 23 | # as soon as one returns candidates we exit 24 | match_strategies = [ 25 | companylookup.ExactCompanyName_FuzzyStreet_ExactCity_ExactPostal_MatchStrategy(), 26 | companylookup.FuzzyCompanyName_FuzzyStreet_ExactCity_ExactPostal_MatchStrategy(), 27 | companylookup.FuzzyCompanyName_PostCode_City_RefineByStreetAndHouse_MatchStrategy()] 28 | 29 | for match_strategy in match_strategies: 30 | # only execute this strategy if we have the required data 31 | if ( not match_strategy.dict_has_required_fields(invoice_data_dict) ): 32 | continue 33 | 34 | # create a matcher engine with this strategy 35 | matcher = companylookup.CompanyMatcher(match_strategy, company_listing_df) 36 | 37 | # execute matcher 38 | company_candidates = matcher.match_companies(invoice_data_dict) 39 | 40 | # if we found candidates then we return and stop processing other strategies 41 | if ( len(company_candidates) > 0): 42 | candidateprocess_dict["process"] = 'COMPANY_MATCH' 43 | candidateprocess_dict["ai_service"] = ai_service 44 | candidateprocess_dict["strategy"] = match_strategy.__class__.__name__ 45 | candidateprocess_dict["company_candidates"] = company_candidates 46 | candidateprocess_dict["execution_end"] = datetime.datetime.now().isoformat() 47 | return {'candidate_process':candidateprocess_dict, 'invoice_data': invoice_data_dict} 48 | 49 | return None 50 | 51 | def validate_po_number(invoice_data: dict) -> bool: 52 | # customer_id = invoice_data.get("CustomerId") 53 | # if customer_id and customer_id.get("confidence") > model_confidence_threshhold: 54 | # return True 55 | 56 | # TODO: is there any format to PO number we could verify? 57 | purchase_order = invoice_data.get("PurchaseOrder") or None 58 | if purchase_order and purchase_order.get("confidence") > model_confidence_threshhold: 59 | return True 60 | 61 | return False 62 | 63 | def validate_gpt_invoice_data(invoice_data: dict) -> bool: 64 | # the GPT-4o data does not guarantee it will match 65 | # the DI schema so we need to validate and possibly 66 | # scrub 67 | return True 68 | 69 | def process_extracted_invoice_data( 70 | invoice_data_dict: dict, 71 | company_listing_df: pandas.DataFrame, 72 | ai_service: str, 73 | candidateprocess_dict: dict) -> dict: 74 | 75 | # check the data dictionary for PO, or Company code. If any of these are found, it writes all the data and 76 | # their corresponding confidence scores, along with the number of pages in the document, to the suggested company file 77 | # in `.csv` format. If DI didn't extract anything for a data element, write `NONE` in that position. 78 | # and exit 79 | 80 | # PO Number is a special case because we immediately exit 81 | # since it doesn't return a list of candidates we won't 82 | # make it a company strategy 83 | if validate_po_number(invoice_data_dict): 84 | candidateprocess_dict["process"] = 'PONUMBER' 85 | candidateprocess_dict["ai_service"] = ai_service 86 | candidateprocess_dict["purchaseorder"] = invoice_data_dict.get('PurchaseOrder').get('valueString') 87 | candidateprocess_dict["execution_end"] = datetime.datetime.now().isoformat() 88 | return {'candidate_process':candidateprocess_dict, 'invoice_data': invoice_data_dict} 89 | 90 | ## move to company metadata search 91 | company_candidates = attempt_company_lookup_strategies(invoice_data_dict, company_listing_df, ai_service, candidateprocess_dict) 92 | if ( company_candidates ): 93 | return company_candidates 94 | 95 | return None 96 | 97 | def ingest_invoice(source_file_name: str, invoice: bytes, company_listing_df: pandas.DataFrame) -> dict: 98 | """ Manage the orchestration of invoice processing """ 99 | # TODO: add logging 100 | 101 | candidateprocess_dict = { 102 | 'source_file': source_file_name, 103 | 'process':'', 104 | 'ai_service':'', 105 | 'strategy':'', 106 | 'purchaseorder':'', 107 | 'company_candidates':[], 108 | 'execution_start': datetime.datetime.now().isoformat(), 109 | 'execution_end': None} 110 | 111 | # call the document analyze and poll for completion using pre-built invoice model 112 | di_invoice_data_dict = crack_invoice(invoice) 113 | 114 | results = process_extracted_invoice_data(di_invoice_data_dict, company_listing_df, 'DocIntelligence', candidateprocess_dict) 115 | 116 | if results: 117 | return results 118 | 119 | # no dice from cracked document data, move to GPT-4o 120 | gpt_invoice_data_dict = scan_invoice_with_gpt(invoice) 121 | 122 | results = process_extracted_invoice_data(gpt_invoice_data_dict, company_listing_df, 'GPT-4o', candidateprocess_dict) 123 | 124 | if results: 125 | return results 126 | 127 | # TODO: failover to manual intervention 128 | candidateprocess_dict["execution_end"] = datetime.datetime.now().isoformat() 129 | 130 | # in case of no matches we return the Doc Intelligence invoice data 131 | return {'candidate_process':candidateprocess_dict, 'invoice_data': di_invoice_data_dict} 132 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # Do not include azure-functions-worker in this file 2 | # The Python Worker is managed by the Azure Functions platform 3 | # Manually managing azure-functions-worker may cause unexpected issues 4 | 5 | azure-functions 6 | azure-storage-blob 7 | azure-ai-documentintelligence 8 | python-dotenv 9 | pandas 10 | fuzzywuzzy 11 | python-Levenshtein 12 | pymupdf 13 | openai 14 | pillow 15 | azure-monitor-opentelemetry 16 | requests -------------------------------------------------------------------------------- /rig.py: -------------------------------------------------------------------------------- 1 | from orchestrator import ingest_invoice 2 | from dotenv import load_dotenv 3 | import json 4 | import sys 5 | import os 6 | import pandas 7 | from datetime import datetime 8 | from multiprocessing.dummy import Pool as ThreadPool 9 | from pathlib import Path 10 | import glob 11 | 12 | def handle_file(filename): 13 | with open(filename, "rb") as f: 14 | if ( os.path.getsize(filename) > 4000000): 15 | return # free tier max doc size is 4MB 16 | print (f'******* Starting {filename} *******\n') 17 | with open(f'{root}{Path(filename).stem}.json', 'w', encoding='utf-8') as g: 18 | json.dump(ingest_invoice(filename, f, companies_df), g, ensure_ascii=False, indent=2) 19 | print (f'******* Completed {filename} *******\n') 20 | 21 | # def run_test(): 22 | 23 | # fsarg = os.fsencode(sys.argv[1]) 24 | # batchstamp = f'{datetime.now():%Y%m%d_%H%M%S}' 25 | 26 | # if ( os.path.isdir(fsarg)): 27 | # root = f'{sys.argv[1]}{os.sep}processed{os.sep}{batchstamp}{os.sep}' 28 | # Path(root).mkdir(parents=True, exist_ok=True) 29 | 30 | 31 | # pool = ThreadPool(4) 32 | # results = pool.map(handle_file, my_array) 33 | 34 | # for file in os.listdir(fsarg)[0:100]: 35 | # filename = os.fsdecode(file) 36 | # if filename.endswith(".pdf"): 37 | # handle_file(root, filename, f, df) 38 | # continue 39 | # else: 40 | # continue 41 | # elif ( os.path.isfile(fsarg)): 42 | # with open(fsarg, "rb") as f: 43 | # print(json.dumps(ingest_invoice(f.read(),df), indent=2)) 44 | 45 | load_dotenv(override=True) 46 | companies_df = pandas.read_csv(os.environ["COMPANY_FILE_PATH"], dtype={'Postal Code':str}, keep_default_na=False) 47 | fsarg = os.fsencode(sys.argv[1]) 48 | batchstamp = f'{datetime.now():%Y%m%d_%H%M%S}' 49 | 50 | if ( not os.path.isdir(fsarg)): 51 | print ('bad directory') 52 | exit() 53 | 54 | root = f'{sys.argv[1]}{os.sep}processed{os.sep}{batchstamp}{os.sep}' 55 | Path(root).mkdir(parents=True, exist_ok=True) 56 | 57 | print(f'Starting batch {batchstamp}') 58 | pool = ThreadPool(10) 59 | results = pool.map(handle_file, glob.glob(sys.argv[1] + '/*.pdf')) 60 | 61 | # run this at command line with python -m rig C:\temp\ipg-inv (path to pdf invoices) --------------------------------------------------------------------------------