├── .funcignore
├── .github
    └── dependabot.yml
├── .gitignore
├── .vscode
    ├── extensions.json
    ├── launch.json
    ├── settings.json
    └── tasks.json
├── CODE_OF_CONDUCT.md
├── LICENSE
├── README.md
├── SECURITY.md
├── SUPPORT.md
├── analyze_results.py
├── companylookup.py
├── docintelligence.py
├── example.env
├── function_app.py
├── gptvision.py
├── host.json
├── localtest.py
├── orchestrator.py
├── requirements.txt
└── rig.py


/.funcignore:
--------------------------------------------------------------------------------
1 | 
2 | 
3 | .venv


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | # To get started with Dependabot version updates, you'll need to specify which
 2 | # package ecosystems to update and where the package manifests are located.
 3 | # Please see the documentation for more information:
 4 | # https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
 5 | # https://containers.dev/guide/dependabot
 6 | 
 7 | version: 2
 8 | updates:
 9 |  - package-ecosystem: "devcontainers"
10 |    directory: "/"
11 |    schedule:
12 |      interval: weekly
13 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
110 | .pdm.toml
111 | .pdm-python
112 | .pdm-build/
113 | 
114 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
115 | __pypackages__/
116 | 
117 | # Celery stuff
118 | celerybeat-schedule
119 | celerybeat.pid
120 | 
121 | # SageMath parsed files
122 | *.sage.py
123 | 
124 | # Environments
125 | .env
126 | .venv
127 | env/
128 | venv/
129 | ENV/
130 | env.bak/
131 | venv.bak/
132 | 
133 | # Spyder project settings
134 | .spyderproject
135 | .spyproject
136 | 
137 | # Rope project settings
138 | .ropeproject
139 | 
140 | # mkdocs documentation
141 | /site
142 | 
143 | # mypy
144 | .mypy_cache/
145 | .dmypy.json
146 | dmypy.json
147 | 
148 | # Pyre type checker
149 | .pyre/
150 | 
151 | # pytype static type analyzer
152 | .pytype/
153 | 
154 | # Cython debug symbols
155 | cython_debug/
156 | 
157 | # PyCharm
158 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
159 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
160 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
161 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
162 | #.idea/
163 | .python_packages
164 | 
165 | local.settings.json
166 | 
167 | datafiles/
168 | localtest/


--------------------------------------------------------------------------------
/.vscode/extensions.json:
--------------------------------------------------------------------------------
1 | {
2 |   "recommendations": [
3 |     "ms-azuretools.vscode-azurefunctions",
4 |     "ms-python.python"
5 |   ]
6 | }


--------------------------------------------------------------------------------
/.vscode/launch.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "version": "0.2.0",
 3 |   "configurations": [
 4 |     {
 5 |       "name": "CodeWithMS - Local Testing",
 6 |       "type": "debugpy",
 7 |       "request": "launch",
 8 |       "program": "localtest.py",
 9 |       "console": "integratedTerminal",
10 |       "args": "datafiles/documents > datafiles/test/output.txt"
11 |     },    
12 |     {
13 |       "name": "Python Debugger: Current File with Arguments",
14 |       "type": "debugpy",
15 |       "request": "launch",
16 |       "program": "${file}",
17 |       "console": "integratedTerminal",
18 |       "args": "${command:pickArgs}"
19 |     },
20 |     {
21 |       "name": "Attach to Python Functions",
22 |       "type": "debugpy",
23 |       "request": "attach",
24 |       "connect": {
25 |         "host": "localhost",
26 |         "port": 9091
27 |       },
28 |       "preLaunchTask": "func: host start"
29 |     }
30 |   ]
31 | }
32 | 


--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
1 | {
2 |     "azureFunctions.deploySubpath": ".",
3 |     "azureFunctions.scmDoBuildDuringDeployment": true,
4 |     "azureFunctions.pythonVenv": ".venv",
5 |     "azureFunctions.projectLanguage": "Python",
6 |     "azureFunctions.projectRuntime": "~4",
7 |     "debug.internalConsoleOptions": "neverOpen",
8 |     "azureFunctions.projectLanguageModel": 2
9 | }


--------------------------------------------------------------------------------
/.vscode/tasks.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"version": "2.0.0",
 3 | 	"tasks": [
 4 | 		{
 5 | 			"type": "func",
 6 | 			"label": "func: host start",
 7 | 			"command": "host start",
 8 | 			"problemMatcher": "$func-python-watch",
 9 | 			"isBackground": true,
10 | 			"dependsOn": "pip install (functions)"
11 | 		},
12 | 		{
13 | 			"label": "pip install (functions)",
14 | 			"type": "shell",
15 | 			"osx": {
16 | 				"command": "${config:azureFunctions.pythonVenv}/bin/python -m pip install -r requirements.txt"
17 | 			},
18 | 			"windows": {
19 | 				"command": "${config:azureFunctions.pythonVenv}\\Scripts\\python -m pip install -r requirements.txt"
20 | 			},
21 | 			"linux": {
22 | 				"command": "${config:azureFunctions.pythonVenv}/bin/python -m pip install -r requirements.txt"
23 | 			},
24 | 			"problemMatcher": []
25 | 		}
26 | 	]
27 | }


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Microsoft Open Source Code of Conduct
 2 | 
 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
 4 | 
 5 | Resources:
 6 | 
 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/)
 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns
10 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 |     MIT License
 2 | 
 3 |     Copyright (c) Microsoft Corporation.
 4 | 
 5 |     Permission is hereby granted, free of charge, to any person obtaining a copy
 6 |     of this software and associated documentation files (the "Software"), to deal
 7 |     in the Software without restriction, including without limitation the rights
 8 |     to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 |     copies of the Software, and to permit persons to whom the Software is
10 |     furnished to do so, subject to the following conditions:
11 | 
12 |     The above copyright notice and this permission notice shall be included in all
13 |     copies or substantial portions of the Software.
14 | 
15 |     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 |     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 |     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 |     AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 |     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 |     OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 |     SOFTWARE
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Intelligent Document Processing with GPT Fallback
 2 | 
 3 | The use of Azure Document Intelligence and its pre-built models is a well-established method to crack unstructured documents. Nevertheless, there are always documents that it is unable to perform reilable extraction with. This process attempts to extract required data points for an invoice. If those elements are not found then we fall back to other means.
 4 | - Fuzzy matching against a list of known vales
 5 | - Use of GPT models to extract data
 6 | 
 7 | ## Prerequisites for local dev
 8 | * Python 3.11
 9 | * Azure Functions Core Tools
10 | * Azure Document Intelligence instance with endpoint and API key
11 | * CSV that matches format Type, Code, Name, Name 1, Name 2, City, District, Postal Code, Street, House Number, Country Key, Region
12 | 
13 | ## Environment Variables
14 | * ARTIFACT_STORAGE
15 | * DOCUMENT_INTELLIGENCE_ENDPOINT
16 | * DOCUMENT_INTELLIGENCE_KEY
17 | * COMPANY_FILE_PATH
18 | * MODEL_CONFIDENCE_THRESHHOLD=0.8
19 | * GPT4_API_KEY
20 | * GPT4_ENDPOINT
21 | * APPLICATIONINSIGHTS_CONNECTION_STRING
22 | 
23 | For local dev add these to a local.settings.json file 
24 | 
25 | ## Storage account setup
26 | The storage account should have three containers that the function app can write to:
27 | 1. invoices-inbox
28 | 2. invoices-results
29 | 3. invoices-processed
30 | 
31 | ## Contributing
32 | 
33 | This project welcomes contributions and suggestions.  Most contributions require you to agree to a
34 | Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us
35 | the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com.
36 | 
37 | When you submit a pull request, a CLA bot will automatically determine whether you need to provide
38 | a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions
39 | provided by the bot. You will only need to do this once across all repos using our CLA.
40 | 
41 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
42 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or
43 | contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
44 | 
45 | ## Trademarks
46 | 
47 | This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft 
48 | trademarks or logos is subject to and must follow 
49 | [Microsoft's Trademark & Brand Guidelines](https://www.microsoft.com/en-us/legal/intellectualproperty/trademarks/usage/general).
50 | Use of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship.
51 | Any use of third-party trademarks or logos are subject to those third-party's policies.
52 | 


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
 1 | <!-- BEGIN MICROSOFT SECURITY.MD V0.0.9 BLOCK -->
 2 | 
 3 | ## Security
 4 | 
 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet) and [Xamarin](https://github.com/xamarin).
 6 | 
 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/security.md/definition), please report it to us as described below.
 8 | 
 9 | ## Reporting Security Issues
10 | 
11 | **Please do not report security vulnerabilities through public GitHub issues.**
12 | 
13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/security.md/msrc/create-report).
14 | 
15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com).  If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/security.md/msrc/pgp).
16 | 
17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc). 
18 | 
19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
20 | 
21 |   * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
22 |   * Full paths of source file(s) related to the manifestation of the issue
23 |   * The location of the affected source code (tag/branch/commit or direct URL)
24 |   * Any special configuration required to reproduce the issue
25 |   * Step-by-step instructions to reproduce the issue
26 |   * Proof-of-concept or exploit code (if possible)
27 |   * Impact of the issue, including how an attacker might exploit the issue
28 | 
29 | This information will help us triage your report more quickly.
30 | 
31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/security.md/msrc/bounty) page for more details about our active programs.
32 | 
33 | ## Preferred Languages
34 | 
35 | We prefer all communications to be in English.
36 | 
37 | ## Policy
38 | 
39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/security.md/cvd).
40 | 
41 | <!-- END MICROSOFT SECURITY.MD BLOCK -->
42 | 


--------------------------------------------------------------------------------
/SUPPORT.md:
--------------------------------------------------------------------------------
 1 | # TODO: The maintainer of this repo has not yet edited this file
 2 | 
 3 | **REPO OWNER**: Do you want Customer Service & Support (CSS) support for this product/project?
 4 | 
 5 | - **No CSS support:** Fill out this template with information about how to file issues and get help.
 6 | - **Yes CSS support:** Fill out an intake form at [aka.ms/onboardsupport](https://aka.ms/onboardsupport). CSS will work with/help you to determine next steps.
 7 | - **Not sure?** Fill out an intake as though the answer were "Yes". CSS will help you decide.
 8 | 
 9 | *Then remove this first heading from this SUPPORT.MD file before publishing your repo.*
10 | 
11 | # Support
12 | 
13 | ## How to file issues and get help  
14 | 
15 | This project uses GitHub Issues to track bugs and feature requests. Please search the existing 
16 | issues before filing new issues to avoid duplicates.  For new issues, file your bug or 
17 | feature request as a new Issue.
18 | 
19 | For help and questions about using this project, please **REPO MAINTAINER: INSERT INSTRUCTIONS HERE 
20 | FOR HOW TO ENGAGE REPO OWNERS OR COMMUNITY FOR HELP. COULD BE A STACK OVERFLOW TAG OR OTHER
21 | CHANNEL. WHERE WILL YOU HELP PEOPLE?**.
22 | 
23 | ## Microsoft Support Policy  
24 | 
25 | Support for this **PROJECT or PRODUCT** is limited to the resources listed above.
26 | 


--------------------------------------------------------------------------------
/analyze_results.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import pandas as pd
 4 | import glob
 5 | import pathlib as Path
 6 | import json
 7 | 
 8 | def analyze():
 9 |     # pass in a directory
10 |     fsarg = os.fsencode(sys.argv[1])
11 |     if ( not os.path.isdir(fsarg)):
12 |         print ('bad directory')
13 |         return
14 | 
15 |     # create a dataframe over all json files in this directory
16 |     dfs = [] # an empty list to store the data frames
17 |     print (f'found {len(os.listdir(fsarg))} files in {sys.argv[1]}')
18 | 
19 |     for file in glob.glob(sys.argv[1] + '/*.json'):
20 |         if (os.path.getsize(file) == 0):
21 |             continue
22 |         with open(file, 'r', encoding='utf-8') as f:
23 |             data = json.load(f)
24 |         df = pd.json_normalize(data["candidate_process"])
25 |         print(df)
26 |         dfs.append(df) # append the data frame to the list
27 |     
28 |     temp = pd.concat(dfs, ignore_index=True) # concatenate all the data frames in the list.
29 |     temp.to_csv(f'{sys.argv[1]}{os.sep}results.csv', sep=',', mode='w')
30 | 
31 | 
32 | analyze()
33 | 
34 | # run this at command line with python -m analyze_results C:\temp\ipg-inv\processed\[batch stamp] (path to json output files)


--------------------------------------------------------------------------------
/companylookup.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import abc
  3 | import io
  4 | import pandas as pd
  5 | import requests
  6 | 
  7 | from fuzzywuzzy import process
  8 | from fuzzywuzzy import fuzz
  9 | 
 10 |     # "valueAddress": {
 11 |     #   "houseNumber": "5454",
 12 |     #   "road": "BEETHOVEN STREET",
 13 |     #   "postalCode": "90066",
 14 |     #   "city": "LOS ANGELES",
 15 |     #   "state": "CA",
 16 |     #   "countryRegion": "USA",
 17 |     #   "streetAddress": "5454 BEETHOVEN STREET"
 18 |     # }
 19 | 
 20 | # TODO: currently each strategy loops through the whole dataframe
 21 | # at current volume that is acceptable but consider other alernatives
 22 | class MatchStrategy(metaclass=abc.ABCMeta):
 23 |     @abc.abstractmethod
 24 |     def execute(self, df: pd.DataFrame, invoice_data_dict: dict) -> list:
 25 |         return
 26 |     
 27 |     @abc.abstractmethod
 28 |     def dict_has_required_fields(self, invoice_data_dict: dict) -> bool:
 29 |         return
 30 |     
 31 |     def safe_string(self, input: str) -> str:
 32 |         return input.replace("\n","").replace("\r","").replace("\t","").strip()
 33 |     
 34 | class ExternalCompanyNameLookup_MatchStrategy(MatchStrategy):
 35 |     def dict_has_required_fields(self, invoice_data_dict: dict) -> bool:
 36 |         return invoice_data_dict.get('CustomerName') or None
 37 |     
 38 |     def execute(self, df: pd.DataFrame, invoice_data_dict: dict) -> list:
 39 |         matches = []
 40 | 
 41 |         url = "https://postman-echo.com/post"
 42 |         payload = {"customer_name": self.safe_string(invoice_data_dict.get('CustomerName').get('valueString'))}
 43 | 
 44 |         # Send the HTTP POST request
 45 |         response = requests.post(url, data=payload)
 46 | 
 47 |         # Check if the request was successful
 48 |         if response.status_code == 200:
 49 |             # Parse the JSON response
 50 |             data = response.json()
 51 | 
 52 |             # assuming you got back an array you can pass it
 53 |             # back to the orchestrator
 54 |             for result in data:
 55 |                 matches.append({'company_code': result['code'], 'company_name': result['name']})
 56 | 
 57 |             return(matches)
 58 |         else:
 59 |             print(f"Request failed with status code {response.status_code}")
 60 |             return(matches)
 61 | 
 62 | class FuzzyCompanyName_PostCode_City_RefineByStreetAndHouse_MatchStrategy(MatchStrategy):
 63 |     
 64 |     def dict_has_required_fields(self, invoice_data_dict: dict) -> bool:
 65 |         customer_name = invoice_data_dict.get('CustomerName') or None
 66 |         customer_address = invoice_data_dict.get('CustomerAddress') or None
 67 |         address_components = None if customer_address is None else customer_address.get('valueAddress')
 68 | 
 69 |         return (
 70 |             customer_name
 71 |             and customer_address
 72 |             and address_components
 73 |             and customer_name.get("valueString") and customer_name.get('confidence') > 0.8
 74 |             and invoice_data_dict.get('CustomerAddress').get('confidence') > 0.8 )
 75 |     
 76 |     def fuzzy_search_combined(self,query, df, threshold=90, alternative=60 ,limit=10):
 77 |         matches = process.extract(query, df['Combined'], limit=limit, scorer=fuzz.token_sort_ratio)
 78 |         results = [df.iloc[match[2]] for match in matches if match[1] >= threshold ] 
 79 |         alternative_results = [df.iloc[match[2]] for match in matches if match[1] >= alternative and  match[1] < threshold] 
 80 |         return results, alternative_results
 81 | 
 82 |     def refine_results(self,initial_results, address_queries, threshold=80):
 83 |         for column, query in zip(address_queries.keys(), address_queries.values()):
 84 |             refined_results = [record for record in initial_results if fuzz.token_set_ratio(record[column], query) >= threshold]
 85 |         return refined_results
 86 | 
 87 |     def append_final_results_to_matches(self,result,final_results):
 88 |         for record in result:
 89 |             final_results.append({'company_code': record['Code'], 'company_name': record['Name']})
 90 |         return final_results
 91 | 
 92 |     def combine_name_address(self,row):
 93 |         name_parts = [row['Name'], row['Name 1'], row['Name 2'], row['Postal Code'], row['City']]
 94 |         combined = ' '.join(filter(None, name_parts))
 95 |         seen = set()
 96 |         unique_words = [word for word in combined.split() if not (word in seen or seen.add(word))]
 97 |         return ' '.join(unique_words)
 98 |         
 99 |     def execute(self, df: pd.DataFrame, invoice_data_dict: dict) -> list:
100 |         matches = []
101 | 
102 |         company_name = invoice_data_dict.get('CustomerName').get('valueString')
103 |         address_components = invoice_data_dict.get('CustomerAddress').get('valueAddress')
104 |         
105 |         # Create a combined column for initial search
106 |         df['Combined'] = df.apply(lambda x: self.combine_name_address(x), axis=1)
107 |         #Query the column by company name, postal Code and City
108 |         initial_query = ' '.join(filter(None,[ company_name.casefold(),(address_components.get('postalCode') or '').casefold(),(address_components.get('city') or '').casefold()]))
109 |    
110 |         #Get the Initial Search resuult
111 |         best_results, alternative_results = self.fuzzy_search_combined(initial_query, df)
112 | 
113 |         #Store the Best Result
114 |         matches = self.append_final_results_to_matches(best_results,matches)
115 | 
116 |         #Define the refine search components
117 |         refine_components = {
118 |             'Street': ' '.join(filter(None,[(address_components.get('house') or '').casefold(),(address_components.get('streetAddress') or '').casefold()]))
119 |         }
120 | 
121 |         #Refine the Initial alternative Result 
122 |         refine_results = self.refine_results(alternative_results, refine_components)
123 | 
124 |         #Append the refined result to matches 
125 |         matches = self.append_final_results_to_matches(refine_results,matches)
126 | 
127 |         return matches
128 | 
129 | class FuzzyCompanyName_FuzzyStreet_ExactCity_ExactPostal_MatchStrategy(MatchStrategy):
130 |     def dict_has_required_fields(self, invoice_data_dict: dict) -> bool:
131 |         customer_name = invoice_data_dict.get('CustomerName') or None
132 |         customer_address = invoice_data_dict.get('CustomerAddress') or None
133 |         address_components = None if customer_address is None else invoice_data_dict.get('CustomerAddress').get('valueAddress')
134 | 
135 |         return (
136 |             customer_name
137 |             and customer_name.get("valueString") and customer_name.get('confidence') > 0.8
138 |             and customer_address
139 |             and address_components
140 |             and invoice_data_dict.get('CustomerAddress').get('confidence') > 0.8
141 |             and address_components.get('houseNumber')
142 |             and address_components.get('road')
143 |             and address_components.get('city')
144 |             and address_components.get('postalCode'))
145 |     
146 |     def execute(self, df: pd.DataFrame, invoice_data_dict: dict) -> list:
147 |         matches = []
148 | 
149 |         company_name = self.safe_string(invoice_data_dict.get('CustomerName').get('valueString'))
150 |         address_components = invoice_data_dict.get('CustomerAddress').get('valueAddress')
151 |         for key, val in address_components.items():
152 |             address_components[key] = self.safe_string(val)
153 | 
154 |         # Iterate over the rows in the DataFrame
155 |         # TODO: is there a better way besides the brute force loop?
156 |         # for company lookup this is probably fine but if more volume is expected than move to a database
157 |         for index, row in df.iterrows():
158 |             # Compare the company name and address with the input
159 |             name_match_ratio = fuzz.ratio(row['Name'].casefold(), company_name.casefold())
160 |             street_match_ratio = fuzz.ratio(row['Street'], address_components.get('houseNumber') + ' ' + address_components.get('road'))
161 |             city_match = address_components.get('city').casefold() == row['City'].casefold()        
162 |             state_match = True # address_components.get('state').casefold() == row['Region'].casefold() # TODO: state abbreviations? non-US addresses?
163 |             postal_match = address_components.get('postalCode') == row['Postal Code'] # TODO: is this US specific??
164 | 
165 |             # If the match is above a certain threshold, add the company to the list of matches
166 |             # TODO: make the threshold configurable
167 |             # TODO: does this matching logic make sense?
168 |             if name_match_ratio > 80 and street_match_ratio > 80 and city_match and state_match and postal_match:
169 |                 matches.append({'company_code': row['Code'], 'company_name': row['Name']})
170 | 
171 |         return matches
172 | 
173 | class ExactCompanyName_FuzzyStreet_ExactCity_ExactPostal_MatchStrategy(MatchStrategy):
174 |     def dict_has_required_fields(self, invoice_data_dict: dict) -> bool:
175 |         customer_name = invoice_data_dict.get('CustomerName') or None
176 |         customer_address = invoice_data_dict.get('CustomerAddress') or None
177 |         address_components = None if customer_address is None else invoice_data_dict.get('CustomerAddress').get('valueAddress')
178 | 
179 |         return (
180 |             customer_name
181 |             and customer_name.get("valueString") and customer_name.get('confidence') > 0.8
182 |             and customer_address
183 |             and address_components
184 |             and invoice_data_dict.get('CustomerAddress').get('confidence') > 0.8
185 |             and address_components.get('houseNumber')
186 |             and address_components.get('road')
187 |             and address_components.get('city')
188 |             and address_components.get('postalCode'))
189 |     
190 |     def execute(self, df: pd.DataFrame, invoice_data_dict: dict) -> list:
191 | 
192 |         matches = []
193 | 
194 |         company_name = self.safe_string(invoice_data_dict.get('CustomerName').get('valueString'))
195 |         address_components = invoice_data_dict.get('CustomerAddress').get('valueAddress')
196 |         for key, val in address_components.items():
197 |             address_components[key] = self.safe_string(val)
198 | 
199 |         # Iterate over the rows in the DataFrame
200 |         # TODO: is there a better way besides the brute force loop?
201 |         # for company lookup this is probably fine but if more volume is expected than move to a database
202 |         for index, row in df.iterrows():
203 |             # Compare the company name and address with the input
204 |             name_match = row['Name'].casefold() == company_name.casefold()
205 |             street_match_ratio = fuzz.ratio(row['Street'], address_components.get('houseNumber') + ' ' + address_components.get('road'))
206 |             city_match = address_components.get('city').casefold() == row['City'].casefold()        
207 |             state_match = True # address_components.get('state').casefold() == row['Region'].casefold() # TODO: state abbreviations? non-US addresses?
208 |             postal_match = address_components.get('postalCode') == row['Postal Code'] # TODO: is this US specific??
209 | 
210 |             # If the match is above a certain threshold, add the company to the list of matches
211 |             # TODO: make the threshold configurable
212 |             # TODO: does this matching logic make sense?
213 |             if name_match and street_match_ratio > 80 and city_match and state_match and postal_match:
214 |                 matches.append({'company_code': row['Code'], 'company_name': row['Name']})
215 | 
216 |         return matches
217 | 
218 | class CompanyMatcher():
219 |     strategy: MatchStrategy
220 |     company_listing_df: pd.DataFrame
221 | 
222 |     def __init__(self, matching_strategy: MatchStrategy, company_listing_df: pd.DataFrame) -> None:
223 |         self.strategy = matching_strategy
224 |         self.company_listing_df = company_listing_df
225 | 
226 |     def match_companies(self, invoice_data_dict: dict) -> list:        
227 |         return self.strategy.execute(self.company_listing_df, invoice_data_dict)
228 |         


--------------------------------------------------------------------------------
/docintelligence.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import logging
 3 | import json
 4 | 
 5 | from azure.core.credentials import AzureKeyCredential
 6 | from azure.ai.documentintelligence import DocumentIntelligenceClient
 7 | from azure.ai.documentintelligence.models import AnalyzeResult
 8 | from azure.ai.documentintelligence.models import AnalyzeDocumentRequest
 9 | 
10 | def crack_invoice(invoice: bytes) -> dict:
11 |     # create a doc intelligence client
12 |     # https://learn.microsoft.com/en-us/python/api/overview/azure/ai-documentintelligence-readme?view=azure-python-preview
13 | 
14 |     """
15 |     This code sample shows Prebuilt Invoice operations with the Azure Form Recognizer client library. 
16 |     The async versions of the samples require Python 3.6 or later.
17 | 
18 |     To learn more, please visit the documentation - Quickstart: Document Intelligence (formerly Form Recognizer) SDKs
19 |     https://learn.microsoft.com/azure/ai-services/document-intelligence/quickstarts/get-started-sdks-rest-api?pivots=programming-language-python
20 |     """
21 |     endpoint = os.environ["DOCUMENT_INTELLIGENCE_ENDPOINT"]
22 |     key = os.environ["DOCUMENT_INTELLIGENCE_KEY"]
23 | 
24 |     document_intelligence_client  = DocumentIntelligenceClient(
25 |         endpoint=endpoint, credential=AzureKeyCredential(key)
26 |     )
27 |         
28 |     poller = document_intelligence_client.begin_analyze_document("prebuilt-invoice", analyze_request=invoice, content_type="application/octet-stream")
29 |     invoice_data: AnalyzeResult = poller.result()
30 | 
31 |     # TODO: validation and error handling
32 |     if invoice_data.documents:
33 |         return invoice_data.as_dict().get("documents")[0].get("fields")
34 |     
35 |     return {}


--------------------------------------------------------------------------------
/example.env:
--------------------------------------------------------------------------------
1 | DOCUMENT_INTELLIGENCE_ENDPOINT={URL of your Azure Cognitive Services endpoint}
2 | DOCUMENT_INTELLIGENCE_KEY={Your Azure Cognitive Services key}
3 | COMPANY_FILE_PATH={Path to your company file}
4 | MODEL_CONFIDENCE_THRESHHOLD=0.8
5 | GPT4_API_KEY={Azure OpenAI GPT-4o key}
6 | GPT4_ENDPOINT={Azure OpenAI GPT-4o endpoint}
7 | ARTIFACT_STORAGE='DefaultEndpointsProtocol=http;AccountName=devstoreaccount1;AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;BlobEndpoint=http://127.0.0.1:10000/devstoreaccount1;'


--------------------------------------------------------------------------------
/function_app.py:
--------------------------------------------------------------------------------
 1 | import azure.functions as func
 2 | import datetime
 3 | import json
 4 | import logging
 5 | import pandas
 6 | import os
 7 | 
 8 | from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient
 9 | from azure.monitor.opentelemetry import configure_azure_monitor
10 | from opentelemetry import trace
11 | from orchestrator import ingest_invoice
12 | from pathlib import Path
13 | from datetime import datetime
14 | 
15 | app = func.FunctionApp()
16 | 
17 | @app.blob_trigger(
18 |     arg_name="invoiceblob", 
19 |     path="invoices-inbox/{filename}",
20 |     connection="ARTIFACT_STORAGE") 
21 | @app.blob_input(
22 |     arg_name="companylistingcsv", 
23 |     path="metadata/comp_plant_addr.csv", 
24 |     connection="ARTIFACT_STORAGE")
25 | @app.blob_output(
26 |     arg_name="outputblob",
27 |     path="invoices-results/{filename}-{rand-guid}.json",
28 |     connection="ARTIFACT_STORAGE")
29 | def new_invoice_file(
30 |     invoiceblob: func.InputStream, 
31 |     companylistingcsv: func.InputStream,
32 |     outputblob: func.Out[str]):
33 | 
34 |     # Configure OpenTelemetry to use Azure Monitor with the 
35 |     # APPLICATIONINSIGHTS_CONNECTION_STRING environment variable.
36 |     if ( "APPLICATIONINSIGHTS_CONNECTION_STRING" in os.environ ):
37 |         configure_azure_monitor()
38 | 
39 |     logging.info(f"Python blob trigger function processed blob"
40 |                 f"Name: {invoiceblob.name} Blob Size: {invoiceblob.length} bytes")
41 |     
42 |     df = pandas.read_csv(companylistingcsv, dtype={'Postal Code':str}, keep_default_na=False)
43 |     invoice_blob_bytes = invoiceblob.read()
44 |     results = ingest_invoice(invoiceblob.name, invoice_blob_bytes, df)
45 | 
46 |     # do something with results
47 |     outputblob.set(json.dumps(results))
48 | 
49 |     # copy to processed container
50 |     blob_service_client = BlobServiceClient.from_connection_string(os.getenv('ARTIFACT_STORAGE'))
51 |     # TODO: container name should be soft-coded here
52 |     container_client = blob_service_client.get_container_client("invoices-processed")
53 |     container_client.upload_blob(f'{Path(invoiceblob.name).stem}_{datetime.now():%Y%m%d_%H%M%S.%f}{Path(invoiceblob.name).suffix}', invoice_blob_bytes)
54 | 
55 |     # TODO: container name should be soft-coded here
56 |     blob_client = blob_service_client.get_blob_client("invoices-inbox",Path(invoiceblob.name).name)
57 |     blob_client.delete_blob()
58 | 
59 |     return
60 | 
61 | 


--------------------------------------------------------------------------------
/gptvision.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import tempfile
 3 | import pymupdf
 4 | from openai import AzureOpenAI
 5 | import base64
 6 | import json
 7 | 
 8 |     # "valueAddress": {
 9 |     #   "houseNumber": "5454",
10 |     #   "road": "BEETHOVEN STREET",
11 |     #   "postalCode": "90066",
12 |     #   "city": "LOS ANGELES",
13 |     #   "state": "CA",
14 |     #   "countryRegion": "USA",
15 |     #   "streetAddress": "5454 BEETHOVEN STREET"
16 |     # }
17 | 
18 | def scan_invoice_with_gpt(invoice: bytes) -> dict:
19 |     # create a GPT-4o client
20 |     # https://learn.microsoft.com/en-us/python/api/overview/azure/ai-documentintelligence-readme?view=azure-python-
21 |     # https://platform.openai.com/docs/guides/vision
22 |     # We currently support PNG (.png), JPEG (.jpeg and .jpg), WEBP (.webp), and non-animated GIF (.gif).
23 |     images = convert_pdf_to_images(invoice)
24 | 
25 |     # this prompt tries to do a complete extraction in one shot
26 |     prompt_structure = {
27 |         "PurchaseOrder": {"valueString": "", "confidence": 100},
28 |         "CustomerName": {"valueString": "", "confidence": 100},
29 |         "CustomerAddress": {
30 |             "valueAddress": {
31 |                 "houseNumber": "",
32 |                 "road": "",
33 |                 "city": "",
34 |                 "state": "",
35 |                 "postalCode": ""
36 |             },
37 |             "confidence": 100
38 |         }
39 |     }
40 | 
41 |     # TODO: quality of extract might be improved with more thorough prompting, perhaps with examples
42 |     prompt = ("Extract the data from this invoice and return it as json. If a value is not present, provide an empty string. Do not format the response with markdown. Use the following structure, setting the valueString key with the value: " + json.dumps(prompt_structure))
43 | 
44 |     messages_content = [
45 |         {
46 |             "type": "text",
47 |             "text": prompt,
48 |         }
49 |     ]
50 | 
51 |     for image in images:
52 |         messages_content.append(
53 |         {
54 |             "type": "image_url",
55 |             "image_url": {
56 |                 "url": f"data:image/jpeg;base64,{base64.b64encode(image).decode('utf-8')}"
57 |             }
58 |         })
59 | 
60 |     oai_client = AzureOpenAI(
61 |         # https://learn.microsoft.com/azure/ai-services/openai/reference#rest-api-versioning
62 |         api_version="2024-02-01",
63 |         # https://learn.microsoft.com/azure/cognitive-services/openai/how-to/create-resource?pivots=web-portal#create-a-resource
64 |         azure_endpoint=os.getenv("AZURE_OPENAI_API_ENDPOINT")
65 |     )
66 | 
67 |     api_response = oai_client.chat.completions.create(
68 |         model="gpt-4o",
69 |         response_format={"type": "json_object"},
70 |         messages= [
71 |             {
72 |                 "role":"user",
73 |                 "content":messages_content
74 |             }
75 |         ]
76 |     )
77 | 
78 |     # TODO: validate api_response
79 | 
80 |     return (json.loads(api_response.choices[0].message.content))
81 | 
82 | def convert_pdf_to_images(invoice: bytes) -> list:
83 |     images = []
84 | 
85 |     # TODO: when run locally invoice is not a byte array
86 |     # and this stream parameter throws an exception
87 |     pdf_document = pymupdf.open(stream=invoice)
88 |     for page_number in range(len(pdf_document)):  
89 |         page = pdf_document.load_page(page_number)  
90 |         image = page.get_pixmap().pil_tobytes(format="PNG")
91 |         images.append(image)
92 | 
93 |     return images


--------------------------------------------------------------------------------
/host.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "version": "2.0",
 3 |   "logging": {
 4 |     "applicationInsights": {
 5 |       "samplingSettings": {
 6 |         "isEnabled": true,
 7 |         "excludedTypes": "Request"
 8 |       }
 9 |     }
10 |   },
11 |   "extensionBundle": {
12 |     "id": "Microsoft.Azure.Functions.ExtensionBundle",
13 |     "version": "[4.*, 5.0.0)"
14 |   }
15 | }


--------------------------------------------------------------------------------
/localtest.py:
--------------------------------------------------------------------------------
 1 | from orchestrator import ingest_invoice
 2 | from dotenv import load_dotenv
 3 | import json
 4 | import sys
 5 | import os
 6 | 
 7 | def run_test():
 8 |     load_dotenv()
 9 | 
10 |     fsarg = os.fsencode(sys.argv[1])
11 | 
12 |     if ( os.path.isdir(fsarg)):
13 |         for file in os.listdir(fsarg):
14 |             filename = os.fsdecode(file)
15 |             if filename.endswith(".pdf"): 
16 |                 with open(sys.argv[1] + '/' + filename, "rb") as f:
17 |                     print ('******* ' + filename + ' *******\n')
18 |                     print(json.dumps(ingest_invoice(f), indent=2))
19 |                     print ('\n\n')
20 |                 continue
21 |             else:
22 |                 continue
23 |     elif ( os.path.isfile(fsarg)):
24 |         with open(fsarg, "rb") as f:
25 |             print(json.dumps(ingest_invoice(f), indent=2))
26 | 
27 | run_test()


--------------------------------------------------------------------------------
/orchestrator.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import json
  3 | import datetime
  4 | import os
  5 | import io
  6 | import pandas
  7 | 
  8 | from docintelligence import crack_invoice
  9 | import companylookup
 10 | from gptvision import scan_invoice_with_gpt
 11 | 
 12 | model_confidence_threshhold = float(os.environ.get("MODEL_CONFIDENCE_THRESHHOLD", 0.8))
 13 | 
 14 | #
 15 | def attempt_company_lookup_strategies(
 16 |         invoice_data_dict: dict, 
 17 |         company_listing_df: pandas.DataFrame,
 18 |         ai_service: str,
 19 |         candidateprocess_dict: dict) -> dict:
 20 |     """ Attempt to match the company name and address to known companies using various strategies """
 21 | 
 22 |     # this list has all the strategies we want to try
 23 |     # as soon as one returns candidates we exit
 24 |     match_strategies = [
 25 |         companylookup.ExactCompanyName_FuzzyStreet_ExactCity_ExactPostal_MatchStrategy(),
 26 |         companylookup.FuzzyCompanyName_FuzzyStreet_ExactCity_ExactPostal_MatchStrategy(),
 27 |         companylookup.FuzzyCompanyName_PostCode_City_RefineByStreetAndHouse_MatchStrategy()]
 28 | 
 29 |     for match_strategy in match_strategies:
 30 |         # only execute this strategy if we have the required data
 31 |         if ( not match_strategy.dict_has_required_fields(invoice_data_dict) ):
 32 |             continue
 33 | 
 34 |         # create a matcher engine with this strategy
 35 |         matcher = companylookup.CompanyMatcher(match_strategy, company_listing_df)
 36 | 
 37 |         # execute matcher
 38 |         company_candidates = matcher.match_companies(invoice_data_dict)
 39 |         
 40 |         # if we found candidates then we return and stop processing other strategies
 41 |         if ( len(company_candidates) > 0):
 42 |             candidateprocess_dict["process"] = 'COMPANY_MATCH'
 43 |             candidateprocess_dict["ai_service"] = ai_service
 44 |             candidateprocess_dict["strategy"] = match_strategy.__class__.__name__
 45 |             candidateprocess_dict["company_candidates"] = company_candidates
 46 |             candidateprocess_dict["execution_end"] = datetime.datetime.now().isoformat()
 47 |             return {'candidate_process':candidateprocess_dict, 'invoice_data': invoice_data_dict}
 48 |         
 49 |     return None
 50 |     
 51 | def validate_po_number(invoice_data: dict) -> bool:
 52 |     # customer_id = invoice_data.get("CustomerId")
 53 |     # if customer_id and customer_id.get("confidence") > model_confidence_threshhold:
 54 |     #     return True
 55 |     
 56 |     # TODO: is there any format to PO number we could verify?
 57 |     purchase_order = invoice_data.get("PurchaseOrder") or None
 58 |     if purchase_order and purchase_order.get("confidence") > model_confidence_threshhold:
 59 |         return True
 60 |     
 61 |     return False
 62 | 
 63 | def validate_gpt_invoice_data(invoice_data: dict) -> bool:
 64 |     # the GPT-4o data does not guarantee it will match 
 65 |     # the DI schema so we need to validate and possibly
 66 |     # scrub
 67 |     return True
 68 | 
 69 | def process_extracted_invoice_data(
 70 |         invoice_data_dict: dict,
 71 |         company_listing_df: pandas.DataFrame,
 72 |         ai_service: str,
 73 |         candidateprocess_dict: dict) -> dict:
 74 |     
 75 |     # check the data dictionary for PO, or Company code. If any of these are found, it writes all the data and 
 76 |     # their corresponding confidence scores, along with the number of pages in the document, to the suggested company file 
 77 |     # in `.csv` format. If DI didn't extract anything for a data element, write `NONE` in that position.
 78 |     # and exit
 79 | 
 80 |     # PO Number is a special case because we immediately exit
 81 |     # since it doesn't return a list of candidates we won't
 82 |     # make it a company strategy
 83 |     if validate_po_number(invoice_data_dict):
 84 |         candidateprocess_dict["process"] = 'PONUMBER'
 85 |         candidateprocess_dict["ai_service"] = ai_service
 86 |         candidateprocess_dict["purchaseorder"] = invoice_data_dict.get('PurchaseOrder').get('valueString')
 87 |         candidateprocess_dict["execution_end"] = datetime.datetime.now().isoformat()
 88 |         return {'candidate_process':candidateprocess_dict, 'invoice_data': invoice_data_dict}
 89 |     
 90 |     ## move to company metadata search
 91 |     company_candidates = attempt_company_lookup_strategies(invoice_data_dict, company_listing_df, ai_service, candidateprocess_dict)
 92 |     if ( company_candidates ):
 93 |         return company_candidates
 94 |     
 95 |     return None
 96 | 
 97 | def ingest_invoice(source_file_name: str, invoice: bytes, company_listing_df: pandas.DataFrame) -> dict:
 98 |     """ Manage the orchestration of invoice processing """
 99 |     # TODO: add logging
100 |     
101 |     candidateprocess_dict = {
102 |         'source_file': source_file_name,
103 |         'process':'',
104 |         'ai_service':'',
105 |         'strategy':'',
106 |         'purchaseorder':'',
107 |         'company_candidates':[],
108 |         'execution_start': datetime.datetime.now().isoformat(),
109 |         'execution_end': None}
110 | 
111 |     # call the document analyze and poll for completion using pre-built invoice model
112 |     di_invoice_data_dict = crack_invoice(invoice)
113 | 
114 |     results = process_extracted_invoice_data(di_invoice_data_dict, company_listing_df, 'DocIntelligence', candidateprocess_dict)
115 | 
116 |     if results:
117 |         return results
118 | 
119 |     # no dice from cracked document data, move to GPT-4o
120 |     gpt_invoice_data_dict = scan_invoice_with_gpt(invoice)
121 | 
122 |     results = process_extracted_invoice_data(gpt_invoice_data_dict, company_listing_df, 'GPT-4o', candidateprocess_dict)
123 | 
124 |     if results:
125 |         return results
126 | 
127 |     # TODO: failover to manual intervention
128 |     candidateprocess_dict["execution_end"] = datetime.datetime.now().isoformat()
129 | 
130 |     # in case of no matches we return the Doc Intelligence invoice data
131 |     return {'candidate_process':candidateprocess_dict, 'invoice_data': di_invoice_data_dict}
132 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | # Do not include azure-functions-worker in this file
 2 | # The Python Worker is managed by the Azure Functions platform
 3 | # Manually managing azure-functions-worker may cause unexpected issues
 4 | 
 5 | azure-functions
 6 | azure-storage-blob
 7 | azure-ai-documentintelligence
 8 | python-dotenv
 9 | pandas
10 | fuzzywuzzy
11 | python-Levenshtein
12 | pymupdf
13 | openai
14 | pillow
15 | azure-monitor-opentelemetry
16 | requests


--------------------------------------------------------------------------------
/rig.py:
--------------------------------------------------------------------------------
 1 | from orchestrator import ingest_invoice
 2 | from dotenv import load_dotenv
 3 | import json
 4 | import sys
 5 | import os
 6 | import pandas
 7 | from datetime import datetime
 8 | from multiprocessing.dummy import Pool as ThreadPool
 9 | from pathlib import Path
10 | import glob
11 | 
12 | def handle_file(filename):
13 |     with open(filename, "rb") as f:
14 |         if ( os.path.getsize(filename) > 4000000):
15 |             return # free tier max doc size is 4MB
16 |         print (f'******* Starting {filename} *******\n')
17 |         with open(f'{root}{Path(filename).stem}.json', 'w', encoding='utf-8') as g:
18 |             json.dump(ingest_invoice(filename, f, companies_df), g, ensure_ascii=False, indent=2)
19 |         print (f'******* Completed {filename} *******\n')
20 | 
21 | # def run_test():
22 |     
23 | #     fsarg = os.fsencode(sys.argv[1])
24 | #     batchstamp = f'{datetime.now():%Y%m%d_%H%M%S}'
25 |     
26 | #     if ( os.path.isdir(fsarg)):
27 | #         root = f'{sys.argv[1]}{os.sep}processed{os.sep}{batchstamp}{os.sep}'
28 | #         Path(root).mkdir(parents=True, exist_ok=True)
29 | 
30 |         
31 | #         pool = ThreadPool(4)
32 | #         results = pool.map(handle_file, my_array)
33 | 
34 | #         for file in os.listdir(fsarg)[0:100]:
35 | #             filename = os.fsdecode(file)
36 | #             if filename.endswith(".pdf"): 
37 | #                 handle_file(root, filename, f, df)
38 | #                 continue
39 | #             else:
40 | #                 continue
41 | #     elif ( os.path.isfile(fsarg)):
42 | #         with open(fsarg, "rb") as f:            
43 | #             print(json.dumps(ingest_invoice(f.read(),df), indent=2))
44 | 
45 | load_dotenv(override=True)
46 | companies_df = pandas.read_csv(os.environ["COMPANY_FILE_PATH"], dtype={'Postal Code':str}, keep_default_na=False)
47 | fsarg = os.fsencode(sys.argv[1])
48 | batchstamp = f'{datetime.now():%Y%m%d_%H%M%S}'
49 | 
50 | if ( not os.path.isdir(fsarg)):
51 |     print ('bad directory')
52 |     exit()
53 | 
54 | root = f'{sys.argv[1]}{os.sep}processed{os.sep}{batchstamp}{os.sep}'
55 | Path(root).mkdir(parents=True, exist_ok=True)
56 | 
57 | print(f'Starting batch {batchstamp}')
58 | pool = ThreadPool(10)
59 | results = pool.map(handle_file, glob.glob(sys.argv[1] + '/*.pdf'))
60 | 
61 | # run this at command line with python -m rig C:\temp\ipg-inv (path to pdf invoices)


--------------------------------------------------------------------------------