├── .editorconfig ├── .flake8 ├── .gitattributes ├── .github ├── CONTRIBUTING.md ├── FUNDING.yml ├── ISSUE_TEMPLATE │ ├── bug_report.md │ ├── config.yml │ └── feature_request.md ├── PULL_REQUEST_TEMPLATE.md ├── actions │ ├── merge-branch │ │ └── action.yml │ └── tests │ │ └── python │ │ └── action.yml ├── dependabot.yml └── workflows │ ├── auto-assign.yml │ ├── precommitVersionBumps.yml │ ├── pullRequestController.yml │ ├── pushMain.yml │ ├── semanticVersionBump.yml │ └── testsPython.yml ├── .gitignore ├── .mergify.yml ├── .pre-commit-config.yaml ├── .prettierignore ├── .prettierrc ├── .pylintrc ├── .vscode ├── extensions.json └── settings.json ├── CHANGELOG.md ├── LICENSE ├── Makefile ├── README.md ├── codespell.txt ├── commitlint.config.js ├── models ├── __init__.py ├── __version__.py ├── conf.py ├── const.py ├── examples │ ├── __init__.py │ ├── certification_programs.py │ ├── load.py │ ├── online_courses.py │ ├── pinecone_init.py │ ├── prompt.py │ └── rag.py ├── exceptions.py ├── hybrid_search_retreiver.py ├── pinecone.py ├── prompt_templates.py ├── tests │ ├── __init__.py │ ├── mock_data │ │ ├── .env.test_01 │ │ ├── .env.test_illegal_nulls │ │ ├── .env.test_legal_nulls │ │ └── test_load.pdf │ ├── test_configuration.py │ ├── test_examples.py │ ├── test_hsr.py │ ├── test_openai.py │ ├── test_pinecone.py │ ├── test_prompt_templates.py │ └── test_prompts.py └── yt.py ├── package.json ├── pyproject.toml ├── release.config.js ├── requirements ├── base.txt └── local.txt ├── run_pylint.sh ├── setup.py ├── setup_test.py ├── setup_utils.py └── tox.ini /.editorconfig: -------------------------------------------------------------------------------- 1 | # see http://editorconfig.org 2 | root = true 3 | 4 | [*] 5 | end_of_line = lf 6 | trim_trailing_whitespace = true 7 | insert_final_newline = true 8 | indent_style = space 9 | indent_size = 2 10 | charset = utf-8 11 | tab_width = 4 12 | 13 | [*.md] 14 | trim_trailing_whitespace = false 15 | 16 | [*.py] 17 | indent_size = 4 18 | 19 | [go.mod] 20 | indent_style = tab 21 | indent_size = 1 22 | 23 | [*.go] 24 | indent_style = tab 25 | indent_size = 1 26 | 27 | [Makefile] 28 | indent_style = tab 29 | indent_size = 1 30 | 31 | [Makefile.*] 32 | indent_style = tab 33 | indent_size = 1 34 | 35 | [LICENSE] 36 | indent_size = none 37 | -------------------------------------------------------------------------------- /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | ignore=D205,D413,D400,D401 3 | max-line-length=120 4 | max-complexity=10 5 | exclude=venv 6 | extend-exclude="*__init__.py,*__version__.py,venv" 7 | select="C101" 8 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | # * text eol=lf 3 | -------------------------------------------------------------------------------- /.github/CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | The repository is released under the GNU AFFERO GENERAL PUBLIC LICENSE license, and follows a standard Github development process, using Github tracker for issues and merging pull requests into master. 4 | -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | # These are supported funding model platforms 2 | 3 | github: FullStackWithLawrence 4 | patreon: FullStackWithLawrence 5 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | --- 5 | 6 | **Describe the bug** 7 | A clear and concise description of what the bug is. 8 | 9 | **Workflow** 10 | If applicable, provide a workflow file to help explain your problem. 11 | 12 | **Expected behavior** 13 | A clear and concise description of what you expected to happen. 14 | 15 | **Additional context** 16 | Add any other context about the problem here. 17 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/config.yml: -------------------------------------------------------------------------------- 1 | blank_issues_enabled: false 2 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | --- 5 | 6 | **Is your feature request related to a problem? Please describe.** 7 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 8 | 9 | **Describe the solution you'd like** 10 | A clear and concise description of what you want to happen. 11 | 12 | **Describe alternatives you've considered** 13 | A clear and concise description of any alternative solutions or features you've considered. 14 | 15 | **Additional context** 16 | Add any other context or screenshots about the feature request here. 17 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | # Pull Request Template 2 | 3 | ## Type of Change 4 | 5 | 6 | 7 | - [ ] New feature 8 | - [ ] Bug fix 9 | - [ ] Documentation 10 | - [ ] Refactor 11 | - [ ] Chore 12 | 13 | ## Resolves 14 | 15 | - Fixes #[Add issue number here.] 16 | 17 | ## Changes 18 | 19 | 20 | 21 | _Describe what this Pull Request does_ 22 | 23 | ## Testing 24 | 25 | 26 | 27 | _Describe the testing that has been done or needs to be done_ 28 | 29 | ## Screenshots 30 | 31 | 32 | 33 | _Add any relevant screenshots_ 34 | 35 | ## Dependencies 36 | 37 | 38 | 39 | _List dependencies_ 40 | 41 | ## Breaking Changes 42 | 43 | 44 | 45 | _Describe any breaking changes_ 46 | -------------------------------------------------------------------------------- /.github/actions/merge-branch/action.yml: -------------------------------------------------------------------------------- 1 | --- 2 | #------------------------------------------------------------------------------ 3 | # Run pre-commit 4 | #------------------------------------------------------------------------------ 5 | name: Merge 6 | branding: 7 | icon: "git-pull-request" 8 | color: "orange" 9 | inputs: 10 | github-token: 11 | description: "The GitHub token to use for authentication" 12 | required: true 13 | type: string 14 | source-branch: 15 | description: "The branch to merge from" 16 | required: false 17 | type: string 18 | default: "main" 19 | target-branch: 20 | description: "The branch to merge to" 21 | required: true 22 | type: string 23 | 24 | python-version: 25 | description: "The version of Python to use, such as 3.12" 26 | required: true 27 | type: string 28 | 29 | runs: 30 | using: "composite" 31 | steps: 32 | - name: Checkout code 33 | id: checkout 34 | uses: actions/checkout@v4 35 | with: 36 | fetch-depth: 0 37 | persist-credentials: false 38 | 39 | - name: Remember current branch 40 | shell: bash 41 | run: | 42 | echo "CURRENT_BRANCH=$(git branch --show-current)" >> $GITHUB_ENV 43 | 44 | - name: Merge 45 | id: merge 46 | shell: bash 47 | run: | 48 | git config --local user.email "action@github.com" 49 | git config --local user.name "GitHub Action" 50 | git checkout ${{ inputs.source-branch }} 51 | git pull 52 | git checkout ${{ inputs.target-branch }} 53 | git merge -Xtheirs ${{ inputs.source-branch }} 54 | git push https://${{ inputs.github-token }}@github.com/${{ github.repository }}.git HEAD:${{ inputs.target-branch }} 55 | 56 | - name: Checkout current branch 57 | shell: bash 58 | run: | 59 | git checkout ${{ env.CURRENT_BRANCH }} 60 | -------------------------------------------------------------------------------- /.github/actions/tests/python/action.yml: -------------------------------------------------------------------------------- 1 | --- 2 | #------------------------------------------------------------------------------ 3 | # Run Python unit tests 4 | #------------------------------------------------------------------------------ 5 | name: Test Python 6 | branding: 7 | icon: "git-pull-request" 8 | color: "orange" 9 | inputs: 10 | python-version: 11 | description: "The version of Python to use, such as 3.12" 12 | required: true 13 | type: string 14 | openai-api-organization: 15 | description: "The OpenAI API organization" 16 | required: true 17 | type: string 18 | openai-api-key: 19 | description: "The OpenAI API key" 20 | required: true 21 | type: string 22 | pinecone-api-key: 23 | description: "The Pinecone API key" 24 | required: true 25 | type: string 26 | pinecone-environment: 27 | description: "The Pinecone environment" 28 | required: true 29 | type: string 30 | 31 | runs: 32 | using: "composite" 33 | steps: 34 | - name: Checkout code 35 | id: checkout 36 | uses: actions/checkout@v4 37 | 38 | - name: Cache Python dependencies 39 | uses: actions/cache@v3 40 | with: 41 | path: ~/.cache/pip 42 | key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements/local.txt') }} 43 | restore-keys: | 44 | ${{ runner.os }}-pip 45 | 46 | - name: Set up Python 47 | uses: actions/setup-python@v4 48 | with: 49 | python-version: ${{ inputs.python-version }} 50 | 51 | - name: locate site-packages path 52 | shell: bash 53 | run: | 54 | echo "SITE_PACKAGES_PATH=$(python -c 'import site; print(site.getsitepackages()[0])')" >> $GITHUB_ENV 55 | 56 | - name: Install pip 57 | shell: bash 58 | run: | 59 | python -m pip install --upgrade pip 60 | 61 | - name: Install dependencies 62 | shell: bash 63 | run: | 64 | pip install -r ./requirements/local.txt 65 | env: 66 | SITE_PACKAGES_PATH: ${{ env.SITE_PACKAGES_PATH }} 67 | 68 | - name: Create .env 69 | shell: bash 70 | run: | 71 | touch ./.env 72 | echo "OPENAI_API_ORGANIZATION=${{ env.OPENAI_API_ORGANIZATION }}" >> ./.env 73 | echo "OPENAI_API_KEY=${{ env.OPENAI_API_KEY }}" >> ./.env 74 | echo "PINECONE_API_KEY=${{ env.PINECONE_API_KEY }}" >> ./.env 75 | echo "PINECONE_ENVIRONMENT=${{ env.PINECONE_ENVIRONMENT }}" >> ./.env 76 | echo "DEBUG_MODE=False" >> ./.env 77 | env: 78 | OPENAI_API_ORGANIZATION: ${{ inputs.openai-api-organization }} 79 | OPENAI_API_KEY: ${{ inputs.openai-api-key }} 80 | PINECONE_API_KEY: ${{ inputs.pinecone-api-key }} 81 | PINECONE_ENVIRONMENT: ${{ inputs.pinecone-environment }} 82 | 83 | - name: Run Python unit tests 84 | shell: bash 85 | run: | 86 | make test 87 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: "github-actions" 4 | directory: "/" 5 | schedule: 6 | interval: "monthly" 7 | assignees: 8 | - "lpm0073" 9 | reviewers: 10 | - "lpm0073" 11 | - package-ecosystem: "npm" 12 | directory: "/" 13 | schedule: 14 | interval: "monthly" 15 | labels: 16 | - "dependencies" 17 | - "javascript" 18 | assignees: 19 | - "FullStackWithLawrence" 20 | reviewers: 21 | - "FullStackWithLawrence" 22 | - package-ecosystem: "pip" 23 | directory: "/" 24 | schedule: 25 | interval: "monthly" 26 | labels: 27 | - "dependencies" 28 | - "python" 29 | assignees: 30 | - "lpm0073" 31 | reviewers: 32 | - "lpm0073" 33 | -------------------------------------------------------------------------------- /.github/workflows/auto-assign.yml: -------------------------------------------------------------------------------- 1 | name: Auto Assign 2 | on: 3 | issues: 4 | types: [opened] 5 | pull_request: 6 | types: [opened] 7 | jobs: 8 | run: 9 | runs-on: ubuntu-latest 10 | permissions: 11 | issues: write 12 | pull-requests: write 13 | steps: 14 | - name: "Auto-assign issue" 15 | uses: pozil/auto-assign-issue@v2 16 | with: 17 | repo-token: ${{ secrets.GITHUB_TOKEN }} 18 | assignees: lpm0073 19 | numOfAssignee: 1 20 | -------------------------------------------------------------------------------- /.github/workflows/precommitVersionBumps.yml: -------------------------------------------------------------------------------- 1 | --- 2 | #------------------------------------------------------------------------------ 3 | # Lawrence McDaniel - https://lawrencemcdaniel.com 4 | # Version Bump Workflow for .pre-commit-config.yaml 5 | # 6 | # This workflow runs on a cron schedule and checks for updates to the 7 | # .pre-commit-config.yaml file. If updates are found, the workflow 8 | # commits the changes to the next branch and pushes the changes to GitHub. 9 | # 10 | # This is a workaround for the fact that the pre-commit autoupdate command 11 | # is not supported by Dependabot. 12 | #------------------------------------------------------------------------------ 13 | name: pre-commit Version Bumps 14 | 15 | on: 16 | schedule: 17 | - cron: "0 0 * * 3" 18 | workflow_dispatch: 19 | 20 | jobs: 21 | evaluate_precommit_config: 22 | runs-on: ubuntu-latest 23 | 24 | steps: 25 | - uses: actions/checkout@v4 26 | with: 27 | persist-credentials: false 28 | 29 | - name: Checkout next branch 30 | run: | 31 | git fetch 32 | git checkout next 33 | git pull origin next 34 | 35 | - name: Cache NPM dependencies 36 | uses: actions/cache@v4 37 | with: 38 | path: ~/.npm 39 | key: ${{ runner.os }}-node-${{ hashFiles('**/package-lock.json') }} 40 | restore-keys: | 41 | ${{ runner.os }}-node 42 | 43 | - name: Cache Python dependencies 44 | uses: actions/cache@v4 45 | with: 46 | path: ~/.cache/pip 47 | key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements/local.txt') }} 48 | restore-keys: | 49 | ${{ runner.os }}-pip 50 | 51 | - name: Set up Python 52 | uses: actions/setup-python@v5 53 | with: 54 | python-version: "3.12" 55 | 56 | - name: locate site-packages path 57 | shell: bash 58 | run: | 59 | echo "SITE_PACKAGES_PATH=$(python -c 'import site; print(site.getsitepackages()[0])')" >> $GITHUB_ENV 60 | 61 | - name: Install pip 62 | shell: bash 63 | run: | 64 | python -m pip install --upgrade pip 65 | 66 | - name: Install dependencies 67 | shell: bash 68 | run: | 69 | pip install -r ./requirements/local.txt 70 | env: 71 | SITE_PACKAGES_PATH: ${{ env.SITE_PACKAGES_PATH }} 72 | 73 | - name: Setup Node.js environment 74 | uses: actions/setup-node@v4 75 | with: 76 | node-version: "20.9.0" 77 | 78 | - name: Install npm dev dependencies 79 | run: npm install 80 | 81 | - name: Update .pre-commit-config.yaml 82 | run: | 83 | pre-commit autoupdate 84 | 85 | - name: Check for unstaged changes 86 | id: check_changes 87 | run: | 88 | if [[ -n "$(git status --porcelain .pre-commit-config.yaml)" ]]; then 89 | echo "::set-output name=changes::true" 90 | else 91 | echo "::set-output name=changes::false" 92 | fi 93 | 94 | - name: Commit and push changes 95 | if: steps.check_changes.outputs.changes == 'true' 96 | shell: bash 97 | run: | 98 | git config --local user.email "action@github.com" 99 | git config --local user.name "GitHub Action" 100 | git add .pre-commit-config.yaml 101 | git commit -m "chore: [gh] version bumps in .pre-commit-config.yaml [skip ci]" 102 | git push https://${{ secrets.PAT }}@github.com/${{ github.repository }}.git HEAD:next 103 | -------------------------------------------------------------------------------- /.github/workflows/pullRequestController.yml: -------------------------------------------------------------------------------- 1 | --- 2 | #------------------------------------------------------------------------------ 3 | # Pull Request Workflow Controller. 4 | # 5 | # Triggers: 6 | # - Called automatically on relevant actions performed on pull requests. 7 | # - Can also be run manually by clicking the "Run workflow" button. 8 | # 9 | # Actions: 10 | # - Use semantic release rules to determine if a new release will be published. 11 | # - run Python tests, but only if Python-related files have changed. 12 | # - run Terraform tests, but only if Terraform-related files have changed. 13 | # - run ReactJS tests, but only if ReactJS-related files have changed. 14 | # - run pre-commit hooks to ensure code is formatted correctly. 15 | # 16 | # To-Do: 17 | # If a new release is to be published then we want to consider running QA tests 18 | # to ensure formatting and documentation is correct. 19 | #------------------------------------------------------------------------------ 20 | name: Pull Request Controller 21 | 22 | on: 23 | workflow_dispatch: 24 | # GitHub Copilot: The `pull_request` and `pull_request_target` are two different 25 | # event types in GitHub Actions that trigger workflows when activity related 26 | # to pull requests occurs. 27 | # - `pull_request`: This event triggers a workflow run whenever a pull 28 | # request is opened, synchronized, or closed. The workflow runs in the context of the 29 | # pull request, meaning it has access to the code and environment variables of the head 30 | # branch of the pull request. This is safe for pull requests within the same repository, 31 | # but for pull requests from a fork, this could potentially expose sensitive information. 32 | # 33 | # - `pull_request_target`: This event is similar to `pull_request`, but it runs in the context 34 | # of the base of the pull request, rather than the head. This means it has access to the code 35 | # and environment variables of the base branch, not the head branch. This is safer for 36 | # pull requests from forks, as it prevents the fork from accessing sensitive information 37 | # in the base repository. However, it means the workflow does not have access to the code 38 | # in the pull request by default. If you need to access the code in the pull request, 39 | # you can use the `actions/checkout` action with the `ref` input 40 | # set to `github.event.pull_request.head.ref`. 41 | # 42 | # In general, use `pull_request` for workflows that need to access the code in the pull request, 43 | # and `pull_request_target` for workflows that need to be safe for pull requests from forks. 44 | pull_request_target: 45 | types: [opened, synchronize] 46 | paths: 47 | - "**.py" 48 | - "./requirements" 49 | - "**.package.json" 50 | - "./models/**" 51 | 52 | env: 53 | python-version: "3.12" 54 | 55 | jobs: 56 | check_for_pending_release: 57 | name: test-semantic-release 58 | runs-on: ubuntu-latest 59 | steps: 60 | - name: Checkout 61 | uses: actions/checkout@v4 62 | 63 | - name: Semantic Release 64 | uses: cycjimmy/semantic-release-action@v4 65 | id: semantic 66 | with: 67 | dry_run: true 68 | branches: | 69 | [ 70 | '+([0-9])?(.{+([0-9]),x}).x', 71 | 'main', 72 | 'next', 73 | 'next-major', 74 | { 75 | name: 'beta', 76 | prerelease: true 77 | }, 78 | { 79 | name: 'alpha', 80 | prerelease: true 81 | } 82 | ] 83 | extra_plugins: | 84 | @semantic-release/git 85 | @semantic-release/changelog 86 | env: 87 | GITHUB_TOKEN: ${{ secrets.PAT }} 88 | 89 | - name: Test Outputs 90 | if: steps.semantic.outputs.new_release_published == 'true' 91 | run: | 92 | echo ${{ steps.semantic.outputs.new_release_version }} 93 | echo ${{ steps.semantic.outputs.new_release_major_version }} 94 | echo ${{ steps.semantic.outputs.new_release_minor_version }} 95 | echo ${{ steps.semantic.outputs.new_release_patch_version }} 96 | -------------------------------------------------------------------------------- /.github/workflows/pushMain.yml: -------------------------------------------------------------------------------- 1 | --- 2 | #--------------------------------------------------------- 3 | # - Create a semantical release 4 | # - Merge main into next, alpha, beta, and next-major 5 | #--------------------------------------------------------- 6 | name: Push to main 7 | 8 | on: 9 | workflow_dispatch: 10 | push: 11 | branches: 12 | - main 13 | jobs: 14 | merge-main-to-dev-branches: 15 | runs-on: ubuntu-latest 16 | env: 17 | GITHUB_TOKEN: ${{ secrets.PAT }} 18 | 19 | steps: 20 | - name: Checkout code 21 | id: checkout 22 | uses: actions/checkout@v4 23 | 24 | - name: Merge main into next 25 | uses: ./.github/actions/merge-branch 26 | with: 27 | github-token: ${{ env.GITHUB_TOKEN }} 28 | source-branch: main 29 | target-branch: next 30 | 31 | - name: Merge main into next-major 32 | uses: ./.github/actions/merge-branch 33 | with: 34 | github-token: ${{ env.GITHUB_TOKEN }} 35 | source-branch: main 36 | target-branch: next-major 37 | 38 | - name: Merge main into alpha 39 | uses: ./.github/actions/merge-branch 40 | with: 41 | github-token: ${{ env.GITHUB_TOKEN }} 42 | source-branch: main 43 | target-branch: alpha 44 | 45 | - name: Merge main into beta 46 | uses: ./.github/actions/merge-branch 47 | with: 48 | github-token: ${{ env.GITHUB_TOKEN }} 49 | source-branch: main 50 | target-branch: beta 51 | 52 | semantic-release: 53 | needs: merge-main-to-dev-branches 54 | runs-on: ubuntu-latest 55 | env: 56 | GITHUB_TOKEN: ${{ secrets.PAT }} 57 | 58 | steps: 59 | - uses: actions/checkout@v4 60 | id: checkout 61 | with: 62 | persist-credentials: false 63 | 64 | - name: Semantic Release 65 | uses: cycjimmy/semantic-release-action@v4 66 | id: semantic 67 | with: 68 | branches: | 69 | [ 70 | '+([0-9])?(.{+([0-9]),x}).x', 71 | 'main', 72 | 'next', 73 | 'next-major', 74 | { 75 | name: 'beta', 76 | prerelease: true 77 | }, 78 | { 79 | name: 'alpha', 80 | prerelease: true 81 | } 82 | ] 83 | extra_plugins: | 84 | @semantic-release/git 85 | @semantic-release/changelog 86 | env: 87 | GIT_COMMITTER_NAME: github-actions[bot] 88 | GIT_COMMITTER_EMAIL: github-actions[bot]@users.noreply.github.com 89 | GIT_AUTHOR_NAME: github-actions[bot] 90 | GIT_AUTHOR_EMAIL: github-actions[bot]@users.noreply.github.com 91 | 92 | - name: Publish To GitHub Package Registry 93 | id: publish 94 | if: steps.semantic.outputs.new_release_published == 'true' 95 | run: echo "new release was published" 96 | shell: bash 97 | 98 | - name: Push updates to branch for major version 99 | id: push_major 100 | if: steps.semantic.outputs.new_release_published == 'true' 101 | run: "git push https://x-access-token:${{ env.GITHUB_TOKEN }}@github.com/${GITHUB_REPOSITORY}.git HEAD:refs/heads/v${{steps.semantic.outputs.new_release_major_version}}" 102 | shell: bash 103 | -------------------------------------------------------------------------------- /.github/workflows/semanticVersionBump.yml: -------------------------------------------------------------------------------- 1 | --- 2 | #------------------------------------------------------------------------------ 3 | # Lawrence McDaniel - https://lawrencemcdaniel.com 4 | # Version Bump Workflow for Python package. 5 | # 6 | # Calculate the version of the 'next' branch based on semantic-release rules. 7 | # Compares the existing value of __version__.py to the calculated value. 8 | # If they are different, it will update __version__.py and push the changes 9 | # to the main branch. 10 | #------------------------------------------------------------------------------ 11 | name: Semantic Version Bump (next) 12 | 13 | on: 14 | workflow_dispatch: 15 | push: 16 | branches: 17 | - alpha 18 | - beta 19 | - next 20 | - next-major 21 | 22 | jobs: 23 | bump-version-next: 24 | runs-on: ubuntu-latest 25 | env: 26 | VERSION_FILE: __version__.py 27 | PACKAGE_PATH: ${{ github.workspace }}/models/ 28 | 29 | steps: 30 | - uses: actions/checkout@v4 31 | with: 32 | persist-credentials: false 33 | 34 | - name: Cache NPM dependencies 35 | uses: actions/cache@v4 36 | with: 37 | path: ~/.npm 38 | key: ${{ runner.os }}-node-${{ hashFiles('**/package-lock.json') }} 39 | restore-keys: | 40 | ${{ runner.os }}-node 41 | 42 | - name: Set up Python 3.12 43 | uses: actions/setup-python@v5 44 | with: 45 | python-version: "3.12" 46 | 47 | - name: Setup Node.js environment 48 | uses: actions/setup-node@v4 49 | with: 50 | node-version: "20.9.0" 51 | 52 | - name: Install npm dev dependencies 53 | run: npm install 54 | 55 | - name: Get current version 56 | # step 1 57 | # the current version persisted to __version__.py 58 | id: current_version 59 | run: | 60 | cd ${{ env.PACKAGE_PATH }} 61 | echo "CURRENT_VERSION=$(python -c 'from __version__ import __version__; print(__version__)')" >> $GITHUB_ENV 62 | env: 63 | GITHUB_TOKEN: ${{ secrets.PAT }} 64 | 65 | - name: null step 66 | id: null_step1 67 | run: echo "i ensure that CURRENT_VERSION is set." 68 | 69 | - name: Get next version 70 | # step 2 71 | # calculate the next version based on semantic-release rules 72 | # this will return a null string is there in fact is no version bump. 73 | # so set NEXT_VERSION to CURRENT_VERSION if there is no version bump. 74 | id: next_version 75 | run: | 76 | NEXT_VERSION=$(npx semantic-release --dry-run --no-ci | awk '/The next release version is/{print $NF}') 77 | echo "NEXT_VERSION=${NEXT_VERSION:-${{ env.CURRENT_VERSION }}}" >> $GITHUB_ENV 78 | env: 79 | GITHUB_TOKEN: ${{ secrets.PAT }} 80 | CURRENT_VERSION: ${{ env.CURRENT_VERSION }} 81 | 82 | - name: null step 83 | id: null_step2 84 | run: echo "i ensure that NEXT_VERSION is set." 85 | 86 | - name: Check versions 87 | # step 3 88 | # compare the current version to the next version. 89 | # if they are different, set VERSION_CHANGED to true 90 | id: check_versions 91 | run: | 92 | if [ "$CURRENT_VERSION" != "$NEXT_VERSION" ]; then 93 | echo "VERSION_CHANGED=true" >> $GITHUB_ENV 94 | else 95 | echo "VERSION_CHANGED=false" >> $GITHUB_ENV 96 | fi 97 | env: 98 | CURRENT_VERSION: ${{ env.CURRENT_VERSION }} 99 | NEXT_VERSION: ${{ env.NEXT_VERSION }} 100 | 101 | - name: another null step 102 | id: null_step3 103 | run: echo "i ensure that CURRENT_VERSION, NEXT_VERSION and VERSION_CHANGED are set." 104 | 105 | - name: Update __version__.py 106 | # step 4 107 | # if VERSION_CHANGED is true, update __version__.py and push the changes to the 108 | # branch that triggered this workflow. 109 | if: env.VERSION_CHANGED == 'true' 110 | id: update_version 111 | run: | 112 | echo "# -*- coding: utf-8 -*-" > ${{ env.VERSION_FILE }} 113 | echo "# DO NOT EDIT." > ${{ env.VERSION_FILE }} 114 | echo "# Managed via automated CI/CD in .github/workflows/semanticVersionBump.yml." > ${{ env.VERSION_FILE }} 115 | echo "__version__ = \"${{ env.NEXT_VERSION }}\"" >> ${{ env.VERSION_FILE }} 116 | git config --local user.email "action@github.com" 117 | git config --local user.name "GitHub Action" 118 | git add ${{ env.VERSION_FILE }} 119 | git commit -m "chore: [gh] Update __version__.py to ${{ env.NEXT_VERSION }} [skip ci]" 120 | git push https://${{ secrets.PAT }}@github.com/${{ github.repository }}.git HEAD:${{ github.ref }} 121 | env: 122 | VERSION_FILE: ${{ env.PACKAGE_PATH }}${{ env.VERSION_FILE }} 123 | GITHUB_TOKEN: ${{ secrets.PAT }} 124 | NEXT_VERSION: ${{ env.NEXT_VERSION }} 125 | VERSION_CHANGED: ${{ env.VERSION_CHANGED }} 126 | -------------------------------------------------------------------------------- /.github/workflows/testsPython.yml: -------------------------------------------------------------------------------- 1 | name: Python Unit Tests 2 | 3 | on: 4 | workflow_dispatch: 5 | pull_request: 6 | paths: 7 | - "**.py" 8 | push: 9 | paths: 10 | - "**.py" 11 | branches: 12 | - main 13 | 14 | env: 15 | python-version: "3.12" 16 | 17 | jobs: 18 | python-unit-tests: 19 | runs-on: ubuntu-latest 20 | steps: 21 | - name: Checkout code 22 | id: checkout 23 | uses: actions/checkout@v4 24 | 25 | - name: Configure AWS credentials 26 | uses: aws-actions/configure-aws-credentials@v4 27 | with: 28 | aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} 29 | aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} 30 | aws-region: ${{ secrets.AWS_REGION }} 31 | 32 | - name: Run Python tests 33 | uses: ./.github/actions/tests/python 34 | with: 35 | python-version: "${{ env.python-version}}" 36 | openai-api-organization: "${{ secrets.OPENAI_API_ORGANIZATION }}" 37 | openai-api-key: "${{ secrets.OPENAI_API_KEY }}" 38 | pinecone-api-key: "${{ secrets.PINECONE_API_KEY }}" 39 | pinecone-environment: "${{ secrets.PINECONE_ENVIRONMENT }}" 40 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | .env 3 | data 4 | .DS_Store 5 | *.zip 6 | 7 | # Python 8 | build 9 | __pycache__ 10 | .pytest_cache 11 | *.egg-info 12 | venv 13 | .venv 14 | .pytest_cache 15 | *.pyc 16 | *.pyo 17 | *.pyd 18 | *.swp 19 | *.log 20 | 21 | # npm 22 | node_modules 23 | package-lock.json 24 | -------------------------------------------------------------------------------- /.mergify.yml: -------------------------------------------------------------------------------- 1 | # see: 2 | # - https://docs.mergify.com/getting-started/ 3 | pull_request_rules: 4 | - name: automatic approve dependabot pull requests 5 | conditions: 6 | - "author~=dependabot[bot]|dependabot-preview[bot]|dependabot" 7 | actions: 8 | review: 9 | type: APPROVE 10 | 11 | - name: automatic merge dependabot pull requests 12 | conditions: 13 | - "author~=dependabot[bot]|dependabot-preview[bot]|dependabot" 14 | - "#approved-reviews-by>=1" 15 | - "base=main" # replace 'main' with the name of the branch you want to auto-merge into 16 | actions: 17 | merge: 18 | method: merge 19 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | default_language_version: 2 | # default language version for each language 3 | python: python3.12 4 | repos: 5 | - repo: https://github.com/codespell-project/codespell 6 | rev: v2.3.0 7 | hooks: 8 | - id: codespell 9 | args: ["--ignore-words=codespell.txt"] 10 | exclude: 'codespell.txt|\.svg$' 11 | - repo: https://github.com/pre-commit/mirrors-prettier 12 | rev: v4.0.0-alpha.8 13 | hooks: 14 | - id: prettier 15 | - repo: https://github.com/psf/black 16 | rev: 24.10.0 17 | hooks: 18 | - id: black 19 | - repo: https://github.com/PyCQA/flake8 20 | rev: 7.1.1 21 | hooks: 22 | - id: flake8 23 | - repo: https://github.com/PyCQA/isort 24 | rev: 5.13.2 25 | hooks: 26 | - id: isort 27 | args: ["--settings-path=pyproject.toml"] 28 | - repo: local 29 | hooks: 30 | - id: pylint 31 | name: pylint 32 | entry: ./run_pylint.sh 33 | language: script 34 | types: [python] 35 | - repo: https://github.com/PyCQA/bandit 36 | rev: 1.8.0 37 | hooks: 38 | - id: bandit 39 | args: ["-ll"] 40 | - repo: https://github.com/pre-commit/pre-commit-hooks 41 | rev: v5.0.0 42 | hooks: 43 | # See https://pre-commit.com/hooks.html for more hooks 44 | #- id: check-added-large-files 45 | - id: fix-byte-order-marker 46 | - id: fix-encoding-pragma 47 | - id: check-case-conflict 48 | - id: check-json 49 | - id: check-merge-conflict 50 | - id: check-symlinks 51 | - id: check-toml 52 | - id: check-xml 53 | - id: check-yaml 54 | - id: destroyed-symlinks 55 | - id: detect-aws-credentials 56 | - id: detect-private-key 57 | - id: end-of-file-fixer 58 | - id: forbid-new-submodules 59 | - id: trailing-whitespace 60 | - id: check-case-conflict 61 | - id: check-merge-conflict 62 | - id: debug-statements 63 | - repo: https://github.com/alessandrojcm/commitlint-pre-commit-hook 64 | rev: v9.20.0 65 | hooks: 66 | - id: commitlint 67 | stages: [commit-msg] 68 | additional_dependencies: ["@commitlint/config-angular"] 69 | ci: 70 | # for more information, see https://pre-commit.ci 71 | autofix_commit_msg: | 72 | [pre-commit.ci] auto fixes from pre-commit.com hooks 73 | autofix_prs: true 74 | autoupdate_branch: "" 75 | autoupdate_commit_msg: "[pre-commit.ci] pre-commit autoupdate" 76 | autoupdate_schedule: weekly 77 | skip: [shellcheck, markdown-link-check, commitlint] 78 | submodules: false 79 | -------------------------------------------------------------------------------- /.prettierignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FullStackWithLawrence/openai-embeddings/a72c8f78404f65938f59543d89072393717856f0/.prettierignore -------------------------------------------------------------------------------- /.prettierrc: -------------------------------------------------------------------------------- 1 | { 2 | "tabWidth": 2 3 | } 4 | -------------------------------------------------------------------------------- /.pylintrc: -------------------------------------------------------------------------------- 1 | [MASTER] 2 | init-hook='import sys; print(sys.executable); print(sys.path)' 3 | ignore-paths=venv 4 | ignore=__version__.py 5 | 6 | [FORMAT] 7 | max-line-length=120 8 | 9 | [MESSAGES CONTROL] 10 | disable=C0103 11 | -------------------------------------------------------------------------------- /.vscode/extensions.json: -------------------------------------------------------------------------------- 1 | { 2 | "recommendations": ["ms-python.black-formatter"] 3 | } 4 | -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "cornflakes.linter.executablePath": "./venv/bin/flake8", 3 | "[python]": { 4 | "editor.defaultFormatter": "ms-python.black-formatter" 5 | } 6 | } 7 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | ## [1.3.8](https://github.com/FullStackWithLawrence/openai-embeddings/compare/v1.3.7...v1.3.8) (2025-05-14) 2 | 3 | 4 | ### Bug Fixes 5 | 6 | * force a new release ([48d8a70](https://github.com/FullStackWithLawrence/openai-embeddings/commit/48d8a70b6f2c53733d05366040de9d2812428084)) 7 | 8 | ## [1.3.7](https://github.com/FullStackWithLawrence/openai-embeddings/compare/v1.3.6...v1.3.7) (2025-02-07) 9 | 10 | 11 | ### Bug Fixes 12 | 13 | * broken yaml ([db3ccfa](https://github.com/FullStackWithLawrence/openai-embeddings/commit/db3ccfa8a6310f04c24a72f49140d6eada7c8f18)) 14 | * remove superfluous checks ([716ede1](https://github.com/FullStackWithLawrence/openai-embeddings/commit/716ede136628193040f4d9863aa2a36b34e3e345)) 15 | 16 | ## [1.3.6](https://github.com/FullStackWithLawrence/openai-embeddings/compare/v1.3.5...v1.3.6) (2025-02-07) 17 | 18 | 19 | ### Bug Fixes 20 | 21 | * breaking changes in unit tests ([90926a9](https://github.com/FullStackWithLawrence/openai-embeddings/commit/90926a95a30a30f12e98841ecce6ac910625be90)) 22 | 23 | # Change Log 24 | 25 | All notable changes to this project will be documented in this file. 26 | The format is based on [Keep a Changelog](http://keepachangelog.com/) and this project adheres to [Semantic Versioning](http://semver.org/). 27 | 28 | ## [1.3.5](https://github.com/FullStackWithLawrence/openai-embeddings/compare/v1.3.4...v1.3.5) (2025-02-05) 29 | 30 | ### Bug Fixes 31 | 32 | - LangChain breaking changes and deprecations ([ac7b57e](https://github.com/FullStackWithLawrence/openai-embeddings/commit/ac7b57e75705afdea1d563c6a9e929504d782e87)) 33 | 34 | ## [1.3.4](https://github.com/FullStackWithLawrence/openai-embeddings/compare/v1.3.3...v1.3.4) (2025-02-05) 35 | 36 | ### Bug Fixes 37 | 38 | - deprecation warnings and breaking changes ([604353e](https://github.com/FullStackWithLawrence/openai-embeddings/commit/604353e60d1197a60c517b14c02dd02909754307)) 39 | 40 | ## [1.3.2](https://github.com/FullStackWithLawrence/openai-embeddings/compare/v1.3.1...v1.3.2) (2024-04-12) 41 | 42 | ### Bug Fixes 43 | 44 | - fix deprecations and breaking changes in LangChain and Pinecone 45 | 46 | ## [1.3.0](https://github.com/FullStackWithLawrence/openai-embeddings/compare/v1.2.2...v1.3.0) (2023-12-19) 47 | 48 | ### Features 49 | 50 | - add pydantic and refactor settings and credentials management ([332e4da](https://github.com/FullStackWithLawrence/openai-embeddings/commit/332e4dab89924b6ac2436e6d260e645bed26a0b4)) 51 | 52 | ## [1.2.2](https://github.com/FullStackWithLawrence/openai-embeddings/compare/v1.2.1...v1.2.2) (2023-12-19) 53 | 54 | ## [1.2.2](https://github.com/FullStackWithLawrence/openai-embeddings/compare/v1.2.1...v1.2.2) (2023-12-19) 55 | 56 | ### Bug Fixes 57 | 58 | - force a new release ([6c04b0b](https://github.com/FullStackWithLawrence/openai-embeddings/commit/6c04b0b95486fa25b40c6f4d1954bd22b58df7c9)) 59 | 60 | ## [1.2.1](https://github.com/FullStackWithLawrence/openai-embeddings/compare/v1.2.0...v1.2.1) (2023-12-04) 61 | 62 | ### Bug Fixes 63 | 64 | - force a new release ([e21f9c5](https://github.com/FullStackWithLawrence/openai-embeddings/commit/e21f9c56b6dc3be3320afb88a491b43fc04d365b)) 65 | 66 | ## [1.2.0](https://github.com/lpm0073/hybrid-search-retriever/compare/v1.1.3...v1.2.0) (2023-12-03) 67 | 68 | ### Features 69 | 70 | - refactor pinecone logic and add pinecone unit tests ([2b8585b](https://github.com/lpm0073/hybrid-search-retriever/commit/2b8585b36e400d04f22e2a5565ea96f4482fd5f4)) 71 | 72 | ## [1.1.3](https://github.com/lpm0073/hybrid-search-retriever/compare/v1.1.2...v1.1.3) (2023-12-02) 73 | 74 | ### Bug Fixes 75 | 76 | - add langchain-experimental for yt example ([f9d6d6d](https://github.com/lpm0073/hybrid-search-retriever/commit/f9d6d6d0b11ff9c1f06faf7eb69511bc5702066d)) 77 | - correct type error with DEBUG_MODE ([a96bdfd](https://github.com/lpm0073/hybrid-search-retriever/commit/a96bdfdb5a0b015740110e02f9f9b06917cd31c7)) 78 | - move retriever results to system_message ([203c8b3](https://github.com/lpm0073/hybrid-search-retriever/commit/203c8b300cda156ac44a0c6e02510c2ab6a2b074)) 79 | 80 | ## [1.1.2](https://github.com/lpm0073/hybrid-search-retriever/compare/v1.1.1...v1.1.2) (2023-12-01) 81 | 82 | ### Bug Fixes 83 | 84 | - syntax error in examples.prompt ([230b709](https://github.com/lpm0073/hybrid-search-retriever/commit/230b7090c96bdd4d7d8757b182f891ab1b82c6f4)) 85 | 86 | ## [1.1.1](https://github.com/lpm0073/netec-llm/compare/v1.1.0...v1.1.1) (2023-12-01) 87 | 88 | ### Bug Fixes 89 | 90 | - had to switch to bm25_encoder so that vector store is searchable ([bad6994](https://github.com/lpm0073/netec-llm/commit/bad699481d217dde81877d85124395529652dabe)) 91 | 92 | # [1.1.0](https://github.com/lpm0073/netec-llm/compare/v1.0.0...v1.1.0) (2023-12-01) 93 | 94 | ### Bug Fixes 95 | 96 | - fix load problem with existing index ([62cd18f](https://github.com/lpm0073/netec-llm/commit/62cd18f8088873a794ec363c4e18770dfdc41ea5)) 97 | 98 | ### Features 99 | 100 | - perfect load(). revert rag() to openai only calls ([8de793d](https://github.com/lpm0073/netec-llm/commit/8de793dcca77ec23f09e68ca9e8dba7f64623b3c)) 101 | - ssm.rag() w load, split, embed, store ([2335d22](https://github.com/lpm0073/netec-llm/commit/2335d22c5fd9092642ff1eb67a34fbcd712d7f9b)) 102 | 103 | # 1.0.0 (2023-11-30) 104 | 105 | ### Features 106 | 107 | - first commit ([9fe5fbb](https://github.com/lpm0073/netec-llm/commit/9fe5fbbd03d278a90a7351a4d907a74783e48684)) 108 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | GNU AFFERO GENERAL PUBLIC LICENSE 2 | Version 3, 19 November 2007 3 | 4 | Copyright (C) 2007 Free Software Foundation, Inc. 5 | Everyone is permitted to copy and distribute verbatim copies 6 | of this license document, but changing it is not allowed. 7 | 8 | Preamble 9 | 10 | The GNU Affero General Public License is a free, copyleft license for 11 | software and other kinds of works, specifically designed to ensure 12 | cooperation with the community in the case of network server software. 13 | 14 | The licenses for most software and other practical works are designed 15 | to take away your freedom to share and change the works. By contrast, 16 | our General Public Licenses are intended to guarantee your freedom to 17 | share and change all versions of a program--to make sure it remains free 18 | software for all its users. 19 | 20 | When we speak of free software, we are referring to freedom, not 21 | price. Our General Public Licenses are designed to make sure that you 22 | have the freedom to distribute copies of free software (and charge for 23 | them if you wish), that you receive source code or can get it if you 24 | want it, that you can change the software or use pieces of it in new 25 | free programs, and that you know you can do these things. 26 | 27 | Developers that use our General Public Licenses protect your rights 28 | with two steps: (1) assert copyright on the software, and (2) offer 29 | you this License which gives you legal permission to copy, distribute 30 | and/or modify the software. 31 | 32 | A secondary benefit of defending all users' freedom is that 33 | improvements made in alternate versions of the program, if they 34 | receive widespread use, become available for other developers to 35 | incorporate. Many developers of free software are heartened and 36 | encouraged by the resulting cooperation. However, in the case of 37 | software used on network servers, this result may fail to come about. 38 | The GNU General Public License permits making a modified version and 39 | letting the public access it on a server without ever releasing its 40 | source code to the public. 41 | 42 | The GNU Affero General Public License is designed specifically to 43 | ensure that, in such cases, the modified source code becomes available 44 | to the community. It requires the operator of a network server to 45 | provide the source code of the modified version running there to the 46 | users of that server. Therefore, public use of a modified version, on 47 | a publicly accessible server, gives the public access to the source 48 | code of the modified version. 49 | 50 | An older license, called the Affero General Public License and 51 | published by Affero, was designed to accomplish similar goals. This is 52 | a different license, not a version of the Affero GPL, but Affero has 53 | released a new version of the Affero GPL which permits relicensing under 54 | this license. 55 | 56 | The precise terms and conditions for copying, distribution and 57 | modification follow. 58 | 59 | TERMS AND CONDITIONS 60 | 61 | 0. Definitions. 62 | 63 | "This License" refers to version 3 of the GNU Affero General Public License. 64 | 65 | "Copyright" also means copyright-like laws that apply to other kinds of 66 | works, such as semiconductor masks. 67 | 68 | "The Program" refers to any copyrightable work licensed under this 69 | License. Each licensee is addressed as "you". "Licensees" and 70 | "recipients" may be individuals or organizations. 71 | 72 | To "modify" a work means to copy from or adapt all or part of the work 73 | in a fashion requiring copyright permission, other than the making of an 74 | exact copy. The resulting work is called a "modified version" of the 75 | earlier work or a work "based on" the earlier work. 76 | 77 | A "covered work" means either the unmodified Program or a work based 78 | on the Program. 79 | 80 | To "propagate" a work means to do anything with it that, without 81 | permission, would make you directly or secondarily liable for 82 | infringement under applicable copyright law, except executing it on a 83 | computer or modifying a private copy. Propagation includes copying, 84 | distribution (with or without modification), making available to the 85 | public, and in some countries other activities as well. 86 | 87 | To "convey" a work means any kind of propagation that enables other 88 | parties to make or receive copies. Mere interaction with a user through 89 | a computer network, with no transfer of a copy, is not conveying. 90 | 91 | An interactive user interface displays "Appropriate Legal Notices" 92 | to the extent that it includes a convenient and prominently visible 93 | feature that (1) displays an appropriate copyright notice, and (2) 94 | tells the user that there is no warranty for the work (except to the 95 | extent that warranties are provided), that licensees may convey the 96 | work under this License, and how to view a copy of this License. If 97 | the interface presents a list of user commands or options, such as a 98 | menu, a prominent item in the list meets this criterion. 99 | 100 | 1. Source Code. 101 | 102 | The "source code" for a work means the preferred form of the work 103 | for making modifications to it. "Object code" means any non-source 104 | form of a work. 105 | 106 | A "Standard Interface" means an interface that either is an official 107 | standard defined by a recognized standards body, or, in the case of 108 | interfaces specified for a particular programming language, one that 109 | is widely used among developers working in that language. 110 | 111 | The "System Libraries" of an executable work include anything, other 112 | than the work as a whole, that (a) is included in the normal form of 113 | packaging a Major Component, but which is not part of that Major 114 | Component, and (b) serves only to enable use of the work with that 115 | Major Component, or to implement a Standard Interface for which an 116 | implementation is available to the public in source code form. A 117 | "Major Component", in this context, means a major essential component 118 | (kernel, window system, and so on) of the specific operating system 119 | (if any) on which the executable work runs, or a compiler used to 120 | produce the work, or an object code interpreter used to run it. 121 | 122 | The "Corresponding Source" for a work in object code form means all 123 | the source code needed to generate, install, and (for an executable 124 | work) run the object code and to modify the work, including scripts to 125 | control those activities. However, it does not include the work's 126 | System Libraries, or general-purpose tools or generally available free 127 | programs which are used unmodified in performing those activities but 128 | which are not part of the work. For example, Corresponding Source 129 | includes interface definition files associated with source files for 130 | the work, and the source code for shared libraries and dynamically 131 | linked subprograms that the work is specifically designed to require, 132 | such as by intimate data communication or control flow between those 133 | subprograms and other parts of the work. 134 | 135 | The Corresponding Source need not include anything that users 136 | can regenerate automatically from other parts of the Corresponding 137 | Source. 138 | 139 | The Corresponding Source for a work in source code form is that 140 | same work. 141 | 142 | 2. Basic Permissions. 143 | 144 | All rights granted under this License are granted for the term of 145 | copyright on the Program, and are irrevocable provided the stated 146 | conditions are met. This License explicitly affirms your unlimited 147 | permission to run the unmodified Program. The output from running a 148 | covered work is covered by this License only if the output, given its 149 | content, constitutes a covered work. This License acknowledges your 150 | rights of fair use or other equivalent, as provided by copyright law. 151 | 152 | You may make, run and propagate covered works that you do not 153 | convey, without conditions so long as your license otherwise remains 154 | in force. You may convey covered works to others for the sole purpose 155 | of having them make modifications exclusively for you, or provide you 156 | with facilities for running those works, provided that you comply with 157 | the terms of this License in conveying all material for which you do 158 | not control copyright. Those thus making or running the covered works 159 | for you must do so exclusively on your behalf, under your direction 160 | and control, on terms that prohibit them from making any copies of 161 | your copyrighted material outside their relationship with you. 162 | 163 | Conveying under any other circumstances is permitted solely under 164 | the conditions stated below. Sublicensing is not allowed; section 10 165 | makes it unnecessary. 166 | 167 | 3. Protecting Users' Legal Rights From Anti-Circumvention Law. 168 | 169 | No covered work shall be deemed part of an effective technological 170 | measure under any applicable law fulfilling obligations under article 171 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or 172 | similar laws prohibiting or restricting circumvention of such 173 | measures. 174 | 175 | When you convey a covered work, you waive any legal power to forbid 176 | circumvention of technological measures to the extent such circumvention 177 | is effected by exercising rights under this License with respect to 178 | the covered work, and you disclaim any intention to limit operation or 179 | modification of the work as a means of enforcing, against the work's 180 | users, your or third parties' legal rights to forbid circumvention of 181 | technological measures. 182 | 183 | 4. Conveying Verbatim Copies. 184 | 185 | You may convey verbatim copies of the Program's source code as you 186 | receive it, in any medium, provided that you conspicuously and 187 | appropriately publish on each copy an appropriate copyright notice; 188 | keep intact all notices stating that this License and any 189 | non-permissive terms added in accord with section 7 apply to the code; 190 | keep intact all notices of the absence of any warranty; and give all 191 | recipients a copy of this License along with the Program. 192 | 193 | You may charge any price or no price for each copy that you convey, 194 | and you may offer support or warranty protection for a fee. 195 | 196 | 5. Conveying Modified Source Versions. 197 | 198 | You may convey a work based on the Program, or the modifications to 199 | produce it from the Program, in the form of source code under the 200 | terms of section 4, provided that you also meet all of these conditions: 201 | 202 | a) The work must carry prominent notices stating that you modified 203 | it, and giving a relevant date. 204 | 205 | b) The work must carry prominent notices stating that it is 206 | released under this License and any conditions added under section 207 | 7. This requirement modifies the requirement in section 4 to 208 | "keep intact all notices". 209 | 210 | c) You must license the entire work, as a whole, under this 211 | License to anyone who comes into possession of a copy. This 212 | License will therefore apply, along with any applicable section 7 213 | additional terms, to the whole of the work, and all its parts, 214 | regardless of how they are packaged. This License gives no 215 | permission to license the work in any other way, but it does not 216 | invalidate such permission if you have separately received it. 217 | 218 | d) If the work has interactive user interfaces, each must display 219 | Appropriate Legal Notices; however, if the Program has interactive 220 | interfaces that do not display Appropriate Legal Notices, your 221 | work need not make them do so. 222 | 223 | A compilation of a covered work with other separate and independent 224 | works, which are not by their nature extensions of the covered work, 225 | and which are not combined with it such as to form a larger program, 226 | in or on a volume of a storage or distribution medium, is called an 227 | "aggregate" if the compilation and its resulting copyright are not 228 | used to limit the access or legal rights of the compilation's users 229 | beyond what the individual works permit. Inclusion of a covered work 230 | in an aggregate does not cause this License to apply to the other 231 | parts of the aggregate. 232 | 233 | 6. Conveying Non-Source Forms. 234 | 235 | You may convey a covered work in object code form under the terms 236 | of sections 4 and 5, provided that you also convey the 237 | machine-readable Corresponding Source under the terms of this License, 238 | in one of these ways: 239 | 240 | a) Convey the object code in, or embodied in, a physical product 241 | (including a physical distribution medium), accompanied by the 242 | Corresponding Source fixed on a durable physical medium 243 | customarily used for software interchange. 244 | 245 | b) Convey the object code in, or embodied in, a physical product 246 | (including a physical distribution medium), accompanied by a 247 | written offer, valid for at least three years and valid for as 248 | long as you offer spare parts or customer support for that product 249 | model, to give anyone who possesses the object code either (1) a 250 | copy of the Corresponding Source for all the software in the 251 | product that is covered by this License, on a durable physical 252 | medium customarily used for software interchange, for a price no 253 | more than your reasonable cost of physically performing this 254 | conveying of source, or (2) access to copy the 255 | Corresponding Source from a network server at no charge. 256 | 257 | c) Convey individual copies of the object code with a copy of the 258 | written offer to provide the Corresponding Source. This 259 | alternative is allowed only occasionally and noncommercially, and 260 | only if you received the object code with such an offer, in accord 261 | with subsection 6b. 262 | 263 | d) Convey the object code by offering access from a designated 264 | place (gratis or for a charge), and offer equivalent access to the 265 | Corresponding Source in the same way through the same place at no 266 | further charge. You need not require recipients to copy the 267 | Corresponding Source along with the object code. If the place to 268 | copy the object code is a network server, the Corresponding Source 269 | may be on a different server (operated by you or a third party) 270 | that supports equivalent copying facilities, provided you maintain 271 | clear directions next to the object code saying where to find the 272 | Corresponding Source. Regardless of what server hosts the 273 | Corresponding Source, you remain obligated to ensure that it is 274 | available for as long as needed to satisfy these requirements. 275 | 276 | e) Convey the object code using peer-to-peer transmission, provided 277 | you inform other peers where the object code and Corresponding 278 | Source of the work are being offered to the general public at no 279 | charge under subsection 6d. 280 | 281 | A separable portion of the object code, whose source code is excluded 282 | from the Corresponding Source as a System Library, need not be 283 | included in conveying the object code work. 284 | 285 | A "User Product" is either (1) a "consumer product", which means any 286 | tangible personal property which is normally used for personal, family, 287 | or household purposes, or (2) anything designed or sold for incorporation 288 | into a dwelling. In determining whether a product is a consumer product, 289 | doubtful cases shall be resolved in favor of coverage. For a particular 290 | product received by a particular user, "normally used" refers to a 291 | typical or common use of that class of product, regardless of the status 292 | of the particular user or of the way in which the particular user 293 | actually uses, or expects or is expected to use, the product. A product 294 | is a consumer product regardless of whether the product has substantial 295 | commercial, industrial or non-consumer uses, unless such uses represent 296 | the only significant mode of use of the product. 297 | 298 | "Installation Information" for a User Product means any methods, 299 | procedures, authorization keys, or other information required to install 300 | and execute modified versions of a covered work in that User Product from 301 | a modified version of its Corresponding Source. The information must 302 | suffice to ensure that the continued functioning of the modified object 303 | code is in no case prevented or interfered with solely because 304 | modification has been made. 305 | 306 | If you convey an object code work under this section in, or with, or 307 | specifically for use in, a User Product, and the conveying occurs as 308 | part of a transaction in which the right of possession and use of the 309 | User Product is transferred to the recipient in perpetuity or for a 310 | fixed term (regardless of how the transaction is characterized), the 311 | Corresponding Source conveyed under this section must be accompanied 312 | by the Installation Information. But this requirement does not apply 313 | if neither you nor any third party retains the ability to install 314 | modified object code on the User Product (for example, the work has 315 | been installed in ROM). 316 | 317 | The requirement to provide Installation Information does not include a 318 | requirement to continue to provide support service, warranty, or updates 319 | for a work that has been modified or installed by the recipient, or for 320 | the User Product in which it has been modified or installed. Access to a 321 | network may be denied when the modification itself materially and 322 | adversely affects the operation of the network or violates the rules and 323 | protocols for communication across the network. 324 | 325 | Corresponding Source conveyed, and Installation Information provided, 326 | in accord with this section must be in a format that is publicly 327 | documented (and with an implementation available to the public in 328 | source code form), and must require no special password or key for 329 | unpacking, reading or copying. 330 | 331 | 7. Additional Terms. 332 | 333 | "Additional permissions" are terms that supplement the terms of this 334 | License by making exceptions from one or more of its conditions. 335 | Additional permissions that are applicable to the entire Program shall 336 | be treated as though they were included in this License, to the extent 337 | that they are valid under applicable law. If additional permissions 338 | apply only to part of the Program, that part may be used separately 339 | under those permissions, but the entire Program remains governed by 340 | this License without regard to the additional permissions. 341 | 342 | When you convey a copy of a covered work, you may at your option 343 | remove any additional permissions from that copy, or from any part of 344 | it. (Additional permissions may be written to require their own 345 | removal in certain cases when you modify the work.) You may place 346 | additional permissions on material, added by you to a covered work, 347 | for which you have or can give appropriate copyright permission. 348 | 349 | Notwithstanding any other provision of this License, for material you 350 | add to a covered work, you may (if authorized by the copyright holders of 351 | that material) supplement the terms of this License with terms: 352 | 353 | a) Disclaiming warranty or limiting liability differently from the 354 | terms of sections 15 and 16 of this License; or 355 | 356 | b) Requiring preservation of specified reasonable legal notices or 357 | author attributions in that material or in the Appropriate Legal 358 | Notices displayed by works containing it; or 359 | 360 | c) Prohibiting misrepresentation of the origin of that material, or 361 | requiring that modified versions of such material be marked in 362 | reasonable ways as different from the original version; or 363 | 364 | d) Limiting the use for publicity purposes of names of licensors or 365 | authors of the material; or 366 | 367 | e) Declining to grant rights under trademark law for use of some 368 | trade names, trademarks, or service marks; or 369 | 370 | f) Requiring indemnification of licensors and authors of that 371 | material by anyone who conveys the material (or modified versions of 372 | it) with contractual assumptions of liability to the recipient, for 373 | any liability that these contractual assumptions directly impose on 374 | those licensors and authors. 375 | 376 | All other non-permissive additional terms are considered "further 377 | restrictions" within the meaning of section 10. If the Program as you 378 | received it, or any part of it, contains a notice stating that it is 379 | governed by this License along with a term that is a further 380 | restriction, you may remove that term. If a license document contains 381 | a further restriction but permits relicensing or conveying under this 382 | License, you may add to a covered work material governed by the terms 383 | of that license document, provided that the further restriction does 384 | not survive such relicensing or conveying. 385 | 386 | If you add terms to a covered work in accord with this section, you 387 | must place, in the relevant source files, a statement of the 388 | additional terms that apply to those files, or a notice indicating 389 | where to find the applicable terms. 390 | 391 | Additional terms, permissive or non-permissive, may be stated in the 392 | form of a separately written license, or stated as exceptions; 393 | the above requirements apply either way. 394 | 395 | 8. Termination. 396 | 397 | You may not propagate or modify a covered work except as expressly 398 | provided under this License. Any attempt otherwise to propagate or 399 | modify it is void, and will automatically terminate your rights under 400 | this License (including any patent licenses granted under the third 401 | paragraph of section 11). 402 | 403 | However, if you cease all violation of this License, then your 404 | license from a particular copyright holder is reinstated (a) 405 | provisionally, unless and until the copyright holder explicitly and 406 | finally terminates your license, and (b) permanently, if the copyright 407 | holder fails to notify you of the violation by some reasonable means 408 | prior to 60 days after the cessation. 409 | 410 | Moreover, your license from a particular copyright holder is 411 | reinstated permanently if the copyright holder notifies you of the 412 | violation by some reasonable means, this is the first time you have 413 | received notice of violation of this License (for any work) from that 414 | copyright holder, and you cure the violation prior to 30 days after 415 | your receipt of the notice. 416 | 417 | Termination of your rights under this section does not terminate the 418 | licenses of parties who have received copies or rights from you under 419 | this License. If your rights have been terminated and not permanently 420 | reinstated, you do not qualify to receive new licenses for the same 421 | material under section 10. 422 | 423 | 9. Acceptance Not Required for Having Copies. 424 | 425 | You are not required to accept this License in order to receive or 426 | run a copy of the Program. Ancillary propagation of a covered work 427 | occurring solely as a consequence of using peer-to-peer transmission 428 | to receive a copy likewise does not require acceptance. However, 429 | nothing other than this License grants you permission to propagate or 430 | modify any covered work. These actions infringe copyright if you do 431 | not accept this License. Therefore, by modifying or propagating a 432 | covered work, you indicate your acceptance of this License to do so. 433 | 434 | 10. Automatic Licensing of Downstream Recipients. 435 | 436 | Each time you convey a covered work, the recipient automatically 437 | receives a license from the original licensors, to run, modify and 438 | propagate that work, subject to this License. You are not responsible 439 | for enforcing compliance by third parties with this License. 440 | 441 | An "entity transaction" is a transaction transferring control of an 442 | organization, or substantially all assets of one, or subdividing an 443 | organization, or merging organizations. If propagation of a covered 444 | work results from an entity transaction, each party to that 445 | transaction who receives a copy of the work also receives whatever 446 | licenses to the work the party's predecessor in interest had or could 447 | give under the previous paragraph, plus a right to possession of the 448 | Corresponding Source of the work from the predecessor in interest, if 449 | the predecessor has it or can get it with reasonable efforts. 450 | 451 | You may not impose any further restrictions on the exercise of the 452 | rights granted or affirmed under this License. For example, you may 453 | not impose a license fee, royalty, or other charge for exercise of 454 | rights granted under this License, and you may not initiate litigation 455 | (including a cross-claim or counterclaim in a lawsuit) alleging that 456 | any patent claim is infringed by making, using, selling, offering for 457 | sale, or importing the Program or any portion of it. 458 | 459 | 11. Patents. 460 | 461 | A "contributor" is a copyright holder who authorizes use under this 462 | License of the Program or a work on which the Program is based. The 463 | work thus licensed is called the contributor's "contributor version". 464 | 465 | A contributor's "essential patent claims" are all patent claims 466 | owned or controlled by the contributor, whether already acquired or 467 | hereafter acquired, that would be infringed by some manner, permitted 468 | by this License, of making, using, or selling its contributor version, 469 | but do not include claims that would be infringed only as a 470 | consequence of further modification of the contributor version. For 471 | purposes of this definition, "control" includes the right to grant 472 | patent sublicenses in a manner consistent with the requirements of 473 | this License. 474 | 475 | Each contributor grants you a non-exclusive, worldwide, royalty-free 476 | patent license under the contributor's essential patent claims, to 477 | make, use, sell, offer for sale, import and otherwise run, modify and 478 | propagate the contents of its contributor version. 479 | 480 | In the following three paragraphs, a "patent license" is any express 481 | agreement or commitment, however denominated, not to enforce a patent 482 | (such as an express permission to practice a patent or covenant not to 483 | sue for patent infringement). To "grant" such a patent license to a 484 | party means to make such an agreement or commitment not to enforce a 485 | patent against the party. 486 | 487 | If you convey a covered work, knowingly relying on a patent license, 488 | and the Corresponding Source of the work is not available for anyone 489 | to copy, free of charge and under the terms of this License, through a 490 | publicly available network server or other readily accessible means, 491 | then you must either (1) cause the Corresponding Source to be so 492 | available, or (2) arrange to deprive yourself of the benefit of the 493 | patent license for this particular work, or (3) arrange, in a manner 494 | consistent with the requirements of this License, to extend the patent 495 | license to downstream recipients. "Knowingly relying" means you have 496 | actual knowledge that, but for the patent license, your conveying the 497 | covered work in a country, or your recipient's use of the covered work 498 | in a country, would infringe one or more identifiable patents in that 499 | country that you have reason to believe are valid. 500 | 501 | If, pursuant to or in connection with a single transaction or 502 | arrangement, you convey, or propagate by procuring conveyance of, a 503 | covered work, and grant a patent license to some of the parties 504 | receiving the covered work authorizing them to use, propagate, modify 505 | or convey a specific copy of the covered work, then the patent license 506 | you grant is automatically extended to all recipients of the covered 507 | work and works based on it. 508 | 509 | A patent license is "discriminatory" if it does not include within 510 | the scope of its coverage, prohibits the exercise of, or is 511 | conditioned on the non-exercise of one or more of the rights that are 512 | specifically granted under this License. You may not convey a covered 513 | work if you are a party to an arrangement with a third party that is 514 | in the business of distributing software, under which you make payment 515 | to the third party based on the extent of your activity of conveying 516 | the work, and under which the third party grants, to any of the 517 | parties who would receive the covered work from you, a discriminatory 518 | patent license (a) in connection with copies of the covered work 519 | conveyed by you (or copies made from those copies), or (b) primarily 520 | for and in connection with specific products or compilations that 521 | contain the covered work, unless you entered into that arrangement, 522 | or that patent license was granted, prior to 28 March 2007. 523 | 524 | Nothing in this License shall be construed as excluding or limiting 525 | any implied license or other defenses to infringement that may 526 | otherwise be available to you under applicable patent law. 527 | 528 | 12. No Surrender of Others' Freedom. 529 | 530 | If conditions are imposed on you (whether by court order, agreement or 531 | otherwise) that contradict the conditions of this License, they do not 532 | excuse you from the conditions of this License. If you cannot convey a 533 | covered work so as to satisfy simultaneously your obligations under this 534 | License and any other pertinent obligations, then as a consequence you may 535 | not convey it at all. For example, if you agree to terms that obligate you 536 | to collect a royalty for further conveying from those to whom you convey 537 | the Program, the only way you could satisfy both those terms and this 538 | License would be to refrain entirely from conveying the Program. 539 | 540 | 13. Remote Network Interaction; Use with the GNU General Public License. 541 | 542 | Notwithstanding any other provision of this License, if you modify the 543 | Program, your modified version must prominently offer all users 544 | interacting with it remotely through a computer network (if your version 545 | supports such interaction) an opportunity to receive the Corresponding 546 | Source of your version by providing access to the Corresponding Source 547 | from a network server at no charge, through some standard or customary 548 | means of facilitating copying of software. This Corresponding Source 549 | shall include the Corresponding Source for any work covered by version 3 550 | of the GNU General Public License that is incorporated pursuant to the 551 | following paragraph. 552 | 553 | Notwithstanding any other provision of this License, you have 554 | permission to link or combine any covered work with a work licensed 555 | under version 3 of the GNU General Public License into a single 556 | combined work, and to convey the resulting work. The terms of this 557 | License will continue to apply to the part which is the covered work, 558 | but the work with which it is combined will remain governed by version 559 | 3 of the GNU General Public License. 560 | 561 | 14. Revised Versions of this License. 562 | 563 | The Free Software Foundation may publish revised and/or new versions of 564 | the GNU Affero General Public License from time to time. Such new versions 565 | will be similar in spirit to the present version, but may differ in detail to 566 | address new problems or concerns. 567 | 568 | Each version is given a distinguishing version number. If the 569 | Program specifies that a certain numbered version of the GNU Affero General 570 | Public License "or any later version" applies to it, you have the 571 | option of following the terms and conditions either of that numbered 572 | version or of any later version published by the Free Software 573 | Foundation. If the Program does not specify a version number of the 574 | GNU Affero General Public License, you may choose any version ever published 575 | by the Free Software Foundation. 576 | 577 | If the Program specifies that a proxy can decide which future 578 | versions of the GNU Affero General Public License can be used, that proxy's 579 | public statement of acceptance of a version permanently authorizes you 580 | to choose that version for the Program. 581 | 582 | Later license versions may give you additional or different 583 | permissions. However, no additional obligations are imposed on any 584 | author or copyright holder as a result of your choosing to follow a 585 | later version. 586 | 587 | 15. Disclaimer of Warranty. 588 | 589 | THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY 590 | APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT 591 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY 592 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, 593 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 594 | PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM 595 | IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF 596 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 597 | 598 | 16. Limitation of Liability. 599 | 600 | IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 601 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS 602 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY 603 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE 604 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF 605 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD 606 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), 607 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF 608 | SUCH DAMAGES. 609 | 610 | 17. Interpretation of Sections 15 and 16. 611 | 612 | If the disclaimer of warranty and limitation of liability provided 613 | above cannot be given local legal effect according to their terms, 614 | reviewing courts shall apply local law that most closely approximates 615 | an absolute waiver of all civil liability in connection with the 616 | Program, unless a warranty or assumption of liability accompanies a 617 | copy of the Program in return for a fee. 618 | 619 | END OF TERMS AND CONDITIONS 620 | 621 | How to Apply These Terms to Your New Programs 622 | 623 | If you develop a new program, and you want it to be of the greatest 624 | possible use to the public, the best way to achieve this is to make it 625 | free software which everyone can redistribute and change under these terms. 626 | 627 | To do so, attach the following notices to the program. It is safest 628 | to attach them to the start of each source file to most effectively 629 | state the exclusion of warranty; and each file should have at least 630 | the "copyright" line and a pointer to where the full notice is found. 631 | 632 | 633 | Copyright (C) 634 | 635 | This program is free software: you can redistribute it and/or modify 636 | it under the terms of the GNU Affero General Public License as published by 637 | the Free Software Foundation, either version 3 of the License, or 638 | (at your option) any later version. 639 | 640 | This program is distributed in the hope that it will be useful, 641 | but WITHOUT ANY WARRANTY; without even the implied warranty of 642 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 643 | GNU Affero General Public License for more details. 644 | 645 | You should have received a copy of the GNU Affero General Public License 646 | along with this program. If not, see . 647 | 648 | Also add information on how to contact you by electronic and paper mail. 649 | 650 | If your software can interact with users remotely through a computer 651 | network, you should also make sure that it provides a way for users to 652 | get its source. For example, if your program is a web application, its 653 | interface could display a "Source" link that leads users to an archive 654 | of the code. There are many ways you could offer source, and different 655 | solutions will be better for different programs; see section 13 for the 656 | specific requirements. 657 | 658 | You should also get your employer (if you work as a programmer) or school, 659 | if any, to sign a "copyright disclaimer" for the program, if necessary. 660 | For more information on this, and how to apply and follow the GNU AGPL, see 661 | . 662 | 663 | EdX Inc. wishes to state, in clarification of the above license terms, that 664 | any public, independently available web service offered over the network and 665 | communicating with edX's copyrighted works by any form of inter-service 666 | communication, including but not limited to Remote Procedure Call (RPC) 667 | interfaces, is not a work based on our copyrighted work within the meaning 668 | of the license. "Corresponding Source" of this work, or works based on this 669 | work, as defined by the terms of this license do not include source code 670 | files for programs used solely to provide those public, independently 671 | available web services. 672 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | SHELL := /bin/bash 2 | ifeq ($(OS),Windows_NT) 3 | PYTHON = python.exe 4 | ACTIVATE_VENV = venv\Scripts\activate 5 | else 6 | PYTHON = python3.12 7 | ACTIVATE_VENV = source venv/bin/activate 8 | endif 9 | PIP = $(PYTHON) -m pip 10 | 11 | ifneq ("$(wildcard .env)","") 12 | include .env 13 | else 14 | $(shell echo -e "OPENAI_API_ORGANIZATION=PLEASE-ADD-ME\n\ 15 | OPENAI_API_KEY=PLEASE-ADD-ME\n\ 16 | PINECONE_API_KEY=PLEASE-ADD-ME\n\ 17 | PINECONE_ENVIRONMENT=gcp-starter\n\ 18 | PINECONE_INDEX_NAME=rag\n\ 19 | PINECONE_VECTORSTORE_TEXT_KEY=lc_id\n\ 20 | PINECONE_METRIC=dotproduct\n\ 21 | PINECONE_DIMENSIONS=1536\n\ 22 | OPENAI_CHAT_MODEL_NAME=gpt-4\n\ 23 | OPENAI_PROMPT_MODEL_NAME=gpt-4\n\ 24 | OPENAI_CHAT_TEMPERATURE=0.0\n\ 25 | OPENAI_CHAT_MAX_RETRIES=3\n\ 26 | DEBUG_MODE=True\n" >> .env) 27 | endif 28 | 29 | .PHONY: analyze init activate test lint clean 30 | 31 | # Default target executed when no arguments are given to make. 32 | all: help 33 | 34 | analyze: 35 | cloc . --exclude-ext=svg,json,zip --vcs=git 36 | 37 | init: 38 | make clean && \ 39 | $(PYTHON) -m venv venv && \ 40 | $(ACTIVATE_VENV) && \ 41 | $(PIP) install --upgrade pip && \ 42 | $(PIP) install -r requirements/local.txt && \ 43 | npm install && \ 44 | pre-commit install 45 | 46 | activate: 47 | . venv/bin/activate 48 | 49 | test: 50 | cd models && pytest -v -s tests/ 51 | python -m setup_test 52 | 53 | lint: 54 | pre-commit run --all-files && \ 55 | pylint models && \ 56 | flake8 . && \ 57 | isort . && \ 58 | black . 59 | 60 | clean: 61 | rm -rf venv && rm -rf node_modules && \ 62 | find ./models/ -name __pycache__ -type d -exec rm -rf {} + 63 | 64 | release: 65 | git commit -m "fix: force a new release" --allow-empty && git push 66 | 67 | ###################### 68 | # HELP 69 | ###################### 70 | 71 | help: 72 | @echo '====================================================================' 73 | @echo 'analyze - generate code analysis report' 74 | @echo 'init - create a Python virtual environment and install dependencies' 75 | @echo 'activate - activate the Python virtual environment' 76 | @echo 'test - run Python unit tests' 77 | @echo 'lint - run Python linting' 78 | @echo 'clean - destroy the Python virtual environment' 79 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # OpenAI Embeddings Example 2 | 3 | 🤖 Retrieval Augmented Generation and Hybrid Search 🤖 4 | 5 | [![FullStackWithLawrence](https://a11ybadges.com/badge?text=FullStackWithLawrence&badgeColor=orange&logo=youtube&logoColor=282828)](https://www.youtube.com/@FullStackWithLawrence)
6 | [![OpenAI](https://a11ybadges.com/badge?logo=openai)](https://platform.openai.com/) 7 | [![LangChain](https://a11ybadges.com/badge?text=LangChain&badgeColor=0834ac)](https://www.langchain.com/) 8 | [![Pinecone](https://a11ybadges.com/badge?text=Pinecone&badgeColor=000000)](https://www.pinecone.io/) 9 | [![Python](https://a11ybadges.com/badge?logo=python)](https://www.python.org/) 10 | [![Pydantic](https://a11ybadges.com/badge?text=Pydantic&badgeColor=E520E9)](https://pydantic.dev/)
11 | [![Release Notes](https://img.shields.io/github/release/FullStackWithLawrence/openai-embeddings)](https://github.com/FullStackWithLawrence/openai-embeddings/releases) 12 | ![GHA pushMain Status](https://img.shields.io/github/actions/workflow/status/FullStackWithLawrence/openai-embeddings/pushMain.yml?branch=main) 13 | [![AGPL License](https://img.shields.io/github/license/overhangio/tutor.svg?style=flat-square)](https://www.gnu.org/licenses/agpl-3.0.en.html) 14 | [![hack.d Lawrence McDaniel](https://img.shields.io/badge/hack.d-Lawrence%20McDaniel-orange.svg)](https://lawrencemcdaniel.com) 15 | 16 | A Hybrid Search and Augmented Generation prompting solution using Python [OpenAI API Embeddings](https://platform.openai.com/docs/guides/embeddings) persisted to a [Pinecone](https://docs.pinecone.io/docs/python-client) vector database index and managed by [LangChain](https://www.langchain.com/). Implements the following: 17 | 18 | - **PDF Loader**. a command-line pdf loader program that extracts text, vectorizes, and 19 | loads into a Pinecone dot product vector database that is dimensioned to match OpenAI embeddings. 20 | - **Retrieval Augmented Generation**. A chatGPT prompt based on a hybrid search retriever that locates relevant documents from the vector database and includes these in OpenAI prompts. 21 | 22 | Secondarily, I also use this repo for demonstrating how to setup [Pydantic](https://docs.pydantic.dev/latest/) to manage your project settings and how to safely work with sensitive credentials data inside your project. 23 | 24 | ## Installation 25 | 26 | ```console 27 | git clone https://github.com/FullStackWithLawrence/openai-embeddings.git 28 | cd openai-embeddings 29 | make init 30 | 31 | # Linux/macOS 32 | source venv/bin/activate 33 | 34 | # Windows Powershell (admin) 35 | venv\Scripts\activate 36 | ``` 37 | 38 | You'll also need to add your api keys to the .env file in the root of the repo. 39 | 40 | - Get your [OpenAI API key](https://platform.openai.com/api-keys) 41 | - Get your [Pinecone API Key](https://app.pinecone.io/) 42 | 43 | ```console 44 | OPENAI_API_ORGANIZATION=PLEASE-ADD-ME 45 | OPENAI_API_KEY=PLEASE-ADD-ME 46 | PINECONE_API_KEY=PLEASE-ADD-ME 47 | ``` 48 | 49 | ## Usage 50 | 51 | ```console 52 | # example 1 - generic assistant 53 | python3 -m models.examples.prompt "your are a helpful assistant" "What analytics and accounting courses does Wharton offer?" 54 | 55 | # example 2 - assistant with improved system prompting 56 | python3 -m models.examples.prompt "You are a student advisor at University of Pennsylvania. You provide concise answers of 100 words or less." "What analytics and accounting courses does Wharton offer?" 57 | 58 | # example 3 - templated assistant: Online courses 59 | python3 -m models.examples.online_courses "analytics and accounting" 60 | 61 | # example 4 - templated assistant: Certification programs 62 | python3 -m models.examples.certification_programs "analytics and accounting" 63 | 64 | # example 5 - Retrieval Augmented Generation 65 | python3 -m models.examples.load "/path/to/your/pdf/documents" 66 | python3 -m models.examples.rag "What analytics and accounting courses does Wharton offer?" 67 | ``` 68 | 69 | ### Retrieval Augmented Generation 70 | 71 | For the question, _"What analytics and accounting courses does Wharton offer?"_, an 72 | embedding can potentially dramatically alter the response generated by chatGPT. To illustrate, I uploaded a batch of 21 sets of lecture notes in PDF format for an online analytics course taught by Wharton professor [Brian Bushee](https://accounting.wharton.upenn.edu/profile/bushee/). You can download these from https://cdn.lawrencemcdaniel.com/fswl/openai-embeddings-data.zip to test whether your results are consistent. 73 | 74 | #### The control set 75 | 76 | Example 1 above, a generic chatGPT prompt with no additional guidance provided by a system prompt nor an embedding, generates the following response: 77 | 78 | ```console 79 | Wharton offers a variety of analytics and accounting courses. Some of the analytics courses include: 80 | 81 | 1. Introduction to Business Analytics: This course provides an overview of the fundamentals of business analytics, including data analysis, statistical modeling, and decision-making. 82 | 83 | 2. Data Visualization and Communication: This course focuses on the effective presentation and communication of data through visualizations and storytelling techniques. 84 | 85 | 3. Predictive Analytics: This course explores the use of statistical models and machine learning algorithms to predict future outcomes and make data-driven decisions. 86 | 87 | 4. Big Data Analytics: This course covers the analysis of large and complex datasets using advanced techniques and tools, such as Hadoop and Spark. 88 | 89 | In terms of accounting courses, Wharton offers: 90 | 91 | 1. Financial Accounting: This course provides an introduction to the principles and concepts of financial accounting, including the preparation and analysis of financial statements. 92 | 93 | 2. Managerial Accounting: This course focuses on the use of accounting information for internal decision-making and planning, including cost analysis and budgeting. 94 | 95 | 3. Advanced Financial Accounting: This course delves into more complex accounting topics, such as consolidations, partnerships, and international accounting standards. 96 | 97 | 4. Auditing and Assurance Services: This course covers the principles and practices of auditing, including risk assessment, internal controls, and audit procedures. 98 | 99 | These are just a few examples of the analytics and accounting courses offered at Wharton. The school offers a wide range of courses to cater to different interests and skill levels in these fields. 100 | (venv) (base) mcdaniel@MacBookAir-Lawrence openai-embeddings % python3 -m models.examples.online_courses "analytics and accounting" 101 | ``` 102 | 103 | #### Same prompt but with an embedding 104 | 105 | After creating an embedding from the sample set of pdf documents, you can prompt models.examples.rag with the same question, and it should provide a quite different response compared to the control from example 1. It should resemble the following: 106 | 107 | ```console 108 | Wharton offers a variety of analytics and accounting courses. Some of the courses offered include: 109 | 110 | 1. Accounting-Based Valuation: This course, taught by Professor Brian Bushee, focuses on using accounting information to value companies and make investment decisions. 111 | 112 | 2. Review of Financial Statements: Also taught by Professor Brian Bushee, this course provides an in-depth understanding of financial statements and how to analyze them for decision-making purposes. 113 | 114 | 3. Discretionary Accruals Model: Another course taught by Professor Brian Bushee, this course explores the concept of discretionary accruals and their impact on financial statements and financial analysis. 115 | 116 | 4. Discretionary Accruals Cases: This course, also taught by Professor Brian Bushee, provides practical applications of the discretionary accruals model through case studies and real-world examples. 117 | 118 | These are just a few examples of the analytics and accounting courses offered at Wharton. The school offers a wide range of courses in these areas to provide students with a comprehensive understanding of financial analysis and decision-making. 119 | ``` 120 | 121 | ## Requirements 122 | 123 | - [git](https://git-scm.com/book/en/v2/Getting-Started-Installing-Git). _pre-installed on Linux and macOS_ 124 | - [make](https://gnuwin32.sourceforge.net/packages/make.htm). _pre-installed on Linux and macOS._ 125 | - [OpenAI platform API key](https://platform.openai.com/). 126 | _If you're new to OpenAI API then see [How to Get an OpenAI API Key](./doc/OPENAI_API_GETTING_STARTED_GUIDE.md)_ 127 | - [Pinecone](https://www.pinecone.io/) API key. A vector database for storing embedding results. 128 | - [Python 3.12](https://www.python.org/downloads/): for creating virtual environment. Also used by pre-commit linters and code formatters. 129 | - [NodeJS](https://nodejs.org/en/download): used with NPM for configuring/testing Semantic Release. 130 | 131 | ## Configuration defaults 132 | 133 | Set these as environment variables on the command line, or in a .env file that should be located in the root of the repo. 134 | 135 | ```console 136 | # OpenAI API 137 | OPENAI_API_ORGANIZATION=ADD-ME-PLEASE 138 | OPENAI_API_KEY=ADD-ME-PLEASE 139 | OPENAI_CHAT_MODEL_NAME=gpt-4 140 | OPENAI_PROMPT_MODEL_NAME=gpt-4 141 | OPENAI_CHAT_TEMPERATURE=0.0 142 | OPENAI_CHAT_MAX_RETRIES=3 143 | 144 | # Pinecone API 145 | PINECONE_API_KEY=ADD-ME-PLEASE 146 | PINECONE_ENVIRONMENT=gcp-starter 147 | PINECONE_INDEX_NAME=openai-embeddings 148 | PINECONE_VECTORSTORE_TEXT_KEY=lc_id 149 | PINECONE_METRIC=dotproduct 150 | PINECONE_DIMENSIONS=1536 151 | 152 | # This package 153 | DEBUG_MODE=False 154 | ``` 155 | 156 | ## Contributing 157 | 158 | This project uses a mostly automated pull request and unit testing process. See the resources in .github for additional details. You additionally should ensure that pre-commit is installed and working correctly on your dev machine by running the following command from the root of the repo. 159 | 160 | ```console 161 | pre-commit run --all-files 162 | ``` 163 | 164 | Pull requests should pass these tests before being submitted: 165 | 166 | ```console 167 | make test 168 | ``` 169 | 170 | ### Developer setup 171 | 172 | ```console 173 | git clone https://github.com/lpm0073/automatic-models.git 174 | cd automatic-models 175 | make init 176 | make activate 177 | ``` 178 | 179 | ### Github Actions 180 | 181 | Actions requires the following secrets: 182 | 183 | ```console 184 | PAT: {{ secrets.PAT }} # a GitHub Personal Access Token 185 | OPENAI_API_ORGANIZATION: {{ secrets.OPENAI_API_ORGANIZATION }} 186 | OPENAI_API_KEY: {{ secrets.OPENAI_API_KEY }} 187 | PINECONE_API_KEY: {{ secrets.PINECONE_API_KEY }} 188 | PINECONE_ENVIRONMENT: {{ secrets.PINECONE_ENVIRONMENT }} 189 | PINECONE_INDEX_NAME: {{ secrets.PINECONE_INDEX_NAME }} 190 | ``` 191 | 192 | ## Additional reading 193 | 194 | - [Youtube - Vector Embeddings Tutorial – Code Your Own AI Assistant with GPT-4 API + LangChain + NLP](https://www.youtube.com/watch?v=yfHHvmaMkcA) 195 | - [Youtube - LangChain Explained in 13 Minutes | QuickStart Tutorial for Beginners](https://www.youtube.com/watch?v=aywZrzNaKjs) 196 | - [OpenAI Embeddings](https://platform.openai.com/docs/guides/embeddings/what-are-embeddings) 197 | - [What is a Vector Database?](https://www.pinecone.io/learn/vector-database/) 198 | - [LangChain RAG](https://python.langchain.com/docs/use_cases/question_answering/) 199 | - [LangChain Document Loaders](https://python.langchain.com/docs/modules/data_connection/document_loaders/pdf) 200 | - [LanchChain Caching](https://python.langchain.com/docs/modules/model_io/llms/llm_caching) 201 | -------------------------------------------------------------------------------- /codespell.txt: -------------------------------------------------------------------------------- 1 | OCE 2 | -------------------------------------------------------------------------------- /commitlint.config.js: -------------------------------------------------------------------------------- 1 | const Configuration = { 2 | /* 3 | * Resolve and load @commitlint/config-conventional from node_modules. 4 | * Referenced packages must be installed 5 | */ 6 | extends: ["@commitlint/config-conventional", "@commitlint/config-angular"], 7 | /* 8 | * Resolve and load conventional-changelog-atom from node_modules. 9 | * Referenced packages must be installed 10 | */ 11 | parserPreset: "conventional-changelog-atom", 12 | /* 13 | * Resolve and load @commitlint/format from node_modules. 14 | * Referenced package must be installed 15 | */ 16 | formatter: "@commitlint/format", 17 | /* 18 | * Any rules defined here will override rules from @commitlint/config-conventional 19 | */ 20 | rules: {}, 21 | /* 22 | * Array of functions that return true if commitlint should ignore the given message. 23 | * Given array is merged with predefined functions, which consist of matchers like: 24 | * 25 | * - 'Merge pull request', 'Merge X into Y' or 'Merge branch X' 26 | * - 'Revert X' 27 | * - 'v1.2.3' (ie semver matcher) 28 | * - 'Automatic merge X' or 'Auto-merged X into Y' 29 | * 30 | * To see full list, check https://github.com/conventional-changelog/commitlint/blob/master/%40commitlint/is-ignored/src/defaults.ts. 31 | * To disable those ignores and run rules always, set `defaultIgnores: false` as shown below. 32 | */ 33 | /* 34 | ignores: [(commit) => commit === ''], 35 | * Whether commitlint uses the default ignore rules, see the description above. 36 | */ 37 | defaultIgnores: true, 38 | /* 39 | * Custom URL to show upon failure 40 | */ 41 | helpUrl: 42 | "https://github.com/conventional-changelog/commitlint/#what-is-commitlint", 43 | /* 44 | * Custom prompt configs 45 | */ 46 | prompt: { 47 | messages: {}, 48 | questions: { 49 | type: { 50 | description: "please input type:", 51 | }, 52 | }, 53 | }, 54 | }; 55 | 56 | module.exports = Configuration; 57 | -------------------------------------------------------------------------------- /models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FullStackWithLawrence/openai-embeddings/a72c8f78404f65938f59543d89072393717856f0/models/__init__.py -------------------------------------------------------------------------------- /models/__version__.py: -------------------------------------------------------------------------------- 1 | # Managed via automated CI/CD in .github/workflows/semanticVersionBump.yml. 2 | __version__ = "1.3.7" 3 | -------------------------------------------------------------------------------- /models/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # pylint: disable=no-member 3 | # pylint: disable=E0213,C0103 4 | """ 5 | Configuration for Lambda functions. 6 | 7 | This module is used to configure the Lambda functions. It uses the pydantic_settings 8 | library to validate the configuration values. The configuration values are read from 9 | any of the following sources: 10 | - constructor arguments 11 | - environment variables 12 | - terraform.tfvars 13 | - default values 14 | """ 15 | 16 | import importlib.util 17 | import os # library for interacting with the operating system 18 | import platform # library to view information about the server host this Lambda runs on 19 | import re 20 | from typing import Any, Dict, List, Optional 21 | 22 | from dotenv import load_dotenv 23 | from pydantic import Field, SecretStr, ValidationError, field_validator 24 | from pydantic_settings import BaseSettings 25 | 26 | from models.const import HERE 27 | from models.exceptions import ModelConfigurationError, ModelValueError 28 | 29 | 30 | DOT_ENV_LOADED = load_dotenv() 31 | 32 | 33 | def load_version() -> Dict[str, str]: 34 | """Stringify the __version__ module.""" 35 | version_file_path = os.path.join(HERE, "__version__.py") 36 | spec = importlib.util.spec_from_file_location("__version__", version_file_path) 37 | version_module = importlib.util.module_from_spec(spec) 38 | spec.loader.exec_module(version_module) 39 | return version_module.__dict__ 40 | 41 | 42 | VERSION = load_version() 43 | 44 | 45 | def get_semantic_version() -> str: 46 | """ 47 | Return the semantic version number. 48 | 49 | Example valid values of __version__.py are: 50 | 0.1.17 51 | 0.1.17-next.1 52 | 0.1.17-next.2 53 | 0.1.17-next.123456 54 | 0.1.17-next-major.1 55 | 0.1.17-next-major.2 56 | 0.1.17-next-major.123456 57 | 58 | Note: 59 | - pypi does not allow semantic version numbers to contain a dash. 60 | - pypi does not allow semantic version numbers to contain a 'v' prefix. 61 | - pypi does not allow semantic version numbers to contain a 'next' suffix. 62 | """ 63 | version = VERSION["__version__"] 64 | version = re.sub(r"-next\.\d+", "", version) 65 | return re.sub(r"-next-major\.\d+", "", version) 66 | 67 | 68 | # pylint: disable=too-few-public-methods 69 | class SettingsDefaults: 70 | """Default values for Settings""" 71 | 72 | DEBUG_MODE = False 73 | DUMP_DEFAULTS = False 74 | 75 | LANGCHAIN_MEMORY_KEY = "chat_history" 76 | 77 | PINECONE_API_KEY: SecretStr = SecretStr(None) 78 | PINECONE_ENVIRONMENT = "gcp-starter" 79 | PINECONE_INDEX_NAME = "openai-embeddings" 80 | PINECONE_VECTORSTORE_TEXT_KEY = "lc_id" 81 | PINECONE_METRIC = "dotproduct" 82 | PINECONE_DIMENSIONS = 1536 83 | 84 | OPENAI_API_ORGANIZATION: str = None 85 | OPENAI_API_KEY: SecretStr = SecretStr(None) 86 | OPENAI_ENDPOINT_IMAGE_N = 4 87 | OPENAI_ENDPOINT_IMAGE_SIZE = "1024x768" 88 | OPENAI_CHAT_CACHE = True 89 | OPENAI_CHAT_MODEL_NAME = "gpt-4" 90 | OPENAI_PROMPT_MODEL_NAME = "gpt-4" 91 | OPENAI_CHAT_TEMPERATURE = 0.0 92 | OPENAI_CHAT_MAX_RETRIES = 3 93 | 94 | @classmethod 95 | def to_dict(cls): 96 | """Convert SettingsDefaults to dict""" 97 | return { 98 | key: value 99 | for key, value in SettingsDefaults.__dict__.items() 100 | if not key.startswith("__") and not callable(key) and key != "to_dict" 101 | } 102 | 103 | 104 | def empty_str_to_bool_default(v: str, default: bool) -> bool: 105 | """Convert empty string to default boolean value""" 106 | if v in [None, ""]: 107 | return default 108 | return v.lower() in ["true", "1", "t", "y", "yes"] 109 | 110 | 111 | def empty_str_to_int_default(v: str, default: int) -> int: 112 | """Convert empty string to default integer value""" 113 | if v in [None, ""]: 114 | return default 115 | try: 116 | return int(v) 117 | except ValueError: 118 | return default 119 | 120 | 121 | # pylint: disable=too-many-public-methods 122 | # pylint: disable=too-many-instance-attributes 123 | class Settings(BaseSettings): 124 | """Settings for Lambda functions""" 125 | 126 | _dump: dict = None 127 | _pinecone_api_key_source: str = "unset" 128 | _openai_api_key_source: str = "unset" 129 | _initialized: bool = False 130 | 131 | def __init__(self, **data: Any): 132 | super().__init__(**data) 133 | if "PINECONE_API_KEY" in os.environ: 134 | self._pinecone_api_key_source = "environment variable" 135 | elif data.get("pinecone_api_key"): 136 | self._pinecone_api_key_source = "init argument" 137 | if "OPENAI_API_KEY" in os.environ: 138 | self._openai_api_key_source = "environment variable" 139 | elif data.get("openai_api_key"): 140 | self._openai_api_key_source = "init argument" 141 | self._initialized = True 142 | 143 | debug_mode: Optional[bool] = Field( 144 | SettingsDefaults.DEBUG_MODE, 145 | env="DEBUG_MODE", 146 | pre=True, 147 | getter=lambda v: empty_str_to_bool_default(v, SettingsDefaults.DEBUG_MODE), 148 | ) 149 | dump_defaults: Optional[bool] = Field( 150 | SettingsDefaults.DUMP_DEFAULTS, 151 | env="DUMP_DEFAULTS", 152 | pre=True, 153 | getter=lambda v: empty_str_to_bool_default(v, SettingsDefaults.DUMP_DEFAULTS), 154 | ) 155 | 156 | langchain_memory_key: Optional[str] = Field(SettingsDefaults.LANGCHAIN_MEMORY_KEY, env="LANGCHAIN_MEMORY_KEY") 157 | 158 | openai_api_organization: Optional[str] = Field( 159 | SettingsDefaults.OPENAI_API_ORGANIZATION, env="OPENAI_API_ORGANIZATION" 160 | ) 161 | openai_api_key: Optional[SecretStr] = Field(SettingsDefaults.OPENAI_API_KEY, env="OPENAI_API_KEY") 162 | openai_endpoint_image_n: Optional[int] = Field( 163 | SettingsDefaults.OPENAI_ENDPOINT_IMAGE_N, env="OPENAI_ENDPOINT_IMAGE_N" 164 | ) 165 | openai_endpoint_image_size: Optional[str] = Field( 166 | SettingsDefaults.OPENAI_ENDPOINT_IMAGE_SIZE, env="OPENAI_ENDPOINT_IMAGE_SIZE" 167 | ) 168 | openai_chat_cache: Optional[bool] = Field( 169 | SettingsDefaults.OPENAI_CHAT_CACHE, 170 | env="OPENAI_CHAT_CACHE", 171 | pre=True, 172 | getter=lambda v: empty_str_to_bool_default(v, SettingsDefaults.OPENAI_CHAT_CACHE), 173 | ) 174 | openai_chat_model_name: Optional[str] = Field(SettingsDefaults.OPENAI_CHAT_MODEL_NAME, env="OPENAI_CHAT_MODEL_NAME") 175 | openai_prompt_model_name: Optional[str] = Field( 176 | SettingsDefaults.OPENAI_PROMPT_MODEL_NAME, env="OPENAI_PROMPT_MODEL_NAME" 177 | ) 178 | openai_chat_temperature: Optional[float] = Field( 179 | SettingsDefaults.OPENAI_CHAT_TEMPERATURE, 180 | env="OPENAI_CHAT_TEMPERATURE", 181 | ge=0.0, 182 | le=1.0, 183 | ) 184 | openai_chat_max_retries: Optional[int] = Field( 185 | SettingsDefaults.OPENAI_CHAT_MAX_RETRIES, 186 | env="OPENAI_CHAT_MAX_RETRIES", 187 | ge=0, 188 | ) 189 | 190 | pinecone_api_key: Optional[SecretStr] = Field(SettingsDefaults.PINECONE_API_KEY, env="PINECONE_API_KEY") 191 | pinecone_environment: Optional[str] = Field(SettingsDefaults.PINECONE_ENVIRONMENT, env="PINECONE_ENVIRONMENT") 192 | pinecone_index_name: Optional[str] = Field(SettingsDefaults.PINECONE_INDEX_NAME, env="PINECONE_INDEX_NAME") 193 | pinecone_vectorstore_text_key: Optional[str] = Field( 194 | SettingsDefaults.PINECONE_VECTORSTORE_TEXT_KEY, env="PINECONE_VECTORSTORE_TEXT_KEY" 195 | ) 196 | pinecone_metric: Optional[str] = Field(SettingsDefaults.PINECONE_METRIC, env="PINECONE_METRIC") 197 | pinecone_dimensions: Optional[int] = Field(SettingsDefaults.PINECONE_DIMENSIONS, env="PINECONE_DIMENSIONS", gt=0) 198 | 199 | @property 200 | def pinecone_api_key_source(self) -> str: 201 | """Pinecone API key source""" 202 | return self._pinecone_api_key_source 203 | 204 | @property 205 | def openai_api_key_source(self) -> str: 206 | """OpenAI API key source""" 207 | return self._openai_api_key_source 208 | 209 | @property 210 | def is_using_dotenv_file(self) -> bool: 211 | """Is the dotenv file being used?""" 212 | return DOT_ENV_LOADED 213 | 214 | @property 215 | def environment_variables(self) -> List[str]: 216 | """Environment variables""" 217 | return list(os.environ.keys()) 218 | 219 | @property 220 | def is_using_tfvars_file(self) -> bool: 221 | """Is the tfvars file being used?""" 222 | return False 223 | 224 | @property 225 | def tfvars_variables(self) -> List[str]: 226 | """Terraform variables""" 227 | return [] 228 | 229 | @property 230 | def is_using_aws_rekognition(self) -> bool: 231 | """Future: Is the AWS Rekognition service being used?""" 232 | return False 233 | 234 | @property 235 | def is_using_aws_dynamodb(self) -> bool: 236 | """Future: Is the AWS DynamoDB service being used?""" 237 | return False 238 | 239 | @property 240 | def version(self) -> str: 241 | """OpenAI API version""" 242 | return get_semantic_version() 243 | 244 | @property 245 | def dump(self) -> dict: 246 | """Dump all settings.""" 247 | 248 | def recursive_sort_dict(d): 249 | return {k: recursive_sort_dict(v) if isinstance(v, dict) else v for k, v in sorted(d.items())} 250 | 251 | if self._dump and self._initialized: 252 | return self._dump 253 | 254 | self._dump = { 255 | "secrets": { 256 | "openai_api_source": self.openai_api_key_source, 257 | "pinecone_api_source": self.pinecone_api_key_source, 258 | }, 259 | "environment": { 260 | "is_using_tfvars_file": self.is_using_tfvars_file, 261 | "is_using_dotenv_file": self.is_using_dotenv_file, 262 | "os": os.name, 263 | "system": platform.system(), 264 | "release": platform.release(), 265 | "debug_mode": self.debug_mode, 266 | "dump_defaults": self.dump_defaults, 267 | "version": self.version, 268 | }, 269 | "langchain": { 270 | "langchain_memory_key": self.langchain_memory_key, 271 | }, 272 | "openai_api": { 273 | "openai_endpoint_image_n": self.openai_endpoint_image_n, 274 | "openai_endpoint_image_size": self.openai_endpoint_image_size, 275 | "openai_chat_cache": self.openai_chat_cache, 276 | "openai_chat_model_name": self.openai_chat_model_name, 277 | "openai_prompt_model_name": self.openai_prompt_model_name, 278 | "openai_chat_temperature": self.openai_chat_temperature, 279 | "openai_chat_max_retries": self.openai_chat_max_retries, 280 | }, 281 | "pinecone_api": { 282 | "pinecone_environment": self.pinecone_environment, 283 | "pinecone_index_name": self.pinecone_index_name, 284 | "pinecone_vectorstore_text_key": self.pinecone_vectorstore_text_key, 285 | "pinecone_metric": self.pinecone_metric, 286 | "pinecone_dimensions": self.pinecone_dimensions, 287 | }, 288 | } 289 | if self.dump_defaults: 290 | settings_defaults = SettingsDefaults.to_dict() 291 | self._dump["settings_defaults"] = settings_defaults 292 | 293 | if self.is_using_dotenv_file: 294 | self._dump["environment"]["dotenv"] = self.environment_variables 295 | 296 | if self.is_using_tfvars_file: 297 | self._dump["environment"]["tfvars"] = self.tfvars_variables 298 | 299 | self._dump = recursive_sort_dict(self._dump) 300 | return self._dump 301 | 302 | # pylint: disable=too-few-public-methods 303 | class Config: 304 | """Pydantic configuration""" 305 | 306 | frozen = True 307 | 308 | @field_validator("debug_mode") 309 | def parse_debug_mode(cls, v) -> bool: 310 | """Parse debug_mode""" 311 | if isinstance(v, bool): 312 | return v 313 | if v in [None, ""]: 314 | return SettingsDefaults.DEBUG_MODE 315 | return v.lower() in ["true", "1", "t", "y", "yes"] 316 | 317 | @field_validator("dump_defaults") 318 | def parse_dump_defaults(cls, v) -> bool: 319 | """Parse dump_defaults""" 320 | if isinstance(v, bool): 321 | return v 322 | if v in [None, ""]: 323 | return SettingsDefaults.DUMP_DEFAULTS 324 | return v.lower() in ["true", "1", "t", "y", "yes"] 325 | 326 | @field_validator("langchain_memory_key") 327 | def check_langchain_memory_key(cls, v) -> str: 328 | """Check langchain_memory_key""" 329 | if v in [None, ""]: 330 | return SettingsDefaults.LANGCHAIN_MEMORY_KEY 331 | return v 332 | 333 | @field_validator("openai_api_organization") 334 | def check_openai_api_organization(cls, v) -> str: 335 | """Check openai_api_organization""" 336 | if v in [None, ""]: 337 | return SettingsDefaults.OPENAI_API_ORGANIZATION 338 | return v 339 | 340 | @field_validator("openai_api_key") 341 | def check_openai_api_key(cls, v) -> SecretStr: 342 | """Check openai_api_key""" 343 | if v in [None, ""]: 344 | return SettingsDefaults.OPENAI_API_KEY 345 | return v 346 | 347 | @field_validator("openai_endpoint_image_n") 348 | def check_openai_endpoint_image_n(cls, v) -> int: 349 | """Check openai_endpoint_image_n""" 350 | if isinstance(v, int): 351 | return v 352 | if v in [None, ""]: 353 | return SettingsDefaults.OPENAI_ENDPOINT_IMAGE_N 354 | return int(v) 355 | 356 | @field_validator("openai_endpoint_image_size") 357 | def check_openai_endpoint_image_size(cls, v) -> str: 358 | """Check openai_endpoint_image_size""" 359 | if v in [None, ""]: 360 | return SettingsDefaults.OPENAI_ENDPOINT_IMAGE_SIZE 361 | return v 362 | 363 | @field_validator("openai_chat_cache") 364 | def check_openai_chat_cache(cls, v) -> bool: 365 | """Check openai_chat_cache""" 366 | if isinstance(v, bool): 367 | return v 368 | if v in [None, ""]: 369 | return SettingsDefaults.OPENAI_CHAT_CACHE 370 | return v.lower() in ["true", "1", "t", "y", "yes"] 371 | 372 | @field_validator("openai_chat_model_name") 373 | def check_openai_chat_model_name(cls, v) -> str: 374 | """Check openai_chat_model_name""" 375 | if v in [None, ""]: 376 | return SettingsDefaults.OPENAI_CHAT_MODEL_NAME 377 | return v 378 | 379 | @field_validator("openai_prompt_model_name") 380 | def check_openai_prompt_model_name(cls, v) -> str: 381 | """Check openai_prompt_model_name""" 382 | if v in [None, ""]: 383 | return SettingsDefaults.OPENAI_PROMPT_MODEL_NAME 384 | return v 385 | 386 | @field_validator("openai_chat_temperature") 387 | def check_openai_chat_temperature(cls, v) -> float: 388 | """Check openai_chat_temperature""" 389 | if v in [None, ""]: 390 | return SettingsDefaults.OPENAI_CHAT_TEMPERATURE 391 | return float(v) 392 | 393 | @field_validator("openai_chat_max_retries") 394 | def check_openai_chat_max_retries(cls, v) -> int: 395 | """Check openai_chat_max_retries""" 396 | if v in [None, ""]: 397 | return SettingsDefaults.OPENAI_CHAT_MAX_RETRIES 398 | return int(v) 399 | 400 | @field_validator("pinecone_api_key") 401 | def check_pinecone_api_key(cls, v) -> SecretStr: 402 | """Check pinecone_api_key""" 403 | if v in [None, ""]: 404 | return SettingsDefaults.PINECONE_API_KEY 405 | return v 406 | 407 | @field_validator("pinecone_environment") 408 | def check_pinecone_environment(cls, v) -> str: 409 | """Check pinecone_environment""" 410 | if v in [None, ""]: 411 | return SettingsDefaults.PINECONE_ENVIRONMENT 412 | return v 413 | 414 | @field_validator("pinecone_index_name") 415 | def check_pinecone_index_name(cls, v) -> str: 416 | """Check pinecone_index_name""" 417 | if v in [None, ""]: 418 | return SettingsDefaults.PINECONE_INDEX_NAME 419 | return v 420 | 421 | @field_validator("pinecone_vectorstore_text_key") 422 | def check_pinecone_vectorstore_text_key(cls, v) -> str: 423 | """Check pinecone_vectorstore_text_key""" 424 | if v in [None, ""]: 425 | return SettingsDefaults.PINECONE_VECTORSTORE_TEXT_KEY 426 | return v 427 | 428 | @field_validator("pinecone_metric") 429 | def check_pinecone_metric(cls, v) -> str: 430 | """Check pinecone_metric""" 431 | if v in [None, ""]: 432 | return SettingsDefaults.PINECONE_METRIC 433 | return v 434 | 435 | @field_validator("pinecone_dimensions") 436 | def check_pinecone_dimensions(cls, v) -> int: 437 | """Check pinecone_dimensions""" 438 | if v in [None, ""]: 439 | return SettingsDefaults.PINECONE_DIMENSIONS 440 | return int(v) 441 | 442 | 443 | settings = None 444 | try: 445 | settings = Settings() 446 | except (ValidationError, ValueError, ModelConfigurationError, ModelValueError) as e: 447 | raise ModelConfigurationError("Invalid configuration: " + str(e)) from e 448 | -------------------------------------------------------------------------------- /models/const.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # pylint: disable=too-few-public-methods 3 | """Sales Support Model (hsr) for the LangChain project.""" 4 | 5 | import os 6 | from pathlib import Path 7 | 8 | 9 | MODULE_NAME = "models" 10 | HERE = os.path.abspath(os.path.dirname(__file__)) 11 | REPO_ROOT = str(Path(HERE).parent) 12 | IS_USING_TFVARS = False 13 | -------------------------------------------------------------------------------- /models/examples/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FullStackWithLawrence/openai-embeddings/a72c8f78404f65938f59543d89072393717856f0/models/examples/__init__.py -------------------------------------------------------------------------------- /models/examples/certification_programs.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """Sales Support Model (hsr) for the LangChain project.""" 3 | import argparse 4 | 5 | from models.hybrid_search_retreiver import HybridSearchRetriever 6 | from models.prompt_templates import UofPennPromptTemplates 7 | 8 | 9 | hsr = HybridSearchRetriever() 10 | templates = UofPennPromptTemplates() 11 | 12 | if __name__ == "__main__": 13 | parser = argparse.ArgumentParser(description="Hybrid search retrieval - University of Pennsylvania examples") 14 | parser.add_argument("concept", type=str, help="A certification program.") 15 | args = parser.parse_args() 16 | 17 | prompt = templates.certification_programs 18 | result = hsr.prompt_with_template(prompt=prompt, concept=args.concept) 19 | print(result) 20 | -------------------------------------------------------------------------------- /models/examples/load.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """Sales Support Model (hsr) Retrieval Augmented Generation (RAG)""" 3 | import argparse 4 | 5 | from models.hybrid_search_retreiver import HybridSearchRetriever 6 | 7 | 8 | hsr = HybridSearchRetriever() 9 | 10 | if __name__ == "__main__": 11 | parser = argparse.ArgumentParser(description="RAG example") 12 | parser.add_argument("filepath", type=str, help="Location of PDF documents") 13 | args = parser.parse_args() 14 | 15 | hsr.load(filepath=args.filepath) 16 | -------------------------------------------------------------------------------- /models/examples/online_courses.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """Sales Support Model (hsr) for the LangChain project.""" 3 | import argparse 4 | 5 | from models.hybrid_search_retreiver import HybridSearchRetriever 6 | from models.prompt_templates import UofPennPromptTemplates 7 | 8 | 9 | hsr = HybridSearchRetriever() 10 | templates = UofPennPromptTemplates() 11 | 12 | if __name__ == "__main__": 13 | parser = argparse.ArgumentParser(description="Hybrid search retrieval - University of Pennsylvania examples") 14 | parser.add_argument("concept", type=str, help="A subject to study: accounting, finance, etc.") 15 | args = parser.parse_args() 16 | 17 | prompt = templates.online_courses 18 | result = hsr.prompt_with_template(prompt=prompt, concept=args.concept) 19 | print(result) 20 | -------------------------------------------------------------------------------- /models/examples/pinecone_init.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """Sales Support Model (hsr) Retrieval Augmented Generation (RAG)""" 3 | 4 | import logging 5 | 6 | # this project 7 | from models.conf import settings 8 | from models.pinecone import PineconeIndex 9 | 10 | 11 | logging.basicConfig(level=logging.DEBUG if settings.debug_mode else logging.INFO) 12 | logger = logging.getLogger(__name__) 13 | 14 | pinecone = PineconeIndex() 15 | 16 | if __name__ == "__main__": 17 | pinecone.initialize() 18 | print("Pinecone index initialized. name: ", pinecone.index_name) 19 | print(pinecone.index.describe_index_stats()) 20 | -------------------------------------------------------------------------------- /models/examples/prompt.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """Sales Support Model (hsr)""" 3 | import argparse 4 | 5 | from langchain.schema import HumanMessage, SystemMessage 6 | 7 | from models.hybrid_search_retreiver import HybridSearchRetriever 8 | 9 | 10 | hsr = HybridSearchRetriever() 11 | 12 | 13 | if __name__ == "__main__": 14 | parser = argparse.ArgumentParser(description="hybrid search retrieval examples") 15 | parser.add_argument("system_message", type=str, help="A system prompt to send to the model.") 16 | parser.add_argument("human_message", type=str, help="A human prompt to send to the model.") 17 | args = parser.parse_args() 18 | 19 | system_message = SystemMessage(content=args.system_message) 20 | human_message = HumanMessage(content=args.human_message) 21 | result = hsr.cached_chat_request(system_message=system_message, human_message=human_message) 22 | print(result.content) 23 | -------------------------------------------------------------------------------- /models/examples/rag.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """Sales Support Model (hsr) Retrieval Augmented Generation (RAG)""" 3 | import argparse 4 | 5 | from langchain.schema import HumanMessage 6 | 7 | from models.hybrid_search_retreiver import HybridSearchRetriever 8 | 9 | 10 | hsr = HybridSearchRetriever() 11 | 12 | if __name__ == "__main__": 13 | parser = argparse.ArgumentParser(description="Retrieval Augmented Generation (RAG)") 14 | parser.add_argument("prompt", type=str, help="A question about the vectorized PDF contents") 15 | args = parser.parse_args() 16 | 17 | human_message = HumanMessage(content=args.prompt) 18 | result = hsr.rag(human_message=human_message) 19 | print(result) 20 | -------------------------------------------------------------------------------- /models/exceptions.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """Module exceptions.py""" 3 | 4 | import openai 5 | 6 | 7 | # pylint: disable=too-few-public-methods 8 | class OpenAIResponseCodes: 9 | """Http response codes from openai API""" 10 | 11 | HTTP_RESPONSE_OK = 200 12 | HTTP_RESPONSE_BAD_REQUEST = 400 13 | HTTP_RESPONSE_INTERNAL_SERVER_ERROR = 500 14 | 15 | 16 | class ModelConfigurationError(Exception): 17 | """Exception raised for errors in the configuration.""" 18 | 19 | def __init__(self, message): 20 | self.message = message 21 | super().__init__(self.message) 22 | 23 | 24 | class ModelValueError(Exception): 25 | """Exception raised for errors in the configuration.""" 26 | 27 | def __init__(self, message): 28 | self.message = message 29 | super().__init__(self.message) 30 | 31 | 32 | class ModelIlligalInvocationError(Exception): 33 | """Exception raised when the service is called by an unknown service.""" 34 | 35 | def __init__(self, message): 36 | self.message = message 37 | super().__init__(self.message) 38 | 39 | 40 | EXCEPTION_MAP = { 41 | ModelValueError: (OpenAIResponseCodes.HTTP_RESPONSE_BAD_REQUEST, "BadRequest"), 42 | ModelConfigurationError: (OpenAIResponseCodes.HTTP_RESPONSE_INTERNAL_SERVER_ERROR, "InternalServerError"), 43 | ModelIlligalInvocationError: (OpenAIResponseCodes.HTTP_RESPONSE_INTERNAL_SERVER_ERROR, "InternalServerError"), 44 | openai.APIError: (OpenAIResponseCodes.HTTP_RESPONSE_BAD_REQUEST, "BadRequest"), 45 | ValueError: (OpenAIResponseCodes.HTTP_RESPONSE_BAD_REQUEST, "BadRequest"), 46 | TypeError: (OpenAIResponseCodes.HTTP_RESPONSE_BAD_REQUEST, "BadRequest"), 47 | NotImplementedError: (OpenAIResponseCodes.HTTP_RESPONSE_BAD_REQUEST, "BadRequest"), 48 | openai.OpenAIError: (OpenAIResponseCodes.HTTP_RESPONSE_INTERNAL_SERVER_ERROR, "InternalServerError"), 49 | Exception: (OpenAIResponseCodes.HTTP_RESPONSE_INTERNAL_SERVER_ERROR, "InternalServerError"), 50 | } 51 | 52 | 53 | class ConfigurationError(Exception): 54 | """Exception raised for errors in the configuration.""" 55 | 56 | def __init__(self, message): 57 | self.message = message 58 | super().__init__(self.message) 59 | -------------------------------------------------------------------------------- /models/hybrid_search_retreiver.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # pylint: disable=E0611,E1101 3 | """ 4 | Hybrid Search Retriever. A class that combines the following: 5 | - OpenAI prompting and ChatModel 6 | - PromptingWrapper 7 | - Vector embedding with Pinecone 8 | - Hybrid Retriever to combine vector embeddings with text search 9 | 10 | Provides a pdf loader program that extracts text, vectorizes, and 11 | loads into a Pinecone dot product vector database that is dimensioned 12 | to match OpenAI embeddings. 13 | 14 | See: https://python.langchain.com/docs/modules/model_io/llms/llm_caching 15 | https://python.langchain.com/docs/modules/data_connection/document_loaders/pdf 16 | https://python.langchain.com/docs/integrations/retrievers/pinecone_hybrid_search 17 | """ 18 | 19 | # general purpose imports 20 | import logging 21 | import textwrap 22 | from typing import Union 23 | 24 | # embedding 25 | from langchain.globals import set_llm_cache 26 | from langchain.prompts import PromptTemplate 27 | from langchain.schema import BaseMessage, HumanMessage, SystemMessage 28 | 29 | # pinecone integration 30 | from langchain_community.cache import InMemoryCache 31 | 32 | # hybrid search capability 33 | from langchain_community.retrievers.pinecone_hybrid_search import ( 34 | PineconeHybridSearchRetriever, 35 | ) 36 | 37 | # from langchain_community.chat_models import ChatOpenAI 38 | # prompting and chat 39 | from langchain_openai import ChatOpenAI 40 | from pinecone_text.sparse import BM25Encoder # pylint: disable=import-error 41 | 42 | # this project 43 | from models.conf import settings 44 | from models.pinecone import PineconeIndex 45 | 46 | 47 | logging.basicConfig(level=logging.DEBUG if settings.debug_mode else logging.INFO) 48 | logger = logging.getLogger(__name__) 49 | 50 | 51 | class HybridSearchRetriever: 52 | """Hybrid Search Retriever""" 53 | 54 | _chat: ChatOpenAI = None 55 | _b25_encoder: BM25Encoder = None 56 | _pinecone: PineconeIndex = None 57 | _retriever: PineconeHybridSearchRetriever = None 58 | 59 | def __init__(self): 60 | """Constructor""" 61 | set_llm_cache(InMemoryCache()) 62 | 63 | @property 64 | def pinecone(self) -> PineconeIndex: 65 | """PineconeIndex lazy read-only property.""" 66 | if self._pinecone is None: 67 | self._pinecone = PineconeIndex() 68 | return self._pinecone 69 | 70 | # prompting wrapper 71 | @property 72 | def chat(self) -> ChatOpenAI: 73 | """ChatOpenAI lazy read-only property.""" 74 | if self._chat is None: 75 | self._chat = ChatOpenAI( 76 | api_key=settings.openai_api_key.get_secret_value(), # pylint: disable=no-member 77 | organization=settings.openai_api_organization, 78 | cache=settings.openai_chat_cache, 79 | max_retries=settings.openai_chat_max_retries, 80 | model=settings.openai_chat_model_name, 81 | temperature=settings.openai_chat_temperature, 82 | ) 83 | return self._chat 84 | 85 | @property 86 | def bm25_encoder(self) -> BM25Encoder: 87 | """BM25Encoder lazy read-only property.""" 88 | if self._b25_encoder is None: 89 | self._b25_encoder = BM25Encoder().default() 90 | return self._b25_encoder 91 | 92 | @property 93 | def retriever(self) -> PineconeHybridSearchRetriever: 94 | """PineconeHybridSearchRetriever lazy read-only property.""" 95 | if self._retriever is None: 96 | self._retriever = PineconeHybridSearchRetriever( 97 | embeddings=self.pinecone.openai_embeddings, sparse_encoder=self.bm25_encoder, index=self.pinecone.index 98 | ) 99 | return self._retriever 100 | 101 | def cached_chat_request( 102 | self, system_message: Union[str, SystemMessage], human_message: Union[str, HumanMessage] 103 | ) -> BaseMessage: 104 | """Cached chat request.""" 105 | if not isinstance(system_message, SystemMessage): 106 | logger.info("Converting system message to SystemMessage") 107 | system_message = SystemMessage(content=str(system_message)) 108 | 109 | if not isinstance(human_message, HumanMessage): 110 | logger.info("Converting human message to HumanMessage") 111 | human_message = HumanMessage(content=str(human_message)) 112 | messages = [system_message, human_message] 113 | # pylint: disable=not-callable 114 | # retval = self.chat(messages) 115 | retval = self.chat.invoke(messages) 116 | return retval 117 | 118 | # pylint: disable=unused-argument 119 | def prompt_with_template( 120 | self, prompt: PromptTemplate, concept: str, model: str = settings.openai_prompt_model_name 121 | ) -> str: 122 | """Prompt with template.""" 123 | retval = self.chat.invoke(prompt.format(concept=concept)) 124 | return str(retval.content) if retval else "no response" 125 | 126 | def load(self, filepath: str): 127 | """Pdf loader.""" 128 | self.pinecone.pdf_loader(filepath=filepath) 129 | 130 | def rag(self, human_message: Union[str, HumanMessage]): 131 | """ 132 | Retrieval Augmented Generation prompt. 133 | 1. Retrieve human message prompt: Given a user input, relevant splits are retrieved 134 | from storage using a Retriever. 135 | 2. Generate: A ChatModel / LLM produces an answer using a prompt that includes 136 | the question and the retrieved data 137 | 138 | To prompt OpenAI's GPT-3 model to consider the embeddings from the Pinecone 139 | vector database, you would typically need to convert the embeddings back 140 | into a format that GPT-3 can understand, such as text. However, GPT-3 does 141 | not natively support direct input of embeddings. 142 | 143 | The typical workflow is to use the embeddings to retrieve relevant documents, 144 | and then use the text of these documents as part of the prompt for GPT-3. 145 | """ 146 | if not isinstance(human_message, HumanMessage): 147 | logger.info("Converting human_message to HumanMessage") 148 | human_message = HumanMessage(content=human_message) 149 | 150 | # --------------------------------------------------------------------- 151 | # 1.) Retrieve relevant documents from Pinecone vector database 152 | # --------------------------------------------------------------------- 153 | documents = self.pinecone.vector_store.similarity_search(query=human_message.content) 154 | 155 | # Extract the text from the documents 156 | document_texts = [doc.page_content for doc in documents] 157 | leader = textwrap.dedent( 158 | """\n 159 | You are a helpful assistant. You should assume that all of the 160 | following bullet points that follow are completely factual. 161 | You should prioritize these enumerated facts when formulating your response:""" 162 | ) 163 | system_message_content = f"{leader} {'\n\n'.join(document_texts)}" 164 | system_message_content = ( 165 | f"{leader} {''.join([f'\n\n{40 * "-"}\n{i + 1}.) {text}\n' for i, text in enumerate(document_texts)])}" 166 | ) 167 | system_message = SystemMessage(content=system_message_content) 168 | # --------------------------------------------------------------------- 169 | # finished with hybrid search setup 170 | # --------------------------------------------------------------------- 171 | star_line = 80 * "*" 172 | logger.info( 173 | "\n%s\n" 174 | "rag() Retrieval Augmented Generation prompt" 175 | "Diagnostic information:\n" 176 | " Retrieved %i related documents from Pinecone\n" 177 | " System messages contains %i words\n" 178 | " System Prompt:" 179 | "\n <============================ BEGIN ===============================>" 180 | "%s" 181 | "\n <============================= END ================================>\n\n", 182 | star_line, 183 | len(documents), 184 | len(system_message.content.split()), 185 | system_message.content, 186 | ) 187 | 188 | # 2.) get a response from the chat model 189 | response = self.cached_chat_request(system_message=system_message, human_message=human_message) 190 | 191 | return str(response.content) 192 | -------------------------------------------------------------------------------- /models/pinecone.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # pylint: disable=E0611,E1101 3 | """A class to manage the lifecycle of Pinecone vector database indexes.""" 4 | 5 | # document loading 6 | import glob 7 | 8 | # general purpose imports 9 | import json 10 | import logging 11 | import os 12 | 13 | from langchain.text_splitter import RecursiveCharacterTextSplitter 14 | from langchain_community.document_loaders.pdf import PyPDFLoader 15 | from langchain_openai import OpenAIEmbeddings 16 | from langchain_pinecone import PineconeVectorStore 17 | 18 | # pinecone integration 19 | from pinecone import Pinecone, ServerlessSpec 20 | from pinecone.core.openapi.shared.exceptions import PineconeApiException 21 | from pinecone.models import IndexList 22 | 23 | # this project 24 | from models.conf import settings 25 | 26 | 27 | logging.basicConfig(level=logging.DEBUG if settings.debug_mode else logging.INFO) 28 | 29 | 30 | class PineconeIndex: 31 | """Pinecone helper class.""" 32 | 33 | _pinecone = None 34 | _index: Pinecone.Index = None 35 | _index_name: str = None 36 | _text_splitter: RecursiveCharacterTextSplitter = None 37 | _openai_embeddings: OpenAIEmbeddings = None 38 | _vector_store: PineconeVectorStore = None 39 | 40 | def __init__(self, index_name: str = None): 41 | self.init() 42 | self.index_name = index_name or settings.pinecone_index_name 43 | logging.debug("PineconeIndex initialized with index_name: %s", self.index_name) 44 | logging.debug(self.index_stats) 45 | 46 | @property 47 | def index_name(self) -> str: 48 | """index name.""" 49 | return self._index_name 50 | 51 | @index_name.setter 52 | def index_name(self, value: str) -> None: 53 | """Set index name.""" 54 | if self._index_name != value: 55 | self.init() 56 | self._index_name = value 57 | self.init_index() 58 | 59 | @property 60 | def index(self) -> Pinecone.Index: 61 | """pinecone.Index lazy read-only property.""" 62 | if self._index is None: 63 | self.init_index() 64 | self._index = self.pinecone.Index(name=self.index_name) 65 | return self._index 66 | 67 | @property 68 | def index_stats(self) -> dict: 69 | """index stats.""" 70 | retval = self.index.describe_index_stats() 71 | return json.dumps(retval.to_dict(), indent=4) 72 | 73 | @property 74 | def initialized(self) -> bool: 75 | """initialized read-only property.""" 76 | indexes = self.pinecone.list_indexes() 77 | return self.index_name in indexes.names() 78 | 79 | @property 80 | def vector_store(self) -> PineconeVectorStore: 81 | """Pinecone lazy read-only property.""" 82 | if self._vector_store is None: 83 | if not self.initialized: 84 | self.init_index() 85 | self._vector_store = PineconeVectorStore( 86 | index=self.index, 87 | embedding=self.openai_embeddings, 88 | text_key=settings.pinecone_vectorstore_text_key, 89 | ) 90 | return self._vector_store 91 | 92 | @property 93 | def openai_embeddings(self) -> OpenAIEmbeddings: 94 | """OpenAIEmbeddings lazy read-only property.""" 95 | if self._openai_embeddings is None: 96 | # pylint: disable=no-member 97 | self._openai_embeddings = OpenAIEmbeddings( 98 | api_key=settings.openai_api_key.get_secret_value(), 99 | organization=settings.openai_api_organization, 100 | ) 101 | return self._openai_embeddings 102 | 103 | @property 104 | def pinecone(self) -> Pinecone: 105 | """Pinecone lazy read-only property.""" 106 | if self._pinecone is None: 107 | print("Initializing Pinecone...") 108 | api_key = settings.pinecone_api_key.get_secret_value() 109 | print(f"API Key: {api_key[:12]}****------") 110 | self._pinecone = Pinecone(api_key=api_key) 111 | return self._pinecone 112 | 113 | @property 114 | def text_splitter(self) -> RecursiveCharacterTextSplitter: 115 | """lazy read-only property.""" 116 | if self._text_splitter is None: 117 | self._text_splitter = RecursiveCharacterTextSplitter() 118 | return self._text_splitter 119 | 120 | def init_index(self): 121 | """Verify that an index named self.index_name exists in Pinecone. If not, create it.""" 122 | indexes: IndexList = None 123 | indexes = self.pinecone.list_indexes() 124 | if self.index_name not in indexes.names(): 125 | logging.debug("Index does not exist.") 126 | self.create() 127 | 128 | # pylint: disable=no-member 129 | def init(self): 130 | """Initialize Pinecone.""" 131 | 132 | self._index = None 133 | self._index_name = None 134 | self._text_splitter = None 135 | self._openai_embeddings = None 136 | self._vector_store = None 137 | 138 | def delete(self): 139 | """Delete index.""" 140 | if not self.initialized: 141 | logging.debug("Index does not exist. Nothing to delete.") 142 | return 143 | print("Deleting index...") 144 | self.pinecone.delete_index(self.index_name) 145 | 146 | def create(self): 147 | """Create index.""" 148 | print("Creating index. This may take a few minutes...") 149 | serverless_spec = ServerlessSpec( 150 | cloud="aws", 151 | region="us-east-1", 152 | ) 153 | try: 154 | self.pinecone.create_index( 155 | name=self.index_name, 156 | dimension=settings.pinecone_dimensions, 157 | metric=settings.pinecone_metric, 158 | spec=serverless_spec, 159 | ) 160 | print("Index created.") 161 | except PineconeApiException: 162 | pass 163 | 164 | def initialize(self): 165 | """Initialize index.""" 166 | self.delete() 167 | self.create() 168 | 169 | def pdf_loader(self, filepath: str): 170 | """ 171 | Embed PDF. 172 | 1. Load PDF document text data 173 | 2. Split into pages 174 | 3. Embed each page 175 | 4. Store in Pinecone 176 | 177 | Note: it's important to make sure that the "context" field that holds the document text 178 | in the metadata is not indexed. Currently you need to specify explicitly the fields you 179 | do want to index. For more information checkout 180 | https://docs.pinecone.io/docs/manage-indexes#selective-metadata-indexing 181 | """ 182 | self.initialize() 183 | 184 | pdf_files = glob.glob(os.path.join(filepath, "*.pdf")) 185 | i = 0 186 | for pdf_file in pdf_files: 187 | i += 1 188 | j = len(pdf_files) 189 | print(f"Loading PDF {i} of {j}: {pdf_file}") 190 | loader = PyPDFLoader(file_path=pdf_file) 191 | docs = loader.load() 192 | k = 0 193 | for doc in docs: 194 | k += 1 195 | print(k * "-", end="\r") 196 | documents = self.text_splitter.create_documents([doc.page_content]) 197 | document_texts = [doc.page_content for doc in documents] 198 | embeddings = self.openai_embeddings.embed_documents(document_texts) 199 | self.vector_store.add_documents(documents=documents, embeddings=embeddings) 200 | 201 | print("Finished loading PDFs. \n" + self.index_stats) 202 | -------------------------------------------------------------------------------- /models/prompt_templates.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # pylint: disable=too-few-public-methods 3 | """Sales Support Model (hsr) prompt templates""" 4 | 5 | from langchain.prompts import PromptTemplate 6 | 7 | 8 | class UofPennPromptTemplates: 9 | """Netec Prompt Templates.""" 10 | 11 | sales_role: str = """You are a helpful student advisor at Wharton School of the 12 | University of Pennsylvania. You provide concise explanations to questions about 13 | the courses they offer in 100 words or less.""" 14 | 15 | @classmethod 16 | def get_properties(cls): 17 | """return a list of properties of this class.""" 18 | return [attr for attr in dir(cls) if isinstance(getattr(cls, attr), property)] 19 | 20 | @property 21 | def online_courses(self) -> PromptTemplate: 22 | """Get prompt.""" 23 | template = ( 24 | self.sales_role 25 | + """ 26 | Explain the online courses Wharton offers about {concept} 27 | """ 28 | ) 29 | return PromptTemplate(input_variables=["concept"], template=template) 30 | 31 | @property 32 | def certification_programs(self) -> PromptTemplate: 33 | """Get prompt.""" 34 | template = ( 35 | self.sales_role 36 | + """ 37 | Summarize their executive and online programs in which learner 38 | can earns certificates for {concept} 39 | """ 40 | ) 41 | return PromptTemplate(input_variables=["concept"], template=template) 42 | 43 | 44 | class NetecPromptTemplates: 45 | """Netec Prompt Templates.""" 46 | 47 | sales_role: str = """You are a helpful sales assistant at Netec who sells 48 | specialized training and exam preparation services to existing customers. 49 | You provide concise explanations of the services that Netec offers in 100 50 | words or less.""" 51 | 52 | @classmethod 53 | def get_properties(cls): 54 | """return a list of properties of this class.""" 55 | return [attr for attr in dir(cls) if isinstance(getattr(cls, attr), property)] 56 | 57 | @property 58 | def training_services(self) -> PromptTemplate: 59 | """Get prompt.""" 60 | template = ( 61 | self.sales_role 62 | + """ 63 | Explain the training services that Netec offers about {concept} 64 | """ 65 | ) 66 | return PromptTemplate(input_variables=["concept"], template=template) 67 | 68 | @property 69 | def oracle_training_services(self) -> PromptTemplate: 70 | """Get prompt.""" 71 | template = ( 72 | self.sales_role 73 | + """ 74 | Note that Netec is the exclusive provider in Latin America of Oracle training services 75 | for the 6 levels of Oracle Certification credentials: Oracle Certified Junior Associate (OCJA), 76 | Oracle Certified Associate (OCA), Oracle Certified Professional (OCP), 77 | Oracle Certified Master (OCM), Oracle Certified Expert (OCE) and 78 | Oracle Certified Specialist (OCS). 79 | Summarize their programs for {concept} 80 | """ 81 | ) 82 | return PromptTemplate(input_variables=["concept"], template=template) 83 | -------------------------------------------------------------------------------- /models/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FullStackWithLawrence/openai-embeddings/a72c8f78404f65938f59543d89072393717856f0/models/tests/__init__.py -------------------------------------------------------------------------------- /models/tests/mock_data/.env.test_01: -------------------------------------------------------------------------------- 1 | DEBUG_MODE = True 2 | DUMP_DEFAULTS = True 3 | LANGCHAIN_MEMORY_KEY = "TEST_chat_history" 4 | PINECONE_ENVIRONMENT = "TEST_gcp-starter" 5 | PINECONE_INDEX_NAME = "TEST_rag" 6 | PINECONE_VECTORSTORE_TEXT_KEY = "TEST_lc_id" 7 | PINECONE_METRIC = "TEST_dotproduct" 8 | PINECONE_DIMENSIONS = 1 9 | OPENAI_ENDPOINT_IMAGE_N = 1 10 | OPENAI_ENDPOINT_IMAGE_SIZE = "TEST_1024x768" 11 | OPENAI_CHAT_CACHE = False 12 | OPENAI_CHAT_MODEL_NAME = "TEST_gpt-4" 13 | OPENAI_PROMPT_MODEL_NAME = "TEST_gpt-4" 14 | OPENAI_CHAT_TEMPERATURE = 1.0 15 | OPENAI_CHAT_MAX_RETRIES = 5 16 | -------------------------------------------------------------------------------- /models/tests/mock_data/.env.test_illegal_nulls: -------------------------------------------------------------------------------- 1 | DEBUG_MODE= 2 | AWS_REKOGNITION_FACE_DETECT_MAX_FACES_COUNT= 3 | AWS_REKOGNITION_FACE_DETECT_THRESHOLD= 4 | -------------------------------------------------------------------------------- /models/tests/mock_data/.env.test_legal_nulls: -------------------------------------------------------------------------------- 1 | LANGCHAIN_MEMORY_KEY= 2 | OPENAI_ENDPOINT_IMAGE_SIZE= 3 | -------------------------------------------------------------------------------- /models/tests/mock_data/test_load.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FullStackWithLawrence/openai-embeddings/a72c8f78404f65938f59543d89072393717856f0/models/tests/mock_data/test_load.pdf -------------------------------------------------------------------------------- /models/tests/test_configuration.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # flake8: noqa: F401 3 | """ 4 | Test conf module. 5 | """ 6 | import os 7 | from unittest.mock import patch 8 | 9 | import pytest # pylint: disable=unused-import 10 | from dotenv import load_dotenv 11 | from pydantic import ValidationError as PydanticValidationError 12 | 13 | from models.conf import Settings, SettingsDefaults 14 | 15 | 16 | HERE = os.path.dirname(os.path.abspath(__file__)) 17 | 18 | 19 | class TestConfig: 20 | """Test config.settings.""" 21 | 22 | def env_path(self, filename): 23 | """Return the path to the .env file.""" 24 | return os.path.join(HERE, "mock_data", filename) 25 | 26 | def test_conf_defaults(self): 27 | """Test that settings == SettingsDefaults when no .env is in use.""" 28 | os.environ.clear() 29 | mock_settings = Settings() 30 | assert mock_settings.langchain_memory_key == SettingsDefaults.LANGCHAIN_MEMORY_KEY 31 | assert mock_settings.debug_mode == SettingsDefaults.DEBUG_MODE 32 | 33 | assert mock_settings.openai_api_key == SettingsDefaults.OPENAI_API_KEY 34 | assert mock_settings.openai_api_organization == SettingsDefaults.OPENAI_API_ORGANIZATION 35 | assert mock_settings.openai_chat_cache == SettingsDefaults.OPENAI_CHAT_CACHE 36 | assert mock_settings.openai_chat_max_retries == SettingsDefaults.OPENAI_CHAT_MAX_RETRIES 37 | assert mock_settings.openai_chat_model_name == SettingsDefaults.OPENAI_CHAT_MODEL_NAME 38 | assert mock_settings.openai_endpoint_image_n == SettingsDefaults.OPENAI_ENDPOINT_IMAGE_N 39 | assert mock_settings.openai_endpoint_image_size == SettingsDefaults.OPENAI_ENDPOINT_IMAGE_SIZE 40 | assert mock_settings.openai_prompt_model_name == SettingsDefaults.OPENAI_PROMPT_MODEL_NAME 41 | 42 | assert mock_settings.pinecone_api_key == SettingsDefaults.PINECONE_API_KEY 43 | assert mock_settings.pinecone_dimensions == SettingsDefaults.PINECONE_DIMENSIONS 44 | assert mock_settings.pinecone_environment == SettingsDefaults.PINECONE_ENVIRONMENT 45 | assert mock_settings.pinecone_index_name == SettingsDefaults.PINECONE_INDEX_NAME 46 | assert mock_settings.pinecone_metric == SettingsDefaults.PINECONE_METRIC 47 | assert mock_settings.pinecone_vectorstore_text_key == SettingsDefaults.PINECONE_VECTORSTORE_TEXT_KEY 48 | 49 | # pylint: disable=no-member 50 | def test_conf_defaults_secrets(self): 51 | """Test that settings secrets match the defaults.""" 52 | os.environ.clear() 53 | mock_settings = Settings() 54 | assert mock_settings.openai_api_key.get_secret_value() == SettingsDefaults.OPENAI_API_KEY.get_secret_value() 55 | assert mock_settings.pinecone_api_key.get_secret_value() == SettingsDefaults.PINECONE_API_KEY.get_secret_value() 56 | 57 | def test_env_legal_nulls(self): 58 | """Test that settings handles missing .env values.""" 59 | os.environ.clear() 60 | env_path = self.env_path(".env.test_legal_nulls") 61 | print("env_path", env_path) 62 | loaded = load_dotenv(env_path) 63 | assert loaded 64 | 65 | mock_settings = Settings() 66 | assert mock_settings.langchain_memory_key == SettingsDefaults.LANGCHAIN_MEMORY_KEY 67 | assert mock_settings.openai_endpoint_image_size == SettingsDefaults.OPENAI_ENDPOINT_IMAGE_SIZE 68 | 69 | def test_env_illegal_nulls(self): 70 | """Test that settings handles missing .env values.""" 71 | os.environ.clear() 72 | env_path = self.env_path(".env.test_illegal_nulls") 73 | print("env_path", env_path) 74 | loaded = load_dotenv(env_path) 75 | assert loaded 76 | 77 | with pytest.raises(PydanticValidationError): 78 | Settings() 79 | 80 | def test_env_overrides(self): 81 | """Test that settings takes custom .env values.""" 82 | os.environ.clear() 83 | env_path = self.env_path(".env.test_01") 84 | loaded = load_dotenv(env_path) 85 | assert loaded 86 | 87 | mock_settings = Settings() 88 | 89 | assert mock_settings.debug_mode is True 90 | assert mock_settings.dump_defaults is True 91 | assert mock_settings.langchain_memory_key == "TEST_chat_history" 92 | assert mock_settings.pinecone_environment == "TEST_gcp-starter" 93 | assert mock_settings.pinecone_index_name == "TEST_rag" 94 | assert mock_settings.pinecone_vectorstore_text_key == "TEST_lc_id" 95 | assert mock_settings.pinecone_metric == "TEST_dotproduct" 96 | assert mock_settings.pinecone_dimensions == 1 97 | assert mock_settings.openai_endpoint_image_n == 1 98 | assert mock_settings.openai_endpoint_image_size == "TEST_1024x768" 99 | assert mock_settings.openai_chat_cache is False 100 | assert mock_settings.openai_chat_model_name == "TEST_gpt-4" 101 | assert mock_settings.openai_prompt_model_name == "TEST_gpt-4" 102 | assert mock_settings.openai_chat_temperature == 1.0 103 | assert mock_settings.openai_chat_max_retries == 5 104 | 105 | @patch.dict(os.environ, {"OPENAI_CHAT_MAX_RETRIES": "-1"}) 106 | def test_invalid_chat_max_retries(self): 107 | """Test that Pydantic raises a validation error for environment variable w negative integer values.""" 108 | 109 | with pytest.raises(PydanticValidationError): 110 | Settings() 111 | 112 | @patch.dict(os.environ, {"OPENAI_CHAT_TEMPERATURE": "-1"}) 113 | def test_invalid_chat_temperature(self): 114 | """Test that Pydantic raises a validation error for environment variable w negative integer values.""" 115 | 116 | with pytest.raises(PydanticValidationError): 117 | Settings() 118 | 119 | @patch.dict(os.environ, {"PINECONE_DIMENSIONS": "-1"}) 120 | def test_invalid_pinecone_dimensions(self): 121 | """Test that Pydantic raises a validation error for environment variable w negative integer values.""" 122 | 123 | with pytest.raises(PydanticValidationError): 124 | Settings() 125 | 126 | def test_configure_with_class_constructor(self): 127 | """test that we can set values with the class constructor""" 128 | os.environ.clear() 129 | 130 | mock_settings = Settings( 131 | debug_mode=True, 132 | dump_defaults=True, 133 | langchain_memory_key="TEST_chat_history", 134 | pinecone_environment="TEST_gcp-starter", 135 | pinecone_index_name="TEST_rag", 136 | pinecone_vectorstore_text_key="TEST_lc_id", 137 | pinecone_metric="TEST_dotproduct", 138 | pinecone_dimensions=1, 139 | openai_endpoint_image_n=1, 140 | openai_endpoint_image_size="TEST_1024x768", 141 | openai_chat_cache=False, 142 | openai_chat_model_name="TEST_gpt-4", 143 | openai_prompt_model_name="TEST_text-davinci-003", 144 | openai_chat_temperature=1.0, 145 | openai_chat_max_retries=5, 146 | ) 147 | 148 | assert mock_settings.debug_mode is True 149 | assert mock_settings.dump_defaults is True 150 | assert mock_settings.langchain_memory_key == "TEST_chat_history" 151 | assert mock_settings.pinecone_environment == "TEST_gcp-starter" 152 | assert mock_settings.pinecone_index_name == "TEST_rag" 153 | assert mock_settings.pinecone_vectorstore_text_key == "TEST_lc_id" 154 | assert mock_settings.pinecone_metric == "TEST_dotproduct" 155 | assert mock_settings.pinecone_dimensions == 1 156 | assert mock_settings.openai_endpoint_image_n == 1 157 | assert mock_settings.openai_endpoint_image_size == "TEST_1024x768" 158 | assert mock_settings.openai_chat_cache is False 159 | assert mock_settings.openai_chat_model_name == "TEST_gpt-4" 160 | assert mock_settings.openai_prompt_model_name == "TEST_text-davinci-003" 161 | assert mock_settings.openai_chat_temperature == 1.0 162 | assert mock_settings.openai_chat_max_retries == 5 163 | 164 | def test_readonly_settings(self): 165 | """test that we can't set readonly values with the class constructor""" 166 | 167 | mock_settings = Settings() 168 | with pytest.raises(PydanticValidationError): 169 | mock_settings.langchain_memory_key = "TEST_chat_history" 170 | with pytest.raises(PydanticValidationError): 171 | mock_settings.pinecone_environment = "TEST_gcp-starter" 172 | with pytest.raises(PydanticValidationError): 173 | mock_settings.pinecone_index_name = "TEST_rag" 174 | with pytest.raises(PydanticValidationError): 175 | mock_settings.pinecone_vectorstore_text_key = "TEST_lc_id" 176 | with pytest.raises(PydanticValidationError): 177 | mock_settings.pinecone_metric = "TEST_dotproduct" 178 | with pytest.raises(PydanticValidationError): 179 | mock_settings.pinecone_dimensions = 1 180 | with pytest.raises(PydanticValidationError): 181 | mock_settings.openai_endpoint_image_n = 1 182 | with pytest.raises(PydanticValidationError): 183 | mock_settings.openai_endpoint_image_size = "TEST_1024x768" 184 | with pytest.raises(PydanticValidationError): 185 | mock_settings.openai_chat_cache = False 186 | with pytest.raises(PydanticValidationError): 187 | mock_settings.openai_chat_model_name = "TEST_gpt-4" 188 | with pytest.raises(PydanticValidationError): 189 | mock_settings.openai_prompt_model_name = "TEST_text-davinci-003" 190 | with pytest.raises(PydanticValidationError): 191 | mock_settings.openai_chat_temperature = 1.0 192 | with pytest.raises(PydanticValidationError): 193 | mock_settings.openai_chat_max_retries = 5 194 | 195 | def test_dump(self): 196 | """Test that dump is a dict.""" 197 | 198 | mock_settings = Settings() 199 | assert isinstance(mock_settings.dump, dict) 200 | 201 | def test_dump_keys(self): 202 | """Test that dump contains the expected keys.""" 203 | 204 | dump = Settings().dump 205 | assert "secrets" in dump.keys() 206 | assert "environment" in dump.keys() 207 | assert "langchain" in dump.keys() 208 | assert "openai_api" in dump.keys() 209 | assert "pinecone_api" in dump.keys() 210 | -------------------------------------------------------------------------------- /models/tests/test_examples.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # flake8: noqa: F401 3 | """ 4 | Test command line example prompts. 5 | """ 6 | from unittest.mock import MagicMock, patch 7 | 8 | import pytest # pylint: disable=unused-import 9 | from langchain.schema import HumanMessage, SystemMessage 10 | 11 | from models.examples.certification_programs import hsr as uofpenn_certification_program 12 | from models.examples.online_courses import hsr as uofpenn_online_hsr 13 | from models.examples.prompt import hsr as prompt_hrs 14 | from models.examples.rag import hsr as rag_hsr 15 | from models.prompt_templates import NetecPromptTemplates 16 | 17 | 18 | HUMAN_MESSAGE = "this is a test" 19 | SYSTEM_PROMPT = """you are a helpful assistant. If you are prompted, 20 | 'this is a test', then return the word 'SUCCESS' in upper case. Return only 21 | this single word, in upper case. Do not embellish. do not further prompt 22 | the user for any reason.""" 23 | 24 | 25 | class TestExamples: 26 | """Test command line examples.""" 27 | 28 | @patch("argparse.ArgumentParser.parse_args") 29 | def test_prompt(self, mock_parse_args): 30 | """Test prompt example.""" 31 | 32 | mock_args = MagicMock() 33 | mock_args.system_prompt = SYSTEM_PROMPT 34 | mock_args.human_prompt = HUMAN_MESSAGE 35 | mock_parse_args.return_value = mock_args 36 | 37 | system_message = SystemMessage(content=SYSTEM_PROMPT) 38 | human_message = HumanMessage(content=HUMAN_MESSAGE) 39 | result = prompt_hrs.cached_chat_request(system_message=system_message, human_message=human_message) 40 | assert result.content == "SUCCESS" 41 | 42 | @patch("argparse.ArgumentParser.parse_args") 43 | def test_rag(self, mock_parse_args): 44 | """Test RAG example.""" 45 | mock_args = MagicMock() 46 | mock_args.human_message = HUMAN_MESSAGE 47 | mock_parse_args.return_value = mock_args 48 | 49 | human_message = HumanMessage(content=mock_args.human_message) 50 | result = rag_hsr.rag(human_message=human_message) 51 | assert isinstance(result, str) 52 | assert len(result) > 0 53 | 54 | @patch("argparse.ArgumentParser.parse_args") 55 | def test_training_services(self, mock_parse_args): 56 | """Test training services templates.""" 57 | mock_args = MagicMock() 58 | mock_args.human_message = HUMAN_MESSAGE 59 | mock_parse_args.return_value = mock_args 60 | 61 | templates = NetecPromptTemplates() 62 | prompt = templates.training_services 63 | 64 | result = uofpenn_certification_program.prompt_with_template(prompt=prompt, concept=mock_args.human_message) 65 | assert isinstance(result, str) 66 | assert len(result) > 0 67 | 68 | @patch("argparse.ArgumentParser.parse_args") 69 | def test_oracle_training_services(self, mock_parse_args): 70 | """Test oracle training services.""" 71 | mock_args = MagicMock() 72 | mock_args.human_message = HUMAN_MESSAGE 73 | mock_parse_args.return_value = mock_args 74 | 75 | templates = NetecPromptTemplates() 76 | prompt = templates.oracle_training_services 77 | 78 | result = uofpenn_online_hsr.prompt_with_template(prompt=prompt, concept=mock_args.human_message) 79 | assert isinstance(result, str) 80 | assert len(result) > 0 81 | -------------------------------------------------------------------------------- /models/tests/test_hsr.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # pylint: disable=E0611,E1101 3 | # flake8: noqa: F401 4 | """ 5 | Test integrity of base class. 6 | """ 7 | import pytest # pylint: disable=unused-import 8 | 9 | # from langchain_community.chat_models import ChatOpenAI 10 | from langchain_openai import ChatOpenAI 11 | 12 | from models.hybrid_search_retreiver import HybridSearchRetriever 13 | from models.pinecone import PineconeIndex 14 | 15 | 16 | class TestSalesSupportModel: 17 | """Test HybridSearchRetriever class.""" 18 | 19 | def test_01_basic(self): 20 | """Ensure that we can instantiate the class.""" 21 | 22 | # pylint: disable=broad-except 23 | try: 24 | HybridSearchRetriever() 25 | except Exception as e: 26 | assert False, f"initialization of HybridSearchRetriever() failed with exception: {e}" 27 | 28 | def test_02_class_aatribute_types(self): 29 | """ensure that class attributes are of the correct type""" 30 | 31 | hsr = HybridSearchRetriever() 32 | assert isinstance(hsr.chat, ChatOpenAI) 33 | assert isinstance(hsr.pinecone, PineconeIndex) 34 | -------------------------------------------------------------------------------- /models/tests/test_openai.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # flake8: noqa: F401 3 | # pylint: disable=too-few-public-methods 4 | """ 5 | Test integrity of base class. 6 | """ 7 | import pytest # pylint: disable=unused-import 8 | 9 | from models.hybrid_search_retreiver import HybridSearchRetriever 10 | 11 | 12 | class TestOpenAI: 13 | """Test HybridSearchRetriever class.""" 14 | 15 | def test_03_test_openai_connectivity(self): 16 | """Ensure that we have connectivity to OpenAI.""" 17 | 18 | hsr = HybridSearchRetriever() 19 | retval = hsr.cached_chat_request( 20 | "your are a helpful assistant", "please return the value 'CORRECT' in all upper case." 21 | ) 22 | assert retval.content == "CORRECT" 23 | -------------------------------------------------------------------------------- /models/tests/test_pinecone.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # flake8: noqa: F401 3 | """ 4 | Test this model's Pinecone helper class. 5 | """ 6 | 7 | import os 8 | 9 | import pinecone as oem_pinecone 10 | import pytest # pylint: disable=unused-import 11 | from pinecone import Pinecone 12 | 13 | from models.conf import settings 14 | from models.pinecone import PineconeIndex 15 | 16 | 17 | class TestPinecone: 18 | """Test HybridSearchRetriever class.""" 19 | 20 | def test_01_can_instantiate(self): 21 | """Ensure that we instantiate the object.""" 22 | # pylint: disable=broad-except 23 | try: 24 | PineconeIndex() 25 | except Exception as e: 26 | assert False, f"Pinecone() failed with exception: {e}" 27 | 28 | def test_02_init(self): 29 | """Ensure that we can initialize Pinecone.""" 30 | pinecone = PineconeIndex() 31 | # pylint: disable=broad-except 32 | try: 33 | pinecone.init() 34 | except Exception as e: 35 | assert False, f"Pinecone.init() failed with exception: {e}" 36 | 37 | def test_03_index(self): 38 | """Test that the index name is correct.""" 39 | pinecone = PineconeIndex() 40 | assert pinecone.index_name == settings.pinecone_index_name 41 | 42 | def test_04_initialize(self): 43 | """Test that the index initializes.""" 44 | pinecone = PineconeIndex() 45 | # pylint: disable=broad-except 46 | try: 47 | pinecone.initialize() 48 | except Exception as e: 49 | assert False, f"Pinecone.initialize() failed with exception: {e}" 50 | assert isinstance(pinecone.index, oem_pinecone.Index) 51 | 52 | def test_05_delete(self): 53 | """Test that the index can be deleted.""" 54 | pinecone_index = PineconeIndex() 55 | 56 | # pylint: disable=E1101 57 | api_key = settings.pinecone_api_key.get_secret_value() 58 | pinecone = Pinecone(api_key=api_key) 59 | indexes = pinecone.list_indexes().names() 60 | assert pinecone_index.index_name in indexes 61 | # pylint: disable=broad-except 62 | try: 63 | pinecone_index.delete() 64 | except Exception as e: 65 | assert False, f"Pinecone.delete() failed with exception: {e}" 66 | 67 | def test_06_create(self): 68 | """Test that the index can be created.""" 69 | pinecone_index = PineconeIndex() 70 | 71 | # pylint: disable=E1101 72 | api_key = settings.pinecone_api_key.get_secret_value() 73 | pinecone = Pinecone(api_key=api_key) 74 | 75 | indexes = pinecone.list_indexes().names() 76 | if pinecone_index.index_name in indexes: 77 | pinecone_index.delete() 78 | 79 | # pylint: disable=broad-except 80 | try: 81 | pinecone_index.create() 82 | except Exception as e: 83 | assert False, f"Pinecone.create() failed with exception: {e}" 84 | assert isinstance(pinecone_index.index, oem_pinecone.Index) 85 | pinecone_index.delete() 86 | 87 | def test_07_load_pdf(self): 88 | """Test that we can load a PDF document to the index.""" 89 | HERE = os.path.dirname(os.path.abspath(__file__)) 90 | test_file = os.path.join(HERE, "mock_data", "test_load.pdf") 91 | 92 | if not os.path.exists(test_file): 93 | pytest.skip(f"File {test_file} does not exist") 94 | 95 | pinecone = PineconeIndex() 96 | # pylint: disable=broad-except 97 | try: 98 | pinecone.pdf_loader(filepath=test_file) 99 | except Exception as e: 100 | assert False, f"Pinecone.load_pdf() failed with exception: {e}" 101 | pinecone.delete() 102 | -------------------------------------------------------------------------------- /models/tests/test_prompt_templates.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # flake8: noqa: F401 3 | # pylint: disable=too-few-public-methods 4 | """ 5 | Test integrity of base class. 6 | """ 7 | import pytest # pylint: disable=unused-import 8 | from langchain.prompts import PromptTemplate 9 | 10 | from models.prompt_templates import NetecPromptTemplates 11 | 12 | 13 | class TestPromptTemplates: 14 | """Test HybridSearchRetriever class.""" 15 | 16 | def test_01_prompt_with_template(self): 17 | """Ensure that all properties of the template class are PromptTemplate instances.""" 18 | templates = NetecPromptTemplates() 19 | for prop_name in templates.get_properties(): 20 | prop = getattr(templates, prop_name) 21 | assert isinstance(prop, PromptTemplate) 22 | -------------------------------------------------------------------------------- /models/tests/test_prompts.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # flake8: noqa: F401 3 | """ 4 | Test integrity of base class. 5 | """ 6 | import pytest # pylint: disable=unused-import 7 | 8 | from models.hybrid_search_retreiver import HybridSearchRetriever 9 | from models.prompt_templates import NetecPromptTemplates 10 | 11 | 12 | class TestPrompts: 13 | """Test HybridSearchRetriever class.""" 14 | 15 | hsr = HybridSearchRetriever() 16 | templates = NetecPromptTemplates() 17 | 18 | def test_oracle_training_services(self): 19 | """Test a prompt with the Oracle training services template""" 20 | 21 | prompt = self.templates.oracle_training_services 22 | result = self.hsr.prompt_with_template(prompt=prompt, concept="Oracle database administrator") 23 | assert result 24 | assert "Oracle" in result 25 | assert "training" in result 26 | 27 | def test_training_services(self): 28 | """Test a prompt with the training services template""" 29 | 30 | prompt = self.templates.training_services 31 | result = self.hsr.prompt_with_template(prompt=prompt, concept="Microsoft certified Azure AI engineer associate") 32 | assert result 33 | assert "Microsoft" in result or "Azure" in result 34 | assert "training" in result 35 | -------------------------------------------------------------------------------- /models/yt.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # pylint: disable=E0611 3 | """ 4 | LangChain Quickstart 5 | ~~~~~~~~~~~~~~~~~~~~ 6 | LangChain Explained in 13 Minutes | QuickStart Tutorial for Beginners 7 | 8 | see: https://www.youtube.com/watch?v=aywZrzNaKjs 9 | https://github.com/rabbitmetrics/langchain-13-min 10 | """ 11 | import logging 12 | import os 13 | 14 | import pinecone 15 | from dotenv import find_dotenv, load_dotenv 16 | 17 | # 5.) sequential chains 18 | # 4.) chains 19 | from langchain.chains.llm import LLMChain 20 | from langchain.chains.sequential import SimpleSequentialChain 21 | 22 | # 3.) prompt templates 23 | from langchain.prompts import PromptTemplate 24 | 25 | # 2.) models and messages 26 | from langchain.schema import HumanMessage, SystemMessage # AIMessage (not used) 27 | 28 | # 6.) embeddings 29 | from langchain.text_splitter import RecursiveCharacterTextSplitter 30 | 31 | # 1.) wrappers 32 | from langchain_community.llms.openai import OpenAI 33 | 34 | # 8.) LangChain agents 35 | from langchain_experimental.agents.agent_toolkits.python.base import create_python_agent 36 | from langchain_experimental.utilities.python import PythonREPL 37 | 38 | # from langchain_community.chat_models import ChatOpenAI 39 | from langchain_openai import ChatOpenAI, OpenAIEmbeddings 40 | 41 | # 7.) pinecode client 42 | from langchain_pinecone import PineconeVectorStore as Pinecone 43 | 44 | from models.conf import settings 45 | 46 | 47 | logging.basicConfig(level=logging.DEBUG if settings.debug_mode else logging.INFO) 48 | logger = logging.getLogger(__name__) 49 | 50 | # Load environment variables from .env file in all folders 51 | # pylint: disable=duplicate-code 52 | dotenv_path = find_dotenv() 53 | if os.path.exists(dotenv_path): 54 | load_dotenv(dotenv_path=dotenv_path, verbose=True) 55 | OPENAI_API_KEY = os.environ["OPENAI_API_KEY"] 56 | OPENAI_API_ORGANIZATION = os.environ["OPENAI_API_ORGANIZATION"] 57 | else: 58 | raise FileNotFoundError("No .env file found in root directory of repository") 59 | 60 | 61 | class LangChainDev: 62 | """LangChain Quickstart""" 63 | 64 | PINECONE_INDEX_NAME = "langchain-quickstart" 65 | 66 | multi_prompt_explanation = None 67 | texts_splitter_results = None 68 | pinecone_search = None 69 | openai_embedding = OpenAIEmbeddings(model_name="ada") # minute: 10:05 70 | query_result = None 71 | agent_executor = create_python_agent( # minute: 11:45 72 | llm=OpenAI(temperature=0, max_tokens=1000), 73 | tool=PythonREPL(), 74 | verbose=True, 75 | ) 76 | # pylint: disable=no-member 77 | pinecone.init( 78 | api_key=settings.pinecone_api_key.get_secret_value(), environment=settings.pinecone_environment 79 | ) # minute 10:43 80 | 81 | # LLM wrappers. minute 5:46 82 | def test_01_basic(self): 83 | """Test a basic request""" 84 | 85 | llm = OpenAI(model_name="gpt-4") 86 | retval = llm("explain large language models in one sentence") 87 | print(retval) 88 | 89 | # 2.) models and messages. minute 6:08 90 | def test_02_chat_model(self): 91 | """Test a chat model""" 92 | chat = ChatOpenAI(model_name="gpt-4", temperature=0.3) 93 | messages = [ 94 | SystemMessage(content="You are an expert data scientist"), 95 | HumanMessage(content="Write a Python script that trains a neural network on simulated data"), 96 | ] 97 | retval = chat(messages) 98 | print(retval.content, end="\n") 99 | 100 | # 3.) prompt templates. minute 6:56 101 | def get_prompt(self): 102 | """Get a prompt""" 103 | template = """ 104 | You are an expert data scientist with an expertise in building deep learning models. 105 | Explain the concept of {concept} in a couple of lines. 106 | """ 107 | prompt = PromptTemplate(input_variables=["concept"], template=template) 108 | return prompt 109 | 110 | def test_03_prompt_templates(self): 111 | """Test prompt templates""" 112 | llm = OpenAI(model_name="gpt-4") 113 | prompt = self.get_prompt() 114 | retval = llm(prompt.format(concept="regularization")) 115 | print(retval) 116 | 117 | # 4.) chains. minute 7:45 118 | def get_chain(self, llm, prompt): 119 | """Get a chain""" 120 | chain = LLMChain(llm=llm, prompt=prompt) 121 | return chain 122 | 123 | def test_04_chain(self): 124 | """Test a chain""" 125 | llm = OpenAI(model_name="gpt-4") 126 | prompt = self.get_prompt() 127 | chain = self.get_chain(llm=llm, prompt=prompt) 128 | print(chain.run("autoencoder")) 129 | 130 | # 5.) sequential chains. minute 8:06 131 | def get_overall_chain(self, chains): 132 | """Get an overall chain""" 133 | return SimpleSequentialChain(chains=chains, verbose=True) 134 | 135 | def get_prompt_two(self): 136 | """Get a second prompt""" 137 | second_prompt = PromptTemplate( 138 | input_variables=["ml_concept"], 139 | template=""" 140 | Turn the concept description of {ml_concept} and explain it to me like I'm five in 500 words. 141 | """, 142 | ) 143 | return second_prompt 144 | 145 | def get_explanation(self): 146 | """Get an explanation""" 147 | llm = OpenAI(model_name="gpt-4") 148 | prompt = self.get_prompt() 149 | chain_one = self.get_chain(llm=llm, prompt=prompt) 150 | 151 | second_prompt = self.get_prompt_two() 152 | chain_two = self.get_chain(llm=llm, prompt=second_prompt) 153 | overall_chain = self.get_overall_chain(chains=[chain_one, chain_two]) 154 | return overall_chain.run("autoencoder") 155 | 156 | def test_05_chains(self): 157 | """Test chains""" 158 | self.multi_prompt_explanation = self.get_explanation() 159 | print(self.multi_prompt_explanation) 160 | 161 | # 6.) embeddings. minute 9:00 162 | def test_06_embeddings(self): 163 | """Test embeddings""" 164 | # minute 9:32 165 | text_splitter = RecursiveCharacterTextSplitter( 166 | chunk_size=100, 167 | chunk_overlap=0, 168 | ) 169 | self.multi_prompt_explanation = self.get_explanation() 170 | if not self.texts_splitter_results: 171 | self.texts_splitter_results = text_splitter.create_documents([self.multi_prompt_explanation]) 172 | print(self.texts_splitter_results[0].page_content) 173 | 174 | # minute 10:05 175 | def test_06_embeddings_b(self): 176 | """Test embeddings b""" 177 | if not self.query_result: 178 | self.query_result = self.openai_embedding.embed_query( # minute 10:21 179 | self.texts_splitter_results[0].page_content 180 | ) 181 | print(self.query_result) 182 | 183 | # 7.) pinecone client. minute 11:00 184 | self.pinecone_search = Pinecone.from_documents( 185 | documents=self.texts_splitter_results, 186 | embedding=self.openai_embedding, 187 | index_name=self.PINECONE_INDEX_NAME, 188 | ) 189 | 190 | # pinecone (continued). minute 11:12 191 | def test_07_pinecone_search(self): 192 | """Test pinecone search""" 193 | query = "What is magical about an autoencoder?" 194 | result = self.pinecone_search.similarity_search(query) 195 | print(result) 196 | 197 | # 8.) LangChain agents. minute 11:45 198 | # (unrelated.) 199 | def test_08_agent_executor(self): 200 | """Test agent executor""" 201 | retval = self.agent_executor.run("Find the roots (zeros) of the quadratic function 3 * x**2 + 2*x -1") 202 | print(retval) 203 | 204 | def main(self): 205 | """Main function""" 206 | # self.test_06_embeddings() 207 | # self.test_06_embeddings_b() 208 | # self.test_07_pinecone_search() 209 | # self.test_08_agent_executor 210 | self.test_03_prompt_templates() 211 | 212 | 213 | def main(): 214 | """Main function""" 215 | pintcode_tests = LangChainDev() 216 | pintcode_tests.main() 217 | 218 | 219 | if __name__ == "__main__": 220 | main() 221 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "private": true, 3 | "scripts": { 4 | "test": "echo \"Error: no test specified\" && exit 1", 5 | "prettier": "prettier --write \"**/*.{js,jsx,ts,tsx,json,css,scss,md}\"" 6 | }, 7 | "devDependencies": { 8 | "@semantic-release/changelog": "^6.0.3", 9 | "@semantic-release/commit-analyzer": "^13.0.0", 10 | "@semantic-release/git": "^10.0.1", 11 | "@semantic-release/github": "^11.0.0", 12 | "@semantic-release/release-notes-generator": "^14.0.0", 13 | "prettier": "^3.1.1", 14 | "typescript": "^5.2.2" 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools", "wheel"] 3 | 4 | [tool.isort] 5 | profile = "black" 6 | lines_after_imports = 2 7 | 8 | [tool.black] 9 | line-length = 120 10 | target-version = ['py311'] 11 | include = '\.pyi?$' 12 | exclude = ''' 13 | /( 14 | \.git 15 | | \.hg 16 | | \.mypy_cache 17 | | \.tox 18 | | \.venv 19 | | venv 20 | | node_modules 21 | | build 22 | | buck-out 23 | | build 24 | | dist 25 | )/ 26 | ''' 27 | 28 | [tool.codespell] 29 | skip = '*.svg,models/prompt_templates.py' 30 | ignore-words = 'codespell.txt' 31 | -------------------------------------------------------------------------------- /release.config.js: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | dryRun: false, 3 | plugins: [ 4 | "@semantic-release/commit-analyzer", 5 | "@semantic-release/release-notes-generator", 6 | [ 7 | "@semantic-release/changelog", 8 | { 9 | changelogFile: "CHANGELOG.md", 10 | }, 11 | ], 12 | "@semantic-release/github", 13 | [ 14 | "@semantic-release/git", 15 | { 16 | assets: ["CHANGELOG.md", "requirements/base.txt"], 17 | message: 18 | "chore(release): ${nextRelease.version} [skip ci]\n\n${nextRelease.notes}", 19 | }, 20 | ], 21 | ], 22 | }; 23 | -------------------------------------------------------------------------------- /requirements/base.txt: -------------------------------------------------------------------------------- 1 | 2 | python-decouple==3.8 3 | langchainhub==0.1.21 4 | langchain-openai==0.3.18 5 | langchain-experimental 6 | openai>=1.40.0 7 | langchain 8 | langchain-pinecone 9 | langchain-experimental 10 | pinecone-client==5.0.1 11 | pinecone-text==0.10.0 12 | pydantic==2.10.4 13 | pydantic-settings==2.9.1 14 | python-dotenv==1.1.0 15 | pypdf==5.6.0 16 | tiktoken==0.9.0 17 | -------------------------------------------------------------------------------- /requirements/local.txt: -------------------------------------------------------------------------------- 1 | 2 | -r base.txt 3 | 4 | # dev and test 5 | # ------------ 6 | pytest==8.3.4 7 | pytest_mock==3.14.0 8 | 9 | # Code linters, formatters, and security scanners 10 | # ------------ 11 | black==25.1.0 12 | flake8==7.2.0 13 | flake8-coding==1.3.2 14 | pre-commit==4.0.1 15 | isort==6.0.1 16 | mypy==1.16.0 17 | pylint==3.3.7 18 | bandit==1.8.3 19 | pydocstringformatter==0.7.3 20 | tox==4.25.0 21 | codespell==2.4.1 22 | -------------------------------------------------------------------------------- /run_pylint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Called from pre-commit. Run pylint on all python files in the current directory 3 | python -m pylint "$@" 4 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Future use: setup for openai_embeddings package. I use this for instructional purposes, 4 | for demonstrating best practices on how to create a Python package. 5 | 6 | This package is not actually published to PyPi. 7 | """ 8 | import io 9 | import os 10 | from typing import List 11 | 12 | from setuptools import find_packages, setup 13 | 14 | from setup_utils import get_semantic_version # pylint: disable=import-error 15 | 16 | 17 | HERE = os.path.abspath(os.path.dirname(__file__)) 18 | 19 | 20 | def is_requirement(line: str) -> bool: 21 | """ 22 | True if line is a valid requirement line from a 23 | Python requirements file. 24 | """ 25 | return not (line.strip() == "" or line.startswith("#")) 26 | 27 | 28 | def load_requirements(filename: str) -> List[str]: 29 | """ 30 | Returns Python package requirements as a list of semantically 31 | versioned pip packages. 32 | 33 | Args: 34 | filename: The name of the requirements file to load. example: "base.txt" 35 | 36 | Returns: 37 | A list of package requirements. 38 | ['pytest==8.3.4', 'pytest_mock==3.14.0', 'black==25.1.0', ... more packages ] 39 | """ 40 | with io.open(os.path.join(HERE, "requirements", filename), "rt", encoding="utf-8") as f: 41 | return [line.strip() for line in f if is_requirement(line) and not line.startswith("-r")] 42 | 43 | 44 | setup( 45 | name="openai_embeddings", 46 | version=get_semantic_version(), 47 | description="""A Hybrid Search and Augmented Generation prompting solution using 48 | Python [OpenAI](https://openai.com/) embeddings sourced from 49 | [Pinecone](https://docs.pinecone.io/docs/python-client) vector database indexes and 50 | managed by [LangChain](https://www.langchain.com/).""", 51 | author="Lawrence McDaniel", 52 | author_email="lpm0073@gmail.com", 53 | url="https://lawrencemcdaniel.com/", 54 | packages=find_packages(), 55 | package_data={ 56 | "openai_embeddings": ["*.md"], 57 | }, 58 | install_requires=load_requirements("base.txt"), 59 | extras_require={ 60 | "dev": load_requirements("local.txt"), 61 | }, 62 | ) 63 | -------------------------------------------------------------------------------- /setup_test.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """Test setup.py.""" 3 | import subprocess 4 | import unittest 5 | 6 | 7 | class TestSetup(unittest.TestCase): 8 | """Test setup.py.""" 9 | 10 | def test_setup_syntax(self): 11 | """Test setup.py syntax.""" 12 | result = subprocess.run(["python", "setup.py", "check"], capture_output=True, text=True, check=False) 13 | assert result.returncode == 0, f"setup.py failed with output:\n{result.stdout}\n{result.stderr}" 14 | assert not result.stderr, "Expected no error output" 15 | 16 | 17 | if __name__ == "__main__": 18 | unittest.main() 19 | -------------------------------------------------------------------------------- /setup_utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # pylint: disable=duplicate-code 3 | """Lawrence McDaniel https://lawrencemcdaniel.com.""" 4 | import importlib.util 5 | import os 6 | import re 7 | from typing import Dict 8 | 9 | 10 | MODULE_NAME = "models" 11 | HERE = os.path.abspath(os.path.dirname(__file__)) 12 | PROJECT_ROOT = os.path.abspath(os.path.join(HERE, MODULE_NAME)) 13 | 14 | # allow setup.py to be run from any path 15 | os.chdir(os.path.normpath(os.path.join(os.path.abspath(__file__), os.pardir))) 16 | 17 | 18 | def load_version() -> Dict[str, str]: 19 | """Stringify the __version__ module.""" 20 | version_file_path = os.path.join(PROJECT_ROOT, "__version__.py") 21 | spec = importlib.util.spec_from_file_location("__version__", version_file_path) 22 | version_module = importlib.util.module_from_spec(spec) 23 | spec.loader.exec_module(version_module) 24 | return version_module.__dict__ 25 | 26 | 27 | VERSION = load_version() 28 | 29 | 30 | def get_semantic_version() -> str: 31 | """ 32 | Return the semantic version number. 33 | 34 | Example valid values of __version__.py are: 35 | 0.1.17 36 | 0.1.17-next.1 37 | 0.1.17-next.2 38 | 0.1.17-next.123456 39 | 0.1.17-next-major.1 40 | 0.1.17-next-major.2 41 | 0.1.17-next-major.123456 42 | 43 | Note: 44 | - pypi does not allow semantic version numbers to contain a dash. 45 | - pypi does not allow semantic version numbers to contain a 'v' prefix. 46 | - pypi does not allow semantic version numbers to contain a 'next' suffix. 47 | """ 48 | version = VERSION["__version__"] 49 | version = re.sub(r"-next\.\d+", "", version) 50 | return re.sub(r"-next-major\.\d+", "", version) 51 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | # setup a basic tox environment for flake8 with the default python3.11 2 | # environment 3 | [tox] 4 | envlist = py3.11,flake8 5 | skip_missing_interpreters = true 6 | 7 | [tool.isort] 8 | profile = "black" 9 | skip =venv,node_modules 10 | 11 | [gh-actions] 12 | python = 13 | 3.8: gitlint,py38,flake8 14 | 3.9: gitlint,py39,flake8 15 | 3.10: gitlint,py310,flake8 16 | 3.11: gitlint,py311,flake8,mypy,black,pylint 17 | 3.12: gitlint,py311,flake8,mypy,black,pylint 18 | 19 | [testenv] 20 | deps = -rrequirements.txt 21 | commands = pytest 22 | 23 | [testenv:flake8] 24 | skip_install = True 25 | deps = flake8 26 | commands = flake8 27 | 28 | [testenv:gitlint] 29 | skip_install = True 30 | deps = gitlint 31 | commands = gitlint {posargs} 32 | 33 | [testenv:bumpversion] 34 | skip_install = True 35 | passenv = 36 | # Git can only find its global configuration if it knows where the 37 | # user's HOME is. 38 | HOME 39 | # We set sign_tags in .bumpversion.cfg, so pass in the GnuPG agent 40 | # reference to avoid having to retype the passphrase for an 41 | # already-cached private key. 42 | GPG_AGENT_INFO 43 | deps = bump2version 44 | commands = bump2version {posargs} 45 | 46 | [testenv:pylint] 47 | deps = pylint 48 | commands = 49 | pylint . 50 | --------------------------------------------------------------------------------