├── .gitattributes ├── .github ├── CODEOWNERS ├── renovate.json └── workflows │ └── ci.yml ├── examples ├── llama_index_examples │ ├── __init__.py │ ├── pyproject.toml │ ├── 05-github-check-access.py │ ├── 08-confluence-check-access.py │ ├── 07-jira-check-acess.py │ ├── README.md │ ├── 02-gdrive-check-access.py │ ├── 03-rag-LlamaIndex-gdrive-filter.py │ ├── 01-rag-LlamaIndex-gdrive-processor.py │ └── 02-rag-LlamaIndex-all-sources-processor.py ├── .gitignore ├── multipass_examples │ ├── 06-gitlab-check-access.py │ ├── pyproject.toml │ ├── 01-github-check-access.py │ ├── 03-slack-check-access.py │ ├── 04-dropbox-check-access.py │ └── README.md ├── langchain_examples │ ├── pyproject.toml │ ├── 05-github-check-access.py │ ├── 02-rag-LangChain-gdrive.py │ └── 01-rag-LangChain-all-sources.py └── README.md ├── packages ├── pangea-multipass │ ├── tests │ │ ├── __init__.py │ │ └── integration │ │ │ ├── __init__.py │ │ │ ├── test_github.py │ │ │ ├── test_gitlab.py │ │ │ ├── test_slack.py │ │ │ └── test_dropbox.py │ ├── pangea_multipass │ │ ├── py.typed │ │ ├── sources │ │ │ ├── github │ │ │ │ ├── __init__.py │ │ │ │ └── github.py │ │ │ ├── gitlab │ │ │ │ ├── __init__.py │ │ │ │ └── gitlab.py │ │ │ ├── slack │ │ │ │ ├── __init__.py │ │ │ │ └── slack.py │ │ │ ├── dropbox │ │ │ │ ├── __init__.py │ │ │ │ └── dropbox.py │ │ │ ├── jira │ │ │ │ ├── __init__.py │ │ │ │ └── jira.py │ │ │ ├── gdrive │ │ │ │ └── __init__.py │ │ │ ├── confluence │ │ │ │ └── __init__.py │ │ │ └── __init__.py │ │ ├── __init__.py │ │ ├── utils.py │ │ ├── dropbox_reader.py │ │ ├── oauth.py │ │ ├── gitlab_reader.py │ │ ├── github_reader.py │ │ ├── slack_reader.py │ │ └── core.py │ ├── README.md │ ├── pyproject.toml │ └── CHANGELOG.md ├── pangea-multipass-langchain │ ├── pangea_multipass_langchain │ │ ├── py.typed │ │ ├── __init__.py │ │ └── langchain.py │ ├── CHANGELOG.md │ ├── pyproject.toml │ └── README.md ├── pangea-multipass-llama-index │ ├── pangea_multipass_llama_index │ │ ├── py.typed │ │ ├── __init__.py │ │ └── llama_index.py │ ├── CHANGELOG.md │ ├── pyproject.toml │ └── README.md └── .gitignore ├── dev ├── setup_repo.sh └── validate_tag.sh ├── LICENSE ├── .pre-commit-config.yaml ├── README.md ├── .gitignore └── EXTENDING.md /.gitattributes: -------------------------------------------------------------------------------- 1 | * text=auto eol=lf 2 | -------------------------------------------------------------------------------- /.github/CODEOWNERS: -------------------------------------------------------------------------------- 1 | * @pangeacyber/sdks 2 | -------------------------------------------------------------------------------- /examples/llama_index_examples/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /packages/pangea-multipass/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /packages/pangea-multipass/pangea_multipass/py.typed: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /packages/pangea-multipass-langchain/pangea_multipass_langchain/py.typed: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /packages/pangea-multipass-llama-index/pangea_multipass_llama_index/py.typed: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /examples/.gitignore: -------------------------------------------------------------------------------- 1 | .env 2 | .env_* 3 | storage/ 4 | .mypy_cache/ 5 | *.json 6 | -------------------------------------------------------------------------------- /packages/.gitignore: -------------------------------------------------------------------------------- 1 | .env 2 | .env_* 3 | storage/ 4 | .mypy_cache/ 5 | *.json 6 | -------------------------------------------------------------------------------- /packages/pangea-multipass/pangea_multipass/sources/github/__init__.py: -------------------------------------------------------------------------------- 1 | from .github import GitHubClient, GitHubProcessor 2 | -------------------------------------------------------------------------------- /packages/pangea-multipass/pangea_multipass/sources/gitlab/__init__.py: -------------------------------------------------------------------------------- 1 | from .gitlab import GitLabClient, GitLabProcessor 2 | -------------------------------------------------------------------------------- /packages/pangea-multipass/pangea_multipass/sources/slack/__init__.py: -------------------------------------------------------------------------------- 1 | from .slack import SlackClient, SlackProcessor 2 | -------------------------------------------------------------------------------- /packages/pangea-multipass/pangea_multipass/sources/dropbox/__init__.py: -------------------------------------------------------------------------------- 1 | from .dropbox import DropboxClient, DropboxProcessor 2 | -------------------------------------------------------------------------------- /packages/pangea-multipass/tests/integration/__init__.py: -------------------------------------------------------------------------------- 1 | from .test_github import TestGitHub 2 | from .test_gitlab import TestGitLab 3 | from .test_slack import TestSlack 4 | -------------------------------------------------------------------------------- /packages/pangea-multipass-langchain/pangea_multipass_langchain/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Pangea Cyber Corporation 2 | # Author: Pangea Cyber Corporation 3 | 4 | from .langchain import * 5 | -------------------------------------------------------------------------------- /packages/pangea-multipass-llama-index/pangea_multipass_llama_index/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Pangea Cyber Corporation 2 | # Author: Pangea Cyber Corporation 3 | 4 | from .llama_index import * 5 | -------------------------------------------------------------------------------- /packages/pangea-multipass/pangea_multipass/sources/jira/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Pangea Cyber Corporation 2 | # Author: Pangea Cyber Corporation 3 | 4 | from .jira import JiraAuth, JiraME, JiraProcessor 5 | -------------------------------------------------------------------------------- /dev/setup_repo.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env zsh 2 | 3 | # Currently this script only supports Macs/ZSH, please add more as needed 4 | brew install pre-commit 5 | 6 | echo "Installing pre-commit hooks" 7 | pre-commit install 8 | -------------------------------------------------------------------------------- /packages/pangea-multipass/pangea_multipass/sources/gdrive/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Pangea Cyber Corporation 2 | # Author: Pangea Cyber Corporation 3 | 4 | from .gdrive import GDriveAPI, GDriveME, GDriveProcessor 5 | -------------------------------------------------------------------------------- /packages/pangea-multipass/pangea_multipass/sources/confluence/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Pangea Cyber Corporation 2 | # Author: Pangea Cyber Corporation 3 | 4 | from .confluence import ConfluenceAuth, ConfluenceME, ConfluenceProcessor 5 | -------------------------------------------------------------------------------- /packages/pangea-multipass/pangea_multipass/sources/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Pangea Cyber Corporation 2 | # Author: Pangea Cyber Corporation 3 | 4 | from .confluence import * 5 | from .dropbox import * 6 | from .gdrive import * 7 | from .github import * 8 | from .gitlab import * 9 | from .jira import * 10 | from .slack import * 11 | -------------------------------------------------------------------------------- /examples/multipass_examples/06-gitlab-check-access.py: -------------------------------------------------------------------------------- 1 | # Ingestion time 2 | import os 3 | 4 | from pangea_multipass import GitLabProcessor, GitLabReader, get_document_metadata 5 | 6 | token = os.getenv("GITLAB_ADMIN_TOKEN") 7 | assert token 8 | 9 | username = os.getenv("GITLAB_USERNAME") 10 | assert username 11 | 12 | reader = GitLabReader(token=token) 13 | print("Loading data...") 14 | files = reader.load_data() 15 | print(f"Loaded {len(files)} files.") 16 | 17 | 18 | # Inference time 19 | processor = GitLabProcessor(admin_token=token, username=username, get_node_metadata=get_document_metadata) 20 | 21 | authorized_files = processor.filter(files) 22 | print(f"User '{username}' has access to {len(authorized_files)} files.") 23 | -------------------------------------------------------------------------------- /packages/pangea-multipass/pangea_multipass/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Pangea Cyber Corporation 2 | # Author: Pangea Cyber Corporation 3 | 4 | from .core import ( 5 | Constant, 6 | DocumentReader, 7 | FilterOperator, 8 | HasherSHA256, 9 | MetadataFilter, 10 | MultipassDocument, 11 | PangeaGenericNodeProcessor, 12 | PangeaMetadataKeys, 13 | PangeaMetadataValues, 14 | PangeaNodeProcessorMixer, 15 | enrich_metadata, 16 | generate_id, 17 | get_document_metadata, 18 | ) 19 | from .dropbox_reader import DropboxReader 20 | from .github_reader import GitHubReader 21 | from .gitlab_reader import GitLabReader 22 | from .oauth import OauthFlow 23 | from .slack_reader import SlackReader 24 | from .sources import * 25 | from .utils import * 26 | -------------------------------------------------------------------------------- /examples/multipass_examples/pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "multipass_examples" 3 | version = "0.1.0" 4 | description = "Pangea Multipass authorization library examples" 5 | authors = [ 6 | { name = "Apurv Jawle", email = "apurv.jawle@pangea.cloud" } 7 | ] 8 | license = "MIT" 9 | readme = "README.md" 10 | package-mode = false 11 | requires-python = ">=3.10,<3.13" 12 | dependencies = [ 13 | "pangea-multipass", 14 | "llama-index-readers-google==0.7.2", 15 | ] 16 | 17 | [dependency-groups] 18 | dev = [ 19 | "mypy==1.19.0", 20 | "types-requests==2.32.4.20250913", 21 | ] 22 | 23 | [tool.uv.sources] 24 | pangea-multipass = { path = "../../packages/pangea-multipass", editable = true } 25 | 26 | [tool.isort] 27 | profile = "black" 28 | line_length = 120 29 | 30 | [tool.mypy] 31 | plugins = ["pydantic.mypy"] 32 | -------------------------------------------------------------------------------- /packages/pangea-multipass/README.md: -------------------------------------------------------------------------------- 1 | # Pangea Multipass: Your Authorization Helper 2 | 3 | Pangea Multipass is a Python library for checking user access to upstream data sources. 4 | 5 | In practice, you can use it to check if a specific user has access to a file in a Google Drive, a ticket in Jira, or a page in Confluence. In concept, we've built this library to be extensible to eventually support Slack channels, GitHub repositories, Salesforce opportunities, and more. 6 | 7 | We originally built this to support our customers' Retrieval-Augmented Generation (RAG) applications to mitigate data leaks. In a RAG architecture, the application inserts additional context at inference time. If you don't check the user's authorization to that context, you could inadvertently leak sensitive information. 8 | 9 | While this is useful in AI/LLM apps, we've abstracted this to work independently so you can use it in any app. 10 | -------------------------------------------------------------------------------- /examples/multipass_examples/01-github-check-access.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Pangea Cyber Corporation 2 | # Author: Pangea Cyber Corporation 3 | 4 | import os 5 | 6 | from pangea_multipass import GitHubProcessor, GitHubReader, PangeaMetadataKeys, get_document_metadata 7 | 8 | # Ingestion time 9 | admin_token = os.getenv("GITHUB_ADMIN_TOKEN") 10 | assert admin_token 11 | 12 | reader = GitHubReader(admin_token) 13 | documents = reader.load_data() 14 | print(f"Loaded {len(documents)} docs:") 15 | 16 | for doc in documents: 17 | print(doc.metadata.get(PangeaMetadataKeys.FILE_NAME), "") 18 | 19 | # Inference time 20 | username = os.getenv("GITHUB_USERNAME") 21 | assert username 22 | 23 | processor = GitHubProcessor(admin_token, get_document_metadata, username=username) 24 | authorized_docs = processor.filter(documents) 25 | 26 | print(f"\nAuthorized docs: {len(authorized_docs)}") 27 | for doc in authorized_docs: 28 | print(doc.metadata.get(PangeaMetadataKeys.FILE_NAME), "") 29 | -------------------------------------------------------------------------------- /.github/renovate.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://docs.renovatebot.com/renovate-schema.json", 3 | "extends": [ 4 | "config:best-practices", 5 | "local>pangeacyber/.github:renovate-config" 6 | ], 7 | "automerge": true, 8 | "automergeStrategy": "rebase", 9 | "ignorePaths": [], 10 | "packageRules": [ 11 | { 12 | "matchManagers": ["github-actions"], 13 | "extends": [ 14 | ":semanticPrefixChore", 15 | ":semanticCommitScope(ci)" 16 | ] 17 | }, 18 | { 19 | "matchFileNames": ["examples/**"], 20 | "extends": [ 21 | ":semanticPrefixChore", 22 | ":semanticCommitScope(examples)" 23 | ], 24 | "additionalBranchPrefix": "{{parentDir}}/" 25 | }, 26 | { 27 | "matchFileNames": ["packages/**"], 28 | "additionalBranchPrefix": "{{parentDir}}/", 29 | "semanticCommitScope": "{{parentDir}}" 30 | }, 31 | { 32 | "matchDepNames": ["python"], 33 | "enabled": false 34 | } 35 | ] 36 | } 37 | -------------------------------------------------------------------------------- /dev/validate_tag.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | if [ $# -lt 1 ]; then 6 | echo "usage: validate_tag.sh " 7 | exit 1 8 | fi 9 | 10 | GIT_TAG=$1 11 | 12 | if [[ ! $GIT_TAG == *"/"* ]]; then 13 | echo "Git tag must contain a forward slash to delimit the package name from the version number." 14 | exit 1 15 | fi 16 | 17 | PACKAGE_NAME=$(echo "$GIT_TAG" | cut -d "/" -f 1) 18 | VERSION=$(echo "$GIT_TAG" | cut -d "/" -f 2) 19 | 20 | if [[ ! "$VERSION" == *"v"* ]]; then 21 | echo "Git tag must contain a version number that's prefixed with 'v'." 22 | exit 1 23 | fi 24 | 25 | # Move to repo root. 26 | PARENT_PATH=$(cd "$(dirname "${BASH_SOURCE[0]}")"; pwd -P) 27 | pushd "$PARENT_PATH/.." 28 | 29 | PYPROJECT_VERSION=v$(poetry version --directory packages/"$PACKAGE_NAME" --dry-run --short) 30 | 31 | if [[ ! "$VERSION" == "$PYPROJECT_VERSION" ]]; then 32 | echo "Git tag version '$VERSION' does not match pyproject.toml version '$PYPROJECT_VERSION'." 33 | exit 1 34 | fi 35 | 36 | popd 37 | -------------------------------------------------------------------------------- /examples/langchain_examples/pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "langchain_examples" 3 | version = "0.1.0" 4 | description = "Pangea Multipass authorization library for LangChain" 5 | authors = [ 6 | { name = "Apurv Jawle", email = "apurv.jawle@pangea.cloud" } 7 | ] 8 | license = "MIT" 9 | package-mode = false 10 | requires-python = ">=3.10,<3.13" 11 | dependencies = [ 12 | "pangea-multipass-langchain (>=0.2.0)", 13 | "pangea-multipass (>=0.2.0)", 14 | "google-api-python-client==2.187.0", 15 | "google-auth-httplib2 (>=0.2.1)", 16 | "langchain-google-community==2.0.10", 17 | "lxml==6.0.2", 18 | "faiss-cpu==1.13.1", 19 | "boto3==1.42.7", 20 | "langchain-aws==0.2.35", 21 | ] 22 | 23 | [dependency-groups] 24 | dev = [ 25 | "mypy==1.19.0", 26 | ] 27 | 28 | [tool.uv.sources] 29 | pangea-multipass = { path = "../../packages/pangea-multipass", editable = true } 30 | pangea-multipass-langchain = { path = "../../packages/pangea-multipass-langchain", editable = true } 31 | 32 | [tool.isort] 33 | profile = "black" 34 | line_length = 120 35 | 36 | [tool.mypy] 37 | plugins = ["pydantic.mypy"] 38 | -------------------------------------------------------------------------------- /examples/langchain_examples/05-github-check-access.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Pangea Cyber Corporation 2 | # Author: Pangea Cyber Corporation 3 | 4 | import os 5 | 6 | from pangea_multipass import GitHubReader, PangeaMetadataKeys 7 | from pangea_multipass_langchain import LangChainGitHubFilter, from_multipass 8 | 9 | # Ingestion time 10 | admin_token = os.getenv("GITHUB_ADMIN_TOKEN") 11 | assert admin_token 12 | 13 | reader = GitHubReader(admin_token) 14 | mp_documents = reader.load_data() 15 | print(f"Loaded {len(mp_documents)} docs:") 16 | 17 | # Convert documents to LangChain format 18 | documents = from_multipass(mp_documents) 19 | for doc in documents: 20 | print(doc.metadata.get(PangeaMetadataKeys.FILE_NAME), "") 21 | 22 | # Inference time 23 | username = os.getenv("GITHUB_USERNAME") 24 | assert username, "GITHUB_USERNAME is not set" 25 | 26 | processor = LangChainGitHubFilter(admin_token, username=username) 27 | authorized_docs = processor.filter(documents) 28 | 29 | print(f"\nAuthorized docs: {len(authorized_docs)}") 30 | for doc in authorized_docs: 31 | print(doc.metadata.get(PangeaMetadataKeys.FILE_NAME), "") 32 | -------------------------------------------------------------------------------- /examples/multipass_examples/03-slack-check-access.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Pangea Cyber Corporation 2 | # Author: Pangea Cyber Corporation 3 | 4 | import logging 5 | import os 6 | 7 | from pangea_multipass import SlackReader 8 | from pangea_multipass.utils import set_logger_to_stdout 9 | 10 | set_logger_to_stdout("multipass", logging.INFO) 11 | 12 | admin_token = os.getenv("SLACK_ADMIN_TOKEN") 13 | assert admin_token 14 | 15 | reader = SlackReader(token=admin_token) 16 | documents = reader.load_data(max_messages_per_channel=1000) 17 | print(f"Loaded {len(documents)} messages.") 18 | 19 | # Inference time 20 | from pangea_multipass import SlackProcessor, get_document_metadata 21 | 22 | user_email = os.getenv("SLACK_USER_EMAIL") 23 | assert user_email 24 | 25 | processor = SlackProcessor(token=admin_token, get_node_metadata=get_document_metadata, user_email=user_email) 26 | filter = processor.get_filter() 27 | print("User has access to channel ids:") 28 | for id in filter.value: 29 | print(f"\t{id}") 30 | 31 | filtered_docs = processor.filter(nodes=documents) 32 | print(f"User has access to {len(filtered_docs)} messages") 33 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Pangea 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /examples/llama_index_examples/pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "llama_index_examples" 3 | version = "0.2.0" 4 | description = "Pangea Multipass authorization library for Llama Index" 5 | authors = [ 6 | { name = "Apurv Jawle", email = "apurv.jawle@pangea.cloud" } 7 | ] 8 | license = "MIT" 9 | readme = "README.md" 10 | package-mode = false 11 | requires-python = ">=3.10,<3.13" 12 | dependencies = [ 13 | "pangea-multipass-llama-index (>=0.2.0)", 14 | "pangea-multipass (>=0.2.0)", 15 | "llama-index-llms-bedrock (==0.4.2)", 16 | "llama-index-embeddings-bedrock (==0.7.2)", 17 | "llama-index-readers-google (==0.7.2)", 18 | "llama-index-readers-confluence (==0.6.0)", 19 | "llama-index-readers-jira (==0.5.1)", 20 | ] 21 | 22 | [dependency-groups] 23 | dev = [ 24 | "mypy==1.19.0", 25 | ] 26 | 27 | [tool.uv.sources] 28 | pangea-multipass = { path = "../../packages/pangea-multipass", editable = true } 29 | pangea-multipass-llama-index = { path = "../../packages/pangea-multipass-llama-index", editable = true } 30 | 31 | [tool.mypy] 32 | plugins = ["pydantic.mypy"] 33 | 34 | [tool.isort] 35 | profile = "black" 36 | line_length = 120 37 | -------------------------------------------------------------------------------- /examples/llama_index_examples/05-github-check-access.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Pangea Cyber Corporation 2 | # Author: Pangea Cyber Corporation 3 | 4 | import os 5 | 6 | from pangea_multipass import GitHubReader, PangeaMetadataKeys 7 | from pangea_multipass_llama_index import LlamaIndexGitHubProcessor, from_multipass 8 | 9 | # Ingestion time 10 | admin_token = os.getenv("GITHUB_ADMIN_TOKEN") 11 | assert admin_token 12 | 13 | reader = GitHubReader(admin_token) 14 | print("Loading data...") 15 | documents = reader.load_data() 16 | print(f"Loaded {len(documents)} docs:") 17 | 18 | # Convert documents to Llama Index format 19 | documents = from_multipass(documents) # type: ignore 20 | for doc in documents: 21 | print(doc.metadata.get(PangeaMetadataKeys.FILE_NAME), "") 22 | 23 | # Inference time 24 | 25 | 26 | username = os.getenv("GITHUB_USERNAME") 27 | assert username, "GITHUB_USERNAME is not set" 28 | 29 | processor = LlamaIndexGitHubProcessor(admin_token, username=username) 30 | authorized_docs = processor.filter(documents) # type: ignore 31 | 32 | print(f"\nAuthorized docs: {len(authorized_docs)}") 33 | for d in authorized_docs: 34 | print(d.metadata.get(PangeaMetadataKeys.FILE_NAME), "") 35 | -------------------------------------------------------------------------------- /packages/pangea-multipass-langchain/CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | All notable changes to this project will be documented in this file. 4 | 5 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), 6 | and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). 7 | 8 | ## Unreleased 9 | 10 | ### Added 11 | 12 | - GitLabReader and GitLabProcessor 13 | - Dropbox processor 14 | 15 | ### Fixed 16 | 17 | - `pydantic` error of `node_processor` default value. 18 | 19 | ## 0.2.0 - 2025-01-15 20 | 21 | ### Added 22 | 23 | - GitHub repository's reader and processor. 24 | - Slack channel's reader and processor. 25 | - `account_id` support on JiraProcessor. 26 | - Check user email permissions with admin credentials in GDriveProcessor. 27 | - Check username permissions with admin token in GitHubProcessor. 28 | - Check user email permissions with admin token in SlackProcessor. 29 | - `py.typed` marker file. 30 | - Check user account id permissions with admin token in JiraProcessor. 31 | - Check user account id permission with admin token in ConfluenceProcessor. 32 | 33 | ## 0.1.0 - 2024-12-24 34 | 35 | ### Added 36 | 37 | - Medatata enritcher and processor for Google Drive, Jira and Confluence data sources. 38 | -------------------------------------------------------------------------------- /packages/pangea-multipass-langchain/pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "pangea-multipass-langchain" 3 | version = "0.2.0" 4 | description = "Pangea Multipass authorization library for LangChain" 5 | authors = [ 6 | { name = "Apurv Jawle", email = "apurv.jawle@pangea.cloud" } 7 | ] 8 | license = "MIT" 9 | readme = "README.md" 10 | requires-python = ">=3.10,<4.0" 11 | dependencies = [ 12 | "langchain (==0.3.27)", 13 | "pangea-multipass (>=0.2.0)", 14 | ] 15 | 16 | [dependency-groups] 17 | dev = [ 18 | "mypy==1.19.0", 19 | ] 20 | 21 | [tool.uv.sources] 22 | pangea-multipass = { path = "../pangea-multipass", editable = true } 23 | 24 | [build-system] 25 | requires = ["hatchling==1.28.0"] 26 | build-backend = "hatchling.build" 27 | 28 | [tool.black] 29 | line-length = 120 30 | 31 | [tool.isort] 32 | profile = "black" 33 | line_length = 120 34 | src_paths = ["pangea_multipass_langchain", "tests"] 35 | known_local_folder = ["pangea_multipass_langchain", "tests"] 36 | 37 | [tool.mypy] 38 | python_version = "3.10" 39 | color_output = true 40 | error_summary = true 41 | pretty = true 42 | show_column_numbers = true 43 | warn_unused_ignores = true 44 | 45 | [[tool.mypy.overrides]] 46 | module = ["pangea_multipass.*"] 47 | follow_untyped_imports = true 48 | -------------------------------------------------------------------------------- /packages/pangea-multipass-llama-index/CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | All notable changes to this project will be documented in this file. 4 | 5 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), 6 | and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). 7 | 8 | ## Unreleased 9 | 10 | ### Added 11 | 12 | - GitLabReader and GitLabProcessor 13 | - Dropbox processor 14 | 15 | ### Fixed 16 | 17 | - `pydantic` error of `node_processor` default value. 18 | 19 | ## 0.2.0 - 2025-01-15 20 | 21 | ### Added 22 | 23 | - GitHub repository's reader and processor. 24 | - Slack channel's reader and processor. 25 | - `account_id` support on JiraProcessor. 26 | - Check user email permissions with admin credentials in GDriveProcessor. 27 | - Check username permissions with admin token in GitHubProcessor. 28 | - Check user email permissions with admin token in SlackProcessor. 29 | - `py.typed` marker file. 30 | - Check user account id permissions with admin token in JiraProcessor. 31 | - Check user account id permission with admin token in ConfluenceProcessor. 32 | 33 | ## 0.1.0 - 2024-12-24 34 | 35 | ### Added 36 | 37 | - Medatata enritcher and processor for Google Drive, Jira and Confluence data sources. 38 | -------------------------------------------------------------------------------- /packages/pangea-multipass-llama-index/pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "pangea-multipass-llama-index" 3 | version = "0.2.0" 4 | description = "Pangea Multipass authorization library for Llama Index" 5 | authors = [ 6 | { name = "Apurv Jawle", email = "apurv.jawle@pangea.cloud" } 7 | ] 8 | license = "MIT" 9 | readme = "README.md" 10 | requires-python = ">=3.10,<4.0" 11 | dependencies = [ 12 | "llama-index (>=0.14.10)", 13 | "pangea-multipass (>=0.2.0)", 14 | ] 15 | 16 | [dependency-groups] 17 | dev = [ 18 | "mypy==1.19.0", 19 | ] 20 | 21 | [tool.uv.sources] 22 | pangea-multipass = { path = "../pangea-multipass", editable = true } 23 | 24 | [build-system] 25 | requires = ["hatchling==1.28.0"] 26 | build-backend = "hatchling.build" 27 | 28 | [tool.black] 29 | line-length = 120 30 | 31 | [tool.isort] 32 | profile = "black" 33 | line_length = 120 34 | src_paths = ["pangea_multipass_llama_index", "tests"] 35 | known_local_folder = ["pangea_multipass_llama_index", "tests"] 36 | 37 | [tool.mypy] 38 | python_version = "3.10" 39 | color_output = true 40 | error_summary = true 41 | pretty = true 42 | show_column_numbers = true 43 | warn_unused_ignores = true 44 | 45 | [[tool.mypy.overrides]] 46 | module = ["pangea_multipass.*"] 47 | follow_untyped_imports = true 48 | -------------------------------------------------------------------------------- /packages/pangea-multipass/pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "pangea-multipass" 3 | version = "0.2.0" 4 | description = "Pangea Multipass authorization library" 5 | authors = [ 6 | { name = "Apurv Jawle", email = "apurv.jawle@pangea.cloud" } 7 | ] 8 | license = "MIT" 9 | readme = "README.md" 10 | requires-python = ">=3.10,<4.0" 11 | dependencies = [ 12 | "google-auth-oauthlib (>=1.2.2)", 13 | "google-auth-httplib2 (>=0.2.1)", 14 | "google-api-python-client (>=2.187.0)", 15 | "google-auth (>=2.43.0)", 16 | "openpyxl (>=3.1.5)", 17 | "slack-sdk (>=3.39.0)", 18 | ] 19 | 20 | [dependency-groups] 21 | dev = [ 22 | "mypy==1.19.0", 23 | "types-requests==2.32.4.20250913", 24 | ] 25 | 26 | [build-system] 27 | requires = ["hatchling==1.28.0"] 28 | build-backend = "hatchling.build" 29 | 30 | [tool.black] 31 | line-length = 120 32 | 33 | [tool.isort] 34 | profile = "black" 35 | line_length = 120 36 | src_paths = ["pangea_multipass", "tests"] 37 | known_local_folder = ["pangea_multipass", "tests"] 38 | 39 | [tool.mypy] 40 | python_version = "3.10" 41 | color_output = true 42 | error_summary = true 43 | pretty = true 44 | show_column_numbers = true 45 | warn_unused_ignores = true 46 | 47 | [[tool.mypy.overrides]] 48 | module = ["google_auth_oauthlib.flow.*", "googleapiclient.discovery.*"] 49 | follow_untyped_imports = true 50 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: v6.0.0 4 | hooks: 5 | - id: check-json 6 | - id: end-of-file-fixer 7 | - id: trailing-whitespace 8 | exclude: .md 9 | - id: check-merge-conflict 10 | - id: debug-statements 11 | - id: detect-aws-credentials 12 | args: 13 | - --allow-missing-credentials 14 | - id: check-executables-have-shebangs 15 | - id: check-shebang-scripts-are-executable 16 | - id: no-commit-to-branch 17 | args: 18 | - --branch 19 | - main 20 | - repo: https://github.com/pycqa/isort 21 | rev: 7.0.0 22 | hooks: 23 | - id: isort 24 | args: 25 | - --profile=black 26 | - --line-length=120 27 | - --resolve-all-configs 28 | - repo: https://github.com/psf/black 29 | rev: 25.12.0 30 | hooks: 31 | - id: black 32 | args: 33 | - --line-length=120 34 | - repo: https://github.com/pre-commit/mirrors-mypy 35 | rev: v1.19.0 36 | hooks: 37 | - id: mypy 38 | args: 39 | - --ignore-missing-imports 40 | # - --strict 41 | - --implicit-reexport 42 | additional_dependencies: 43 | - types-Deprecated==1.2.9.3 44 | - types-python-dateutil==2.8.19.14 45 | - types-requests==2.31.0.10 46 | -------------------------------------------------------------------------------- /packages/pangea-multipass/tests/integration/test_github.py: -------------------------------------------------------------------------------- 1 | import os 2 | import unittest 3 | 4 | from pangea_multipass import GitHubProcessor, GitHubReader, get_document_metadata 5 | 6 | token = os.getenv("GITHUB_ADMIN_TOKEN") or "" 7 | username = os.getenv("GITHUB_USERNAME") or "" 8 | 9 | _TOTAL_FILES = 8 10 | _AUTHORIZED_FILES = 5 11 | _AUTHORIZED_PROJECTS = 2 12 | 13 | 14 | class TestGitHub(unittest.TestCase): 15 | def setUp(self) -> None: 16 | assert token 17 | assert username 18 | 19 | def test_github(self) -> None: 20 | reader = GitHubReader(token=token) 21 | files = reader.load_data() 22 | assert len(files) == _TOTAL_FILES 23 | 24 | processor = GitHubProcessor(token=token, username=username, get_node_metadata=get_document_metadata) 25 | filter = processor.get_filter() 26 | assert len(filter.value) == _AUTHORIZED_PROJECTS 27 | 28 | authorized_files = processor.filter(files) 29 | assert len(authorized_files) == _AUTHORIZED_FILES 30 | 31 | def test_github_pagination(self) -> None: 32 | reader = GitHubReader(token=token) 33 | 34 | repos = reader.get_repos() 35 | assert len(repos) == 3 36 | 37 | all_files = [] 38 | 39 | for repo in repos: 40 | has_more_files = True 41 | while has_more_files: 42 | files = reader.read_repo_files(repo, page_size=1) 43 | all_files.extend(files) 44 | has_more_files = reader.has_more_files 45 | assert len(files) == 1 46 | 47 | assert len(all_files) == _TOTAL_FILES 48 | -------------------------------------------------------------------------------- /packages/pangea-multipass/tests/integration/test_gitlab.py: -------------------------------------------------------------------------------- 1 | import os 2 | import unittest 3 | 4 | from pangea_multipass import GitLabProcessor, GitLabReader, get_document_metadata 5 | 6 | token = os.getenv("GITLAB_ADMIN_TOKEN") or "" 7 | username = os.getenv("GITLAB_USERNAME") or "" 8 | 9 | _TOTAL_FILES = 8 10 | _AUTHORIZED_FILES = 5 11 | _AUTHORIZED_PROJECTS = 2 12 | 13 | 14 | class TestGitLab(unittest.TestCase): 15 | def setUp(self) -> None: 16 | assert token 17 | assert username 18 | 19 | def test_gitlab(self) -> None: 20 | reader = GitLabReader(token=token) 21 | files = reader.load_data() 22 | assert len(files) == _TOTAL_FILES 23 | 24 | processor = GitLabProcessor(admin_token=token, username=username, get_node_metadata=get_document_metadata) 25 | filter = processor.get_filter() 26 | assert len(filter.value) == _AUTHORIZED_PROJECTS 27 | 28 | authorized_files = processor.filter(files) 29 | assert len(authorized_files) == _AUTHORIZED_FILES 30 | 31 | def test_gitlab_pagination(self) -> None: 32 | reader = GitLabReader(token=token) 33 | 34 | repos = reader.get_repos() 35 | assert len(repos) == 3 36 | 37 | all_files = [] 38 | 39 | for repo in repos: 40 | has_more_files = True 41 | while has_more_files: 42 | files = reader.read_repo_files(repo, page_size=1) 43 | all_files.extend(files) 44 | has_more_files = reader.has_more_files 45 | assert len(files) == 1 46 | 47 | assert len(all_files) == _TOTAL_FILES 48 | -------------------------------------------------------------------------------- /packages/pangea-multipass/CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | All notable changes to this project will be documented in this file. 4 | 5 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), 6 | and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). 7 | 8 | ## Unreleased 9 | 10 | ### Added 11 | 12 | - GitLabReader and GitLabProcessor 13 | - Dropbox reader and processor 14 | - Pagination support on GitHubReader 15 | - Pagination support on SlackReader 16 | - Logger to `GitLabReader`, `GitHubReader`, `SlackReader`, `GitLabClient`, `GitHubClient` and `SlackClient`. 17 | 18 | ### Fixed 19 | 20 | - Handle null fields on issues in JiraME 21 | - Handle trailing slash in Jira URL 22 | - GitLabProcessor `get_filter()` 23 | 24 | ### Changed 25 | 26 | - Rename `GitLabAPI` to `GitLabClient` 27 | - Rename `GitHubAPI` to `GitHubClient` 28 | - Rename `SlackAPI` to `SlackClient` 29 | 30 | 31 | ## 0.2.0 - 2025-01-15 32 | 33 | ### Added 34 | 35 | - GitHub repository's reader and processor. 36 | - Slack channel's reader and processor. 37 | - `account_id` support on JiraProcessor. 38 | - Check user email permissions with admin credentials in GDriveProcessor. 39 | - Check username permissions with admin token in GitHubProcessor. 40 | - Check user email permissions with admin token in SlackProcessor. 41 | - `py.typed` marker file. 42 | - Check user account id permissions with admin token in JiraProcessor. 43 | - Check user account id permission with admin token in ConfluenceProcessor. 44 | 45 | ## 0.1.0 - 2024-12-24 46 | 47 | ### Added 48 | 49 | - Medatata enritcher and processor for Google Drive, Jira and Confluence data sources. 50 | -------------------------------------------------------------------------------- /packages/pangea-multipass-llama-index/README.md: -------------------------------------------------------------------------------- 1 | # Pangea Multipass for Llama Index 2 | 3 | This library extends the Pangea Multipass package to integrate metadata enrichment and document processing with Llama Index. It enables seamless use of authorization checks, metadata filtering, and custom processors on documents from Google Drive, JIRA, and Confluence, utilizing Llama Index structures for Retrieval-Augmented Generation (RAG) applications. 4 | 5 | ## Features 6 | 7 | - **Document Integration**: Adapts Pangea processors and enrichers to handle Llama Index documents. 8 | - **Llama Index-Compatible Filtering**: Provides metadata filtering with operators for fine-grained document access control. 9 | - **Authorization Processing**: Aggregates and applies multiple authorization checks on Llama Index nodes with custom, combinable node processors. 10 | 11 | ## Installation 12 | 13 | Use [Poetry](https://python-poetry.org/) to install dependencies: 14 | 15 | ```bash 16 | poetry add pangea-multipass-llama-index 17 | ``` 18 | 19 | ## Usage 20 | ### Core Components 21 | - Document Reader: LIDocumentReader reads content from Llama Index documents for enrichment. 22 | - Processors for Llama Index: 23 | - LlamaIndexJiraProcessor — Handles JIRA documents within Llama Index. 24 | - LlamaIndexConfluenceProcessor — Processes Confluence documents in Llama Index. 25 | - LlamaIndexGDriveProcessor — Manages Google Drive documents in Llama Index. 26 | - Node Postprocessor Mixer: Combines multiple processors for complex, multi-source document filtering. 27 | - Metadata Filters: Filter documents based on metadata using operators like EQ, CONTAINS, and custom metadata keys. 28 | 29 | ## License 30 | This project is licensed under the MIT License. 31 | -------------------------------------------------------------------------------- /packages/pangea-multipass/tests/integration/test_slack.py: -------------------------------------------------------------------------------- 1 | import os 2 | import unittest 3 | 4 | from pangea_multipass import SlackProcessor, SlackReader, get_document_metadata 5 | 6 | token = os.getenv("SLACK_ADMIN_TOKEN") or "" 7 | user_email = os.getenv("SLACK_USER_EMAIL") or "" 8 | 9 | _TOTAL_FILES = 12 10 | _AUTHORIZED_FILES = 7 11 | _TOTAL_CHANNELS = 4 12 | _AUTHORIZED_CHANNELS = 3 13 | 14 | 15 | class TestSlack(unittest.TestCase): 16 | def setUp(self) -> None: 17 | assert token 18 | assert user_email 19 | 20 | def test_slack(self) -> None: 21 | reader = SlackReader(token=token) 22 | documents = reader.load_data() 23 | assert len(documents) == _TOTAL_FILES 24 | 25 | processor = SlackProcessor(token=token, get_node_metadata=get_document_metadata, user_email=user_email) 26 | filter = processor.get_filter() 27 | assert len(filter.value) == _AUTHORIZED_CHANNELS 28 | 29 | filtered_docs = processor.filter(nodes=documents) 30 | assert len(filtered_docs) == _AUTHORIZED_FILES 31 | 32 | def test_slack_pagination(self) -> None: 33 | reader = SlackReader(token=token) 34 | 35 | channels = reader.get_channels() 36 | assert len(channels) == _TOTAL_CHANNELS 37 | 38 | documents = [] 39 | 40 | for channel in channels: 41 | has_more_messages = True 42 | while has_more_messages: 43 | docs = reader.read_messages(channel=channel, page_size=1) 44 | assert len(docs) <= 1 # Some messages are filtered out so we can't guarantee the exact number 45 | documents.extend(docs) 46 | has_more_messages = reader.has_more_messages 47 | 48 | assert len(documents) == _TOTAL_FILES 49 | -------------------------------------------------------------------------------- /examples/multipass_examples/04-dropbox-check-access.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from pangea_multipass import DropboxClient, DropboxReader, OauthFlow, data_load, data_save 4 | from requests.exceptions import HTTPError 5 | 6 | app_key = os.getenv("DROPBOX_APP_KEY") 7 | assert app_key 8 | 9 | # File to store tokens 10 | DROPBOX_TOKEN_FILE = "dropbox_tokens.json" 11 | 12 | if not os.path.exists(DROPBOX_TOKEN_FILE): 13 | code_verifier, code_challenge = OauthFlow.generate_pkce_pair() 14 | 15 | flow = OauthFlow( 16 | auth_url=DropboxClient.AUTH_URL, 17 | token_url=DropboxClient.TOKEN_URL, 18 | client_id=app_key, 19 | ) 20 | tokens = flow.run_pkce(code_verifier=code_verifier, code_challenge=code_challenge) 21 | else: 22 | tokens = data_load(DROPBOX_TOKEN_FILE) 23 | assert tokens 24 | access_token = OauthFlow.refresh_access_token( 25 | url=DropboxClient.TOKEN_URL, refresh_token=tokens["refresh_token"], client_id=app_key 26 | ) 27 | tokens.update(access_token) 28 | 29 | data_save(DROPBOX_TOKEN_FILE, tokens) 30 | access_token = tokens["access_token"] 31 | reader = DropboxReader(access_token) 32 | documents = [] 33 | 34 | print("Loading documents from Dropbox...") 35 | try: 36 | documents = reader.load_data() 37 | 38 | except HTTPError as e: 39 | if e.response: 40 | print(e.response.text) 41 | else: 42 | print(e) 43 | 44 | print(f"Loaded {len(documents)} docs") 45 | 46 | 47 | # Inference time 48 | from pangea_multipass import DropboxProcessor, get_document_metadata 49 | 50 | user_email = os.getenv("DROPBOX_USER_EMAIL") 51 | assert user_email 52 | 53 | processor = DropboxProcessor(access_token, user_email=user_email, get_node_metadata=get_document_metadata) 54 | print("Filtering authorized documents...") 55 | authorized_docs = processor.filter(documents) 56 | 57 | print(f"Authorized docs: {len(authorized_docs)}") 58 | -------------------------------------------------------------------------------- /packages/pangea-multipass/pangea_multipass/utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | import os 4 | from logging.handlers import TimedRotatingFileHandler 5 | from typing import Dict, Optional 6 | 7 | 8 | def data_load(filename: str) -> Optional[dict]: 9 | if not os.path.exists(filename): 10 | return None 11 | 12 | with open(filename, "r") as f: 13 | return json.load(f) 14 | 15 | 16 | def data_save(filename: str, data: dict): 17 | with open(filename, "w") as f: 18 | json.dump(data, f) 19 | 20 | 21 | _loggers: Dict[str, bool] = {} 22 | 23 | 24 | def set_logger_to_json(logger_name: str, level=logging.DEBUG): 25 | if _loggers.get(logger_name) is not None: 26 | return 27 | 28 | _loggers[logger_name] = True 29 | logger = logging.getLogger(logger_name) 30 | logger.setLevel(level) 31 | handler = TimedRotatingFileHandler( 32 | filename="multipass_logs.json", when="D", interval=1, backupCount=90, encoding="utf-8", delay=False 33 | ) 34 | handler.setLevel(level) 35 | formatter = logging.Formatter( 36 | fmt='{"time": "%(asctime)s.%(msecs)03d", "name": "%(name)s", "level": "%(levelname)s", "message": %(message)s },', 37 | datefmt="%Y-%m-%d %H:%M:%S", 38 | ) 39 | handler.setFormatter(formatter) 40 | logger.addHandler(handler) 41 | 42 | 43 | def set_logger_to_stdout(logger_name: str, level=logging.DEBUG): 44 | if _loggers.get(logger_name) is not None: 45 | return 46 | 47 | _loggers[logger_name] = True 48 | logger = logging.getLogger(logger_name) 49 | logger.setLevel(level) 50 | handler = logging.StreamHandler() 51 | handler.setLevel(level) 52 | formatter = logging.Formatter( 53 | fmt="[%(asctime)s.%(msecs)03d %(name)s %(levelname)s]: %(message)s.", 54 | datefmt="%Y-%m-%d %H:%M:%S", 55 | ) 56 | handler.setFormatter(formatter) 57 | logger.addHandler(handler) 58 | -------------------------------------------------------------------------------- /packages/pangea-multipass/tests/integration/test_dropbox.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import unittest 4 | 5 | from pangea_multipass import DropboxClient, DropboxProcessor, DropboxReader, OauthFlow, get_document_metadata 6 | 7 | _TOTAL_FILES = 10 8 | _AUTHORIZED_FILES = 5 9 | _AUTHORIZED_FOLDERS = 3 10 | 11 | 12 | class TestDropbox(unittest.TestCase): 13 | def setUp(self) -> None: 14 | refresh_token = os.getenv("DROPBOX_REFRESH_TOKEN") or "" 15 | assert refresh_token 16 | app_key = os.getenv("DROPBOX_APP_KEY") or "" 17 | assert app_key 18 | self.user_email = os.getenv("DROPBOX_USER_EMAIL") or "" 19 | assert self.user_email 20 | token_data = OauthFlow.refresh_access_token( 21 | url=DropboxClient.TOKEN_URL, refresh_token=refresh_token, client_id=app_key 22 | ) 23 | self.access_token = token_data["access_token"] 24 | assert self.access_token 25 | 26 | def test_dropbox(self) -> None: 27 | reader = DropboxReader(token=self.access_token) 28 | files = reader.load_data() 29 | assert len(files) == _TOTAL_FILES 30 | 31 | processor = DropboxProcessor( 32 | token=self.access_token, user_email=self.user_email, get_node_metadata=get_document_metadata 33 | ) 34 | filter = processor.get_filter() 35 | assert len(filter.value) == _AUTHORIZED_FOLDERS 36 | 37 | authorized_files = processor.filter(files) 38 | assert len(authorized_files) == _AUTHORIZED_FILES 39 | 40 | def test_dropbox_pagination(self) -> None: 41 | reader = DropboxReader(token=self.access_token) 42 | has_more_files = True 43 | all_files = [] 44 | 45 | while has_more_files: 46 | files = reader.read_page(page_size=1) 47 | all_files.extend(files) 48 | has_more_files = reader.has_more_files 49 | 50 | assert len(all_files) == _TOTAL_FILES 51 | -------------------------------------------------------------------------------- /examples/llama_index_examples/08-confluence-check-access.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Pangea Cyber Corporation 2 | # Author: Pangea Cyber Corporation 3 | 4 | import os 5 | from typing import List 6 | 7 | from llama_index.readers.confluence import ConfluenceReader 8 | from pangea_multipass import ConfluenceAuth, ConfluenceME, PangeaMetadataKeys, enrich_metadata 9 | from pangea_multipass_llama_index import LIDocument, LIDocumentReader 10 | 11 | # Fetch documents from Confluence 12 | confluence_space_key = "~71202041f9bfec117041348629ccf3e3c751b3" 13 | confluence_space_id = 393230 14 | 15 | admin_token = os.getenv("CONFLUENCE_ADMIN_TOKEN") 16 | assert admin_token 17 | admin_email = os.getenv("CONFLUENCE_ADMIN_EMAIL") 18 | assert admin_email 19 | url = os.getenv("CONFLUENCE_BASE_URL") 20 | assert url 21 | 22 | 23 | def confluence_read_docs() -> List[LIDocument]: 24 | """Fetch all documents from Confluence using ConfluenceReader.""" 25 | 26 | # Create a ConfluenceReader instance 27 | print("Loading Confluence docs...") 28 | reader = ConfluenceReader( 29 | base_url=url, 30 | user_name=admin_email, 31 | password=admin_token, 32 | ) 33 | documents: List[LIDocument] = reader.load_data(space_key=confluence_space_key, include_attachments=True) 34 | 35 | # Enrich metadata process 36 | print(f"Processing {len(documents)} Confluence docs...") 37 | confluence_me = ConfluenceME() 38 | enrich_metadata(documents, [confluence_me], reader=LIDocumentReader()) 39 | 40 | return documents 41 | 42 | 43 | documents = confluence_read_docs() 44 | print(f"Loaded {len(documents)} pages.") 45 | 46 | # Inference 47 | from pangea_multipass_llama_index import LlamaIndexConfluenceProcessor 48 | 49 | admin_token = os.getenv("CONFLUENCE_ADMIN_TOKEN") 50 | assert admin_token 51 | admin_email = os.getenv("CONFLUENCE_ADMIN_EMAIL") 52 | assert admin_email 53 | url = os.getenv("CONFLUENCE_BASE_URL") 54 | assert url 55 | account_id = os.getenv("CONFLUENCE_USER_ACCOUNT_ID") 56 | assert account_id 57 | 58 | # Create Confluence filter with admin token 59 | confluence_processor = LlamaIndexConfluenceProcessor( 60 | ConfluenceAuth(admin_email, admin_token, url), account_id=account_id 61 | ) 62 | 63 | authorized_docs = confluence_processor.filter(documents) # type: ignore 64 | 65 | print(f"\nAuthorized pages: {len(authorized_docs)}") 66 | for doc in authorized_docs: 67 | print(f"\t{doc.metadata.get(PangeaMetadataKeys.CONFLUENCE_PAGE_ID, '')}") 68 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | 8 | pull_request: 9 | types: 10 | - opened 11 | - synchronize 12 | - reopened 13 | - ready_for_review 14 | 15 | merge_group: 16 | 17 | workflow_dispatch: 18 | 19 | permissions: 20 | contents: read 21 | 22 | jobs: 23 | build: 24 | runs-on: ubuntu-24.04 25 | strategy: 26 | matrix: 27 | package: [pangea-multipass, pangea-multipass-langchain, pangea-multipass-llama-index] 28 | defaults: 29 | run: 30 | working-directory: ./packages/${{ matrix.package }} 31 | steps: 32 | - name: Checkout code 33 | uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 34 | 35 | - name: Install uv 36 | uses: astral-sh/setup-uv@ed21f2f24f8dd64503750218de024bcf64c7250a # v7.1.5 37 | with: 38 | enable-cache: true 39 | 40 | - name: Setup Python 41 | uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0 42 | with: 43 | python-version: 3.12 44 | 45 | - name: Install dependencies 46 | run: uv sync --all-extras --dev 47 | 48 | - name: Build 49 | run: uv build 50 | 51 | mypy: 52 | runs-on: ubuntu-24.04 53 | strategy: 54 | fail-fast: false 55 | matrix: 56 | path: 57 | - examples/langchain_examples 58 | - examples/llama_index_examples 59 | - examples/multipass_examples 60 | - packages/pangea-multipass 61 | - packages/pangea-multipass-langchain 62 | - packages/pangea-multipass-llama-index 63 | defaults: 64 | run: 65 | working-directory: ${{ matrix.path }} 66 | steps: 67 | - name: Checkout code 68 | uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 69 | 70 | - name: Install cairo 71 | run: sudo apt-get install -y libcairo2-dev 72 | 73 | - name: Install uv 74 | uses: astral-sh/setup-uv@ed21f2f24f8dd64503750218de024bcf64c7250a # v7.1.5 75 | with: 76 | enable-cache: true 77 | 78 | - name: Setup Python 79 | uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0 80 | with: 81 | python-version: 3.12 82 | 83 | - name: Install dependencies 84 | run: uv sync --all-extras --dev 85 | 86 | - name: mypy 87 | run: uv run mypy . --ignore-missing-imports --implicit-reexport 88 | -------------------------------------------------------------------------------- /examples/llama_index_examples/07-jira-check-acess.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Pangea Cyber Corporation 2 | # Author: Pangea Cyber Corporation 3 | 4 | import os 5 | import warnings 6 | from typing import List 7 | 8 | from llama_index.core import Document 9 | from llama_index.readers.jira import JiraReader 10 | from pangea_multipass import JiraAuth, JiraME, PangeaMetadataKeys, enrich_metadata 11 | from pangea_multipass_llama_index import LIDocument, LIDocumentReader 12 | 13 | # Suppress specific warning 14 | warnings.filterwarnings("ignore", message='Field "model_name" has conflict with protected namespace') 15 | 16 | 17 | def jira_load_data(reader: JiraReader, query: str = "") -> List[Document]: 18 | max_results = 100 19 | start_at = 0 20 | keep_iterating = True 21 | all_documents: List[Document] = [] 22 | 23 | while keep_iterating: 24 | documents = reader.load_data(query, start_at=start_at, max_results=max_results) 25 | all_documents.extend(documents) 26 | l = len(documents) 27 | start_at = start_at + l 28 | keep_iterating = l >= max_results 29 | 30 | return all_documents 31 | 32 | 33 | def jira_read_docs() -> List[LIDocument]: 34 | # Jira credentials and base URL 35 | JIRA_BASE_URL = os.getenv("JIRA_BASE_URL") or "" 36 | assert JIRA_BASE_URL 37 | jira_admin_email = os.getenv("JIRA_ADMIN_EMAIL") or "" 38 | assert jira_admin_email 39 | jira_api_token = os.getenv("JIRA_ADMIN_TOKEN") or "" 40 | assert jira_api_token 41 | 42 | # Initialize LlamaIndex JiraReader 43 | print("Loading Jira docs...") 44 | jira_reader = JiraReader(server_url=JIRA_BASE_URL, email=jira_admin_email, api_token=jira_api_token) 45 | 46 | documents = jira_load_data(jira_reader, "") 47 | 48 | # Metadata enricher library 49 | print(f"Processing {len(documents)} Jira docs...") 50 | jira_me = JiraME(JIRA_BASE_URL, jira_admin_email, jira_api_token) 51 | enrich_metadata(documents, [jira_me], reader=LIDocumentReader()) 52 | 53 | return documents 54 | 55 | 56 | documents = jira_read_docs() 57 | 58 | # Inference 59 | from pangea_multipass_llama_index import LlamaIndexJiraProcessor 60 | 61 | # Create JIRA filter 62 | jira_user_token = os.getenv("JIRA_ADMIN_TOKEN") 63 | assert jira_user_token 64 | jira_user_email = os.getenv("JIRA_ADMIN_EMAIL") 65 | assert jira_user_email 66 | jira_url = os.getenv("JIRA_BASE_URL") 67 | assert jira_url 68 | jira_account_id = os.getenv("JIRA_USER_ACCOUNT_ID") 69 | assert jira_account_id 70 | 71 | jira_processor = LlamaIndexJiraProcessor( 72 | JiraAuth(jira_user_email, jira_user_token, jira_url), account_id=jira_account_id 73 | ) 74 | authorized_docs = jira_processor.filter(documents) # type: ignore 75 | 76 | print(f"\nAuthorized issues: {len(authorized_docs)}") 77 | for doc in authorized_docs: 78 | print(f"\t{doc.metadata.get(PangeaMetadataKeys.JIRA_ISSUE_ID, '')}") 79 | -------------------------------------------------------------------------------- /packages/pangea-multipass-langchain/README.md: -------------------------------------------------------------------------------- 1 | # Pangea Multipass for LangChain 2 | 3 | The `pangea-multipass-langchain` package extends Pangea Multipass to integrate with LangChain's document processing, providing enhanced security, metadata filtering, and access control for LangChain documents. This package supports integrations with Google Drive, JIRA, and Confluence, leveraging metadata-based filtering and authorization to control document access. 4 | 5 | ## Features 6 | 7 | - **Document Reader**: Custom `LangChainDocumentReader` class reads content from LangChain documents, adapting to Pangea's document model. 8 | - **Integration Processors**: 9 | - `LangChainJiraFilter`: Allows JIRA integration, authenticating and processing JIRA documents in LangChain. 10 | - `LangChainConfluenceFilter`: Provides Confluence integration for document access control in LangChain. 11 | - `LangChainGDriveFilter`: Uses Google OAuth credentials to access and filter Google Drive documents in LangChain. 12 | - **Document Filter Mixer**: The `DocumentFilterMixer` aggregates multiple processors, applying customized filters for advanced access control across various sources. 13 | 14 | ## Installation 15 | 16 | Use [Poetry](https://python-poetry.org/) to install dependencies: 17 | 18 | ```bash 19 | poetry add pangea-multipass-langchain 20 | ``` 21 | 22 | ## Usage 23 | ### Core Components 24 | 25 | - LangChainDocumentReader: The LangChainDocumentReader class enables reading content from LangChain documents for authorization and metadata filtering. This class acts as a bridge between LangChain documents and Pangea's authorization model. 26 | - Processors for LangChain Integration: The package includes processors that integrate with specific data sources using authentication credentials. Each processor retrieves metadata from documents, allowing fine-grained control over document access: 27 | - LangChainJiraFilter: Authenticates with JIRA and processes JIRA documents. 28 | - LangChainConfluenceFilter: Processes Confluence documents, applying access control. 29 | - LangChainGDriveFilter: Integrates Google Drive documents into LangChain using OAuth2 credentials. 30 | - DocumentFilterMixer: The DocumentFilterMixer aggregates multiple document processors, applying filters to handle complex document access control. It retrieves authorized and unauthorized documents based on the combined filters from each processor. 31 | - Filter Documents: filter() applies filters to a list of LangChain documents. 32 | - Retrieve Unauthorized Documents: get_unauthorized_documents() retrieves documents that fail authorization checks. 33 | - Retrieve Authorized Documents: get_authorized_documents() provides access to documents meeting authorization criteria. 34 | - Metadata Filtering: The package includes metadata-based filtering, allowing users to apply filters with operators like EQ, GT, LT, CONTAINS, and more. Each filter can be customized to match document metadata for precise access control. 35 | 36 | ## License 37 | This project is licensed under the MIT License. 38 | -------------------------------------------------------------------------------- /examples/multipass_examples/README.md: -------------------------------------------------------------------------------- 1 | # Pangea Multipass Examples 2 | 3 | ## Setting up your Data Sources 4 | 5 | Check out the README in the base `examples` folder for the environment variables you need for each data source. 6 | 7 | ## Running the Code 8 | 9 | Ensure you have [uv](https://docs.astral.sh/uv/) installed for dependency 10 | management and virtual environment setup. 11 | 12 | ### Installing Dependencies 13 | 14 | Run the following command to install all dependencies: 15 | 16 | ```bash 17 | uv sync 18 | ``` 19 | 20 | ### Running the GitHub check: 21 | 22 | After you set the GitHub environment variables in the `examples\README.md` file, run this command: 23 | 24 | ```bash 25 | uv run 01-github-check.py 26 | ``` 27 | 28 | *Note:* If your admin account has access to numerous repositories - directly or via Organizations - this may take a while. For test purposes, we recommend using a smaller test account. 29 | 30 | Sample output: 31 | 32 | ```bash 33 | Loaded 8 docs: 34 | offices.txt 35 | strategy.txt 36 | capacitor.txt 37 | folder_1/internal_architecture.txt 38 | folder_2/react.txt 39 | folder_1/salaries.txt 40 | folder_2/venture-capital.txt 41 | interest-rate.txt 42 | 43 | Authorized docs: 5 44 | offices.txt 45 | strategy.txt 46 | capacitor.txt 47 | folder_1/internal_architecture.txt 48 | folder_2/react.txt 49 | ``` 50 | 51 | ### Running the Google Drive check: 52 | 53 | After you set the Google Drive environment variables in the `examples\README.md` file, run this command: 54 | 55 | ```bash 56 | uv run 02-gdrive-check.py 57 | ``` 58 | 59 | 60 | ### Running the Slack check: 61 | 62 | After you set the Slack environment variables in the `examples\README.md` file, run this command: 63 | 64 | ```bash 65 | uv run 03-slack-check.py 66 | ``` 67 | 68 | *Note:* In order to read messages from your Slack channel, your app/bot will need to be present in the channel. This applies to both public and private channels. Any public channels that the bot is not in will generate a "not_in_channel" message. 69 | 70 | Sample output: 71 | 72 | ```bash 73 | Error fetching messages for channel C021V27F8KU: not_in_channel 74 | Error fetching messages for channel C0LNSFJ6897: not_in_channel 75 | Loaded 38 messages. 76 | User has acess to channel ids: 77 | C021V27F8KU 78 | C0LNLKJ6897 79 | C021V8CDFMZ 80 | C029J65B4KH 81 | C02PFMW465Q 82 | C087CNAQGLV 83 | C087K7JPQQ4 84 | User has access to 32 messages 85 | ``` 86 | 87 | ### Running the Dropbox check: 88 | 89 | After you set the Dropbox environment variables in the `examples\README.md` file, run this command: 90 | 91 | ```bash 92 | uv run 04-dropbox-check.py 93 | ``` 94 | 95 | The first time this runs, it will open a browser window to authorize the Dropbox application to your account and store the resulting tokens. Later runs will simply continue to the output below. 96 | 97 | Sample output: 98 | 99 | ```bash 100 | Listening for authentication response on http://localhost:8080 ... 101 | 127.0.0.1 - - [18/Feb/2025 11:18:32] "GET /?code=crD1VEFcJzAAAAAAAABLxoiqCA5-LbgQiaGWQR2R3gA HTTP/1.1" 200 - 102 | Loading documents from Dropbox... 103 | Loaded page: 1. Docs: 0 104 | Loaded page: 2. Docs: 20 105 | Loaded page: 3. Docs: 18 106 | Loaded page: 4. Docs: 23 107 | Loaded 61 docs 108 | Filtering authorized documents... 109 | Authorized docs: 22 110 | ``` 111 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Pangea Multipass: Your Authorization Helper 2 | 3 | Pangea Multipass is a Python library for checking user access to upstream data sources. 4 | 5 | In practice, you can use it to check if a specific user has access to a file in a Google Drive, a ticket in Jira, or a page in Confluence. In concept, we've built this library to be extensible to eventually support Slack channels, GitHub repositories, Salesforce opportunities, and more. 6 | 7 | We originally built this to support our customers' Retrieval-Augmented Generation (RAG) applications to mitigate data leaks. In a RAG architecture, the application inserts additional context at inference time. If you don't check the user's authorization to that context, you could inadvertently leak sensitive information. 8 | 9 | While this is useful in AI/LLM apps, we've abstracted this to work independently so you can use it in any app. 10 | 11 | Check out the `/examples` folder for AI-specific and generic examples. 12 | 13 | ## Features 14 | 15 | - **Document Reading**: Supports document content extraction for use in processing and enrichment. 16 | - **Metadata Enrichment**: Includes enrichers for hashing, constant value setting, and custom metadata. 17 | - **Metadata Filtering**: Provides flexible operators to filter document metadata for customized queries. 18 | - **Authorization Processing**: Manages authorized and unauthorized nodes with customizable node processors. 19 | - **Extensible**: Built on abstract base classes, allowing easy extension and customization of functionality. 20 | 21 | ## Installation 22 | 23 | To install `pangea-multipass`, you can use [Poetry](https://python-poetry.org/) for dependency management: 24 | 25 | ```bash 26 | poetry add pangea-multipass 27 | ``` 28 | 29 | There are full runnable demos in the `pangea_multipass_lib\examples` directory but here are the key aspects. 30 | 31 | Using a set of Google Drive credentials - following the steps in the llama_index_examples folder - you initialize the data source: 32 | 33 | ```python 34 | gdrive_reader = GoogleDriveReader( 35 | folder_id=gdrive_fid, token_path=admin_token_filepath, credentials_path=credentials_filepath 36 | ) 37 | documents = gdrive_reader.load_data(folder_id=gdrive_fid) 38 | ``` 39 | 40 | This gives you a list of files. You can then use the processors to filter into the authorized and unauthorized resource lists: 41 | 42 | ```python 43 | gdrive_processor = LlamaIndexGDriveProcessor(creds) 44 | node_processor = NodePostprocessorMixer([gdrive_processor]) 45 | 46 | authorized_docs = node_processor.postprocess_nodes(documents) 47 | unauthorized_docs = node_processor.get_unauthorized_nodes() 48 | ``` 49 | 50 | In general, the authorized list will be more important but you may notify an admin or log if a user is attempting to access a folder where they have limited access. It could be an attempt at data theft or their permissions are incomplete. 51 | 52 | ## Roadmap 53 | 54 | At release, this library supports Google Workspace, Confluence, and Jira. For adding systems, our top priorities are: 55 | 56 | - Box 57 | - Dropbox 58 | - Office 365 59 | 60 | Others we plan to support or are looking for contributions are: 61 | 62 | - Zoom 63 | - Salesforce 64 | - GitLab 65 | - Zendesk 66 | - Notion 67 | - Sharepoint 68 | - Asana 69 | - Hubspot 70 | 71 | Check out `EXTENDING.md` for the specific structure and requirements for extending Pangea Multipass for your data sources. Pull requests are welcome. 72 | -------------------------------------------------------------------------------- /examples/llama_index_examples/README.md: -------------------------------------------------------------------------------- 1 | # Pangea Auth Llama Index Example App 2 | 3 | These examples application demonstrates the integration of `PangeaAuth` with `LlamaIndex` to perform secure document retrieval and metadata enrichment from multiple data sources. The app connects to Google Drive, Confluence, and JIRA, retrieves documents, enriches metadata for authorization, and performs query-based searches using vector indexing. 4 | 5 | ## Configuration 6 | 7 | ### Environment Variables 8 | 9 | Set the following environment variables to configure access to Confluence and JIRA: 10 | 11 | - **Confluence**: 12 | - `CONFLUENCE_ADMIN_TOKEN`: Admin token for Confluence authentication. 13 | - `CONFLUENCE_ADMIN_EMAIL`: Admin email for Confluence authentication. 14 | - `CONFLUENCE_BASE_URL`: Base URL of the Confluence server. 15 | - `CONFLUENCE_USER_TOKEN`: User token for accessing Confluence. 16 | - `CONFLUENCE_USER_EMAIL`: User email for accessing Confluence. 17 | 18 | - **JIRA**: 19 | - `JIRA_BASE_URL`: Base URL of the JIRA server. 20 | - `JIRA_ADMIN_EMAIL`: Admin email for JIRA authentication. 21 | - `JIRA_ADMIN_TOKEN`: Admin token for JIRA authentication. 22 | - `JIRA_USER_TOKEN`: User token for accessing JIRA. 23 | - `JIRA_USER_EMAIL`: User email for accessing JIRA. 24 | 25 | environment variables are used in ingestion time and this `admin` should have access to all the cloud files/issues/pages that are desired to be included in the vector store. 26 | environment variables are used in inference time and in this case, `user` could have restricted access to some files/issues/pages that are going to be checked on runtime. 27 | 28 | ### Required Config Files 29 | 30 | 1. **Google OAuth2 Credentials**: `credentials.json` 31 | This file should contain your OAuth2 credentials for Google Drive API access. Download it from your Google Cloud Console and place it in the `examples` directory. 32 | 33 | 2. **Admin Access Token for Google Drive**: `admin_access_token.json` 34 | This file stores the access token for the Google Drive admin user and is generated through OAuth2. 35 | 36 | ## Installation and Setup 37 | 38 | ### Prerequisites 39 | 40 | Ensure you have [uv](https://docs.astral.sh/uv/) installed for dependency management and virtual environment setup. 41 | 42 | ### Installing Dependencies 43 | 44 | Run the following command to install all dependencies: 45 | 46 | ```bash 47 | uv sync 48 | ``` 49 | 50 | ## Running the Application 51 | - Set Up Environment Variables: Set all required environment variables in your terminal or in a .env file in the root directory. 52 | - Run the App: 53 | 54 | ```bash 55 | uv run 07-2-rag-LlamaIndex-all-sources-processor 56 | ``` 57 | 58 | ## How It Works 59 | ### Document Retrieval and Enrichment 60 | The app connects to Google Drive, Confluence, and JIRA to retrieve documents based on the provided credentials and metadata filters. Documents are enriched with metadata through the PangeaAuth framework, ensuring enhanced access control. 61 | 62 | ### Query Engine and Access Control 63 | The app uses a vector store to index documents and enables query-based retrieval. The query engine notifies users if certain sources were unauthorized, ensuring that answers reflect accessible data only. 64 | 65 | ### Unauthorized Document Warning 66 | If documents from any source are unauthorized for the user, a warning is displayed, indicating missing context due to access restrictions. 67 | 68 | ## License 69 | This project is licensed under the MIT License. 70 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control 110 | .pdm.toml 111 | .pdm-python 112 | .pdm-build/ 113 | 114 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 115 | __pypackages__/ 116 | 117 | # Celery stuff 118 | celerybeat-schedule 119 | celerybeat.pid 120 | 121 | # SageMath parsed files 122 | *.sage.py 123 | 124 | # Environments 125 | .env 126 | .venv 127 | env/ 128 | venv/ 129 | ENV/ 130 | env.bak/ 131 | venv.bak/ 132 | 133 | # Spyder project settings 134 | .spyderproject 135 | .spyproject 136 | 137 | # Rope project settings 138 | .ropeproject 139 | 140 | # mkdocs documentation 141 | /site 142 | 143 | # mypy 144 | .mypy_cache/ 145 | .dmypy.json 146 | dmypy.json 147 | 148 | # Pyre type checker 149 | .pyre/ 150 | 151 | # pytype static type analyzer 152 | .pytype/ 153 | 154 | # Cython debug symbols 155 | cython_debug/ 156 | 157 | # PyCharm 158 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 159 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 160 | # and can be added to the global gitignore or merged into this file. For a more nuclear 161 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 162 | #.idea/ 163 | *.whl 164 | -------------------------------------------------------------------------------- /examples/llama_index_examples/02-gdrive-check-access.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Pangea Cyber Corporation 2 | # Author: Pangea Cyber Corporation 3 | 4 | import os 5 | import sys 6 | from typing import List 7 | 8 | from google.oauth2.credentials import Credentials 9 | from llama_index.readers.google import GoogleDriveReader 10 | from pangea_multipass import GDriveAPI, GDriveME, PangeaMetadataKeys, enrich_metadata 11 | from pangea_multipass_llama_index import LIDocument, LIDocumentReader 12 | 13 | if len(sys.argv) != 2: 14 | print(f"usage: {sys.argv[0]} ") 15 | exit(1) 16 | 17 | 18 | SCOPES = [ 19 | "openid", 20 | "https://www.googleapis.com/auth/userinfo.email", 21 | "https://www.googleapis.com/auth/userinfo.profile", 22 | "https://www.googleapis.com/auth/drive.metadata.readonly", 23 | "https://www.googleapis.com/auth/drive.readonly", 24 | ] 25 | 26 | # Sample folder data folder owned by apurv@gondwana.cloud https://drive.google.com/drive/u/1/folders/1Kj77oi2QGEOPKcIo_hKZPiHDJyKKFVgR 27 | 28 | # gdrive_fid = "1Kj77oi2QGEOPKcIo_hKZPiHDJyKKFVgR" 29 | gdrive_fid = sys.argv[1] 30 | 31 | # File name for the admin user 32 | admin_token_filepath = "admin_access_token.json" 33 | 34 | 35 | def google_drive_read_docs() -> List[LIDocument]: 36 | print(f"Loading Google Drive docs. Folder ID: {gdrive_fid}.") 37 | # Google Drive Data Ingestion 38 | credentials_filepath = os.path.abspath("../credentials.json") 39 | 40 | # Invoke Google /auth endpoint and save the token for later use 41 | if not os.path.isfile(admin_token_filepath): 42 | print("Sign in with the admin user account:") 43 | GDriveAPI.get_and_save_access_token(credentials_filepath, admin_token_filepath, SCOPES) 44 | 45 | # load the documents and create the index 46 | print("Login to GDrive as admin...") 47 | gdrive_reader = GoogleDriveReader( 48 | folder_id=gdrive_fid, token_path=admin_token_filepath, credentials_path=credentials_filepath 49 | ) 50 | print("Loading data...") 51 | documents: List[LIDocument] = gdrive_reader.load_data(folder_id=gdrive_fid) 52 | 53 | print(f"Processing {len(documents)} docs...") 54 | 55 | # Metadata enricher library 56 | creds = Credentials.from_authorized_user_file(admin_token_filepath, SCOPES) 57 | gdrive_me = GDriveME(creds, {}) 58 | enrich_metadata(documents, [gdrive_me], reader=LIDocumentReader()) 59 | # Finish metadata enrichement 60 | 61 | return documents 62 | 63 | 64 | documents = google_drive_read_docs() 65 | 66 | # Inference 67 | from pangea_multipass import GDriveAPI 68 | from pangea_multipass_llama_index import LlamaIndexGDriveProcessor, NodePostprocessorMixer 69 | 70 | # Create GDrive filter 71 | print("Login to GDrive as user...") 72 | creds = Credentials.from_authorized_user_file(admin_token_filepath, SCOPES) 73 | 74 | # User email to check permissions 75 | user_email = "alice@gondwana.cloud" 76 | 77 | gdrive_processor = LlamaIndexGDriveProcessor(creds, user_email=user_email) 78 | node_processor = NodePostprocessorMixer([gdrive_processor]) 79 | 80 | # Process documents 81 | authorized_docs = node_processor.postprocess_nodes(documents) # type: ignore 82 | unauthorized_docs = node_processor.get_unauthorized_nodes() 83 | 84 | if len(authorized_docs): 85 | print(f"User: '{user_email}' has access to the next files in folder '{gdrive_fid}'") 86 | for docs in authorized_docs: 87 | file_id = docs.metadata.get(PangeaMetadataKeys.GDRIVE_FILE_ID, "") 88 | name = docs.metadata.get(PangeaMetadataKeys.FILE_NAME, "") 89 | print(f"id: {file_id:44} filename: {name}.") 90 | else: 91 | print(f"User '{user_email}' has NO access to any file in folder '{gdrive_fid}'") 92 | 93 | if len(unauthorized_docs): 94 | print(f"\nUser '{user_email}' has NO access to the next files in folder '{gdrive_fid}'") 95 | for docs in unauthorized_docs: 96 | file_id = docs.metadata.get(PangeaMetadataKeys.GDRIVE_FILE_ID, "") 97 | name = docs.metadata.get(PangeaMetadataKeys.FILE_NAME, "") 98 | print(f"id: {file_id:44} filename: {name}.") 99 | else: 100 | print(f"\nUser '{user_email}' has access to all the files in folder '{gdrive_fid}'") 101 | -------------------------------------------------------------------------------- /packages/pangea-multipass/pangea_multipass/dropbox_reader.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | from typing import List, Optional 4 | 5 | import requests 6 | 7 | from .core import MultipassDocument, PangeaMetadataKeys, PangeaMetadataValues, generate_id 8 | from .sources import DropboxClient 9 | 10 | _actor = "dropbox_reader" 11 | 12 | 13 | class DropboxReader: 14 | _token: str 15 | """Dropbox access token""" 16 | 17 | _has_more: bool 18 | _cursor: Optional[str] 19 | _folder_path: str 20 | _recursive: bool 21 | 22 | def __init__(self, token: str, folder_path: str = "", recursive: bool = True, logger_name: str = "multipass"): 23 | self._token = token 24 | self._folder_path = folder_path 25 | self._recursive = recursive 26 | self.logger = logging.getLogger(logger_name) 27 | self._client = DropboxClient(logger_name) 28 | self.restart() 29 | 30 | def restart(self): 31 | self._has_more = True 32 | self._cursor = None 33 | 34 | @property 35 | def has_more_files(self): 36 | """Check if there are more files to read""" 37 | return self._has_more 38 | 39 | def load_data( 40 | self, 41 | ) -> List[MultipassDocument]: 42 | """ 43 | Retrieves all files in Dropbox. 44 | It download and return just files, skipping folders. 45 | """ 46 | 47 | documents: List[MultipassDocument] = [] 48 | 49 | while self._has_more: 50 | documents.extend(self.read_page(page_size=50)) 51 | 52 | return documents 53 | 54 | def read_page(self, page_size: int = 50) -> List[MultipassDocument]: 55 | """ 56 | Read a page of files from Dropbox 57 | Page size is an approximate number of files to read. 58 | It could be more due to Drobox API limitations or it could be less due to folders being skipped. 59 | """ 60 | 61 | documents: List[MultipassDocument] = [] 62 | 63 | url = DropboxClient.LIST_FILES_URL if self._cursor is None else DropboxClient.LIST_CONTINUE_URL 64 | data = {"path": self._folder_path, "recursive": self._recursive, "limit": page_size} 65 | if self._cursor: 66 | data = {"cursor": self._cursor} 67 | 68 | headers = {"Authorization": f"Bearer {self._token}", "Content-Type": "application/json"} 69 | response = requests.post(url, json=data, headers=headers) 70 | if response.status_code != 200: 71 | self.logger.error( 72 | json.dumps( 73 | { 74 | "actor": _actor, 75 | "fn": "read_page", 76 | "action": "post", 77 | "url": url, 78 | "data": data, 79 | "status_code": response.status_code, 80 | "reason": response.reason, 81 | "text": response.text, 82 | } 83 | ) 84 | ) 85 | 86 | response.raise_for_status() 87 | result = response.json() 88 | entries = result.get("entries", []) 89 | 90 | for entrie in entries: 91 | if entrie.get(".tag", "") != "file": 92 | continue 93 | 94 | file_path: str = entrie.get("path_lower", None) 95 | if not file_path: 96 | continue 97 | 98 | name = entrie.get("name", "") 99 | path = file_path.removesuffix(f"/{name}") 100 | 101 | file = self._client.download_file(token=self._token, file_path=file_path) 102 | metadata: dict[str, str] = { 103 | PangeaMetadataKeys.DATA_SOURCE: PangeaMetadataValues.DATA_SOURCE_DROPBOX, 104 | PangeaMetadataKeys.DROPBOX_ID: entrie.get("id", ""), 105 | PangeaMetadataKeys.DROPBOX_PATH: path, 106 | PangeaMetadataKeys.DROPBOX_FILE_PATH: file_path, 107 | PangeaMetadataKeys.FILE_PATH: file_path, 108 | PangeaMetadataKeys.FILE_NAME: name, 109 | } 110 | documents.append(MultipassDocument(id=generate_id(), content=file, metadata=metadata)) 111 | 112 | self._has_more = result.get("has_more", False) 113 | self._cursor = result.get("cursor") 114 | return documents 115 | -------------------------------------------------------------------------------- /packages/pangea-multipass/pangea_multipass/oauth.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import hashlib 3 | import os 4 | import threading 5 | import webbrowser 6 | from http.server import BaseHTTPRequestHandler, HTTPServer 7 | 8 | import requests 9 | 10 | 11 | class OauthFlow: 12 | # TODO: improve to be thread safe 13 | _auth_code = None 14 | 15 | auth_url: str 16 | token_url: str 17 | client_id: str 18 | host: str 19 | port: int 20 | 21 | def __init__( 22 | self, 23 | auth_url: str, 24 | token_url: str, 25 | client_id: str, 26 | host: str = "localhost", 27 | port: int = 8080, 28 | ): 29 | self.auth_url = auth_url 30 | self.token_url = token_url 31 | self.client_id = client_id 32 | self.host = host 33 | self.port = port 34 | 35 | def run_pkce(self, code_challenge: str, code_verifier: str, code_challenge_method: str = "S256"): 36 | self._auth_code = None 37 | redirect_uri = f"http://{self.host}:{self.port}" 38 | 39 | auth_url = ( 40 | f"{self.auth_url}?" 41 | f"client_id={self.client_id}&" 42 | "response_type=code&" 43 | "token_access_type=offline&" 44 | f"redirect_uri={redirect_uri}&" 45 | f"code_challenge={code_challenge}&" 46 | f"code_challenge_method={code_challenge_method}" 47 | ) 48 | 49 | webbrowser.open(auth_url) 50 | 51 | server_thread = threading.Thread(target=OauthFlow._run_server, daemon=True) 52 | server_thread.start() 53 | 54 | while OauthFlow._auth_code is None: 55 | pass # Busy wait (can be improved with event-based handling) 56 | 57 | response = requests.post( 58 | self.token_url, 59 | data={ 60 | "client_id": self.client_id, 61 | "grant_type": "authorization_code", 62 | "code": OauthFlow._auth_code, 63 | "redirect_uri": redirect_uri, 64 | "code_verifier": code_verifier, # PKCE verification 65 | }, 66 | ) 67 | response.raise_for_status() 68 | return response.json() 69 | 70 | class OAuthHandler(BaseHTTPRequestHandler): 71 | """Handles the OAuth redirect to capture auth code automatically.""" 72 | 73 | def do_GET(self): 74 | if "code=" in self.path: 75 | OauthFlow._auth_code = self.path.split("code=")[-1].split("&")[0] 76 | self.send_response(200) 77 | self.send_header("Content-type", "text/html") 78 | self.end_headers() 79 | self.wfile.write(b"

Authorization Successful!

You can close this tab.

") 80 | else: 81 | self.send_response(400) 82 | self.end_headers() 83 | self.wfile.write(b"

Authorization Failed

") 84 | 85 | @staticmethod 86 | def generate_pkce_pair(): 87 | code_verifier = base64.urlsafe_b64encode(os.urandom(32)).rstrip(b"=").decode("utf-8") 88 | code_challenge = ( 89 | base64.urlsafe_b64encode(hashlib.sha256(code_verifier.encode("utf-8")).digest()) 90 | .rstrip(b"=") 91 | .decode("utf-8") 92 | ) 93 | 94 | return code_verifier, code_challenge 95 | 96 | @staticmethod 97 | def _run_server(host: str = "localhost", port: int = 8080): 98 | """Starts a simple HTTP server to listen for OAuth callback.""" 99 | 100 | redirect_uri = f"http://{host}:{port}" 101 | server = HTTPServer((host, port), OauthFlow.OAuthHandler) 102 | print(f"\n🌍 Listening for authentication response on {redirect_uri} ...") 103 | server.handle_request() # Handles only one request (closes after first login) 104 | 105 | @staticmethod 106 | def refresh_access_token(url: str, refresh_token: str, client_id: str): 107 | """Refresh the access token using the refresh token.""" 108 | 109 | # Send request to refresh the token 110 | response = requests.post( 111 | url, 112 | data={ 113 | "grant_type": "refresh_token", 114 | "refresh_token": refresh_token, 115 | "client_id": client_id, 116 | }, 117 | headers={"Content-Type": "application/x-www-form-urlencoded"}, 118 | ) 119 | response.raise_for_status() 120 | 121 | return response.json() 122 | -------------------------------------------------------------------------------- /packages/pangea-multipass/pangea_multipass/gitlab_reader.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from typing import Any, List, Optional 3 | 4 | import requests 5 | 6 | from .sources import GitLabClient 7 | from pangea_multipass import MultipassDocument, PangeaMetadataKeys, PangeaMetadataValues, generate_id 8 | 9 | 10 | class GitLabReader: 11 | _token: str 12 | _has_more_files: bool 13 | _next_files_page: Optional[str] 14 | _current_repository: dict 15 | _logger_name: str 16 | 17 | def __init__(self, token: str, logger_name: str = "multipass"): 18 | self._token = token 19 | self.logger = logging.getLogger(logger_name) 20 | self._client = GitLabClient(logger_name) 21 | self._restart() 22 | 23 | def get_repos(self): 24 | """Get all the repositories the token has access to""" 25 | return self._client.get_user_projects(self._token) 26 | 27 | def read_repo_files(self, repository: dict, page_size: int = 100) -> List[MultipassDocument]: 28 | """ 29 | Read files from a given repository 30 | If the repository is different from the last one, it will restart the reader. 31 | If the repository is the same, it will continue reading from the last file. 32 | """ 33 | self._read_repo_files_checks(repository, page_size) 34 | if self._next_files_page is None: 35 | return [] 36 | 37 | response = requests.get(self._next_files_page, headers={"Authorization": f"Bearer {self._token}"}) 38 | 39 | repo_id = self._current_repository.get("id", None) 40 | if response.status_code != 200: 41 | raise Exception(f"Skipping {repo_id}: Could not fetch file tree") 42 | 43 | documents: List[MultipassDocument] = [] 44 | files = response.json() 45 | for file in files: 46 | if file["type"] == "blob": # Only download actual files 47 | file_path = file["path"] 48 | file_name = file["name"] 49 | repo_name = self._current_repository.get("name", "") 50 | repo_namespace_path = self._current_repository.get("path_with_namespace", "") 51 | content = self._client.download_file(self._token, repo_id, file_path) # type: ignore[arg-type] 52 | metadata: dict[str, Any] = { 53 | PangeaMetadataKeys.DATA_SOURCE: PangeaMetadataValues.DATA_SOURCE_GITLAB, 54 | PangeaMetadataKeys.GITLAB_REPOSITORY_ID: repo_id, 55 | PangeaMetadataKeys.GITLAB_REPOSITORY_NAME: repo_name, 56 | PangeaMetadataKeys.GITLAB_REPOSITORY_NAMESPACE_WITH_PATH: repo_namespace_path, 57 | PangeaMetadataKeys.FILE_PATH: file_path, 58 | PangeaMetadataKeys.FILE_NAME: file_name, 59 | } 60 | documents.append(MultipassDocument(generate_id(), content, metadata)) 61 | 62 | self._next_files_page = response.links.get("next", {}).get("url", None) # Check if pagination has next page 63 | self._has_more_files = self._next_files_page is not None 64 | 65 | return documents 66 | 67 | def load_data(self) -> List[MultipassDocument]: 68 | """ 69 | Load all the data from the repositories 70 | This will read all the files from all the repositories the token has access to. 71 | This process is blocking and can take a long time. If working with a large number of repositories, 72 | consider using the read_repo_files method. 73 | """ 74 | documents: List[MultipassDocument] = [] 75 | repos = self.get_repos() 76 | 77 | for repo in repos: 78 | has_more_files = True 79 | while has_more_files: 80 | files = self.read_repo_files(repository=repo) 81 | documents.extend(files) 82 | has_more_files = self.has_more_files 83 | 84 | return documents 85 | 86 | @property 87 | def has_more_files(self): 88 | """Check if there are more files to read""" 89 | return self._has_more_files 90 | 91 | def _restart(self): 92 | self._has_more_files = True 93 | self._next_files_page = None 94 | self._current_repository = {} 95 | 96 | def _read_repo_files_checks(self, repository: dict, page_size: int) -> None: 97 | current_repo_id = self._current_repository.get("id", None) 98 | new_repo_id = repository.get("id", None) 99 | if current_repo_id is None or current_repo_id != new_repo_id: 100 | self._restart() 101 | self._current_repository = repository 102 | 103 | if self._has_more_files is True and self._next_files_page is None: 104 | repo_id = repository.get("id", None) 105 | if repo_id is None: 106 | raise Exception("Invalid repository id") 107 | 108 | self._next_files_page = f"https://gitlab.com/api/v4/projects/{repo_id}/repository/tree?recursive=true&per_page={page_size}&pagination=keyset" 109 | else: 110 | self._has_more_files = False 111 | -------------------------------------------------------------------------------- /examples/llama_index_examples/03-rag-LlamaIndex-gdrive-filter.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Pangea Cyber Corporation 2 | # Author: Pangea Cyber Corporation 3 | 4 | import os 5 | import warnings 6 | from typing import List 7 | 8 | from google.oauth2.credentials import Credentials 9 | from llama_index.core import Settings, StorageContext, VectorStoreIndex, load_index_from_storage 10 | from llama_index.embeddings.bedrock import BedrockEmbedding 11 | from llama_index.llms.bedrock import Bedrock 12 | from llama_index.readers.google import GoogleDriveReader 13 | from pangea_multipass import GDriveAPI, GDriveME, enrich_metadata 14 | from pangea_multipass_llama_index import LIDocument, LIDocumentReader 15 | 16 | # Suppress specific warning 17 | warnings.filterwarnings("ignore", message='Field "model_name" has conflict with protected namespace') 18 | 19 | SCOPES = [ 20 | "openid", 21 | "https://www.googleapis.com/auth/userinfo.email", 22 | "https://www.googleapis.com/auth/userinfo.profile", 23 | "https://www.googleapis.com/auth/drive.metadata.readonly", 24 | ] 25 | 26 | # import logging 27 | # import sys 28 | 29 | # logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) 30 | # logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout)) 31 | 32 | 33 | # Initialize LLM, anthropic deployed on bedrock 34 | llm = Bedrock( 35 | model="anthropic.claude-3-5-sonnet-20240620-v1:0", 36 | profile_name="dev", 37 | region_name="us-west-2", 38 | temperature=0.5, 39 | max_tokens=512, 40 | ) 41 | 42 | # Initialize Embedding model, amazon titan deployed on bedrock 43 | embed_model = BedrockEmbedding(model="amazon.titan-embed-g1-text-02", region_name="us-west-2", profile_name="dev") 44 | 45 | # Set up the models 46 | Settings.llm = llm 47 | Settings.embed_model = embed_model 48 | 49 | # Set up chunking parameters 50 | Settings.chunk_size = 1000 51 | Settings.chunk_overlap = 100 52 | 53 | 54 | def google_drive_read_docs() -> List[LIDocument]: 55 | print("Loading Google Drive docs...") 56 | # Google Drive Data Ingestion 57 | credentials_filepath = os.path.abspath("../credentials.json") 58 | 59 | # Sample folder data folder owned by apurv@gondwana.cloud https://drive.google.com/drive/u/1/folders/1Kj77oi2QGEOPKcIo_hKZPiHDJyKKFVgR 60 | gdrive_fid = "1Kj77oi2QGEOPKcIo_hKZPiHDJyKKFVgR" 61 | 62 | # File name for the admin user 63 | admin_token_filepath = "admin_access_token.json" 64 | 65 | # # Invoke Google /auth endpoint and save he token for later use 66 | # GDrive.get_and_save_access_token(credentials_filepath, admin_token_filepath, SCOPES) 67 | 68 | # load the documents and create the index 69 | print("Login to GDrive as admin...") 70 | gdrive_reader = GoogleDriveReader( 71 | folder_id=gdrive_fid, token_path=admin_token_filepath, credentials_path=credentials_filepath 72 | ) 73 | documents: List[LIDocument] = gdrive_reader.load_data(folder_id=gdrive_fid) 74 | 75 | print(f"Processing {len(documents)} docs...") 76 | 77 | # Metadata enricher library 78 | creds = Credentials.from_authorized_user_file(admin_token_filepath, SCOPES) 79 | gdrive_me = GDriveME(creds, {}) 80 | enrich_metadata(documents, [gdrive_me], reader=LIDocumentReader()) 81 | # Finish metadata enrichement 82 | 83 | return documents 84 | 85 | 86 | # Load data from Gdrive or from the disk 87 | PERSIST_DIR = "./storage/rbac/llamaindex/gdrive" 88 | if not os.path.exists(PERSIST_DIR): 89 | # Load documents 90 | documents = google_drive_read_docs() 91 | 92 | print("Create and save index...") 93 | index = VectorStoreIndex.from_documents(documents) 94 | # store it for later 95 | index.storage_context.persist(persist_dir=PERSIST_DIR) 96 | else: 97 | # load the existing index 98 | print("Loading index...") 99 | storage_context = StorageContext.from_defaults(persist_dir=PERSIST_DIR) 100 | index = load_index_from_storage(storage_context) # type: ignore 101 | 102 | 103 | # Inference 104 | from pangea_multipass_llama_index import LlamaIndexGDriveProcessor, NodePostprocessorMixer 105 | 106 | # Create GDrive filter 107 | credentials_filepath = os.path.abspath("../credentials.json") 108 | print("Login to GDrive as user...") 109 | creds = GDriveAPI.get_user_credentials(credentials_filepath, scopes=SCOPES) 110 | 111 | node_processor = NodePostprocessorMixer([LlamaIndexGDriveProcessor(creds)]) 112 | metadata_filters = node_processor.get_filter() 113 | 114 | # Using filters 115 | query_engine = index.as_query_engine(similarity_top_k=10, streaming=True, filters=metadata_filters) 116 | retriever = index.as_retriever(similarity_top_k=10) 117 | 118 | 119 | # Inference pipeline 120 | while True: 121 | user_prompt = input("Enter your question:") 122 | 123 | nodes = retriever.retrieve(user_prompt) 124 | count = len(node_processor.get_unauthorized_nodes()) 125 | count_authorized = len(node_processor.get_authorized_nodes()) 126 | 127 | answer = query_engine.query(user_prompt) 128 | # print("Assistant: ", answer) 129 | answer.print_response_stream() # type: ignore 130 | 131 | print("\n=================\n") 132 | print( 133 | f"\nWarning: This answer could be inaccurate as its missing context from {count} out of {len(nodes)} data sources. Include {count_authorized} sources." 134 | ) 135 | print("\n++++++++++++++++++") 136 | -------------------------------------------------------------------------------- /examples/llama_index_examples/01-rag-LlamaIndex-gdrive-processor.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Pangea Cyber Corporation 2 | # Author: Pangea Cyber Corporation 3 | 4 | import os 5 | import warnings 6 | from typing import List 7 | 8 | from google.oauth2.credentials import Credentials 9 | from llama_index.core import Settings, StorageContext, VectorStoreIndex, load_index_from_storage 10 | from llama_index.embeddings.bedrock import BedrockEmbedding 11 | from llama_index.llms.bedrock import Bedrock 12 | from llama_index.readers.google import GoogleDriveReader 13 | from pangea_multipass import GDriveME, enrich_metadata 14 | from pangea_multipass_llama_index import LIDocument, LIDocumentReader 15 | 16 | # Suppress specific warning 17 | warnings.filterwarnings("ignore", message='Field "model_name" has conflict with protected namespace') 18 | 19 | SCOPES = [ 20 | "openid", 21 | "https://www.googleapis.com/auth/userinfo.email", 22 | "https://www.googleapis.com/auth/userinfo.profile", 23 | "https://www.googleapis.com/auth/drive.metadata.readonly", 24 | ] 25 | 26 | # import logging 27 | # import sys 28 | 29 | # logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) 30 | # logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout)) 31 | 32 | 33 | # Initialize LLM, anthropic deployed on bedrock 34 | llm = Bedrock( 35 | model="anthropic.claude-3-5-sonnet-20240620-v1:0", 36 | profile_name="dev", 37 | region_name="us-west-2", 38 | temperature=0.5, 39 | max_tokens=512, 40 | ) 41 | 42 | # Initialize Embedding model, amazon titan deployed on bedrock 43 | embed_model = BedrockEmbedding(model="amazon.titan-embed-g1-text-02", region_name="us-west-2", profile_name="dev") 44 | 45 | # Set up the models 46 | Settings.llm = llm 47 | Settings.embed_model = embed_model 48 | 49 | # Set up chunking parameters 50 | Settings.chunk_size = 1000 51 | Settings.chunk_overlap = 100 52 | 53 | 54 | def google_drive_read_docs() -> List[LIDocument]: 55 | print("Loading Google Drive docs...") 56 | # Google Drive Data Ingestion 57 | credentials_filepath = os.path.abspath("../credentials.json") 58 | 59 | # Sample folder data folder owned by apurv@gondwana.cloud https://drive.google.com/drive/u/1/folders/1Kj77oi2QGEOPKcIo_hKZPiHDJyKKFVgR 60 | gdrive_fid = "1Kj77oi2QGEOPKcIo_hKZPiHDJyKKFVgR" 61 | 62 | # File name for the admin user 63 | admin_token_filepath = "admin_access_token.json" 64 | 65 | # # Invoke Google /auth endpoint and save he token for later use 66 | # GDrive.get_and_save_access_token(credentials_filepath, admin_token_filepath, SCOPES) 67 | 68 | # load the documents and create the index 69 | print("Login to GDrive as admin...") 70 | gdrive_reader = GoogleDriveReader( 71 | folder_id=gdrive_fid, token_path=admin_token_filepath, credentials_path=credentials_filepath 72 | ) 73 | documents: List[LIDocument] = gdrive_reader.load_data(folder_id=gdrive_fid) 74 | 75 | print(f"Processing {len(documents)} docs...") 76 | 77 | # Metadata enricher library 78 | creds = Credentials.from_authorized_user_file(admin_token_filepath, SCOPES) 79 | gdrive_me = GDriveME(creds, {}) 80 | enrich_metadata(documents, [gdrive_me], reader=LIDocumentReader()) 81 | # Finish metadata enrichement 82 | 83 | return documents 84 | 85 | 86 | # Load data from Gdrive or from the disk 87 | PERSIST_DIR = "./storage/rbac/llamaindex/gdrive" 88 | if not os.path.exists(PERSIST_DIR): 89 | # Load documents 90 | documents = google_drive_read_docs() 91 | 92 | print("Create and save index...") 93 | index = VectorStoreIndex.from_documents(documents) 94 | # store it for later 95 | index.storage_context.persist(persist_dir=PERSIST_DIR) 96 | else: 97 | # load the existing index 98 | print("Loading index...") 99 | storage_context = StorageContext.from_defaults(persist_dir=PERSIST_DIR) 100 | index = load_index_from_storage(storage_context) # type: ignore 101 | 102 | # Inference 103 | 104 | from pangea_multipass import GDriveAPI 105 | from pangea_multipass_llama_index import LlamaIndexGDriveProcessor, NodePostprocessorMixer 106 | 107 | # Create GDrive filter 108 | credentials_filepath = os.path.abspath("../credentials.json") 109 | creds = GDriveAPI.get_user_credentials(credentials_filepath, scopes=SCOPES) 110 | 111 | print("Login to GDrive as user...") 112 | gdrive_processor = LlamaIndexGDriveProcessor(creds) 113 | node_processor = NodePostprocessorMixer([gdrive_processor]) 114 | 115 | # Using node postprocessor 116 | query_engine = index.as_query_engine( 117 | streaming=True, 118 | similarity_top_k=10, 119 | node_postprocessors=[node_processor], 120 | ) 121 | 122 | retriever = index.as_retriever(similarity_top_k=10) 123 | 124 | 125 | # Inference pipeline 126 | while True: 127 | user_prompt = input("Enter your question:") 128 | 129 | nodes = retriever.retrieve(user_prompt) 130 | count = len(node_processor.get_unauthorized_nodes()) 131 | count_authorized = len(node_processor.get_authorized_nodes()) 132 | 133 | answer = query_engine.query(user_prompt) 134 | # print("Assistant: ", answer) 135 | answer.print_response_stream() # type: ignore 136 | 137 | print("\n=================\n") 138 | print( 139 | f"\nWarning: This answer could be inaccurate as its missing context from {count} out of {len(nodes)} data sources. Include {count_authorized} sources." 140 | ) 141 | print("\n++++++++++++++++++") 142 | -------------------------------------------------------------------------------- /packages/pangea-multipass/pangea_multipass/github_reader.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from typing import Any, List, Optional 3 | 4 | from .core import MultipassDocument, PangeaMetadataKeys, PangeaMetadataValues, generate_id 5 | from .sources.github import GitHubClient 6 | 7 | 8 | class GitHubReader: 9 | _token: str 10 | """GitHub personal access token""" 11 | 12 | _current_file: int = 0 13 | _repo_files: Optional[List[dict]] = None 14 | _current_repository: dict = {} 15 | 16 | def __init__(self, token: str, logger_name: str = "multipass"): 17 | self._token = token 18 | self.logger = logging.getLogger(logger_name) 19 | self._client = GitHubClient(logger_name) 20 | self._restart() 21 | 22 | def load_data( 23 | self, 24 | ) -> List[MultipassDocument]: 25 | """ 26 | Load all the data from the repositories 27 | This will read all the files from all the repositories the token has access to. 28 | This process is blocking and can take a long time. If working with a large number of repositories, 29 | consider using the read_repo_files method. 30 | """ 31 | documents: List[MultipassDocument] = [] 32 | 33 | # Get repositories 34 | repos = self._client.get_user_repos(self._token) 35 | 36 | for repo in repos: 37 | owner = repo["owner"]["login"] 38 | repo_name = repo["name"] 39 | 40 | # Get all files recursively 41 | files = self._client.get_repo_files(self._token, owner, repo_name) 42 | 43 | for file in files: 44 | file_path = file["path"] 45 | download_url = file["url"] 46 | 47 | # Fetch the file content 48 | content = self._client.download_file_content(self._token, download_url) 49 | 50 | # Create metadata 51 | metadata: dict[str, Any] = { 52 | PangeaMetadataKeys.GITHUB_REPOSITORY_NAME: repo_name, 53 | PangeaMetadataKeys.GITHUB_REPOSITORY_OWNER: owner, 54 | PangeaMetadataKeys.FILE_PATH: file_path, 55 | PangeaMetadataKeys.FILE_NAME: file_path, 56 | PangeaMetadataKeys.DATA_SOURCE: PangeaMetadataValues.DATA_SOURCE_GITHUB, 57 | PangeaMetadataKeys.GITHUB_REPOSITORY_OWNER_AND_NAME: (owner, repo_name), 58 | } 59 | 60 | doc = MultipassDocument(id=generate_id(), content=content, metadata=metadata) 61 | documents.append(doc) 62 | 63 | return documents 64 | 65 | def read_repo_files(self, repository: dict, page_size: int = 100) -> List[MultipassDocument]: 66 | """ 67 | Read files from a given repository 68 | If the repository is different from the last one, it will restart the reader. 69 | If the repository is the same, it will continue reading from the last file. 70 | """ 71 | documents: List[MultipassDocument] = [] 72 | 73 | self._read_repo_files_checks(repository) 74 | if self._repo_files is None: 75 | return documents 76 | 77 | owner = self._current_repository["owner"]["login"] 78 | repo_name = self._current_repository["name"] 79 | 80 | i = 0 81 | while i < page_size and self._current_file < len(self._repo_files): 82 | file = self._repo_files[self._current_file] 83 | i += 1 84 | self._current_file += 1 85 | 86 | file_path = file["path"] 87 | download_url = file["url"] 88 | 89 | # Fetch the file content 90 | content = self._client.download_file_content(self._token, download_url) 91 | 92 | # Create metadata 93 | metadata: dict[str, Any] = { 94 | PangeaMetadataKeys.GITHUB_REPOSITORY_NAME: repo_name, 95 | PangeaMetadataKeys.GITHUB_REPOSITORY_OWNER: owner, 96 | PangeaMetadataKeys.FILE_PATH: file_path, 97 | PangeaMetadataKeys.FILE_NAME: file_path, 98 | PangeaMetadataKeys.DATA_SOURCE: PangeaMetadataValues.DATA_SOURCE_GITHUB, 99 | PangeaMetadataKeys.GITHUB_REPOSITORY_OWNER_AND_NAME: (owner, repo_name), 100 | } 101 | 102 | doc = MultipassDocument(id=generate_id(), content=content, metadata=metadata) 103 | documents.append(doc) 104 | 105 | return documents 106 | 107 | def get_repos(self) -> List[dict]: 108 | """Get all the repositories the token has access to""" 109 | return self._client.get_user_repos(self._token) 110 | 111 | @property 112 | def has_more_files(self): 113 | """Check if there are more files to read""" 114 | return self._repo_files is not None and self._current_file < len(self._repo_files) 115 | 116 | def _restart(self) -> None: 117 | self._current_file = 0 118 | self._repo_files = None 119 | self._current_repository = {} 120 | 121 | def _read_repo_files_checks(self, repository: dict) -> None: 122 | current_repo_id = self._current_repository.get("id", None) 123 | new_repo_id = repository.get("id", None) 124 | 125 | if current_repo_id is None or current_repo_id != new_repo_id: 126 | self._restart() 127 | self._current_repository = repository 128 | 129 | owner = self._current_repository["owner"]["login"] 130 | repo_name = self._current_repository["name"] 131 | 132 | if self._repo_files is None: 133 | self._repo_files = self._client.get_repo_files(self._token, owner, repo_name) 134 | -------------------------------------------------------------------------------- /examples/langchain_examples/02-rag-LangChain-gdrive.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Pangea Cyber Corporation 2 | # Author: Pangea Cyber Corporation 3 | 4 | import os 5 | from io import BytesIO 6 | from pathlib import Path 7 | from typing import List 8 | 9 | import boto3 10 | from google.oauth2.credentials import Credentials 11 | from langchain_aws import BedrockEmbeddings, ChatBedrock 12 | from langchain_community.vectorstores import FAISS 13 | from langchain_core.documents import Document 14 | from langchain_google_community import GoogleDriveLoader 15 | from pangea_multipass import GDriveAPI, GDriveME, enrich_metadata 16 | from pangea_multipass_langchain import DocumentFilterMixer, LangChainDocumentReader 17 | 18 | # Initialization 19 | bedrock_client = boto3.client("bedrock-runtime", region_name="us-west-2") 20 | model_id = "anthropic.claude-3-5-sonnet-20240620-v1:0" 21 | model_kwargs = { 22 | "max_tokens": 512, 23 | "temperature": 0.5, 24 | } 25 | 26 | ## Setup the LLM parameters 27 | llm = ChatBedrock( 28 | client=bedrock_client, 29 | model_id=model_id, 30 | model_kwargs=model_kwargs, 31 | ) 32 | 33 | ## Setup the Embedding model parameters 34 | embedding_model = BedrockEmbeddings(model_id="amazon.titan-embed-g1-text-02", client=bedrock_client) 35 | 36 | 37 | class TextLoader: 38 | file: BytesIO 39 | 40 | def __init__(self, file: BytesIO): 41 | self.file = file 42 | 43 | def load(self) -> List[Document]: 44 | return [Document(page_content=self.file.read().decode("utf-8"))] 45 | 46 | 47 | ## Data ingestion pipeline 48 | 49 | PERSIST_DIR = "./storage/data/langchain/faiss_index" 50 | if not os.path.exists(PERSIST_DIR): 51 | os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "" 52 | # Google Drive Data Ingestion 53 | admin_token_filepath = "admin_access_token.json" 54 | 55 | credentials_filepath = os.path.abspath("../credentials.json") 56 | print("Login to GDrive as admin...") 57 | GDriveAPI.get_and_save_access_token( 58 | credentials_filepath, admin_token_filepath, ["https://www.googleapis.com/auth/drive.readonly"] 59 | ) 60 | 61 | loader = GoogleDriveLoader( 62 | folder_id="1Kj77oi2QGEOPKcIo_hKZPiHDJyKKFVgR", 63 | token_path=Path(admin_token_filepath), 64 | credentials_path=Path(credentials_filepath), 65 | recursive=True, 66 | load_extended_metadata=True, 67 | file_loader_cls=TextLoader, 68 | ) 69 | 70 | docs = loader.load() 71 | print(f"GDrive docs loaded: {len(docs)}.") 72 | 73 | # Metadata enricher library 74 | SCOPES = [ 75 | "openid", 76 | "https://www.googleapis.com/auth/userinfo.email", 77 | "https://www.googleapis.com/auth/userinfo.profile", 78 | "https://www.googleapis.com/auth/drive.metadata.readonly", 79 | ] 80 | 81 | creds = Credentials.from_authorized_user_file(admin_token_filepath, SCOPES) 82 | gdrive_me = GDriveME(creds, {}) 83 | enrich_metadata(docs, [gdrive_me], reader=LangChainDocumentReader()) 84 | # Finish metadata enrichement 85 | 86 | # Initialize the vector store https://faiss.ai 87 | print("Initializing vector store...") 88 | vectorstore = FAISS.from_documents(documents=docs, embedding=embedding_model) 89 | 90 | # Store to file system 91 | print("Storing vector store...") 92 | vectorstore.save_local(PERSIST_DIR) 93 | else: 94 | print("Loading vector store...") 95 | vectorstore = FAISS.load_local( 96 | folder_path=PERSIST_DIR, embeddings=embedding_model, allow_dangerous_deserialization=True 97 | ) 98 | 99 | 100 | ## Inference pipeline 101 | 102 | from langchain.chains.combine_documents import create_stuff_documents_chain 103 | from langchain_core.prompts import ChatPromptTemplate 104 | from pangea_multipass_langchain import LangChainGDriveFilter 105 | 106 | # Create GDrive filter 107 | credentials_filepath = os.path.abspath("../credentials.json") 108 | SCOPES = [ 109 | "openid", 110 | "https://www.googleapis.com/auth/userinfo.email", 111 | "https://www.googleapis.com/auth/userinfo.profile", 112 | "https://www.googleapis.com/auth/drive.metadata.readonly", 113 | ] 114 | 115 | print("Login to GDrive as user...") 116 | creds = GDriveAPI.get_user_credentials(credentials_filepath, scopes=SCOPES) 117 | 118 | gdrive_filter = LangChainGDriveFilter(creds) 119 | filter_mixer = DocumentFilterMixer(document_filters=[gdrive_filter]) 120 | 121 | # Use indexed store as a reteriver to create qa chain 122 | retriever = vectorstore.as_retriever() 123 | 124 | # Prompt template with System, Context and User prompt 125 | template = """"System: Answer the following question based only on the provided context: 126 | 127 | 128 | {context} 129 | 130 | 131 | Question: {input} 132 | """ 133 | prompt = ChatPromptTemplate.from_template(template) 134 | 135 | # Document chain using the LLM and prompt template 136 | qa_chain = create_stuff_documents_chain(llm, prompt) 137 | 138 | while True: 139 | user_prompt = input("Enter your question: ") 140 | similar_docs = retriever.invoke(user_prompt) 141 | 142 | print(f"similar_docs: {len(similar_docs)}") 143 | 144 | filtered_docs = filter_mixer.filter(similar_docs) 145 | print(f"filtered_docs: {len(filtered_docs)}") 146 | count = len(filter_mixer.get_unauthorized_documents()) 147 | 148 | response = qa_chain.invoke({"input": user_prompt, "context": filtered_docs}) 149 | print(f"\n{response}") 150 | print("\n=================") 151 | print( 152 | f"Warning: This answer could be inaccurate as its missing context from {count} out of {len(similar_docs)} data sources. Include {len(filtered_docs)} sources." 153 | ) 154 | print("=================\n") 155 | -------------------------------------------------------------------------------- /EXTENDING.md: -------------------------------------------------------------------------------- 1 | # Extending Pangea Multipass 2 | 3 | ## Core Components 4 | 5 | - Metadata Enrichment: Use classes like HasherSHA256 and Constant to add enriched metadata to documents. 6 | - Metadata Enrichers are applied through `enrich_metadata` method to all the documents. 7 | - DocumentReader: Implement custom readers for extracting document content. 8 | - MetadataUpdater: Apply enriched metadata to documents. 9 | - Filter Operators: Use FilterOperator for applying various metadata filters (e.g., EQ, GT, CONTAINS, etc.). 10 | 11 | ## Extend file sources 12 | 13 | In order to extend the file sources provided by this package, here it's explain how new classes should be implemented. In this case GDrive source is used as an example and documented in detail. 14 | New sources implementation should have, at least, 2 parts: 15 | 16 | - Metadata Enricher: Used on ingestion time to add metadata to the documents. 17 | - Processors/Filters: Used on inference time to process enriched medatada and filter documents based on logged user. 18 | 19 | ## Metadata Enricher 20 | 21 | CustomSource metadata enricher implementation should inherit from `MetadataEnricher` class. 22 | 23 | ```python 24 | class GDriveME(MetadataEnricher): 25 | ``` 26 | 27 | This inheritance will require that the `extract_metadata` method is implemented with this signature: 28 | 29 | ``` 30 | extract_metadata(self, doc: Any, file_content: str) -> dict[str, Any]: 31 | ``` 32 | 33 | This method will receive the document itself, so it's possible to access to the document metadata and other attributes if needed, and it will also receive document content, so it's possible so process it for whatever is needed (hash it, process by another LLM in order to get further information about it, etc.) 34 | 35 | ```python 36 | def extract_metadata(self, doc: Any, file_content: str) -> dict[str, Any]: 37 | metadata: dict[str, Any] = {} 38 | 39 | # This step is to normalize some attributes across platforms 40 | # Optional: If CustomSource has an attribute that it's related to the file name, title, etc. It would be nice to copy it to 41 | # PangeaMetadataKeys.FILE_NAME in order to unify this key/value. 42 | metadata[PangeaMetadataKeys.FILE_NAME] = doc.metadata.get("file name", "") 43 | 44 | # Required: Metadata enricher should set data source key as follow 45 | metadata[PangeaMetadataKeys.DATA_SOURCE] = PangeaMetadataValues.DATA_SOURCE_GDRIVE 46 | 47 | # Required: at least for this use case, it's required that this metadata enricher set the file id, so it will be used 48 | # at inference time to request the permissions of this file. 49 | # In this case there is a function to get the id from the metadata due to each source LangChain/LlamaIndex use different 50 | # key names to save it into the metadata. 51 | id = self._get_id_from_metadata(doc.metadata) 52 | if not id: 53 | raise Exception("empty doc_id") 54 | metadata[PangeaMetadataKeys.GDRIVE_FILE_ID] = id 55 | 56 | # New metadata is returned so it will be used by `enrich_metadata` method implemented in core package 57 | return metadata 58 | ``` 59 | 60 | NOTE: In this case GDrive metadata enricher have to be initialized with admin credentials to access the data source, so it is able to request all the files and their metadata. 61 | 62 | ## Processor/Filter 63 | 64 | CustomProcessor should inherit from `PangeaGenericNodeProcessor` and `Generic[T]`. 65 | 66 | ```python 67 | class GDriveProcessor(PangeaGenericNodeProcessor, Generic[T]): 68 | ``` 69 | 70 | `PangeaGenericNodeProcessor` will require that `filter` and `get_filter` methods are implemented. 71 | 72 | `filter()` method will take care of filter available node in run time. In order to do this, this `Processor` should be initialized with user credentials, so it's able to check what files this user has access to. 73 | 74 | ```python 75 | def filter( 76 | self, 77 | nodes: List[T], 78 | ) -> List[T]: 79 | 80 | return [node for node in nodes if self._is_authorized(node)] 81 | ``` 82 | 83 | `get_filter()` method will return a `MetadataFilter` to be used in LlamaIndex or LangChain retriever filters. In this case it requests all permissions of all files, so it's not performant for really large datasets. It's recommended to use `filter` so that only files that are of interest to the current prompt are requested. 84 | 85 | ```python 86 | def get_filter( 87 | self, 88 | ): 89 | 90 | if not self.files_ids: 91 | self.files_ids = GDriveAPI.list_all_file_ids(self.creds) 92 | 93 | return MetadataFilter(key=PangeaMetadataKeys.GDRIVE_FILE_ID, value=self.files_ids, operator=FilterOperator.IN) 94 | ``` 95 | 96 | ## API 97 | 98 | This third class is used just to group all the API request related to this particular data source. It's not required but it's a nice way to group all these required methods used internally for the above classes. 99 | 100 | ```python 101 | class GDriveAPI: 102 | _SCOPES = [ 103 | ... 104 | ] 105 | 106 | _user_token_filepath: str = "gdrive_access_token.json" 107 | 108 | @staticmethod 109 | def get_and_save_access_token(credentials_filepath, token_filepath, scopes): 110 | pass 111 | 112 | @staticmethod 113 | def get_user_info(creds: Credentials): 114 | pass 115 | 116 | @staticmethod 117 | def get_user_credentials( 118 | credentials_filepath: str, user_token_filepath: str = _user_token_filepath, scopes=_SCOPES 119 | ): 120 | pass 121 | 122 | @staticmethod 123 | def check_file_access(creds: Credentials, file_id: str) -> bool: 124 | pass 125 | 126 | @staticmethod 127 | def list_all_file_ids(creds: Credentials) -> List[str]: 128 | pass 129 | ``` 130 | -------------------------------------------------------------------------------- /examples/README.md: -------------------------------------------------------------------------------- 1 | # Trying Pangea Multipass 2 | 3 | Pangea Multipass is a general purpose library for checking a user's access to resources in an upstream system. While we (Pangea) originally built this for our AI/LLM apps, you can use this library independently. To see that in action, check out the `multipass_examples` folder, otherwise explore your LLM framework of choice. 4 | 5 | Each directory has its own README to get setup though many of the steps overlap. 6 | 7 | ## Set up the environment 8 | 9 | These are the upstream data sources the core library currently supports. Configure the ones you need and store the credentials for the examples. Most of these will require administrator access to get the credentials. 10 | 11 | 12 | ### Google Drive 13 | 14 | In order to use Google Drive as a source in the examples you need to: 15 | 16 | - Download the `credentials.json` file from Google console and save it in `/examples/` folder. 17 | - On the example script update `gdrive_fid` variable value with the Google Drive folder ID to process. 18 | 19 | 20 | ### Jira 21 | 22 | In order to use Jira as a source, it's needed to set some environment variables: 23 | - `JIRA_BASE_URL`: Jira project base URL. Its format is `.atlassian.net/`. Take care of remove `https://` part. 24 | - `JIRA_ADMIN_EMAIL`: Admin email used in the ingestion time. System will process all the tickets this user has access to. 25 | - `JIRA_ADMIN_TOKEN`: Access token of the admin email set above. [Learn more](https://support.atlassian.com/atlassian-account/docs/manage-api-tokens-for-your-atlassian-account/). 26 | - `JIRA_USER_EMAIL`: User email used in inference time. This email will be used to validate which tickets returned by the LLM the user has access to. 27 | - `JIRA_USER_TOKEN`: Access token of the user email set above. [Learn more](https://support.atlassian.com/atlassian-account/docs/manage-api-tokens-for-your-atlassian-account/). 28 | - `JIRA_USER_ACCOUNT_ID`: Set it to use `JIRA_ADMIN_TOKEN` and `JIRA_ADMIN_EMAIL` at inference time to check user permissions. This way it's not needed to set `JIRA_USER_EMAIL` and `JIRA_USER_TOKEN`. 29 | 30 | 31 | ### Confluence 32 | 33 | In order to use Confluence as a source it's needed to set some environment variables: 34 | - `CONFLUENCE_BASE_URL`: Confluence project base URL. Its format is `https://.atlassian.net/`. 35 | - `CONFLUENCE_ADMIN_EMAIL`: Admin email used in the ingestion time. System will process all the files this user has access to. 36 | - `CONFLUENCE_ADMIN_TOKEN`: Access token of the admin email set above. [Learn more](https://support.atlassian.com/atlassian-account/docs/manage-api-tokens-for-your-atlassian-account/) 37 | - `CONFLUENCE_USER_EMAIL`: User email used in inference time. This email will be used to validate which files returned by the LLM the user has access to. 38 | - `CONFLUENCE_USER_TOKEN`: Access token of the user email set above. [Learn more](https://support.atlassian.com/atlassian-account/docs/manage-api-tokens-for-your-atlassian-account/) 39 | 40 | 41 | ### GitHub 42 | 43 | In order to use GitHub as a source it's needed to set some environment variables: 44 | - `GITHUB_ADMIN_TOKEN`: Access token used in the ingestion time. System will process all the repositories this token has access to. [Learn more](https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens#creating-a-fine-grained-personal-access-token). This could be a `Fine-grained personal access token` with access to all the respositories owned by the admin account and `repository permission` to `read access to code and metadata`. 45 | - `GITHUB_USER_TOKEN`: (Deprecated. Use `GITHUB_USERNAME` and `GITHUB_ADMIN_TOKEN` instead.) Token user in inference time. It will be used to validate which files returned by the LLM the user has access to. [Learn more](https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens#creating-a-personal-access-token-classic). This must be a `classic personal access token` with scoped access to (at least) all the `repo` items. 46 | - `GITHUB_USERNAME`: Username used in inference time. This email will be used to validate which files returned by the LLM the user has access to. 47 | 48 | 49 | ### Slack 50 | 51 | In order to use Slack as a source it's needed to set some environment variables, those are `SLACK_ADMIN_TOKEN` and `SLACK_USER_TOKEN`. 52 | 53 | To get these tokens, you must create a Slack App and generate the tokens. The default app settings are sufficient. For further instructions about how to get slack tokens you could [click here](https://api.slack.com/tutorials/tracks/getting-a-token). 54 | 55 | For this particular application the token's scope should be at least: `channels:history`, `groups:history`, `users:read`, `users:read.email` to process all public and private channels and access to user emails to check its permissions. 56 | 57 | - `SLACK_ADMIN_TOKEN`: Access token used in the ingestion time. System will process all the channels this token has access to. 58 | - `SLACK_USER_TOKEN`: (Deprecated. Use `SLACK_USER_EMAIL` and `SLACK_ADMIN_TOKEN` instead.) Token user in inference time. It will be used to validate which files returned by the LLM the user has access to. 59 | - `SLACK_USER_EMAIL`: User email used in inference time. This email will be used to validate which files returned by the LLM the user has access to. 60 | 61 | 62 | ### Dropbox 63 | 64 | When using Dropbox as a source, the admin user needs to authenticate on Dropbox using [Oauth2 protocol](https://developers.dropbox.com/oauth-guide) and allow Pangea's `pangea-multipass` Dropbox App. To do so, follow the Oauth2 flow using PKCE and the `pangea-multipass`'s app key. This app key could be saved in `DROPBOX_APP_KEY` like so: 65 | 66 | In order to use Dropbox as a source, you need two environment variables: 67 | 68 | - `DROPBOX_APP_KEY`: The identifier for the Dropbox app that Multipass will use to access your files. For testing, you can use our Pangea app with key: `hmhe1wh0koy8cv6` 69 | - `DROPBOX_USER_EMAIL`: User email used in inference time. This email will be used to validate which files returned by the LLM the user has access to. 70 | -------------------------------------------------------------------------------- /packages/pangea-multipass/pangea_multipass/slack_reader.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from typing import Any, List, Optional, Tuple 3 | 4 | from slack_sdk import WebClient 5 | from slack_sdk.errors import SlackApiError 6 | 7 | from .core import MultipassDocument, PangeaMetadataKeys, PangeaMetadataValues, generate_id 8 | from .sources import SlackClient 9 | 10 | 11 | class SlackReader: 12 | _token: str 13 | _slack_client: WebClient 14 | _channel_id: Optional[str] = None 15 | _latest_ts: Optional[str] = None 16 | _has_more_messages: bool = True 17 | 18 | def __init__(self, token: str, logger_name: str = "multipass") -> None: 19 | self._token = token 20 | self._slack_client = WebClient(token=self._token) 21 | self.logger = logging.getLogger(logger_name) 22 | self._client = SlackClient(logger_name) 23 | self._restart() 24 | 25 | def load_data(self, max_messages_per_channel: int = 1000) -> List[MultipassDocument]: 26 | """Load all messages from all channels""" 27 | documents: List[MultipassDocument] = [] 28 | channels = self._client.list_channels(token=self._token) 29 | for channel in channels: 30 | self.logger.debug(f"Fetching messages from channel {channel['name']}") 31 | messages, _, _ = self._fetch_messages(channel["id"], max_messages_per_channel) 32 | documents.extend(self._process_messages(messages, channel)) 33 | 34 | return documents 35 | 36 | def get_channels(self) -> List[dict[str, Any]]: 37 | """Get all the channels token has access to""" 38 | return self._client.list_channels(token=self._token) 39 | 40 | def read_messages(self, channel: dict, page_size: int = 100) -> List[MultipassDocument]: 41 | """ 42 | Read messages from a given channel 43 | If the channel is different from the last one, it will restart the reader. 44 | If the channel is the same, it will continue reading from the last message. 45 | """ 46 | new_channel_id = channel["id"] 47 | if self._channel_id is None or new_channel_id != self._channel_id: 48 | self.logger.debug(f"Restarting reader for channel {new_channel_id}") 49 | self._restart() 50 | self._channel_id = new_channel_id 51 | 52 | if self._channel_id is None: 53 | self.logger.error("Channel ID is not set") 54 | raise Exception("Channel ID is not set") 55 | 56 | messages, latest, more_messages = self._fetch_messages(self._channel_id, page_size, self._latest_ts) 57 | self._latest_ts = latest 58 | self._has_more_messages = more_messages 59 | return self._process_messages(messages, channel) 60 | 61 | @property 62 | def has_more_messages(self) -> bool: 63 | """Check if there are more messages to read""" 64 | return self._has_more_messages 65 | 66 | def _restart(self) -> None: 67 | self._channel_id = None 68 | self._latest_ts = None 69 | self._has_more_messages = True 70 | 71 | def _process_messages(self, messages: list[dict[str, Any]], channel: dict) -> List[MultipassDocument]: 72 | """Process the messages and create the documents""" 73 | channel_id = channel["id"] 74 | channel_name = channel["name"] 75 | documents: List[MultipassDocument] = [] 76 | 77 | for message in messages: 78 | subtype = message.get("subtype", "") 79 | # Just ignore the channel join messages 80 | if subtype == "channel_join": 81 | continue 82 | user = message.get("user", "") 83 | text = message.get("text", "") 84 | ts = message.get("ts", "") 85 | metadata = { 86 | PangeaMetadataKeys.SLACK_CHANNEL_ID: channel_id, 87 | PangeaMetadataKeys.SLACK_CHANNEL_NAME: channel_name, 88 | PangeaMetadataKeys.SLACK_TIMESTAMP: ts, 89 | PangeaMetadataKeys.SLACK_USER: user, 90 | PangeaMetadataKeys.DATA_SOURCE: PangeaMetadataValues.DATA_SOURCE_SLACK, 91 | } 92 | documents.append(MultipassDocument(id=generate_id(), content=text, metadata=metadata)) # type: ignore[arg-type] 93 | 94 | return documents 95 | 96 | def _fetch_messages( 97 | self, channel_id: str, max_messages: int = 1000, latest: Optional[str] = None 98 | ) -> Tuple[List[dict[str, Any]], Optional[str], bool]: 99 | """ 100 | Fetch the messages from a given channel. 101 | """ 102 | 103 | page_size = 100 104 | page_size = page_size if page_size < max_messages else max_messages 105 | messages: List[dict[str, Any]] = [] 106 | more_messages = True 107 | 108 | try: 109 | while len(messages) < max_messages: 110 | response = self._slack_client.conversations_history(channel=channel_id, latest=latest, limit=page_size) 111 | new_messages: List[dict[str, Any]] = response.get("messages", []) 112 | messages.extend(new_messages) 113 | 114 | if not new_messages: 115 | self.logger.debug(f"No more messages to fetch for channel {channel_id}") 116 | more_messages = False 117 | break 118 | 119 | message = new_messages[-1] 120 | latest = message.get("ts", "") 121 | 122 | # We could delete this check and do another request and it should return an empty list. 123 | if len(new_messages) < page_size: 124 | self.logger.debug( 125 | f"Size of new messages is less than page size. No more messages to fetch for channel {channel_id}" 126 | ) 127 | more_messages = False 128 | break 129 | 130 | page_size = page_size if (max_messages - len(messages)) > page_size else (max_messages - len(messages)) 131 | 132 | except SlackApiError as e: 133 | self.logger.error(f"Error fetching messages for channel {channel_id}: {e.response['error']}") 134 | raise Exception(f"Error fetching messages for channel {channel_id}: {e.response['error']}") 135 | 136 | return (messages, latest, more_messages) 137 | -------------------------------------------------------------------------------- /examples/langchain_examples/01-rag-LangChain-all-sources.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Pangea Cyber Corporation 2 | # Author: Pangea Cyber Corporation 3 | 4 | import os 5 | from io import BytesIO 6 | from pathlib import Path 7 | from typing import List 8 | 9 | import boto3 10 | from google.oauth2.credentials import Credentials 11 | from langchain.document_loaders import ConfluenceLoader 12 | from langchain_aws import BedrockEmbeddings, ChatBedrock 13 | from langchain_community.vectorstores import FAISS 14 | from langchain_core.documents import Document 15 | from langchain_google_community import GoogleDriveLoader 16 | from pangea_multipass import ConfluenceAuth, ConfluenceME, GDriveAPI, GDriveME, enrich_metadata 17 | from pangea_multipass_langchain import DocumentFilterMixer, LangChainConfluenceFilter, LangChainDocumentReader 18 | 19 | # Initialization 20 | bedrock_client = boto3.client("bedrock-runtime", region_name="us-west-2") 21 | model_id = "anthropic.claude-3-5-sonnet-20240620-v1:0" 22 | model_kwargs = { 23 | "max_tokens": 512, 24 | "temperature": 0.5, 25 | } 26 | 27 | ## Setup the LLM paramaters 28 | llm = ChatBedrock( 29 | client=bedrock_client, 30 | model_id=model_id, 31 | model_kwargs=model_kwargs, 32 | ) 33 | 34 | ## Setup the Embedding model paramaters 35 | embedding_model = BedrockEmbeddings(model_id="amazon.titan-embed-g1-text-02", client=bedrock_client) 36 | 37 | 38 | class TextLoader: 39 | file: BytesIO 40 | 41 | def __init__(self, file: BytesIO): 42 | self.file = file 43 | 44 | def load(self) -> List[Document]: 45 | return [Document(page_content=self.file.read().decode("utf-8"))] 46 | 47 | 48 | ## Data ingestion pipeline 49 | 50 | 51 | def load_gdrive_documents() -> List[Document]: 52 | os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "" 53 | # Google Drive Data Ingestion 54 | admin_token_filepath = "admin_access_token.json" 55 | 56 | credentials_filepath = os.path.abspath("../credentials.json") 57 | print("Login to GDrive as admin...") 58 | GDriveAPI.get_and_save_access_token( 59 | credentials_filepath, admin_token_filepath, ["https://www.googleapis.com/auth/drive.readonly"] 60 | ) 61 | 62 | loader = GoogleDriveLoader( 63 | folder_id="1Kj77oi2QGEOPKcIo_hKZPiHDJyKKFVgR", 64 | token_path=Path(admin_token_filepath), 65 | credentials_path=Path(credentials_filepath), 66 | recursive=True, 67 | load_extended_metadata=True, 68 | file_loader_cls=TextLoader, 69 | ) 70 | 71 | docs: List[Document] = loader.load() 72 | print(f"GDrive docs loaded: {len(docs)}.") 73 | 74 | # Metadata enricher library 75 | SCOPES = [ 76 | "openid", 77 | "https://www.googleapis.com/auth/userinfo.email", 78 | "https://www.googleapis.com/auth/userinfo.profile", 79 | "https://www.googleapis.com/auth/drive.metadata.readonly", 80 | ] 81 | creds = Credentials.from_authorized_user_file(admin_token_filepath, SCOPES) 82 | gdrive_me = GDriveME(creds, {}) 83 | enrich_metadata(docs, [gdrive_me], reader=LangChainDocumentReader()) 84 | # Finish metadata enrichement 85 | return docs 86 | 87 | 88 | def confluence_read_docs() -> List[Document]: 89 | """Fetch all documents from Confluence using ConfluenceLoader.""" 90 | 91 | confluence_admin_token = os.getenv("CONFLUENCE_ADMIN_TOKEN") 92 | assert confluence_admin_token 93 | confluence_admin_email = os.getenv("CONFLUENCE_ADMIN_EMAIL") 94 | assert confluence_admin_email 95 | confluence_url = os.getenv("CONFLUENCE_BASE_URL") 96 | assert confluence_url 97 | 98 | confluence_space_key = "~71202041f9bfec117041348629ccf3e3c751b3" 99 | # confluence_space_id = 393230 100 | 101 | # Create a ConfluenceReader instance 102 | print("Loading Confluence docs...") 103 | loader = ConfluenceLoader( 104 | url=confluence_url, 105 | username=confluence_admin_email, 106 | api_key=confluence_admin_token, 107 | space_key=confluence_space_key, 108 | ) 109 | documents: List[Document] = loader.load() 110 | 111 | # Enrich metadata process 112 | print(f"Processing {len(documents)} Confluence docs...") 113 | confluence_me = ConfluenceME() 114 | enrich_metadata(documents, [confluence_me], reader=LangChainDocumentReader()) 115 | 116 | return documents 117 | 118 | 119 | PERSIST_DIR = "./storage/data/langchain/faiss_index" 120 | if not os.path.exists(PERSIST_DIR): 121 | gdrive_docs = load_gdrive_documents() 122 | confluence_docs = confluence_read_docs() 123 | docs = gdrive_docs + confluence_docs 124 | 125 | # Initialize the vector store https://faiss.ai 126 | print("Initializing vector store...") 127 | vectorstore = FAISS.from_documents(documents=docs, embedding=embedding_model) 128 | 129 | # Store to file system 130 | print("Storing vector store...") 131 | vectorstore.save_local(PERSIST_DIR) 132 | else: 133 | print("Loading vector store...") 134 | vectorstore = FAISS.load_local( 135 | folder_path=PERSIST_DIR, embeddings=embedding_model, allow_dangerous_deserialization=True 136 | ) 137 | 138 | 139 | ## Inference pipeline 140 | 141 | from langchain.chains.combine_documents import create_stuff_documents_chain 142 | from langchain_core.prompts import ChatPromptTemplate 143 | from pangea_multipass_langchain import LangChainGDriveFilter 144 | 145 | # Create GDrive filter 146 | credentials_filepath = os.path.abspath("../credentials.json") 147 | SCOPES = [ 148 | "openid", 149 | "https://www.googleapis.com/auth/userinfo.email", 150 | "https://www.googleapis.com/auth/userinfo.profile", 151 | "https://www.googleapis.com/auth/drive.metadata.readonly", 152 | ] 153 | print("Login to GDrive as user...") 154 | creds = GDriveAPI.get_user_credentials(credentials_filepath, scopes=SCOPES) 155 | gdrive_filter = LangChainGDriveFilter(creds) 156 | 157 | # Create Confluence filter 158 | confluence_admin_token = os.getenv("CONFLUENCE_ADMIN_TOKEN") 159 | assert confluence_admin_token 160 | confluence_admin_email = os.getenv("CONFLUENCE_ADMIN_EMAIL") 161 | assert confluence_admin_email 162 | confluence_url = os.getenv("CONFLUENCE_BASE_URL") 163 | assert confluence_url 164 | confluence_account_id = os.getenv("CONFLUENCE_USER_ACCOUNT_ID") 165 | assert confluence_account_id 166 | 167 | confluence_filter = LangChainConfluenceFilter( 168 | ConfluenceAuth(confluence_admin_email, confluence_admin_token, confluence_url), account_id=confluence_account_id 169 | ) 170 | 171 | # Create mixed filter 172 | filter_mixer = DocumentFilterMixer(document_filters=[gdrive_filter, confluence_filter]) 173 | 174 | # Use indexed store as a reteriver to create qa chain 175 | retriever = vectorstore.as_retriever() 176 | 177 | # Prompt template with System, Context and User prompt 178 | template = """"System: Answer the following question based only on the provided context: 179 | 180 | 181 | {context} 182 | 183 | 184 | Question: {input} 185 | """ 186 | prompt = ChatPromptTemplate.from_template(template) 187 | 188 | # Document chain using the LLM and prompt template 189 | qa_chain = create_stuff_documents_chain(llm, prompt) 190 | 191 | while True: 192 | user_prompt = input("Enter your question: ") 193 | similar_docs = retriever.invoke(user_prompt) 194 | 195 | print(f"similar_docsilar: {len(similar_docs)}") 196 | 197 | filtered_docs = filter_mixer.filter(similar_docs) 198 | print(f"filtered_docs: {len(filtered_docs)}") 199 | count = len(filter_mixer.get_unauthorized_documents()) 200 | 201 | response = qa_chain.invoke({"input": user_prompt, "context": filtered_docs}) 202 | print(f"\n{response}") 203 | print("\n=================") 204 | print( 205 | f"Warning: This answer could be inaccurate as it's missing context from {count} out of {len(similar_docs)} data sources. Included {len(filtered_docs)} sources." 206 | ) 207 | print("=================\n") 208 | -------------------------------------------------------------------------------- /packages/pangea-multipass-langchain/pangea_multipass_langchain/langchain.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Pangea Cyber Corporation 2 | # Author: Pangea Cyber Corporation 3 | 4 | from typing import Any, List, Optional 5 | 6 | from google.oauth2.credentials import Credentials 7 | from langchain_core.documents import Document 8 | from pangea_multipass import ( 9 | ConfluenceAuth, 10 | ConfluenceProcessor, 11 | DocumentReader, 12 | DropboxProcessor, 13 | FilterOperator, 14 | GDriveProcessor, 15 | GitHubProcessor, 16 | GitLabProcessor, 17 | JiraAuth, 18 | JiraProcessor, 19 | ) 20 | from pangea_multipass import MetadataFilter as PangeaMetadataFilter 21 | from pangea_multipass import ( 22 | MultipassDocument, 23 | PangeaGenericNodeProcessor, 24 | PangeaNodeProcessorMixer, 25 | SlackProcessor, 26 | ) 27 | 28 | 29 | class LangChainDocumentReader(DocumentReader): 30 | """Lang chain document reader""" 31 | 32 | def read(self, doc: Document) -> str: 33 | return str(doc.page_content) 34 | 35 | 36 | def get_doc_id(doc: Document) -> str: 37 | return doc.id if doc.id is not None else "" 38 | 39 | 40 | def get_doc_metadata(doc: Document) -> dict[str, Any]: 41 | return dict(doc.metadata) 42 | 43 | 44 | def from_multipass(documents: List[MultipassDocument]) -> List[Document]: 45 | lc_documents: List[Document] = [] 46 | for doc in documents: 47 | lc_doc = Document(id=doc.id, page_content=doc.content) 48 | lc_doc.metadata = doc.metadata 49 | lc_documents.append(lc_doc) 50 | 51 | return lc_documents 52 | 53 | 54 | class LangChainJiraFilter(JiraProcessor[Document]): 55 | """Filter for Jira integration with LangChain documents. 56 | 57 | Uses Jira authentication to check document access in the LangChain. 58 | 59 | Args: 60 | auth (JiraAuth): Jira authentication credentials. 61 | account_id (Optional[str]): Jira user's account id to check issues permissions. 62 | """ 63 | 64 | def __init__(self, auth: JiraAuth, account_id: Optional[str] = None): 65 | super().__init__(auth, get_node_metadata=get_doc_metadata, account_id=account_id) 66 | 67 | 68 | class LangChainConfluenceFilter(ConfluenceProcessor[Document]): 69 | """Filter for Confluence integration with LangChain documents. 70 | 71 | Uses Confluence authentication to check document access in the LangChain. 72 | 73 | Args: 74 | auth (ConfluenceAuth): Confluence authentication credentials. 75 | space_id (Optional[int]): The space ID to filter pages by. 76 | account_id (Optional[str]): User account id to check permissions using admin token. 77 | 78 | """ 79 | 80 | def __init__(self, auth: ConfluenceAuth, space_id: Optional[int] = None, account_id: Optional[str] = None): 81 | super().__init__(auth, get_node_metadata=get_doc_metadata, space_id=space_id, account_id=account_id) 82 | 83 | 84 | class LangChainGDriveFilter(GDriveProcessor[Document]): 85 | """Filter for Google Drive integration with LangChain documents. 86 | 87 | Uses Google Drive credentials to check document access in the LangChain. 88 | 89 | Args: 90 | creds (Credentials): Google OAuth2 credentials. 91 | user_email (Optional[str]): User email to check access to files. 92 | """ 93 | 94 | def __init__(self, creds: Credentials, user_email: Optional[str] = None): 95 | super().__init__(creds, get_node_metadata=get_doc_metadata, user_email=user_email) 96 | 97 | 98 | class LangChainGitHubFilter(GitHubProcessor[Document]): 99 | """Filter for GitHub integration with LangChain documents. 100 | 101 | Uses GitHub classic token to check document access in the LangChain. 102 | 103 | Args: 104 | token (str): GitHub classic token. 105 | username (str): GitHub username to check permissions. 106 | """ 107 | 108 | def __init__(self, token: str, username: str): 109 | super().__init__(token, get_node_metadata=get_doc_metadata, username=username) 110 | 111 | 112 | class LangChainSlackFilter(SlackProcessor[Document]): 113 | """Filter for Slack integration with LangChain documents. 114 | 115 | Uses Slack token to check access to channels in the LangChain. 116 | 117 | Args: 118 | token (str): Slack token. 119 | user_email (Optional[str]): User email to check access to channels. 120 | """ 121 | 122 | def __init__(self, token: str, user_email: Optional[str] = None): 123 | super().__init__(token, get_node_metadata=get_doc_metadata, user_email=user_email) 124 | 125 | 126 | class LangChainGitLabFilter(GitLabProcessor[Document]): 127 | """Filter for GitLab integration with LangChain documents. 128 | 129 | Uses GitLab token to access nodes in the LangChain. 130 | 131 | Args: 132 | token (str): GitLab token. 133 | username (str): Username to check access to files. 134 | """ 135 | 136 | def __init__(self, admin_token: str, username: str): 137 | super().__init__(admin_token=admin_token, username=username, get_node_metadata=get_doc_metadata) 138 | 139 | 140 | class LangChainDropboxFilter(DropboxProcessor[Document]): 141 | """Filter for Dropbox integration with LangChain documents. 142 | 143 | Uses Dropbox token to check access to documents in the LangChain. 144 | 145 | Args: 146 | token (str): Dropbox token. 147 | user_email (str): User email to check access to files. 148 | """ 149 | 150 | def __init__(self, token: str, user_email: str): 151 | super().__init__(token, user_email=user_email, get_node_metadata=get_doc_metadata) 152 | 153 | 154 | class DocumentFilterMixer: 155 | node_processor: PangeaNodeProcessorMixer[Document] = PangeaNodeProcessorMixer(get_doc_metadata, []) 156 | 157 | def __init__(self, document_filters: List[PangeaGenericNodeProcessor[Document]]): 158 | super().__init__() 159 | self.node_processor = PangeaNodeProcessorMixer[Document]( 160 | get_node_metadata=get_doc_metadata, 161 | node_processors=document_filters, 162 | ) 163 | 164 | def filter( 165 | self, 166 | documents: List[Document], 167 | ) -> List[Document]: 168 | return self.node_processor.filter(documents) 169 | 170 | def get_filter( 171 | self, 172 | ) -> dict[str, Any]: 173 | filters = [] 174 | for filter in self.node_processor.get_filters(): 175 | filters.append(_convert_metadata_filter_to_langchain(filter)) 176 | return {"$or": filters} 177 | 178 | def get_unauthorized_documents( 179 | self, 180 | ) -> List[Document]: 181 | """Retrieves documents that are unauthorized for access. 182 | 183 | Returns: 184 | List[Document]: List of unauthorized documents. 185 | """ 186 | return self.node_processor.get_unauthorized_nodes() 187 | 188 | def get_authorized_documents( 189 | self, 190 | ) -> List[Document]: 191 | """Retrieves documents that are authorized for access. 192 | 193 | Returns: 194 | List[Document]: List of authorized documents. 195 | """ 196 | return self.node_processor.get_authorized_nodes() 197 | 198 | 199 | def _convert_metadata_filter_to_langchain(input: PangeaMetadataFilter) -> dict[str, Any]: 200 | if input.operator == FilterOperator.EQ: 201 | filter = {input.key: input.value} 202 | elif input.operator == FilterOperator.IN: 203 | filter = {input.key: {"$in": input.value}} 204 | elif input.operator == FilterOperator.CONTAINS: 205 | filter = {input.key: {"$contain": input.value}} 206 | elif input.operator == FilterOperator.GT: 207 | filter = {input.key: {"$gt": input.value}} 208 | elif input.operator == FilterOperator.LT: 209 | filter = {input.key: {"$lt": input.value}} 210 | elif input.operator == FilterOperator.NE: 211 | filter = {input.key: {"$ne": input.value}} 212 | elif input.operator == FilterOperator.GTE: 213 | filter = {input.key: {"$gte": input.value}} 214 | elif input.operator == FilterOperator.LTE: 215 | filter = {input.key: {"$lte": input.value}} 216 | elif input.operator == FilterOperator.NIN: 217 | filter = {input.key: {"$nin": input.value}} 218 | else: 219 | raise TypeError(f"Invalid filter operator: {input.operator}") 220 | 221 | return filter 222 | -------------------------------------------------------------------------------- /packages/pangea-multipass/pangea_multipass/sources/gitlab/gitlab.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | from typing import Any, Callable, Generic, List, Optional 4 | from urllib.parse import quote 5 | 6 | import requests 7 | 8 | from pangea_multipass.core import ( 9 | FilterOperator, 10 | MetadataFilter, 11 | PangeaGenericNodeProcessor, 12 | PangeaMetadataKeys, 13 | PangeaMetadataValues, 14 | T, 15 | ) 16 | 17 | 18 | class GitLabClient: 19 | _actor = "gitlab_client" 20 | 21 | def __init__(self, logger_name: str = "multipass"): 22 | self.logger = logging.getLogger(logger_name) 23 | 24 | def get_auth_headers(self, token: str) -> dict[str, str]: 25 | """Authenticate to GitLab using a personal access token.""" 26 | return {"Authorization": f"Bearer {token}"} 27 | 28 | def user_has_access(self, admin_token: str, user_id: str, project_id: str) -> bool: 29 | """ 30 | Check if a specific user has access to a GitLab project using an admin token. 31 | """ 32 | url = f"https://gitlab.com/api/v4/projects/{project_id}/members/all/{user_id}" 33 | headers = self.get_auth_headers(admin_token) 34 | response = requests.get(url, headers=headers) 35 | 36 | if response.status_code == 200: 37 | return True # User has access 38 | elif response.status_code == 404: 39 | return False # User does not have access 40 | elif response.status_code == 403: 41 | self._log_error("user_has_access", url, {}, response) 42 | raise Exception("Admin token does not have sufficient permissions to check access.") 43 | else: 44 | self._log_error("user_has_access", url, {}, response) 45 | raise Exception(f"Unexpected error: {response.status_code} - {response.json()}") 46 | 47 | def get_user(self, admin_token: str, username: str) -> dict: 48 | """Get user information using an admin token.""" 49 | 50 | url = f"https://gitlab.com/api/v4/users?username={quote(username)}" 51 | response = requests.get( 52 | url, 53 | headers=self.get_auth_headers(admin_token), 54 | ) 55 | 56 | if response.status_code != 200: 57 | self._log_error("get_user", url, {}, response) 58 | 59 | response.raise_for_status() 60 | users = response.json() 61 | return users[0] if len(users) else {} 62 | 63 | def get_user_info(self, admin_token: str) -> dict: 64 | """Get user information from current token""" 65 | 66 | url = "https://gitlab.com/api/v4/user/" 67 | response = requests.get( 68 | url, 69 | headers=self.get_auth_headers(admin_token), 70 | ) 71 | 72 | if response.status_code != 200: 73 | self._log_error("get_user_info", url, {}, response) 74 | 75 | response.raise_for_status() 76 | return response.json() 77 | 78 | def get_user_projects(self, admin_token: str) -> list[dict[str, Any]]: 79 | """Fetch all projects the authenticated user has access to.""" 80 | projects = [] 81 | headers = self.get_auth_headers(admin_token) 82 | url = f"https://gitlab.com/api/v4/projects" 83 | params = {"per_page": 100, "membership": True, "simple": True} 84 | while url: 85 | response = requests.get(url, headers=headers, params=params) 86 | if response.status_code != 200: 87 | self._log_error("get_user_projects", url, params, response) 88 | raise Exception(f"Error fetching projects: {response.text}") 89 | 90 | projects.extend(response.json()) 91 | url = response.links.get("next", {}).get("url") # Pagination 92 | return projects 93 | 94 | def get_allowed_projects(self, admin_token: str, user_id: str) -> list[int]: 95 | projects = self.get_user_projects(admin_token=admin_token) 96 | user_projects = [] 97 | 98 | for project in projects: 99 | if self.user_has_access(admin_token, user_id, project["id"]): 100 | user_projects.append(project["id"]) 101 | 102 | return user_projects 103 | 104 | def download_file(self, token: str, repo_id: str, file_path: str): 105 | encoded_file_path = quote(file_path, safe="") # Encode special chars 106 | file_url = f"https://gitlab.com/api/v4/projects/{repo_id}/repository/files/{encoded_file_path}/raw" 107 | 108 | response = requests.get(file_url, headers=self.get_auth_headers(token)) 109 | if response.status_code != 200: 110 | self._log_error("download_file", file_url, {}, response) 111 | raise Exception(f"Skipping {file_path}: Could not download file") 112 | 113 | return response.content 114 | 115 | def _log_error(self, function_name: str, url: str, data: dict, response: requests.Response): 116 | self.logger.error( 117 | json.dumps( 118 | { 119 | "actor": GitLabClient._actor, 120 | "fn": function_name, 121 | "url": url, 122 | "data": data, 123 | "status_code": response.status_code, 124 | "reason": response.reason, 125 | "text": response.text, 126 | } 127 | ) 128 | ) 129 | 130 | 131 | class GitLabProcessor(PangeaGenericNodeProcessor[T], Generic[T]): 132 | _access_cache: dict[str, bool] = {} 133 | _token: str 134 | _username: str 135 | _user_id: Optional[str] 136 | _projects: list[int] = [] 137 | _get_node_metadata: Callable[[T], dict[str, Any]] 138 | 139 | def __init__( 140 | self, 141 | admin_token: str, 142 | username: str, 143 | get_node_metadata: Callable[[T], dict[str, Any]], 144 | logger_name: str = "multipass", 145 | ): 146 | self._token = admin_token 147 | self._username = username 148 | self._access_cache = {} 149 | self._get_node_metadata = get_node_metadata 150 | self._user_id = None 151 | self._client = GitLabClient(logger_name) 152 | 153 | def _has_access(self, metadata: dict[str, Any]) -> bool: 154 | """Check if the user has access to the given file.""" 155 | 156 | project_id = metadata.get(PangeaMetadataKeys.GITLAB_REPOSITORY_ID, None) 157 | if not project_id: 158 | raise KeyError(f"Invalid metadata key: {PangeaMetadataKeys.GITLAB_REPOSITORY_ID}") 159 | 160 | if self._user_id is None: 161 | self._load_user_id() 162 | 163 | if self._user_id is None: 164 | print("Could not load user ID") 165 | return False 166 | 167 | if project_id in self._access_cache: 168 | return self._access_cache[project_id] 169 | 170 | has_access = self._client.user_has_access(self._token, self._user_id, project_id) 171 | self._access_cache[project_id] = has_access 172 | return has_access 173 | 174 | def filter( 175 | self, 176 | nodes: List[T], 177 | ) -> List[T]: 178 | """Filter GitLab files by access permissions. 179 | 180 | Args: 181 | nodes (List[T]): List of nodes to process. 182 | 183 | Returns: 184 | List[T]: Nodes that have authorized access. 185 | """ 186 | 187 | filtered: List[T] = [] 188 | for node in nodes: 189 | if self._is_authorized(node): 190 | filtered.append(node) 191 | return filtered 192 | 193 | def get_filter( 194 | self, 195 | ) -> MetadataFilter: 196 | """Generate a filter based on accessible GitLab project IDs. 197 | 198 | Returns: 199 | MetadataFilter: Filter for GitLab project IDs. 200 | """ 201 | 202 | if not self._projects: 203 | if self._user_id is None: 204 | self._load_user_id() 205 | 206 | if self._user_id is None: 207 | raise Exception("Could not load user ID") 208 | 209 | self._projects = self._client.get_allowed_projects(self._token, self._user_id) 210 | 211 | return MetadataFilter( 212 | key=PangeaMetadataKeys.GITLAB_REPOSITORY_ID, value=self._projects, operator=FilterOperator.IN 213 | ) 214 | 215 | def _is_authorized(self, node: T) -> bool: 216 | metadata = self._get_node_metadata(node) 217 | return metadata[PangeaMetadataKeys.DATA_SOURCE] == PangeaMetadataValues.DATA_SOURCE_GITLAB and self._has_access( 218 | metadata 219 | ) 220 | 221 | def _load_user_id(self): 222 | user = self._client.get_user(self._token, username=self._username) 223 | self._user_id = user.get("id", None) 224 | -------------------------------------------------------------------------------- /packages/pangea-multipass/pangea_multipass/sources/github/github.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | from typing import Any, Callable, Generic, List, Tuple 4 | 5 | import requests 6 | 7 | from pangea_multipass.core import ( 8 | FilterOperator, 9 | MetadataFilter, 10 | PangeaGenericNodeProcessor, 11 | PangeaMetadataKeys, 12 | PangeaMetadataValues, 13 | T, 14 | ) 15 | 16 | 17 | class GitHubClient: 18 | _actor = "github_client" 19 | 20 | def __init__(self, logger_name: str = "multipass"): 21 | self.logger = logging.getLogger(logger_name) 22 | 23 | def get_auth_headers(self, token: str) -> dict[str, str]: 24 | """Authenticate to GitHub using a personal access token.""" 25 | headers = { 26 | "Authorization": f"token {token}", 27 | "Accept": "application/vnd.github.v3+json", 28 | } 29 | return headers 30 | 31 | def has_access(self, token: str, owner: str, repo_name: str) -> bool: 32 | """ 33 | Check if this token has access to this particular GitHub repository 34 | """ 35 | access = False 36 | 37 | headers = self.get_auth_headers(token) 38 | url = f"https://api.github.com/repos/{owner}/{repo_name}" 39 | response = requests.get(url, headers=headers) 40 | 41 | if response.status_code == 200: 42 | access = True # User has access 43 | elif response.status_code == 404: 44 | access = False # Repository not found or no access 45 | elif response.status_code == 403: 46 | self._log_error("has_access", url, {}, response) 47 | raise Exception(f"Access forbidden. Check permissions or token scope.") 48 | else: 49 | self._log_error("has_access", url, {}, response) 50 | raise Exception(f"Unexpected error: {response.status_code} - {response.json()}") 51 | 52 | return access 53 | 54 | def user_has_access(self, admin_token: str, owner: str, repo_name: str, username: str) -> bool: 55 | """ 56 | Checks if a user has access to a specific GitHub repository using an admin token 57 | """ 58 | headers = self.get_auth_headers(admin_token) 59 | url = f"https://api.github.com/repos/{owner}/{repo_name}/collaborators/{username}" 60 | response = requests.get(url, headers=headers) 61 | 62 | if response.status_code == 204: 63 | return True 64 | elif response.status_code == 404: 65 | return False 66 | elif response.status_code == 403: 67 | self._log_error("user_has_access", url, {}, response) 68 | raise Exception("Admin token does not have sufficient permissions to check access.") 69 | else: 70 | self._log_error("user_has_access", url, {}, response) 71 | raise Exception(f"Unexpected error: {response.status_code} - {response.json()}") 72 | 73 | def get_user_repos(self, token: str) -> List[dict[str, Any]]: 74 | """Get all repositories the authenticated user has access to.""" 75 | 76 | headers = self.get_auth_headers(token) 77 | url = "https://api.github.com/user/repos" 78 | repos: List[dict[str, Any]] = [] 79 | page = 1 80 | 81 | while True: 82 | response = requests.get(url, headers=headers, params={"per_page": 100, "page": page}) 83 | if response.status_code != 200: 84 | self._log_error("get_user_repos", url, {"per_page": 100, "page": page}, response) 85 | raise Exception(f"Error fetching repositories: {response.json()}") 86 | 87 | data = response.json() 88 | if not data: 89 | break 90 | 91 | repos.extend(data) 92 | page += 1 93 | 94 | return repos 95 | 96 | def get_repo_files(self, token: str, owner: str, repo: str) -> List[dict[str, Any]]: 97 | """Fetch all files in a repository using the GitHub Tree API.""" 98 | 99 | headers = self.get_auth_headers(token) 100 | 101 | url = f"https://api.github.com/repos/{owner}/{repo}/git/trees/main?recursive=1" 102 | response = requests.get(url, headers=headers) 103 | 104 | if response.status_code == 200: 105 | tree_data = response.json() 106 | return [item for item in tree_data.get("tree", []) if item["type"] == "blob"] 107 | elif response.status_code == 404: 108 | self.logger.warning(f"Repository '{repo}' not found.") 109 | return [] 110 | else: 111 | self._log_error("get_repo_files", url, {}, response) 112 | raise Exception(f"Error fetching files for repository '{repo}': {response.json()}") 113 | 114 | def download_file_content(self, token: str, url: str) -> str: 115 | """Download the content of a file from GitHub.""" 116 | 117 | headers = self.get_auth_headers(token) 118 | 119 | response = requests.get(url, headers=headers) 120 | if response.status_code == 200: 121 | return str(response.content) 122 | else: 123 | self._log_error("download_file_content", url, {}, response) 124 | raise Exception(f"Error downloading file: {response.json()}") 125 | 126 | def get_allowed_repos(self, token: str, username: str) -> List[dict]: 127 | projects = self.get_user_repos(token) 128 | user_projects = [] 129 | 130 | for project in projects: 131 | if self.user_has_access(token, project["owner"]["login"], project["name"], username): 132 | user_projects.append(project) 133 | 134 | return user_projects 135 | 136 | def _log_error(self, function_name: str, url: str, data: dict, response: requests.Response): 137 | self.logger.error( 138 | json.dumps( 139 | { 140 | "actor": GitHubClient._actor, 141 | "fn": function_name, 142 | "url": url, 143 | "data": data, 144 | "status_code": response.status_code, 145 | "reason": response.reason, 146 | "text": response.text, 147 | } 148 | ) 149 | ) 150 | 151 | 152 | class GitHubProcessor(PangeaGenericNodeProcessor[T], Generic[T]): 153 | _access_cache: dict[Tuple[str, str], bool] = {} 154 | _token: str 155 | _repos: List[Tuple[str, str]] = [] 156 | _username: str 157 | 158 | def __init__( 159 | self, 160 | token: str, 161 | get_node_metadata: Callable[[T], dict[str, Any]], 162 | username: str, 163 | logger_name: str = "multipass", 164 | ): 165 | super().__init__() 166 | self._token = token 167 | self._access_cache = {} 168 | self.get_node_metadata = get_node_metadata 169 | self._username = username 170 | self._client = GitHubClient(logger_name) 171 | 172 | def filter( 173 | self, 174 | nodes: List[T], 175 | ) -> List[T]: 176 | """Filter GitHub files by access permissions. 177 | 178 | Args: 179 | nodes (List[T]): List of nodes to process. 180 | 181 | Returns: 182 | List[Any]: Nodes that have authorized access. 183 | """ 184 | 185 | filtered: List[T] = [] 186 | for node in nodes: 187 | if self._is_authorized(node): 188 | filtered.append(node) 189 | return filtered 190 | 191 | def get_filter( 192 | self, 193 | ) -> MetadataFilter: 194 | """Generate a filter based on accessible Jira issue IDs. 195 | 196 | Returns: 197 | MetadataFilter: Filter for Jira issue IDs. 198 | """ 199 | 200 | if not self._repos: 201 | repos_info = self._client.get_allowed_repos(self._token, username=self._username) 202 | repos = [] 203 | 204 | for repo in repos_info: 205 | owner = repo["owner"]["login"] 206 | repo_name = repo["name"] 207 | repos.append((owner, repo_name)) 208 | 209 | self._repos = repos 210 | 211 | return MetadataFilter( 212 | key=PangeaMetadataKeys.GITHUB_REPOSITORY_OWNER_AND_NAME, value=self._repos, operator=FilterOperator.IN 213 | ) 214 | 215 | def _has_access(self, metadata: dict[str, Any]) -> bool: 216 | """Check if the authenticated user has access to a repository.""" 217 | 218 | repo_name = metadata.get(PangeaMetadataKeys.GITHUB_REPOSITORY_NAME, None) 219 | if repo_name is None: 220 | raise KeyError(f"Invalid metadata key: {PangeaMetadataKeys.GITHUB_REPOSITORY_NAME}") 221 | 222 | owner = metadata.get(PangeaMetadataKeys.GITHUB_REPOSITORY_OWNER, None) 223 | if owner is None: 224 | raise KeyError(f"Invalid metadata key: {PangeaMetadataKeys.GITHUB_REPOSITORY_OWNER}") 225 | 226 | access_tuple = (owner, repo_name) 227 | has_access = self._access_cache.get(access_tuple, None) 228 | if has_access is not None: 229 | return has_access 230 | 231 | if self._username: 232 | has_access = self._client.user_has_access( 233 | admin_token=self._token, owner=owner, repo_name=repo_name, username=self._username 234 | ) 235 | else: 236 | has_access = self._client.has_access(token=self._token, owner=owner, repo_name=repo_name) 237 | 238 | self._access_cache[access_tuple] = has_access 239 | return has_access 240 | 241 | def _is_authorized(self, node: T) -> bool: 242 | metadata = self.get_node_metadata(node) 243 | return metadata[PangeaMetadataKeys.DATA_SOURCE] == PangeaMetadataValues.DATA_SOURCE_GITHUB and self._has_access( 244 | metadata 245 | ) 246 | -------------------------------------------------------------------------------- /examples/llama_index_examples/02-rag-LlamaIndex-all-sources-processor.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Pangea Cyber Corporation 2 | # Author: Pangea Cyber Corporation 3 | 4 | import os 5 | import warnings 6 | from typing import List 7 | 8 | from google.oauth2.credentials import Credentials 9 | from llama_index.core import Document, Settings, StorageContext, VectorStoreIndex, load_index_from_storage 10 | from llama_index.embeddings.bedrock import BedrockEmbedding 11 | from llama_index.llms.bedrock import Bedrock 12 | from llama_index.readers.confluence import ConfluenceReader 13 | from llama_index.readers.google import GoogleDriveReader 14 | from llama_index.readers.jira import JiraReader 15 | from pangea_multipass import ConfluenceAuth, ConfluenceME, GDriveAPI, GDriveME, JiraAuth, JiraME, enrich_metadata 16 | from pangea_multipass_llama_index import LIDocument, LIDocumentReader 17 | 18 | # Suppress specific warning 19 | warnings.filterwarnings("ignore", message='Field "model_name" has conflict with protected namespace') 20 | 21 | SCOPES = [ 22 | "openid", 23 | "https://www.googleapis.com/auth/userinfo.email", 24 | "https://www.googleapis.com/auth/userinfo.profile", 25 | "https://www.googleapis.com/auth/drive.metadata.readonly", 26 | ] 27 | 28 | # import logging 29 | # import sys 30 | 31 | # logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) 32 | # logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout)) 33 | 34 | 35 | # Initialize LLM, anthropic deployed on bedrock 36 | llm = Bedrock( 37 | model="anthropic.claude-3-5-sonnet-20240620-v1:0", 38 | profile_name="dev", 39 | region_name="us-west-2", 40 | temperature=0.5, 41 | max_tokens=512, 42 | ) 43 | 44 | # Initialize Embedding model, amazon titan deployed on bedrock 45 | embed_model = BedrockEmbedding(model="amazon.titan-embed-g1-text-02", region_name="us-west-2", profile_name="dev") 46 | 47 | # Set up the models 48 | Settings.llm = llm 49 | Settings.embed_model = embed_model 50 | 51 | # Set up chunking parameters 52 | Settings.chunk_size = 1000 53 | Settings.chunk_overlap = 100 54 | 55 | 56 | def google_drive_read_docs() -> List[LIDocument]: 57 | print("Loading Google Drive docs...") 58 | # Google Drive Data Ingestion 59 | credentials_filepath = os.path.abspath("../credentials.json") 60 | 61 | # Sample folder data folder owned by apurv@gondwana.cloud https://drive.google.com/drive/u/1/folders/1Kj77oi2QGEOPKcIo_hKZPiHDJyKKFVgR 62 | gdrive_fid = "1Kj77oi2QGEOPKcIo_hKZPiHDJyKKFVgR" 63 | 64 | # File name for the admin user 65 | admin_token_filepath = "admin_access_token.json" 66 | 67 | # # Invoke Google /auth endpoint and save he token for later use 68 | # GDrive.get_and_save_access_token(credentials_filepath, admin_token_filepath, SCOPES) 69 | 70 | # load the documents and create the index 71 | print("Login to GDrive as admin...") 72 | gdrive_reader = GoogleDriveReader( 73 | folder_id=gdrive_fid, token_path=admin_token_filepath, credentials_path=credentials_filepath 74 | ) 75 | documents: List[LIDocument] = gdrive_reader.load_data(folder_id=gdrive_fid) 76 | 77 | print(f"Processing {len(documents)} docs...") 78 | 79 | # Metadata enricher library 80 | creds = Credentials.from_authorized_user_file(admin_token_filepath, SCOPES) 81 | gdrive_me = GDriveME(creds, {}) 82 | enrich_metadata(documents, [gdrive_me], reader=LIDocumentReader()) 83 | # Finish metadata enrichement 84 | 85 | return documents 86 | 87 | 88 | # Fetch documents from Confluence 89 | confluence_space_key = "~71202041f9bfec117041348629ccf3e3c751b3" 90 | confluence_space_id = 393230 91 | 92 | 93 | def confluence_read_docs() -> List[LIDocument]: 94 | """Fetch all documents from Confluence using ConfluenceReader.""" 95 | 96 | token = os.getenv("CONFLUENCE_ADMIN_TOKEN") 97 | assert token 98 | email = os.getenv("CONFLUENCE_ADMIN_EMAIL") 99 | assert email 100 | url = os.getenv("CONFLUENCE_BASE_URL") 101 | assert url 102 | 103 | # Create a ConfluenceReader instance 104 | print("Loading Confluence docs...") 105 | reader = ConfluenceReader( 106 | base_url=url, 107 | user_name=email, 108 | password=token, 109 | ) 110 | documents: List[LIDocument] = reader.load_data(space_key=confluence_space_key, include_attachments=True) 111 | 112 | # Enrich metadata process 113 | print(f"Processing {len(documents)} Confluence docs...") 114 | confluence_me = ConfluenceME() 115 | enrich_metadata(documents, [confluence_me], reader=LIDocumentReader()) 116 | 117 | return documents 118 | 119 | 120 | def jira_load_data(reader: JiraReader, query: str = "") -> List[Document]: 121 | max_results = 100 122 | start_at = 0 123 | keep_iterating = True 124 | all_documents: List[Document] = [] 125 | 126 | while keep_iterating: 127 | documents = reader.load_data(query, start_at=start_at, max_results=max_results) 128 | all_documents.extend(documents) 129 | l = len(documents) 130 | start_at = start_at + l 131 | keep_iterating = l >= max_results 132 | 133 | return all_documents 134 | 135 | 136 | def jira_read_docs() -> List[LIDocument]: 137 | # Jira credentials and base URL 138 | JIRA_BASE_URL = os.getenv("JIRA_BASE_URL") or "" 139 | assert JIRA_BASE_URL 140 | jira_admin_email = os.getenv("JIRA_ADMIN_EMAIL") or "" 141 | assert jira_admin_email 142 | jira_api_token = os.getenv("JIRA_ADMIN_TOKEN") or "" 143 | assert jira_api_token 144 | 145 | # Initialize LlamaIndex JiraReader 146 | print("Loading Jira docs...") 147 | jira_reader = JiraReader(server_url=JIRA_BASE_URL, email=jira_admin_email, api_token=jira_api_token) 148 | 149 | documents = jira_load_data(jira_reader, "") 150 | 151 | # Metadata enricher library 152 | print(f"Processing {len(documents)} Jira docs...") 153 | jira_me = JiraME(JIRA_BASE_URL, jira_admin_email, jira_api_token) 154 | enrich_metadata(documents, [jira_me], reader=LIDocumentReader()) 155 | 156 | return documents 157 | 158 | 159 | # Load data from Gdrive or from the disk 160 | PERSIST_DIR = "./storage/rbac/llamaindex/all_sources" 161 | if not os.path.exists(PERSIST_DIR): 162 | # Load documents 163 | gdrive_documents = google_drive_read_docs() 164 | confluence_documents = confluence_read_docs() 165 | jira_documents = jira_read_docs() 166 | 167 | # Combine documents 168 | documents = gdrive_documents + confluence_documents + jira_documents 169 | 170 | print("Create and save index...") 171 | index = VectorStoreIndex.from_documents(documents) 172 | # store it for later 173 | index.storage_context.persist(persist_dir=PERSIST_DIR) 174 | else: 175 | # load the existing index 176 | print("Loading index...") 177 | storage_context = StorageContext.from_defaults(persist_dir=PERSIST_DIR) 178 | index = load_index_from_storage(storage_context) # type: ignore 179 | 180 | 181 | # Inference 182 | 183 | from pangea_multipass_llama_index import ( 184 | LlamaIndexConfluenceProcessor, 185 | LlamaIndexGDriveProcessor, 186 | LlamaIndexJiraProcessor, 187 | NodePostprocessorMixer, 188 | ) 189 | 190 | # Create GDrive filter 191 | credentials_filepath = os.path.abspath("../credentials.json") 192 | print("Login to GDrive as user...") 193 | creds = GDriveAPI.get_user_credentials(credentials_filepath, scopes=SCOPES) 194 | gdrive_processor = LlamaIndexGDriveProcessor(creds) 195 | 196 | # Create Confluence filter 197 | confluence_admin_token = os.getenv("CONFLUENCE_ADMIN_TOKEN") 198 | assert confluence_admin_token 199 | confluence_admin_email = os.getenv("CONFLUENCE_ADMIN_EMAIL") 200 | assert confluence_admin_email 201 | confluence_url = os.getenv("CONFLUENCE_BASE_URL") 202 | assert confluence_url 203 | confluence_account_id = os.getenv("CONFLUENCE_USER_ACCOUNT_ID") 204 | assert confluence_account_id 205 | confluence_processor = LlamaIndexConfluenceProcessor( 206 | ConfluenceAuth(confluence_admin_email, confluence_admin_token, confluence_url), account_id=confluence_account_id 207 | ) 208 | 209 | # Create JIRA filter 210 | JIRA_BASE_URL = os.getenv("JIRA_BASE_URL") or "" 211 | assert JIRA_BASE_URL 212 | jira_admin_email = os.getenv("JIRA_ADMIN_EMAIL") or "" 213 | assert jira_admin_email 214 | jira_admin_token = os.getenv("JIRA_ADMIN_TOKEN") or "" 215 | assert jira_admin_token 216 | jira_account_id = os.getenv("JIRA_USER_ACCOUNT_ID") or "" 217 | assert jira_account_id 218 | jira_processor = LlamaIndexJiraProcessor( 219 | JiraAuth(jira_admin_email, jira_admin_token, JIRA_BASE_URL), account_id=jira_account_id 220 | ) 221 | 222 | # Initialize query engine and the reteriver to send prompts 223 | # query_engine = index.as_query_engine(similarity_top_k=10, streaming=True, filters=metadata_filters) 224 | node_processor = NodePostprocessorMixer( 225 | [ 226 | gdrive_processor, 227 | jira_processor, 228 | confluence_processor, 229 | ] 230 | ) 231 | 232 | query_engine = index.as_query_engine( 233 | streaming=True, 234 | similarity_top_k=10, 235 | node_postprocessors=[node_processor], 236 | ) 237 | 238 | retriever = index.as_retriever(similarity_top_k=10) 239 | 240 | 241 | # Inference pipeline 242 | while True: 243 | user_prompt = input("Enter your question:") 244 | 245 | nodes = retriever.retrieve(user_prompt) 246 | count = len(node_processor.get_unauthorized_nodes()) 247 | count_authorized = len(node_processor.get_authorized_nodes()) 248 | 249 | answer = query_engine.query(user_prompt) 250 | # print("Assistant: ", answer) 251 | answer.print_response_stream() # type: ignore 252 | 253 | print("\n=================\n") 254 | print( 255 | f"\nWarning: This answer could be inaccurate as its missing context from {count} out of {len(nodes)} data sources. Include {count_authorized} sources." 256 | ) 257 | print("\n++++++++++++++++++") 258 | -------------------------------------------------------------------------------- /packages/pangea-multipass/pangea_multipass/sources/slack/slack.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | from typing import Any, Callable, Generic, List, Optional 4 | 5 | import requests 6 | from slack_sdk import WebClient 7 | from slack_sdk.errors import SlackApiError 8 | 9 | from pangea_multipass.core import ( 10 | FilterOperator, 11 | MetadataFilter, 12 | PangeaGenericNodeProcessor, 13 | PangeaMetadataKeys, 14 | PangeaMetadataValues, 15 | T, 16 | ) 17 | 18 | 19 | class SlackClient: 20 | _actor = "slack_client" 21 | 22 | def __init__(self, logger_name: str = "multipass"): 23 | self.logger = logging.getLogger(logger_name) 24 | 25 | def list_channels(self, token: str) -> List[dict[str, Any]]: 26 | """ 27 | List all channels the authenticated user has access to. 28 | 29 | Args: 30 | token (str): Slack token. 31 | 32 | Returns: 33 | List of channel ids that the authenticated user has access to. 34 | """ 35 | 36 | client = WebClient(token=token) 37 | try: 38 | response = client.conversations_list(types="public_channel,private_channel") 39 | channels: List[dict[str, Any]] = response.get("channels", []) 40 | return channels 41 | except SlackApiError as e: 42 | self._log_error("list_channels", "conversations.list", {}, e.response) 43 | return [] 44 | 45 | def get_channel_members(self, token: str, channel_id: str) -> Optional[List[str]]: 46 | """ 47 | Retrieve the list of members in a Slack channel. 48 | 49 | Args: 50 | token (str): Slack token. 51 | channel_id (str): Channel id to request members. 52 | 53 | Returns: 54 | List of user IDs in the channel. 55 | """ 56 | 57 | client = WebClient(token=token) 58 | try: 59 | response = client.conversations_members(channel=channel_id) 60 | return response["members"] 61 | except SlackApiError as e: 62 | self._log_error("get_channel_members", "conversations.members", {"channel": channel_id}, e.response) 63 | return None 64 | 65 | def get_all_channels(self, token: str) -> Optional[List[str]]: 66 | """ 67 | Retrieve all channels in the workspace. 68 | 69 | Args: 70 | token (str): Slack token 71 | 72 | Returns: 73 | List of channel IDs. 74 | """ 75 | 76 | client = WebClient(token=token) 77 | channels: List[dict[str, Any]] = [] 78 | try: 79 | response = client.conversations_list(types="public_channel,private_channel", limit=1000) 80 | channels = response.get("channels", []) 81 | return [channel["id"] for channel in channels] 82 | except SlackApiError as e: 83 | self._log_error("get_all_channels", "conversations.list", {}, e.response) 84 | return None 85 | 86 | def get_user_id(self, token: str, user_email: str) -> Optional[str]: 87 | """ 88 | Retrieve the Slack user ID for a given email address. 89 | 90 | Args: 91 | token (str): Slack token. 92 | user_email (str): User email to request user id. 93 | 94 | Returns: 95 | User ID or None if the user does not exist. 96 | """ 97 | 98 | client = WebClient(token=token) 99 | try: 100 | response = client.users_lookupByEmail(email=user_email) 101 | return response["user"]["id"] 102 | except SlackApiError as e: 103 | self._log_error("get_user_id", "users.lookupByEmail", {"email": user_email}, e.response) 104 | return None 105 | 106 | def get_channels_for_user(self, token: str, user_id: str, channel_ids: List[str]) -> List[str]: 107 | """ 108 | Check which channels a user has access to. 109 | 110 | Args: 111 | token (str): Slack token. 112 | user_id (str): Slack user id. 113 | channels_ids (List[str]): Channels id to check access for user_id. 114 | 115 | Returns: 116 | List of channel IDs the user has access to. 117 | """ 118 | client = WebClient(token=token) 119 | accessible_channels = [] 120 | for channel_id in channel_ids: 121 | try: 122 | response = client.conversations_members(channel=channel_id) 123 | members: List[str] = response.get("members", []) 124 | if user_id in members: 125 | accessible_channels.append(channel_id) 126 | except SlackApiError as e: 127 | if e.response["error"] == "not_in_channel": 128 | continue # User is not in this channel 129 | else: 130 | self._log_error( 131 | "get_channels_for_user", "conversations.members", {"channel": channel_id}, e.response 132 | ) 133 | pass 134 | return accessible_channels 135 | 136 | def _log_error(self, function_name: str, url: str, data: dict, response: requests.Response): 137 | self.logger.error( 138 | json.dumps( 139 | { 140 | "actor": SlackClient._actor, 141 | "fn": function_name, 142 | "url": url, 143 | "data": data, 144 | "status_code": response.status_code, 145 | "reason": response.reason, 146 | "text": response.text, 147 | } 148 | ) 149 | ) 150 | 151 | 152 | class SlackProcessor(PangeaGenericNodeProcessor[T], Generic[T]): 153 | _channels_id_cache: dict[str, bool] = {} 154 | _token: str 155 | _user_email: Optional[str] = None 156 | _user_id: Optional[str] = None 157 | 158 | def __init__( 159 | self, 160 | token: str, 161 | get_node_metadata: Callable[[T], dict[str, Any]], 162 | user_email: Optional[str] = None, 163 | logger_name: str = "multipass", 164 | ): 165 | super().__init__() 166 | self._token = token 167 | self._channels_id_cache = {} 168 | self.get_node_metadata = get_node_metadata 169 | self._user_email = user_email 170 | self._client = SlackClient(logger_name) 171 | 172 | def _has_access(self, metadata: dict[str, Any]) -> bool: 173 | """Check if the authenticated user has access to a channel.""" 174 | 175 | channel_id = metadata.get(PangeaMetadataKeys.SLACK_CHANNEL_ID, None) 176 | if channel_id is None: 177 | raise KeyError(f"Invalid metadata key: {PangeaMetadataKeys.SLACK_CHANNEL_ID}") 178 | 179 | if not self._channels_id_cache: 180 | self._load_channels_from_token() 181 | else: 182 | self._load_channels_with_email() 183 | 184 | return self._channels_id_cache.get(channel_id, False) 185 | 186 | def filter( 187 | self, 188 | nodes: List[T], 189 | ) -> List[T]: 190 | """Filter Slack channels by access permissions. 191 | 192 | Args: 193 | nodes (List[T]): List of nodes to process. 194 | 195 | Returns: 196 | List[Any]: Nodes that have authorized access. 197 | """ 198 | 199 | filtered: List[T] = [] 200 | for node in nodes: 201 | if self._is_authorized(node): 202 | filtered.append(node) 203 | return filtered 204 | 205 | def get_filter( 206 | self, 207 | ) -> MetadataFilter: 208 | """Generate a filter based on accessible Slack channel IDs. 209 | 210 | Returns: 211 | MetadataFilter: Filter for Slack channel IDs. 212 | """ 213 | 214 | if not self._user_email: 215 | self._load_channels_from_token() 216 | else: 217 | self._load_channels_with_email() 218 | 219 | channels = list(self._channels_id_cache.keys()) 220 | 221 | return MetadataFilter(key=PangeaMetadataKeys.SLACK_CHANNEL_ID, value=channels, operator=FilterOperator.IN) 222 | 223 | def check_user_access(self, token: str, channel_id: str, user_email: str) -> bool: 224 | """ 225 | Check if a user has access to a specific Slack channel. 226 | 227 | Args: 228 | token (str): Slack token. 229 | channel_id (srt): ID of the Slack channel. 230 | user_email (str): Email of the user to check. 231 | 232 | Returns: 233 | True if the user is a member of the channel, False otherwise. 234 | """ 235 | 236 | user_id = self._client.get_user_id(token, user_email) 237 | if not user_id: 238 | return False 239 | 240 | channel_members = self._client.get_channel_members(token, channel_id) 241 | if channel_members is None: 242 | return False 243 | 244 | return user_id in channel_members 245 | 246 | def _load_channels_with_email(self) -> None: 247 | if self._channels_id_cache: 248 | return 249 | 250 | if not self._user_id and self._user_email is not None: 251 | self._user_id = self._client.get_user_id(self._token, self._user_email) 252 | 253 | if not self._user_id: 254 | return 255 | 256 | all_channels = self._client.get_all_channels(self._token) 257 | if all_channels is None: 258 | return 259 | 260 | channels = self._client.get_channels_for_user(self._token, user_id=self._user_id, channel_ids=all_channels) 261 | for channel in channels: 262 | self._channels_id_cache[channel] = True 263 | 264 | def _load_channels_from_token(self) -> None: 265 | if self._channels_id_cache: 266 | return 267 | 268 | for channel in self._client.list_channels(self._token): 269 | self._channels_id_cache[channel["id"]] = True 270 | 271 | def _is_authorized(self, node: T) -> bool: 272 | metadata = self.get_node_metadata(node) 273 | return metadata[PangeaMetadataKeys.DATA_SOURCE] == PangeaMetadataValues.DATA_SOURCE_SLACK and self._has_access( 274 | metadata 275 | ) 276 | -------------------------------------------------------------------------------- /packages/pangea-multipass-llama-index/pangea_multipass_llama_index/llama_index.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Pangea Cyber Corporation 2 | # Author: Pangea Cyber Corporation 3 | 4 | from typing import Any, List, Optional 5 | 6 | from google.oauth2.credentials import Credentials 7 | from llama_index.core import Document as LIDocument 8 | from llama_index.core.postprocessor.types import BaseNodePostprocessor 9 | from llama_index.core.schema import NodeWithScore, QueryBundle 10 | from llama_index.core.vector_stores import FilterCondition, FilterOperator, MetadataFilter, MetadataFilters 11 | from pangea_multipass import ( 12 | ConfluenceAuth, 13 | ConfluenceProcessor, 14 | DocumentReader, 15 | DropboxProcessor, 16 | GDriveProcessor, 17 | GitHubProcessor, 18 | GitLabProcessor, 19 | JiraAuth, 20 | JiraProcessor, 21 | ) 22 | from pangea_multipass import MetadataFilter as PangeaMetadataFilter 23 | from pangea_multipass import ( 24 | MultipassDocument, 25 | PangeaGenericNodeProcessor, 26 | PangeaNodeProcessorMixer, 27 | SlackProcessor, 28 | ) 29 | 30 | 31 | class LIDocumentReader(DocumentReader): 32 | """Document reader for Llama Index documents. 33 | 34 | Provides methods for reading content from a Llama Index document. 35 | 36 | Methods: 37 | read(doc: LIDocument) -> str: Reads and returns the content of a Llama Index document. 38 | """ 39 | 40 | def read(self, doc: LIDocument) -> str: 41 | """Reads and returns the content of the given Llama Index document. 42 | 43 | Args: 44 | doc (LIDocument): The Llama Index document to read. 45 | 46 | Returns: 47 | str: The content of the document. 48 | """ 49 | return str(doc.get_content()) 50 | 51 | 52 | # pangea-metadata-llama-index 53 | def get_doc_id(doc: LIDocument) -> str: 54 | """Fetches the document ID from a Llama Index document. 55 | 56 | Args: 57 | doc (LIDocument): The Llama Index document. 58 | 59 | Returns: 60 | str: The document ID. 61 | """ 62 | return str(doc.doc_id) 63 | 64 | 65 | def get_node_metadata(node: NodeWithScore) -> dict[str, Any]: 66 | """Fetches metadata from a node with a score. 67 | 68 | Args: 69 | node (NodeWithScore): The node from which metadata is retrieved. 70 | 71 | Returns: 72 | dict[str, Any]: A dictionary containing node metadata. 73 | """ 74 | return dict(node.metadata) 75 | 76 | 77 | def from_multipass(documents: List[MultipassDocument]) -> List[LIDocument]: 78 | li_documents: List[LIDocument] = [] 79 | for doc in documents: 80 | li_doc = LIDocument(doc_id=doc.id, text=doc.content) 81 | li_doc.metadata = doc.metadata 82 | li_documents.append(li_doc) 83 | 84 | return li_documents 85 | 86 | 87 | class LlamaIndexJiraProcessor(JiraProcessor[NodeWithScore]): 88 | """Processor for Jira integration with Llama Index nodes. 89 | 90 | Uses Jira authentication to access nodes. 91 | 92 | Args: 93 | auth (JiraAuth): Jira authentication credentials. 94 | account_id (Optional[str]): Jira user's account id to check issues permissions. 95 | """ 96 | 97 | def __init__(self, auth: JiraAuth, account_id: Optional[str] = None): 98 | super().__init__(auth, get_node_metadata=get_node_metadata, account_id=account_id) 99 | 100 | 101 | class LlamaIndexConfluenceProcessor(ConfluenceProcessor[NodeWithScore]): 102 | """Processor for Confluence integration with Llama Index nodes. 103 | 104 | Uses Confluence authentication to check nodes access. 105 | 106 | Args: 107 | auth (ConfluenceAuth): Confluence authentication credentials. 108 | space_id (Optional[int]): The space ID to filter pages by. 109 | account_id (Optional[str]): User account id to check permissions using admin token. 110 | 111 | """ 112 | 113 | def __init__(self, auth: ConfluenceAuth, space_id: Optional[int] = None, account_id: Optional[str] = None): 114 | super().__init__(auth, get_node_metadata=get_node_metadata, space_id=space_id, account_id=account_id) 115 | 116 | 117 | class LlamaIndexGDriveProcessor(GDriveProcessor[NodeWithScore]): 118 | """Processor for Google Drive integration with Llama Index nodes. 119 | 120 | Uses Google Drive credentials to check nodes access. 121 | 122 | Args: 123 | creds (Credentials): Google OAuth2 credentials. 124 | user_email (Optional[str]): User email to check access to files. 125 | """ 126 | 127 | def __init__(self, creds: Credentials, user_email: Optional[str] = None): 128 | super().__init__(creds, get_node_metadata=get_node_metadata, user_email=user_email) 129 | 130 | 131 | class LlamaIndexGitHubProcessor(GitHubProcessor[NodeWithScore]): 132 | """Processor for GitHub integration with Llama Index nodes. 133 | 134 | Uses GitHub token to check node access. 135 | 136 | Args: 137 | token (str): GitHub classic token. 138 | username (str): GitHub username to check permissions. 139 | """ 140 | 141 | def __init__(self, token: str, username: str): 142 | super().__init__(token, get_node_metadata=get_node_metadata, username=username) 143 | 144 | 145 | class LlamaIndexSlackProcessor(SlackProcessor[NodeWithScore]): 146 | """Processor for Slack integration with Llama Index nodes. 147 | 148 | Uses Slack token to check node access. 149 | 150 | Args: 151 | token (str): Slack token. 152 | user_email (Optional[str]): User email to check access to files. 153 | """ 154 | 155 | def __init__(self, token: str, user_email: Optional[str] = None): 156 | super().__init__(token, get_node_metadata=get_node_metadata, user_email=user_email) 157 | 158 | 159 | class LlamaIndexGitLabProcessor(GitLabProcessor[NodeWithScore]): 160 | """Processor for GitLab integration with Llama Index nodes. 161 | 162 | Uses GitLab token to access nodes in the Llama Index. 163 | 164 | Args: 165 | token (str): GitLab token. 166 | username (str): Username to check access to files. 167 | """ 168 | 169 | def __init__(self, admin_token: str, username: str): 170 | super().__init__(admin_token=admin_token, username=username, get_node_metadata=get_node_metadata) 171 | 172 | 173 | class LlamaIndexDropboxProcessor(DropboxProcessor[NodeWithScore]): 174 | """Processor for Dropbox integration with Llama Index nodes. 175 | 176 | Uses Dropbox token to check node access. 177 | 178 | Args: 179 | token (str): Dropbox token. 180 | user_email (str): User email to check access to files. 181 | """ 182 | 183 | def __init__(self, token: str, user_email: str): 184 | super().__init__(token, user_email=user_email, get_node_metadata=get_node_metadata) 185 | 186 | 187 | class NodePostprocessorMixer(BaseNodePostprocessor): 188 | """Postprocessor mixer for processing nodes with multiple processors. 189 | 190 | This class mixes multiple node processors and applies them to Llama Index nodes. 191 | 192 | Attributes: 193 | node_processor (PangeaNodeProcessorMixer[NodeWithScore]): A mixer of node processors. 194 | 195 | Methods: 196 | _postprocess_nodes(nodes: List[NodeWithScore], query_bundle: Optional[QueryBundle] = None) -> List[NodeWithScore]: 197 | Postprocesses a list of nodes with the mixed processors. 198 | get_filter() -> MetadataFilters: Gets the metadata filters used for processing nodes. 199 | get_unauthorized_nodes() -> List[NodeWithScore]: Retrieves nodes that are unauthorized for access. 200 | get_authorized_nodes() -> List[NodeWithScore]: Retrieves nodes that are authorized for access. 201 | """ 202 | 203 | node_processor: PangeaNodeProcessorMixer[NodeWithScore] = PangeaNodeProcessorMixer(get_node_metadata, []) 204 | 205 | def __init__(self, node_processors: List[PangeaGenericNodeProcessor[NodeWithScore]]): 206 | """Initializes the NodePostprocessorMixer with a list of node processors. 207 | 208 | Args: 209 | node_processors (List[PangeaGenericNodeProcessor]): List of node processors to mix and apply. 210 | """ 211 | 212 | super().__init__() 213 | self.node_processor = PangeaNodeProcessorMixer[NodeWithScore]( 214 | get_node_metadata=get_node_metadata, 215 | node_processors=node_processors, 216 | ) 217 | 218 | def _postprocess_nodes( 219 | self, 220 | nodes: List[NodeWithScore], 221 | query_bundle: Optional[QueryBundle] = None, 222 | ) -> List[NodeWithScore]: 223 | """Applies postprocessing to a list of nodes using the mixed node processors. 224 | 225 | Args: 226 | nodes (List[NodeWithScore]): The nodes to be postprocessed. 227 | query_bundle (Optional[QueryBundle]): Query context for processing. Defaults to None. 228 | 229 | Returns: 230 | List[NodeWithScore]: The list of postprocessed nodes. 231 | """ 232 | 233 | return self.node_processor.filter(nodes) 234 | 235 | def get_filter( 236 | self, 237 | ) -> MetadataFilters: 238 | """Generates metadata filters for processing nodes. 239 | 240 | Returns: 241 | MetadataFilters: A set of metadata filters with an OR condition applied. 242 | """ 243 | 244 | filters: List[MetadataFilter | MetadataFilters] = [] 245 | for filter in self.node_processor.get_filters(): 246 | filters.append(_convert_metadata_filter_to_llama_index(filter)) 247 | 248 | return MetadataFilters(filters=filters, condition=FilterCondition.OR) 249 | 250 | def get_unauthorized_nodes( 251 | self, 252 | ) -> List[NodeWithScore]: 253 | """Retrieves nodes that are unauthorized for access. 254 | 255 | Returns: 256 | List[NodeWithScore]: List of unauthorized nodes. 257 | """ 258 | return self.node_processor.get_unauthorized_nodes() 259 | 260 | def get_authorized_nodes( 261 | self, 262 | ) -> List[NodeWithScore]: 263 | """Retrieves nodes that are authorized for access. 264 | 265 | Returns: 266 | List[NodeWithScore]: List of authorized nodes. 267 | """ 268 | return self.node_processor.get_authorized_nodes() 269 | 270 | 271 | def _convert_metadata_filter_to_llama_index(input: PangeaMetadataFilter) -> MetadataFilter: 272 | """Converts a Pangea metadata filter to a Llama Index-compatible filter. 273 | 274 | Args: 275 | input (PangeaMetadataFilter): The Pangea metadata filter to convert. 276 | 277 | Returns: 278 | MetadataFilter: The converted Llama Index metadata filter. 279 | """ 280 | return MetadataFilter(key=input.key, value=input.value, operator=FilterOperator(input.operator)) 281 | -------------------------------------------------------------------------------- /packages/pangea-multipass/pangea_multipass/core.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Pangea Cyber Corporation 2 | # Author: Pangea Cyber Corporation 3 | 4 | import dataclasses 5 | import enum 6 | import hashlib 7 | from abc import ABC, abstractmethod 8 | from secrets import token_hex 9 | from typing import Any, Callable, Generic, List, Sequence, TypeVar 10 | 11 | T = TypeVar("T") 12 | _PANGEA_METADATA_KEY_PREFIX = "_pangea_" 13 | 14 | 15 | def generate_id() -> str: 16 | return token_hex(20) 17 | 18 | 19 | class FilterOperator(str, enum.Enum): 20 | """Defines operators for filtering metadata.""" 21 | 22 | IN = "in" # In array (string or number) 23 | CONTAINS = "contains" # metadata array contains value (string or number) 24 | EQ = "==" # default operator (string, int, float) 25 | GT = ">" # greater than (int, float) 26 | LT = "<" # less than (int, float) 27 | NE = "!=" # not equal to (string, int, float) 28 | GTE = ">=" # greater than or equal to (int, float) 29 | LTE = "<=" # less than or equal to (int, float) 30 | NIN = "nin" # Not in array (string or number) 31 | ANY = "any" # Contains any (array of strings) 32 | ALL = "all" # Contains all (array of strings) 33 | TEXT_MATCH = "text_match" # full text match (allows you to search for a specific substring, token or phrase within the text field) 34 | IS_EMPTY = "is_empty" # the field is not exist or empty (null or empty array) 35 | 36 | 37 | class PangeaMetadataKeys(str, enum.Enum): 38 | DATA_SOURCE = f"{_PANGEA_METADATA_KEY_PREFIX}data_source" 39 | FILE_NAME = f"{_PANGEA_METADATA_KEY_PREFIX}file_name" 40 | FILE_PATH = f"{_PANGEA_METADATA_KEY_PREFIX}file_path" 41 | CONFLUENCE_PAGE_ID = f"{_PANGEA_METADATA_KEY_PREFIX}confluence_page_id" 42 | JIRA_ISSUE_ID = f"{_PANGEA_METADATA_KEY_PREFIX}jira_issue_id" 43 | GDRIVE_FILE_ID = f"{_PANGEA_METADATA_KEY_PREFIX}gdrive_file_id" 44 | NODE_ID = f"{_PANGEA_METADATA_KEY_PREFIX}node_id" 45 | GITHUB_REPOSITORY_NAME = f"{_PANGEA_METADATA_KEY_PREFIX}repository_name" 46 | GITHUB_REPOSITORY_OWNER = f"{_PANGEA_METADATA_KEY_PREFIX}repository_owner" 47 | GITHUB_REPOSITORY_OWNER_AND_NAME = f"{_PANGEA_METADATA_KEY_PREFIX}repository_owner_and_name" 48 | SLACK_CHANNEL_ID = f"{_PANGEA_METADATA_KEY_PREFIX}slack_channel_id" 49 | SLACK_CHANNEL_NAME = f"{_PANGEA_METADATA_KEY_PREFIX}slack_channel_name" 50 | SLACK_USER = f"{_PANGEA_METADATA_KEY_PREFIX}slack_user" 51 | SLACK_TIMESTAMP = f"{_PANGEA_METADATA_KEY_PREFIX}slack_timestamp" 52 | GITLAB_REPOSITORY_ID = f"{_PANGEA_METADATA_KEY_PREFIX}gitlab_repository_id" 53 | GITLAB_REPOSITORY_NAME = f"{_PANGEA_METADATA_KEY_PREFIX}gitlab_repository_name" 54 | GITLAB_REPOSITORY_NAMESPACE_WITH_PATH = f"{_PANGEA_METADATA_KEY_PREFIX}gitlab_repository_namespace_with_path" 55 | DROPBOX_ID = f"{_PANGEA_METADATA_KEY_PREFIX}dropbox_id" 56 | DROPBOX_PATH = f"{_PANGEA_METADATA_KEY_PREFIX}path" 57 | DROPBOX_FILE_PATH = f"{_PANGEA_METADATA_KEY_PREFIX}file_path" 58 | 59 | 60 | class PangeaMetadataValues(str, enum.Enum): 61 | DATA_SOURCE_CONFLUENCE = "confluence" 62 | DATA_SOURCE_GDRIVE = "gdrive" 63 | DATA_SOURCE_JIRA = "jira" 64 | DATA_SOURCE_GITHUB = "github" 65 | DATA_SOURCE_SLACK = "slack" 66 | DATA_SOURCE_GITLAB = "gitlab" 67 | DATA_SOURCE_DROPBOX = "dropbox" 68 | 69 | 70 | @dataclasses.dataclass 71 | class MultipassDocument: 72 | id: str 73 | content: str 74 | metadata: dict[str, Any] 75 | 76 | 77 | def get_document_metadata(doc: MultipassDocument) -> dict[str, Any]: 78 | """Fetches metadata from a multipass document. 79 | 80 | Args: 81 | doc (MultipassDocument): The doc from which metadata is retrieved. 82 | 83 | Returns: 84 | dict[str, Any]: A dictionary containing node metadata. 85 | """ 86 | return doc.metadata 87 | 88 | 89 | @dataclasses.dataclass 90 | class MetadataFilter: 91 | """Represents a filter for document metadata.""" 92 | 93 | key: str 94 | value: Any 95 | operator: FilterOperator 96 | 97 | 98 | class DocumentReader(ABC): 99 | """Interface for reading documents.""" 100 | 101 | @abstractmethod 102 | def read(self, doc: Any) -> str: 103 | """Reads and returns content of the document as a string.""" 104 | pass 105 | 106 | 107 | class PangeaGenericNodeProcessor(ABC, Generic[T]): 108 | """Abstract processor for handling nodes with filtering and processing methods.""" 109 | 110 | @abstractmethod 111 | def filter(self, nodes: List[T]) -> List[T]: 112 | """Processes nodes and applies filtering.""" 113 | pass 114 | 115 | @abstractmethod 116 | def get_filter(self) -> MetadataFilter: 117 | """Returns a filter based on the processed nodes' metadata.""" 118 | pass 119 | 120 | 121 | class MetadataEnricher(ABC): 122 | """Interface for generating additional metadata for documents.""" 123 | 124 | _key: str 125 | """Key used in the metadata dictionary for the enrichment. """ 126 | 127 | def __init__(self, key: str): 128 | if not key.startswith(_PANGEA_METADATA_KEY_PREFIX): 129 | key = f"{_PANGEA_METADATA_KEY_PREFIX}{key}" 130 | 131 | self._key = key 132 | 133 | @abstractmethod 134 | def extract_metadata(self, doc: Any, file_content: str) -> dict[str, Any]: 135 | """Generates metadata based on document and its content.""" 136 | pass 137 | 138 | 139 | class MetadataUpdater(ABC): 140 | """Interface for updating document metadata.""" 141 | 142 | @abstractmethod 143 | def update_metadata(self, doc: Any, metadata: dict[str, Any]) -> None: 144 | """Updates document with provided metadata.""" 145 | pass 146 | 147 | 148 | class GenericMetadataUpdater(MetadataUpdater): 149 | """Updates metadata of a Llama Index or Lang Chain Document.""" 150 | 151 | def update_metadata(self, doc: Any, metadata: dict[str, Any]) -> None: 152 | """Updates document metadata with given key-value pairs.""" 153 | doc.metadata.update(metadata) 154 | 155 | 156 | class HasherSHA256(MetadataEnricher): 157 | """Generates SHA-256 hash for the document and adds it to metadata.""" 158 | 159 | def extract_metadata(self, doc: Any, file_content: str) -> dict[str, Any]: 160 | """Returns SHA-256 hash of the document content.""" 161 | return {self._key: hashlib.sha256(file_content.encode()).hexdigest()} 162 | 163 | 164 | class Constant(MetadataEnricher): 165 | """Sets a constant value as metadata for the document.""" 166 | 167 | value: str 168 | 169 | def __init__(self, key: str, value: str): 170 | super().__init__(f"{key}") 171 | self.value = value 172 | 173 | def extract_metadata(self, doc: Any, file_content: str) -> dict[str, Any]: 174 | """Sets a constant value in the metadata.""" 175 | return {self._key: self.value} 176 | 177 | 178 | def enrich_metadata( 179 | documents: Sequence[Any], 180 | metadata_enrichers: List[MetadataEnricher], 181 | reader: DocumentReader, 182 | updater: MetadataUpdater = GenericMetadataUpdater(), 183 | ) -> None: 184 | """Enriches metadata of documents by applying specified enrichers. 185 | 186 | Args: 187 | documents: A sequence of documents to enrich. 188 | metadata_enrichers: List of metadata enrichers to apply. 189 | reader: A reader instance to obtain document content. 190 | updater: Optional updater instance to apply metadata changes. 191 | """ 192 | 193 | for doc in documents: 194 | file_content = reader.read(doc) 195 | 196 | # Add Pangea Node Random ID 197 | updater.update_metadata(doc, {PangeaMetadataKeys.NODE_ID: generate_id()}) 198 | 199 | for enricher in metadata_enrichers: 200 | updater.update_metadata(doc, enricher.extract_metadata(doc, file_content)) 201 | 202 | reader.read(doc) 203 | 204 | 205 | class PangeaNodeProcessorMixer(Generic[T]): 206 | """Combines multiple node processors for authorization filtering. 207 | 208 | Aggregates results from various node processors to create a unified view of authorized and unauthorized nodes. 209 | 210 | Attributes: 211 | _node_processors (List[PangeaGenericNodeProcessor]): List of node processors. 212 | _get_node_metadata (Callable): Function to get node metadata. 213 | _unauthorized_nodes (List[T]): Cached list of unauthorized nodes. 214 | _authorized_nodes (List[T]): Cached list of authorized nodes. 215 | """ 216 | 217 | _node_processors: List[PangeaGenericNodeProcessor[T]] = [] 218 | _get_node_metadata: Callable[[T], dict[str, Any]] 219 | _unauthorized_nodes: List[T] = [] 220 | _authorized_nodes: List[T] = [] 221 | 222 | def __init__( 223 | self, 224 | get_node_metadata: Callable[[T], dict[str, Any]], 225 | node_processors: List[PangeaGenericNodeProcessor[T]], 226 | ): 227 | self._node_processors = node_processors 228 | self._get_node_metadata = get_node_metadata 229 | 230 | def filter( 231 | self, 232 | nodes: List[T], 233 | ) -> List[T]: 234 | """Process nodes through each processor to filter authorized nodes. 235 | 236 | Args: 237 | nodes (List[T]): List of nodes to process. 238 | 239 | Returns: 240 | List[T]: Nodes that have been authorized across all processors. 241 | """ 242 | 243 | authorized: dict[str, T] = {} 244 | unauthorized: dict[str, T] = {} 245 | for node in nodes: 246 | id = self._get_node_metadata(node).get(PangeaMetadataKeys.NODE_ID, None) 247 | if not id: 248 | raise Exception(f"{PangeaMetadataKeys.NODE_ID} key should be set in node metadata") 249 | 250 | unauthorized[id] = node 251 | 252 | # This works as an OR operator among all node post processors 253 | for npp in self._node_processors: 254 | for node in npp.filter(list(unauthorized.values())): 255 | id = self._get_node_metadata(node).get(PangeaMetadataKeys.NODE_ID) 256 | authorized[id] = unauthorized.pop(id) # type: ignore 257 | 258 | self._unauthorized_nodes = list(unauthorized.values()) 259 | self._authorized_nodes = list(authorized.values()) 260 | return self._authorized_nodes 261 | 262 | def get_filters(self) -> List[MetadataFilter]: 263 | """Retrieve filters from all node processors. 264 | 265 | Returns: 266 | List[MetadataFilter]: List of filters from each processor. 267 | """ 268 | 269 | filters = [] 270 | for np in self._node_processors: 271 | filters.append(np.get_filter()) 272 | 273 | return filters 274 | 275 | def get_unauthorized_nodes( 276 | self, 277 | ) -> List[T]: 278 | """Retrieve nodes that were unauthorized after processing. 279 | 280 | Returns: 281 | List[T]: Unauthorized nodes. 282 | """ 283 | 284 | return self._unauthorized_nodes 285 | 286 | def get_authorized_nodes( 287 | self, 288 | ) -> List[T]: 289 | """Retrieve nodes that were authorized after processing. 290 | 291 | Returns: 292 | List[T]: Authorized nodes. 293 | """ 294 | 295 | return self._authorized_nodes 296 | -------------------------------------------------------------------------------- /packages/pangea-multipass/pangea_multipass/sources/dropbox/dropbox.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | from typing import Any, Callable, Generic, List, Optional 4 | 5 | import requests 6 | 7 | from pangea_multipass.core import ( 8 | FilterOperator, 9 | MetadataFilter, 10 | PangeaGenericNodeProcessor, 11 | PangeaMetadataKeys, 12 | PangeaMetadataValues, 13 | T, 14 | ) 15 | 16 | 17 | class DropboxClient: 18 | _actor = "dropbox_client" 19 | 20 | AUTH_URL = "https://www.dropbox.com/oauth2/authorize" 21 | TOKEN_URL = "https://api.dropbox.com/oauth2/token" 22 | LIST_FILES_URL = "https://api.dropboxapi.com/2/files/list_folder" 23 | LIST_CONTINUE_URL = "https://api.dropboxapi.com/2/files/list_folder/continue" 24 | 25 | def __init__(self, logger_name: str = "multipass"): 26 | self.logger = logging.getLogger(logger_name) 27 | 28 | def download_file(self, token: str, file_path: str): 29 | """Download a file from Dropbox.""" 30 | 31 | headers = { 32 | "Authorization": f"Bearer {token}", 33 | "Dropbox-API-Arg": json.dumps({"path": file_path}), 34 | } 35 | 36 | url = "https://content.dropboxapi.com/2/files/download" 37 | response = requests.post(url, headers=headers, stream=True) 38 | if response.status_code != 200: 39 | self.logger.error( 40 | json.dumps( 41 | { 42 | "actor": DropboxClient._actor, 43 | "fn": "download_file", 44 | "url": url, 45 | "data": {"path": file_path}, 46 | "status_code": response.status_code, 47 | "reason": response.reason, 48 | "text": response.text, 49 | } 50 | ) 51 | ) 52 | response.raise_for_status() 53 | return response.content 54 | 55 | def check_user_access(self, token: str, file_path: str, user_email: str): 56 | """ 57 | Checks if a user has access to a specific Dropbox file. 58 | 59 | :param token: Admin OAuth token with access to all files. 60 | :param file_path: Path to the file in Dropbox (e.g., "/Documents/file.txt"). 61 | :param user_email: Email of the user whose access needs to be checked. 62 | :return: Boolean indicating whether the user has access. 63 | """ 64 | url = "https://api.dropboxapi.com/2/sharing/list_file_members" 65 | headers = {"Authorization": f"Bearer {token}", "Content-Type": "application/json"} 66 | data = {"file": file_path} 67 | 68 | response = requests.post(url, json=data, headers=headers) 69 | if response.status_code != 200: 70 | self._log_error("check_user_access", url, data, response) 71 | return False 72 | 73 | response_data = response.json() 74 | self.logger.debug( 75 | json.dumps( 76 | { 77 | "actor": DropboxClient._actor, 78 | "fn": "check_user_access", 79 | "actions": "post", 80 | "url": url, 81 | "data": data, 82 | "response": response_data, 83 | } 84 | ) 85 | ) 86 | 87 | members = response_data.get("users", []) 88 | for member in members: 89 | if member.get("user", {}).get("email", "").lower() == user_email.lower(): 90 | return True 91 | 92 | return False 93 | 94 | def list_shared_folders(self, token: str, user_email: str) -> List[str]: 95 | """ 96 | Lists shared folders that a user has access to in Dropbox. 97 | 98 | :param token: Admin OAuth token with access to all files. 99 | :param user_email: Email of the user whose accessible folders need to be listed. 100 | :return: List of folder paths the user has access to. 101 | """ 102 | 103 | accessible_folders: List[str] = [] 104 | has_more = True 105 | cursor: Optional[str] = None 106 | headers = {"Authorization": f"Bearer {token}", "Content-Type": "application/json"} 107 | 108 | while has_more: 109 | url = ( 110 | "https://api.dropboxapi.com/2/sharing/list_folders" 111 | if cursor is None 112 | else "https://api.dropboxapi.com/2/sharing/list_folders/continue" 113 | ) 114 | data = {} if cursor is None else {"cursor": cursor} 115 | response = requests.post(url, json=data, headers=headers) 116 | 117 | if response.status_code != 200: 118 | self._log_error("list_shared_folders", url, data, response) 119 | return accessible_folders 120 | 121 | resp_data = response.json() 122 | self.logger.debug( 123 | json.dumps( 124 | { 125 | "actor": DropboxClient._actor, 126 | "fn": "list_shared_folders", 127 | "actions": "post", 128 | "url": url, 129 | "data": data, 130 | "response": resp_data, 131 | } 132 | ) 133 | ) 134 | 135 | shared_folders = resp_data.get("entries", []) 136 | cursor = resp_data.get("cursor", None) 137 | has_more = cursor is not None 138 | 139 | for folder in shared_folders: 140 | folder_id = folder.get("shared_folder_id") 141 | folder_name = folder.get("name") 142 | 143 | members_url = "https://api.dropboxapi.com/2/sharing/list_folder_members" 144 | members_data = {"shared_folder_id": folder_id} 145 | 146 | members_response = requests.post(members_url, json=members_data, headers=headers) 147 | 148 | if members_response.status_code == 200: 149 | members = members_response.json().get("users", []) 150 | for member in members: 151 | if member.get("user", {}).get("email", "").lower() == user_email.lower(): 152 | if not folder_name.startswith("/"): 153 | folder_name = f"/{folder_name}" 154 | accessible_folders.append(folder_name) 155 | break 156 | 157 | return accessible_folders 158 | 159 | def list_subfolders(self, token: str, root: str) -> List[str]: 160 | """ 161 | Lists all folders in Dropbox. 162 | 163 | :param token: Admin OAuth token with access to all files. 164 | :return: List of all folder paths. 165 | """ 166 | 167 | folders: List[str] = [] 168 | has_more = True 169 | cursor: Optional[str] = None 170 | headers = {"Authorization": f"Bearer {token}", "Content-Type": "application/json"} 171 | 172 | while has_more: 173 | url = DropboxClient.LIST_FILES_URL if cursor is None else DropboxClient.LIST_CONTINUE_URL 174 | data = {"path": root, "recursive": True, "limit": 100} 175 | if cursor: 176 | data = {"cursor": cursor} 177 | 178 | response = requests.post(url, headers=headers, json=data) 179 | 180 | if response.status_code != 200: 181 | self._log_error("list_subfolders", url, data, response) 182 | return folders 183 | 184 | resp_data = response.json() 185 | folder_entries = resp_data.get("entries", []) 186 | cursor = resp_data.get("cursor", None) 187 | has_more = resp_data.get("has_more", False) 188 | 189 | for entrie in folder_entries: 190 | if entrie.get(".tag") != "folder": 191 | continue 192 | 193 | folder_path = entrie.get("path_lower", "") 194 | folders.append(folder_path) 195 | 196 | return folders 197 | 198 | def _log_error(self, function_name: str, url: str, data: dict, response: requests.Response): 199 | self.logger.error( 200 | json.dumps( 201 | { 202 | "actor": DropboxClient._actor, 203 | "fn": function_name, 204 | "url": url, 205 | "data": data, 206 | "status_code": response.status_code, 207 | "reason": response.reason, 208 | "text": response.text, 209 | } 210 | ) 211 | ) 212 | 213 | 214 | class DropboxProcessor(PangeaGenericNodeProcessor[T], Generic[T]): 215 | _access_cache: dict[str, bool] = {} 216 | _token: str 217 | _folders: List[str] = [] 218 | _user_email: str 219 | 220 | def __init__( 221 | self, 222 | token: str, 223 | user_email: str, 224 | get_node_metadata: Callable[[T], dict[str, Any]], 225 | logger_name: str = "multipass", 226 | ): 227 | super().__init__() 228 | self._token = token 229 | self._access_cache = {} 230 | self.get_node_metadata = get_node_metadata 231 | self._user_email = user_email 232 | self.logger = logging.getLogger(logger_name) 233 | self._client = DropboxClient(logger_name) 234 | 235 | def _has_access(self, metadata: dict[str, Any]) -> bool: 236 | """Check if the authenticated user has access to a file.""" 237 | 238 | path = metadata.get(PangeaMetadataKeys.DROPBOX_FILE_PATH, "") 239 | if not path: 240 | raise KeyError(f"Invalid metadata key: {PangeaMetadataKeys.DROPBOX_FILE_PATH}") 241 | 242 | has_access = self._access_cache.get(path, None) 243 | if has_access is not None: 244 | return has_access 245 | 246 | has_access = self._client.check_user_access(token=self._token, file_path=path, user_email=self._user_email) 247 | 248 | self._access_cache[path] = has_access 249 | return has_access 250 | 251 | def filter( 252 | self, 253 | nodes: List[T], 254 | ) -> List[T]: 255 | """Filter Dropbox files by access permissions. 256 | 257 | Args: 258 | nodes (List[T]): List of nodes to process. 259 | 260 | Returns: 261 | List[Any]: Nodes that have authorized access. 262 | """ 263 | 264 | filtered: List[T] = [] 265 | for node in nodes: 266 | if self._is_authorized(node): 267 | filtered.append(node) 268 | return filtered 269 | 270 | def get_filter( 271 | self, 272 | ) -> MetadataFilter: 273 | """Generate a filter based on accessible Dropbox paths. 274 | 275 | Returns: 276 | MetadataFilter: Filter for Dropbox paths. 277 | """ 278 | 279 | if not self._folders: 280 | shared_folders = self._client.list_shared_folders(self._token, self._user_email) 281 | folders = {value: True for value in shared_folders} 282 | 283 | for folder in shared_folders: 284 | subfolders = self._client.list_subfolders(self._token, folder) 285 | folders.update({value: True for value in subfolders}) 286 | 287 | self._access_cache = folders 288 | self._folders = list(folders.keys()) 289 | 290 | return MetadataFilter(key=PangeaMetadataKeys.DROPBOX_PATH, value=self._folders, operator=FilterOperator.IN) 291 | 292 | def _is_authorized(self, node: T) -> bool: 293 | metadata = self.get_node_metadata(node) 294 | return metadata[ 295 | PangeaMetadataKeys.DATA_SOURCE 296 | ] == PangeaMetadataValues.DATA_SOURCE_DROPBOX and self._has_access(metadata) 297 | -------------------------------------------------------------------------------- /packages/pangea-multipass/pangea_multipass/sources/jira/jira.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Pangea Cyber Corporation 2 | # Author: Pangea Cyber Corporation 3 | 4 | import dataclasses 5 | from typing import Any, Callable, Generic, List, Optional 6 | from urllib.parse import urljoin 7 | 8 | import requests 9 | from requests.auth import HTTPBasicAuth 10 | from requests.exceptions import HTTPError 11 | 12 | from pangea_multipass.core import ( 13 | _PANGEA_METADATA_KEY_PREFIX, 14 | FilterOperator, 15 | MetadataEnricher, 16 | MetadataFilter, 17 | PangeaGenericNodeProcessor, 18 | PangeaMetadataKeys, 19 | PangeaMetadataValues, 20 | T, 21 | ) 22 | 23 | 24 | @dataclasses.dataclass 25 | class JiraAuth: 26 | """Holds authentication details for Jira API.""" 27 | 28 | email: str 29 | token: str 30 | url: str 31 | 32 | 33 | class JiraME(MetadataEnricher): 34 | """Jira Metadata Enricher. 35 | 36 | Enriches metadata for documents using data fetched from Jira, like issue assignments and reporter details. 37 | 38 | Attributes: 39 | _url (str): URL for the Jira instance. 40 | _email (str): Email for authenticating with Jira. 41 | _api_token (str): API token for Jira access. 42 | _auth (JiraAuth): Authentication details for Jira. 43 | """ 44 | 45 | _url: str 46 | _email: str 47 | _api_token: str 48 | _auth: JiraAuth 49 | 50 | def __init__(self, url: str, email: str, api_token: str): 51 | self._url = url.rstrip("/") 52 | self._email = email 53 | self._api_token = api_token 54 | self._auth = JiraAuth(email, api_token, self._url) 55 | 56 | def extract_metadata(self, doc: Any, file_content: str) -> dict[str, Any]: 57 | """Fetch Jira-related metadata for the document. 58 | 59 | Args: 60 | doc (Any): The document to enrich with metadata. 61 | file_content (str): The content of the file. 62 | 63 | Returns: 64 | dict[str, Any]: Extracted metadata including issue ID, assignee, and reporter details. 65 | """ 66 | 67 | metadata: dict[str, Any] = {} 68 | 69 | # This step is to normalize some attributes across platforms 70 | metadata[PangeaMetadataKeys.DATA_SOURCE] = PangeaMetadataValues.DATA_SOURCE_JIRA 71 | metadata[PangeaMetadataKeys.FILE_NAME] = doc.metadata.get("title", "") 72 | 73 | id = doc.metadata.get("id", "") 74 | if not id: 75 | raise Exception("invalid metadata key") 76 | 77 | metadata[PangeaMetadataKeys.JIRA_ISSUE_ID] = id 78 | 79 | # New metadata 80 | issue = JiraAPI.get_issue(self._auth, id) 81 | # Sometimes field is present but it's null, so we should handle that case 82 | fields = issue.get("fields", {}) 83 | if fields is None: 84 | fields = {} 85 | assignee = fields.get("assignee", {}) 86 | if assignee is None: 87 | assignee = {} 88 | reporter = fields.get("reporter", {}) 89 | if reporter is None: 90 | reporter = {} 91 | 92 | metadata[f"{_PANGEA_METADATA_KEY_PREFIX}jira_assignee_account_id"] = assignee.get("accountId", "") 93 | metadata[f"{_PANGEA_METADATA_KEY_PREFIX}jira_assignee_name"] = assignee.get("displayName", "") 94 | metadata[f"{_PANGEA_METADATA_KEY_PREFIX}jira_reporter_account_id"] = reporter.get("accountId", "") 95 | metadata[f"{_PANGEA_METADATA_KEY_PREFIX}jira_reporter_name"] = reporter.get("displayName", "") 96 | 97 | return metadata 98 | 99 | 100 | class JiraProcessor(PangeaGenericNodeProcessor[T], Generic[T]): 101 | """Processes Jira documents for access control. 102 | 103 | Filters Jira documents based on issue ID permissions and caches access results. 104 | 105 | Attributes: 106 | auth (JiraAuth): Jira authentication details. 107 | issue_ids_cache (dict[str, bool]): Cache of access status for Jira issue IDs. 108 | issue_ids_list (List[str]): List of authorized Jira issue IDs. 109 | get_node_metadata (Callable): Function to retrieve metadata for nodes. 110 | """ 111 | 112 | auth: JiraAuth 113 | issue_ids_cache: dict[str, bool] 114 | issue_ids_list: List[str] 115 | get_node_metadata: Callable[[T], dict[str, Any]] 116 | _account_id: Optional[str] 117 | 118 | def __init__( 119 | self, auth: JiraAuth, get_node_metadata: Callable[[T], dict[str, Any]], account_id: Optional[str] = None 120 | ): 121 | super().__init__() 122 | self.auth = auth 123 | self.issue_ids_cache = {} 124 | self.get_node_metadata = get_node_metadata 125 | self._account_id = account_id 126 | 127 | def filter( 128 | self, 129 | nodes: List[T], 130 | ) -> List[Any]: 131 | """Filter Jira nodes by access permissions. 132 | 133 | Args: 134 | nodes (List[T]): List of nodes to process. 135 | 136 | Returns: 137 | List[Any]: Nodes that have authorized access. 138 | """ 139 | 140 | filtered: List[T] = [] 141 | if not self._account_id: 142 | for node in nodes: 143 | if self._is_authorized(node): 144 | filtered.append(node) 145 | return filtered 146 | 147 | issues = [] 148 | for node in nodes: 149 | metadata = self.get_node_metadata(node) 150 | if metadata[PangeaMetadataKeys.DATA_SOURCE] == PangeaMetadataValues.DATA_SOURCE_JIRA: 151 | issues.append(int(metadata.get(PangeaMetadataKeys.JIRA_ISSUE_ID, ""))) 152 | filtered.append(node) 153 | 154 | allowed_issues = JiraAPI.get_allowed_issues(self.auth, self._account_id, issues) 155 | return list( 156 | filter( 157 | lambda x: (int(self.get_node_metadata(x).get(PangeaMetadataKeys.JIRA_ISSUE_ID, ""))) in allowed_issues, 158 | filtered, 159 | ) 160 | ) 161 | 162 | def get_filter( 163 | self, 164 | ) -> MetadataFilter: 165 | """Generate a filter based on accessible Jira issue IDs. 166 | 167 | Returns: 168 | MetadataFilter: Filter for Jira issue IDs. 169 | """ 170 | 171 | if not self.issue_ids_list: 172 | self.issue_ids_list = JiraAPI.get_issue_ids(self.auth) 173 | return MetadataFilter( 174 | key=PangeaMetadataKeys.JIRA_ISSUE_ID, value=self.issue_ids_list, operator=FilterOperator.IN 175 | ) 176 | 177 | def _is_authorized(self, node: T) -> bool: 178 | metadata = self.get_node_metadata(node) 179 | return metadata[PangeaMetadataKeys.DATA_SOURCE] == PangeaMetadataValues.DATA_SOURCE_JIRA and self._has_access( 180 | metadata 181 | ) 182 | 183 | def _has_access(self, metadata: dict[str, Any]) -> bool: 184 | id = metadata.get(PangeaMetadataKeys.JIRA_ISSUE_ID, None) 185 | if id is None: 186 | raise KeyError("Invalid metadata key") 187 | 188 | access = self.issue_ids_cache.get(id, None) 189 | if access is not None: 190 | return access 191 | 192 | try: 193 | JiraAPI.get_issue(self.auth, id) 194 | access = True 195 | except HTTPError as e: 196 | if e.response is None or e.response.status_code == 404: 197 | access = False 198 | 199 | if access is None: 200 | return False 201 | 202 | self.issue_ids_cache[id] = access 203 | return access 204 | 205 | 206 | class JiraAPI: 207 | @staticmethod 208 | def _get(auth: JiraAuth, path: str, params: dict[str, Any] = {}) -> dict[str, Any]: 209 | """ 210 | Makes a request to the Jira API. 211 | 212 | Args: 213 | auth (JiraAuth): The authentication credentials for Jira. 214 | path (str): The API path to send the request to. 215 | params (dict, optional): The query parameters for the request. 216 | 217 | Returns: 218 | dict: The JSON response from the Jira API. 219 | """ 220 | 221 | basic_auth = HTTPBasicAuth(auth.email, auth.token) 222 | url = urljoin(f"https://{auth.url}", path) 223 | response = requests.get(url, headers={"Accept": "application/json"}, params=params, auth=basic_auth) 224 | response.raise_for_status() 225 | return response.json() 226 | 227 | @staticmethod 228 | def _post(auth: JiraAuth, path: str, body: dict[str, Any] = {}) -> dict[str, Any]: 229 | headers = {"Accept": "application/json", "Content-Type": "application/json"} 230 | 231 | basic_auth = HTTPBasicAuth(auth.email, auth.token) 232 | 233 | response = requests.request( 234 | "POST", urljoin(f"https://{auth.url}", path), json=body, headers=headers, auth=basic_auth 235 | ) 236 | 237 | response.raise_for_status() 238 | return response.json() 239 | 240 | @staticmethod 241 | def get_issue(auth: JiraAuth, issue_id: str) -> dict[str, Any]: 242 | """ 243 | Retrieves details of a specific Jira issue. 244 | 245 | Args: 246 | auth (JiraAuth): The authentication credentials for Jira. 247 | issue_id (str): The ID of the Jira issue to retrieve. 248 | 249 | Returns: 250 | dict: The JSON response containing issue details. 251 | """ 252 | 253 | return JiraAPI._get(auth, f"/rest/api/3/issue/{issue_id}") 254 | 255 | @staticmethod 256 | def myself(auth: JiraAuth) -> dict[str, Any]: 257 | """ 258 | Retrieves the profile information of the currently authenticated user in Jira. 259 | 260 | Args: 261 | auth (JiraAuth): The authentication credentials for Jira. 262 | 263 | Returns: 264 | dict: A dictionary containing the authenticated user's profile information. 265 | 266 | Raises: 267 | HTTPError: If the request to Jira fails. 268 | """ 269 | return JiraAPI._get(auth, "/rest/api/3/myself") 270 | 271 | @staticmethod 272 | def search(auth: JiraAuth, params: dict[str, Any] = {}) -> dict[str, Any]: 273 | """ 274 | Searches for issues in Jira using specified query parameters. 275 | 276 | This method provides a way to search for issues in Jira, returning a paginated list 277 | of issues based on search criteria defined in the `params` argument. The parameters 278 | can be customized to filter issues based on various criteria such as project, status, 279 | labels, etc. 280 | 281 | Args: 282 | auth (JiraAuth): The authentication credentials for Jira. 283 | params (dict, optional): A dictionary of query parameters for customizing the search. 284 | Default is an empty dictionary. 285 | 286 | Returns: 287 | dict: A dictionary containing the search results, including issue details and pagination info. 288 | 289 | Raises: 290 | HTTPError: If the request to Jira fails. 291 | """ 292 | return JiraAPI._get(auth, "/rest/api/3/search", params) 293 | 294 | @staticmethod 295 | def get_issue_ids(auth: JiraAuth) -> List[str]: 296 | """ 297 | Retrieves the IDs of all issues in Jira. 298 | 299 | This method iterates through all issues in the Jira instance and retrieves their IDs. 300 | It paginates through results if there are more issues than the `max_results` limit. 301 | 302 | Args: 303 | auth (JiraAuth): The authentication credentials for Jira. 304 | 305 | Returns: 306 | List[str]: A list of all issue IDs in the Jira instance. 307 | """ 308 | 309 | max_results = 50 310 | start_at = 0 311 | keep_iterating = True 312 | issue_ids: List[str] = [] 313 | 314 | while keep_iterating: 315 | params = { 316 | "query": "", 317 | "maxResults": max_results, 318 | "startAt": start_at, 319 | "fields": ["id"], 320 | } 321 | 322 | resp = JiraAPI.search(auth, params) 323 | issues = resp.get("issues", []) 324 | total = resp.get("total", 0) 325 | 326 | ids = [issue["id"] for issue in issues] 327 | issue_ids.extend(ids) 328 | 329 | start_at = start_at + len(ids) 330 | keep_iterating = start_at < total 331 | 332 | return issue_ids 333 | 334 | @staticmethod 335 | def get_permission_check(auth: JiraAuth, account_id: str, issues: List[int]) -> dict[str, Any]: 336 | body = { 337 | "accountId": account_id, 338 | "projectPermissions": [ 339 | { 340 | "issues": issues, 341 | "permissions": ["EDIT_ISSUES"], 342 | } 343 | ], 344 | } 345 | 346 | return JiraAPI._post(auth=auth, path="rest/api/3/permissions/check", body=body) 347 | 348 | @staticmethod 349 | def get_allowed_issues(auth: JiraAuth, account_id: str, issues: List[int]) -> List[int]: 350 | resp = JiraAPI.get_permission_check(auth, account_id, issues) 351 | return resp.get("projectPermissions", [])[0].get("issues", []) 352 | --------------------------------------------------------------------------------