├── .gitattributes
├── .github
    ├── CODEOWNERS
    ├── renovate.json
    └── workflows
    │   └── ci.yml
├── examples
    ├── llama_index_examples
    │   ├── __init__.py
    │   ├── pyproject.toml
    │   ├── 05-github-check-access.py
    │   ├── 08-confluence-check-access.py
    │   ├── 07-jira-check-acess.py
    │   ├── README.md
    │   ├── 02-gdrive-check-access.py
    │   ├── 03-rag-LlamaIndex-gdrive-filter.py
    │   ├── 01-rag-LlamaIndex-gdrive-processor.py
    │   └── 02-rag-LlamaIndex-all-sources-processor.py
    ├── .gitignore
    ├── multipass_examples
    │   ├── 06-gitlab-check-access.py
    │   ├── pyproject.toml
    │   ├── 01-github-check-access.py
    │   ├── 03-slack-check-access.py
    │   ├── 04-dropbox-check-access.py
    │   └── README.md
    ├── langchain_examples
    │   ├── pyproject.toml
    │   ├── 05-github-check-access.py
    │   ├── 02-rag-LangChain-gdrive.py
    │   └── 01-rag-LangChain-all-sources.py
    └── README.md
├── packages
    ├── pangea-multipass
    │   ├── tests
    │   │   ├── __init__.py
    │   │   └── integration
    │   │   │   ├── __init__.py
    │   │   │   ├── test_github.py
    │   │   │   ├── test_gitlab.py
    │   │   │   ├── test_slack.py
    │   │   │   └── test_dropbox.py
    │   ├── pangea_multipass
    │   │   ├── py.typed
    │   │   ├── sources
    │   │   │   ├── github
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── github.py
    │   │   │   ├── gitlab
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── gitlab.py
    │   │   │   ├── slack
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── slack.py
    │   │   │   ├── dropbox
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── dropbox.py
    │   │   │   ├── jira
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── jira.py
    │   │   │   ├── gdrive
    │   │   │   │   └── __init__.py
    │   │   │   ├── confluence
    │   │   │   │   └── __init__.py
    │   │   │   └── __init__.py
    │   │   ├── __init__.py
    │   │   ├── utils.py
    │   │   ├── dropbox_reader.py
    │   │   ├── oauth.py
    │   │   ├── gitlab_reader.py
    │   │   ├── github_reader.py
    │   │   ├── slack_reader.py
    │   │   └── core.py
    │   ├── README.md
    │   ├── pyproject.toml
    │   └── CHANGELOG.md
    ├── pangea-multipass-langchain
    │   ├── pangea_multipass_langchain
    │   │   ├── py.typed
    │   │   ├── __init__.py
    │   │   └── langchain.py
    │   ├── CHANGELOG.md
    │   ├── pyproject.toml
    │   └── README.md
    ├── pangea-multipass-llama-index
    │   ├── pangea_multipass_llama_index
    │   │   ├── py.typed
    │   │   ├── __init__.py
    │   │   └── llama_index.py
    │   ├── CHANGELOG.md
    │   ├── pyproject.toml
    │   └── README.md
    └── .gitignore
├── dev
    ├── setup_repo.sh
    └── validate_tag.sh
├── LICENSE
├── .pre-commit-config.yaml
├── README.md
├── .gitignore
└── EXTENDING.md


/.gitattributes:
--------------------------------------------------------------------------------
1 | * text=auto eol=lf
2 | 


--------------------------------------------------------------------------------
/.github/CODEOWNERS:
--------------------------------------------------------------------------------
1 | * @pangeacyber/sdks
2 | 


--------------------------------------------------------------------------------
/examples/llama_index_examples/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/packages/pangea-multipass/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/packages/pangea-multipass/pangea_multipass/py.typed:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/packages/pangea-multipass-langchain/pangea_multipass_langchain/py.typed:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/packages/pangea-multipass-llama-index/pangea_multipass_llama_index/py.typed:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/examples/.gitignore:
--------------------------------------------------------------------------------
1 | .env
2 | .env_*
3 | storage/
4 | .mypy_cache/
5 | *.json
6 | 


--------------------------------------------------------------------------------
/packages/.gitignore:
--------------------------------------------------------------------------------
1 | .env
2 | .env_*
3 | storage/
4 | .mypy_cache/
5 | *.json
6 | 


--------------------------------------------------------------------------------
/packages/pangea-multipass/pangea_multipass/sources/github/__init__.py:
--------------------------------------------------------------------------------
1 | from .github import GitHubClient, GitHubProcessor
2 | 


--------------------------------------------------------------------------------
/packages/pangea-multipass/pangea_multipass/sources/gitlab/__init__.py:
--------------------------------------------------------------------------------
1 | from .gitlab import GitLabClient, GitLabProcessor
2 | 


--------------------------------------------------------------------------------
/packages/pangea-multipass/pangea_multipass/sources/slack/__init__.py:
--------------------------------------------------------------------------------
1 | from .slack import SlackClient, SlackProcessor
2 | 


--------------------------------------------------------------------------------
/packages/pangea-multipass/pangea_multipass/sources/dropbox/__init__.py:
--------------------------------------------------------------------------------
1 | from .dropbox import DropboxClient, DropboxProcessor
2 | 


--------------------------------------------------------------------------------
/packages/pangea-multipass/tests/integration/__init__.py:
--------------------------------------------------------------------------------
1 | from .test_github import TestGitHub
2 | from .test_gitlab import TestGitLab
3 | from .test_slack import TestSlack
4 | 


--------------------------------------------------------------------------------
/packages/pangea-multipass-langchain/pangea_multipass_langchain/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Pangea Cyber Corporation
2 | # Author: Pangea Cyber Corporation
3 | 
4 | from .langchain import *
5 | 


--------------------------------------------------------------------------------
/packages/pangea-multipass-llama-index/pangea_multipass_llama_index/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Pangea Cyber Corporation
2 | # Author: Pangea Cyber Corporation
3 | 
4 | from .llama_index import *
5 | 


--------------------------------------------------------------------------------
/packages/pangea-multipass/pangea_multipass/sources/jira/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Pangea Cyber Corporation
2 | # Author: Pangea Cyber Corporation
3 | 
4 | from .jira import JiraAuth, JiraME, JiraProcessor
5 | 


--------------------------------------------------------------------------------
/dev/setup_repo.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env zsh
2 | 
3 | # Currently this script only supports Macs/ZSH, please add more as needed
4 | brew install pre-commit
5 | 
6 | echo "Installing pre-commit hooks"
7 | pre-commit install
8 | 


--------------------------------------------------------------------------------
/packages/pangea-multipass/pangea_multipass/sources/gdrive/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Pangea Cyber Corporation
2 | # Author: Pangea Cyber Corporation
3 | 
4 | from .gdrive import GDriveAPI, GDriveME, GDriveProcessor
5 | 


--------------------------------------------------------------------------------
/packages/pangea-multipass/pangea_multipass/sources/confluence/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Pangea Cyber Corporation
2 | # Author: Pangea Cyber Corporation
3 | 
4 | from .confluence import ConfluenceAuth, ConfluenceME, ConfluenceProcessor
5 | 


--------------------------------------------------------------------------------
/packages/pangea-multipass/pangea_multipass/sources/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Pangea Cyber Corporation
 2 | # Author: Pangea Cyber Corporation
 3 | 
 4 | from .confluence import *
 5 | from .dropbox import *
 6 | from .gdrive import *
 7 | from .github import *
 8 | from .gitlab import *
 9 | from .jira import *
10 | from .slack import *
11 | 


--------------------------------------------------------------------------------
/examples/multipass_examples/06-gitlab-check-access.py:
--------------------------------------------------------------------------------
 1 | # Ingestion time
 2 | import os
 3 | 
 4 | from pangea_multipass import GitLabProcessor, GitLabReader, get_document_metadata
 5 | 
 6 | token = os.getenv("GITLAB_ADMIN_TOKEN")
 7 | assert token
 8 | 
 9 | username = os.getenv("GITLAB_USERNAME")
10 | assert username
11 | 
12 | reader = GitLabReader(token=token)
13 | print("Loading data...")
14 | files = reader.load_data()
15 | print(f"Loaded {len(files)} files.")
16 | 
17 | 
18 | # Inference time
19 | processor = GitLabProcessor(admin_token=token, username=username, get_node_metadata=get_document_metadata)
20 | 
21 | authorized_files = processor.filter(files)
22 | print(f"User '{username}' has access to {len(authorized_files)} files.")
23 | 


--------------------------------------------------------------------------------
/packages/pangea-multipass/pangea_multipass/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Pangea Cyber Corporation
 2 | # Author: Pangea Cyber Corporation
 3 | 
 4 | from .core import (
 5 |     Constant,
 6 |     DocumentReader,
 7 |     FilterOperator,
 8 |     HasherSHA256,
 9 |     MetadataFilter,
10 |     MultipassDocument,
11 |     PangeaGenericNodeProcessor,
12 |     PangeaMetadataKeys,
13 |     PangeaMetadataValues,
14 |     PangeaNodeProcessorMixer,
15 |     enrich_metadata,
16 |     generate_id,
17 |     get_document_metadata,
18 | )
19 | from .dropbox_reader import DropboxReader
20 | from .github_reader import GitHubReader
21 | from .gitlab_reader import GitLabReader
22 | from .oauth import OauthFlow
23 | from .slack_reader import SlackReader
24 | from .sources import *
25 | from .utils import *
26 | 


--------------------------------------------------------------------------------
/examples/multipass_examples/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "multipass_examples"
 3 | version = "0.1.0"
 4 | description = "Pangea Multipass authorization library examples"
 5 | authors = [
 6 |     { name = "Apurv Jawle", email = "apurv.jawle@pangea.cloud" }
 7 | ]
 8 | license = "MIT"
 9 | readme = "README.md"
10 | package-mode = false
11 | requires-python = ">=3.10,<3.13"
12 | dependencies = [
13 |     "pangea-multipass",
14 |     "llama-index-readers-google==0.7.2",
15 | ]
16 | 
17 | [dependency-groups]
18 | dev = [
19 |     "mypy==1.19.0",
20 |     "types-requests==2.32.4.20250913",
21 | ]
22 | 
23 | [tool.uv.sources]
24 | pangea-multipass = { path = "../../packages/pangea-multipass", editable = true }
25 | 
26 | [tool.isort]
27 | profile = "black"
28 | line_length = 120
29 | 
30 | [tool.mypy]
31 | plugins = ["pydantic.mypy"]
32 | 


--------------------------------------------------------------------------------
/packages/pangea-multipass/README.md:
--------------------------------------------------------------------------------
 1 | # Pangea Multipass: Your Authorization Helper
 2 | 
 3 | Pangea Multipass is a Python library for checking user access to upstream data sources.
 4 | 
 5 | In practice, you can use it to check if a specific user has access to a file in a Google Drive, a ticket in Jira, or a page in Confluence. In concept, we've built this library to be extensible to eventually support Slack channels, GitHub repositories, Salesforce opportunities, and more. 
 6 | 
 7 | We originally built this to support our customers' Retrieval-Augmented Generation (RAG) applications to mitigate data leaks. In a RAG architecture, the application inserts additional context at inference time. If you don't check the user's authorization to that context, you could inadvertently leak sensitive information. 
 8 | 
 9 | While this is useful in AI/LLM apps, we've abstracted this to work independently so you can use it in any app.
10 | 


--------------------------------------------------------------------------------
/examples/multipass_examples/01-github-check-access.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Pangea Cyber Corporation
 2 | # Author: Pangea Cyber Corporation
 3 | 
 4 | import os
 5 | 
 6 | from pangea_multipass import GitHubProcessor, GitHubReader, PangeaMetadataKeys, get_document_metadata
 7 | 
 8 | # Ingestion time
 9 | admin_token = os.getenv("GITHUB_ADMIN_TOKEN")
10 | assert admin_token
11 | 
12 | reader = GitHubReader(admin_token)
13 | documents = reader.load_data()
14 | print(f"Loaded {len(documents)} docs:")
15 | 
16 | for doc in documents:
17 |     print(doc.metadata.get(PangeaMetadataKeys.FILE_NAME), "")
18 | 
19 | # Inference time
20 | username = os.getenv("GITHUB_USERNAME")
21 | assert username
22 | 
23 | processor = GitHubProcessor(admin_token, get_document_metadata, username=username)
24 | authorized_docs = processor.filter(documents)
25 | 
26 | print(f"\nAuthorized docs: {len(authorized_docs)}")
27 | for doc in authorized_docs:
28 |     print(doc.metadata.get(PangeaMetadataKeys.FILE_NAME), "")
29 | 


--------------------------------------------------------------------------------
/.github/renovate.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "$schema": "https://docs.renovatebot.com/renovate-schema.json",
 3 |   "extends": [
 4 |     "config:best-practices",
 5 |     "local>pangeacyber/.github:renovate-config"
 6 |   ],
 7 |   "automerge": true,
 8 |   "automergeStrategy": "rebase",
 9 |   "ignorePaths": [],
10 |   "packageRules": [
11 |     {
12 |       "matchManagers": ["github-actions"],
13 |       "extends": [
14 |         ":semanticPrefixChore",
15 |         ":semanticCommitScope(ci)"
16 |       ]
17 |     },
18 |     {
19 |       "matchFileNames": ["examples/**"],
20 |       "extends": [
21 |         ":semanticPrefixChore",
22 |         ":semanticCommitScope(examples)"
23 |       ],
24 |       "additionalBranchPrefix": "{{parentDir}}/"
25 |     },
26 |     {
27 |       "matchFileNames": ["packages/**"],
28 |       "additionalBranchPrefix": "{{parentDir}}/",
29 |       "semanticCommitScope": "{{parentDir}}"
30 |     },
31 |     {
32 |       "matchDepNames": ["python"],
33 |       "enabled": false
34 |     }
35 |   ]
36 | }
37 | 


--------------------------------------------------------------------------------
/dev/validate_tag.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -e
 4 | 
 5 | if [ $# -lt 1 ]; then
 6 |     echo "usage: validate_tag.sh <git_tag>"
 7 |     exit 1
 8 | fi
 9 | 
10 | GIT_TAG=$1
11 | 
12 | if [[ ! $GIT_TAG == *"/"* ]]; then
13 |    echo "Git tag must contain a forward slash to delimit the package name from the version number."
14 |    exit 1
15 | fi
16 | 
17 | PACKAGE_NAME=$(echo "$GIT_TAG" | cut -d "/" -f 1)
18 | VERSION=$(echo "$GIT_TAG" | cut -d "/" -f 2)
19 | 
20 | if [[ ! "$VERSION" == *"v"* ]]; then
21 |    echo "Git tag must contain a version number that's prefixed with 'v'."
22 |    exit 1
23 | fi
24 | 
25 | # Move to repo root.
26 | PARENT_PATH=$(cd "$(dirname "${BASH_SOURCE[0]}")"; pwd -P)
27 | pushd "$PARENT_PATH/.."
28 | 
29 | PYPROJECT_VERSION=v$(poetry version --directory packages/"$PACKAGE_NAME" --dry-run --short)
30 | 
31 | if [[ ! "$VERSION" == "$PYPROJECT_VERSION" ]]; then
32 |    echo "Git tag version '$VERSION' does not match pyproject.toml version '$PYPROJECT_VERSION'."
33 |    exit 1
34 | fi
35 | 
36 | popd
37 | 


--------------------------------------------------------------------------------
/examples/langchain_examples/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "langchain_examples"
 3 | version = "0.1.0"
 4 | description = "Pangea Multipass authorization library for LangChain"
 5 | authors = [
 6 |     { name = "Apurv Jawle", email = "apurv.jawle@pangea.cloud" }
 7 | ]
 8 | license = "MIT"
 9 | package-mode = false
10 | requires-python = ">=3.10,<3.13"
11 | dependencies = [
12 |     "pangea-multipass-langchain (>=0.2.0)",
13 |     "pangea-multipass (>=0.2.0)",
14 |     "google-api-python-client==2.187.0",
15 |     "google-auth-httplib2 (>=0.2.1)",
16 |     "langchain-google-community==2.0.10",
17 |     "lxml==6.0.2",
18 |     "faiss-cpu==1.13.1",
19 |     "boto3==1.42.7",
20 |     "langchain-aws==0.2.35",
21 | ]
22 | 
23 | [dependency-groups]
24 | dev = [
25 |     "mypy==1.19.0",
26 | ]
27 | 
28 | [tool.uv.sources]
29 | pangea-multipass = { path = "../../packages/pangea-multipass", editable = true }
30 | pangea-multipass-langchain = { path = "../../packages/pangea-multipass-langchain", editable = true }
31 | 
32 | [tool.isort]
33 | profile = "black"
34 | line_length = 120
35 | 
36 | [tool.mypy]
37 | plugins = ["pydantic.mypy"]
38 | 


--------------------------------------------------------------------------------
/examples/langchain_examples/05-github-check-access.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Pangea Cyber Corporation
 2 | # Author: Pangea Cyber Corporation
 3 | 
 4 | import os
 5 | 
 6 | from pangea_multipass import GitHubReader, PangeaMetadataKeys
 7 | from pangea_multipass_langchain import LangChainGitHubFilter, from_multipass
 8 | 
 9 | # Ingestion time
10 | admin_token = os.getenv("GITHUB_ADMIN_TOKEN")
11 | assert admin_token
12 | 
13 | reader = GitHubReader(admin_token)
14 | mp_documents = reader.load_data()
15 | print(f"Loaded {len(mp_documents)} docs:")
16 | 
17 | # Convert documents to LangChain format
18 | documents = from_multipass(mp_documents)
19 | for doc in documents:
20 |     print(doc.metadata.get(PangeaMetadataKeys.FILE_NAME), "")
21 | 
22 | # Inference time
23 | username = os.getenv("GITHUB_USERNAME")
24 | assert username, "GITHUB_USERNAME is not set"
25 | 
26 | processor = LangChainGitHubFilter(admin_token, username=username)
27 | authorized_docs = processor.filter(documents)
28 | 
29 | print(f"\nAuthorized docs: {len(authorized_docs)}")
30 | for doc in authorized_docs:
31 |     print(doc.metadata.get(PangeaMetadataKeys.FILE_NAME), "")
32 | 


--------------------------------------------------------------------------------
/examples/multipass_examples/03-slack-check-access.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Pangea Cyber Corporation
 2 | # Author: Pangea Cyber Corporation
 3 | 
 4 | import logging
 5 | import os
 6 | 
 7 | from pangea_multipass import SlackReader
 8 | from pangea_multipass.utils import set_logger_to_stdout
 9 | 
10 | set_logger_to_stdout("multipass", logging.INFO)
11 | 
12 | admin_token = os.getenv("SLACK_ADMIN_TOKEN")
13 | assert admin_token
14 | 
15 | reader = SlackReader(token=admin_token)
16 | documents = reader.load_data(max_messages_per_channel=1000)
17 | print(f"Loaded {len(documents)} messages.")
18 | 
19 | # Inference time
20 | from pangea_multipass import SlackProcessor, get_document_metadata
21 | 
22 | user_email = os.getenv("SLACK_USER_EMAIL")
23 | assert user_email
24 | 
25 | processor = SlackProcessor(token=admin_token, get_node_metadata=get_document_metadata, user_email=user_email)
26 | filter = processor.get_filter()
27 | print("User has access to channel ids:")
28 | for id in filter.value:
29 |     print(f"\t{id}")
30 | 
31 | filtered_docs = processor.filter(nodes=documents)
32 | print(f"User has access to {len(filtered_docs)} messages")
33 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 Pangea
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/examples/llama_index_examples/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "llama_index_examples"
 3 | version = "0.2.0"
 4 | description = "Pangea Multipass authorization library for Llama Index"
 5 | authors = [
 6 |     { name = "Apurv Jawle", email = "apurv.jawle@pangea.cloud" }
 7 | ]
 8 | license = "MIT"
 9 | readme = "README.md"
10 | package-mode = false
11 | requires-python = ">=3.10,<3.13"
12 | dependencies = [
13 |     "pangea-multipass-llama-index (>=0.2.0)",
14 |     "pangea-multipass (>=0.2.0)",
15 |     "llama-index-llms-bedrock (==0.4.2)",
16 |     "llama-index-embeddings-bedrock (==0.7.2)",
17 |     "llama-index-readers-google (==0.7.2)",
18 |     "llama-index-readers-confluence (==0.6.0)",
19 |     "llama-index-readers-jira (==0.5.1)",
20 | ]
21 | 
22 | [dependency-groups]
23 | dev = [
24 |     "mypy==1.19.0",
25 | ]
26 | 
27 | [tool.uv.sources]
28 | pangea-multipass = { path = "../../packages/pangea-multipass", editable = true }
29 | pangea-multipass-llama-index = { path = "../../packages/pangea-multipass-llama-index", editable = true }
30 | 
31 | [tool.mypy]
32 | plugins = ["pydantic.mypy"]
33 | 
34 | [tool.isort]
35 | profile = "black"
36 | line_length = 120
37 | 


--------------------------------------------------------------------------------
/examples/llama_index_examples/05-github-check-access.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Pangea Cyber Corporation
 2 | # Author: Pangea Cyber Corporation
 3 | 
 4 | import os
 5 | 
 6 | from pangea_multipass import GitHubReader, PangeaMetadataKeys
 7 | from pangea_multipass_llama_index import LlamaIndexGitHubProcessor, from_multipass
 8 | 
 9 | # Ingestion time
10 | admin_token = os.getenv("GITHUB_ADMIN_TOKEN")
11 | assert admin_token
12 | 
13 | reader = GitHubReader(admin_token)
14 | print("Loading data...")
15 | documents = reader.load_data()
16 | print(f"Loaded {len(documents)} docs:")
17 | 
18 | # Convert documents to Llama Index format
19 | documents = from_multipass(documents)  # type: ignore
20 | for doc in documents:
21 |     print(doc.metadata.get(PangeaMetadataKeys.FILE_NAME), "")
22 | 
23 | # Inference time
24 | 
25 | 
26 | username = os.getenv("GITHUB_USERNAME")
27 | assert username, "GITHUB_USERNAME is not set"
28 | 
29 | processor = LlamaIndexGitHubProcessor(admin_token, username=username)
30 | authorized_docs = processor.filter(documents)  # type: ignore
31 | 
32 | print(f"\nAuthorized docs: {len(authorized_docs)}")
33 | for d in authorized_docs:
34 |     print(d.metadata.get(PangeaMetadataKeys.FILE_NAME), "")
35 | 


--------------------------------------------------------------------------------
/packages/pangea-multipass-langchain/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | # Changelog
 2 | 
 3 | All notable changes to this project will be documented in this file.
 4 | 
 5 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 6 | and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 7 | 
 8 | ## Unreleased
 9 | 
10 | ### Added
11 | 
12 | - GitLabReader and GitLabProcessor
13 | - Dropbox processor
14 | 
15 | ### Fixed
16 | 
17 | - `pydantic` error of `node_processor` default value.
18 | 
19 | ## 0.2.0 - 2025-01-15
20 | 
21 | ### Added
22 | 
23 | - GitHub repository's reader and processor.
24 | - Slack channel's reader and processor.
25 | - `account_id` support on JiraProcessor.
26 | - Check user email permissions with admin credentials in GDriveProcessor.
27 | - Check username permissions with admin token in GitHubProcessor.
28 | - Check user email permissions with admin token in SlackProcessor.
29 | - `py.typed` marker file.
30 | - Check user account id permissions with admin token in JiraProcessor.
31 | - Check user account id permission with admin token in ConfluenceProcessor.
32 | 
33 | ## 0.1.0 - 2024-12-24
34 | 
35 | ### Added
36 | 
37 | - Medatata enritcher and processor for Google Drive, Jira and Confluence data sources.
38 | 


--------------------------------------------------------------------------------
/packages/pangea-multipass-langchain/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "pangea-multipass-langchain"
 3 | version = "0.2.0"
 4 | description = "Pangea Multipass authorization library for LangChain"
 5 | authors = [
 6 |     { name = "Apurv Jawle", email = "apurv.jawle@pangea.cloud" }
 7 | ]
 8 | license = "MIT"
 9 | readme = "README.md"
10 | requires-python = ">=3.10,<4.0"
11 | dependencies = [
12 |     "langchain (==0.3.27)",
13 |     "pangea-multipass (>=0.2.0)",
14 | ]
15 | 
16 | [dependency-groups]
17 | dev = [
18 |     "mypy==1.19.0",
19 | ]
20 | 
21 | [tool.uv.sources]
22 | pangea-multipass = { path = "../pangea-multipass", editable = true }
23 | 
24 | [build-system]
25 | requires = ["hatchling==1.28.0"]
26 | build-backend = "hatchling.build"
27 | 
28 | [tool.black]
29 | line-length = 120
30 | 
31 | [tool.isort]
32 | profile = "black"
33 | line_length = 120
34 | src_paths = ["pangea_multipass_langchain", "tests"]
35 | known_local_folder = ["pangea_multipass_langchain", "tests"]
36 | 
37 | [tool.mypy]
38 | python_version = "3.10"
39 | color_output = true
40 | error_summary = true
41 | pretty = true
42 | show_column_numbers = true
43 | warn_unused_ignores = true
44 | 
45 | [[tool.mypy.overrides]]
46 | module = ["pangea_multipass.*"]
47 | follow_untyped_imports = true
48 | 


--------------------------------------------------------------------------------
/packages/pangea-multipass-llama-index/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | # Changelog
 2 | 
 3 | All notable changes to this project will be documented in this file.
 4 | 
 5 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 6 | and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 7 | 
 8 | ## Unreleased
 9 | 
10 | ### Added
11 | 
12 | - GitLabReader and GitLabProcessor
13 | - Dropbox processor
14 | 
15 | ### Fixed
16 | 
17 | - `pydantic` error of `node_processor` default value.
18 | 
19 | ## 0.2.0 - 2025-01-15
20 | 
21 | ### Added
22 | 
23 | - GitHub repository's reader and processor.
24 | - Slack channel's reader and processor.
25 | - `account_id` support on JiraProcessor.
26 | - Check user email permissions with admin credentials in GDriveProcessor.
27 | - Check username permissions with admin token in GitHubProcessor.
28 | - Check user email permissions with admin token in SlackProcessor.
29 | - `py.typed` marker file.
30 | - Check user account id permissions with admin token in JiraProcessor.
31 | - Check user account id permission with admin token in ConfluenceProcessor.
32 | 
33 | ## 0.1.0 - 2024-12-24
34 | 
35 | ### Added
36 | 
37 | - Medatata enritcher and processor for Google Drive, Jira and Confluence data sources.
38 | 


--------------------------------------------------------------------------------
/packages/pangea-multipass-llama-index/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "pangea-multipass-llama-index"
 3 | version = "0.2.0"
 4 | description = "Pangea Multipass authorization library for Llama Index"
 5 | authors = [
 6 |     { name = "Apurv Jawle", email = "apurv.jawle@pangea.cloud" }
 7 | ]
 8 | license = "MIT"
 9 | readme = "README.md"
10 | requires-python = ">=3.10,<4.0"
11 | dependencies = [
12 |     "llama-index (>=0.14.10)",
13 |     "pangea-multipass (>=0.2.0)",
14 | ]
15 | 
16 | [dependency-groups]
17 | dev = [
18 |     "mypy==1.19.0",
19 | ]
20 | 
21 | [tool.uv.sources]
22 | pangea-multipass = { path = "../pangea-multipass", editable = true }
23 | 
24 | [build-system]
25 | requires = ["hatchling==1.28.0"]
26 | build-backend = "hatchling.build"
27 | 
28 | [tool.black]
29 | line-length = 120
30 | 
31 | [tool.isort]
32 | profile = "black"
33 | line_length = 120
34 | src_paths = ["pangea_multipass_llama_index", "tests"]
35 | known_local_folder = ["pangea_multipass_llama_index", "tests"]
36 | 
37 | [tool.mypy]
38 | python_version = "3.10"
39 | color_output = true
40 | error_summary = true
41 | pretty = true
42 | show_column_numbers = true
43 | warn_unused_ignores = true
44 | 
45 | [[tool.mypy.overrides]]
46 | module = ["pangea_multipass.*"]
47 | follow_untyped_imports = true
48 | 


--------------------------------------------------------------------------------
/packages/pangea-multipass/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "pangea-multipass"
 3 | version = "0.2.0"
 4 | description = "Pangea Multipass authorization library"
 5 | authors = [
 6 |     { name = "Apurv Jawle", email = "apurv.jawle@pangea.cloud" }
 7 | ]
 8 | license = "MIT"
 9 | readme = "README.md"
10 | requires-python = ">=3.10,<4.0"
11 | dependencies = [
12 |     "google-auth-oauthlib (>=1.2.2)",
13 |     "google-auth-httplib2 (>=0.2.1)",
14 |     "google-api-python-client (>=2.187.0)",
15 |     "google-auth (>=2.43.0)",
16 |     "openpyxl (>=3.1.5)",
17 |     "slack-sdk (>=3.39.0)",
18 | ]
19 | 
20 | [dependency-groups]
21 | dev = [
22 |     "mypy==1.19.0",
23 |     "types-requests==2.32.4.20250913",
24 | ]
25 | 
26 | [build-system]
27 | requires = ["hatchling==1.28.0"]
28 | build-backend = "hatchling.build"
29 | 
30 | [tool.black]
31 | line-length = 120
32 | 
33 | [tool.isort]
34 | profile = "black"
35 | line_length = 120
36 | src_paths = ["pangea_multipass", "tests"]
37 | known_local_folder = ["pangea_multipass", "tests"]
38 | 
39 | [tool.mypy]
40 | python_version = "3.10"
41 | color_output = true
42 | error_summary = true
43 | pretty = true
44 | show_column_numbers = true
45 | warn_unused_ignores = true
46 | 
47 | [[tool.mypy.overrides]]
48 | module = ["google_auth_oauthlib.flow.*", "googleapiclient.discovery.*"]
49 | follow_untyped_imports = true
50 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 |   - repo: https://github.com/pre-commit/pre-commit-hooks
 3 |     rev: v6.0.0
 4 |     hooks:
 5 |       - id: check-json
 6 |       - id: end-of-file-fixer
 7 |       - id: trailing-whitespace
 8 |         exclude: .md
 9 |       - id: check-merge-conflict
10 |       - id: debug-statements
11 |       - id: detect-aws-credentials
12 |         args:
13 |           - --allow-missing-credentials
14 |       - id: check-executables-have-shebangs
15 |       - id: check-shebang-scripts-are-executable
16 |       - id: no-commit-to-branch
17 |         args:
18 |           - --branch
19 |           - main
20 |   - repo: https://github.com/pycqa/isort
21 |     rev: 7.0.0
22 |     hooks:
23 |       - id: isort
24 |         args:
25 |           - --profile=black
26 |           - --line-length=120
27 |           - --resolve-all-configs
28 |   - repo: https://github.com/psf/black
29 |     rev: 25.12.0
30 |     hooks:
31 |       - id: black
32 |         args:
33 |           - --line-length=120
34 |   - repo: https://github.com/pre-commit/mirrors-mypy
35 |     rev: v1.19.0
36 |     hooks:
37 |       - id: mypy
38 |         args:
39 |           - --ignore-missing-imports
40 |           # - --strict
41 |           - --implicit-reexport
42 |         additional_dependencies:
43 |           - types-Deprecated==1.2.9.3
44 |           - types-python-dateutil==2.8.19.14
45 |           - types-requests==2.31.0.10
46 | 


--------------------------------------------------------------------------------
/packages/pangea-multipass/tests/integration/test_github.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import unittest
 3 | 
 4 | from pangea_multipass import GitHubProcessor, GitHubReader, get_document_metadata
 5 | 
 6 | token = os.getenv("GITHUB_ADMIN_TOKEN") or ""
 7 | username = os.getenv("GITHUB_USERNAME") or ""
 8 | 
 9 | _TOTAL_FILES = 8
10 | _AUTHORIZED_FILES = 5
11 | _AUTHORIZED_PROJECTS = 2
12 | 
13 | 
14 | class TestGitHub(unittest.TestCase):
15 |     def setUp(self) -> None:
16 |         assert token
17 |         assert username
18 | 
19 |     def test_github(self) -> None:
20 |         reader = GitHubReader(token=token)
21 |         files = reader.load_data()
22 |         assert len(files) == _TOTAL_FILES
23 | 
24 |         processor = GitHubProcessor(token=token, username=username, get_node_metadata=get_document_metadata)
25 |         filter = processor.get_filter()
26 |         assert len(filter.value) == _AUTHORIZED_PROJECTS
27 | 
28 |         authorized_files = processor.filter(files)
29 |         assert len(authorized_files) == _AUTHORIZED_FILES
30 | 
31 |     def test_github_pagination(self) -> None:
32 |         reader = GitHubReader(token=token)
33 | 
34 |         repos = reader.get_repos()
35 |         assert len(repos) == 3
36 | 
37 |         all_files = []
38 | 
39 |         for repo in repos:
40 |             has_more_files = True
41 |             while has_more_files:
42 |                 files = reader.read_repo_files(repo, page_size=1)
43 |                 all_files.extend(files)
44 |                 has_more_files = reader.has_more_files
45 |                 assert len(files) == 1
46 | 
47 |         assert len(all_files) == _TOTAL_FILES
48 | 


--------------------------------------------------------------------------------
/packages/pangea-multipass/tests/integration/test_gitlab.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import unittest
 3 | 
 4 | from pangea_multipass import GitLabProcessor, GitLabReader, get_document_metadata
 5 | 
 6 | token = os.getenv("GITLAB_ADMIN_TOKEN") or ""
 7 | username = os.getenv("GITLAB_USERNAME") or ""
 8 | 
 9 | _TOTAL_FILES = 8
10 | _AUTHORIZED_FILES = 5
11 | _AUTHORIZED_PROJECTS = 2
12 | 
13 | 
14 | class TestGitLab(unittest.TestCase):
15 |     def setUp(self) -> None:
16 |         assert token
17 |         assert username
18 | 
19 |     def test_gitlab(self) -> None:
20 |         reader = GitLabReader(token=token)
21 |         files = reader.load_data()
22 |         assert len(files) == _TOTAL_FILES
23 | 
24 |         processor = GitLabProcessor(admin_token=token, username=username, get_node_metadata=get_document_metadata)
25 |         filter = processor.get_filter()
26 |         assert len(filter.value) == _AUTHORIZED_PROJECTS
27 | 
28 |         authorized_files = processor.filter(files)
29 |         assert len(authorized_files) == _AUTHORIZED_FILES
30 | 
31 |     def test_gitlab_pagination(self) -> None:
32 |         reader = GitLabReader(token=token)
33 | 
34 |         repos = reader.get_repos()
35 |         assert len(repos) == 3
36 | 
37 |         all_files = []
38 | 
39 |         for repo in repos:
40 |             has_more_files = True
41 |             while has_more_files:
42 |                 files = reader.read_repo_files(repo, page_size=1)
43 |                 all_files.extend(files)
44 |                 has_more_files = reader.has_more_files
45 |                 assert len(files) == 1
46 | 
47 |         assert len(all_files) == _TOTAL_FILES
48 | 


--------------------------------------------------------------------------------
/packages/pangea-multipass/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | # Changelog
 2 | 
 3 | All notable changes to this project will be documented in this file.
 4 | 
 5 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 6 | and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 7 | 
 8 | ## Unreleased
 9 | 
10 | ### Added
11 | 
12 | - GitLabReader and GitLabProcessor
13 | - Dropbox reader and processor
14 | - Pagination support on GitHubReader
15 | - Pagination support on SlackReader
16 | - Logger to `GitLabReader`, `GitHubReader`, `SlackReader`, `GitLabClient`, `GitHubClient` and `SlackClient`.
17 | 
18 | ### Fixed
19 | 
20 | - Handle null fields on issues in JiraME
21 | - Handle trailing slash in Jira URL
22 | - GitLabProcessor `get_filter()`
23 | 
24 | ### Changed
25 | 
26 | - Rename `GitLabAPI` to `GitLabClient`
27 | - Rename `GitHubAPI` to `GitHubClient`
28 | - Rename `SlackAPI` to `SlackClient`
29 | 
30 | 
31 | ## 0.2.0 - 2025-01-15
32 | 
33 | ### Added
34 | 
35 | - GitHub repository's reader and processor.
36 | - Slack channel's reader and processor.
37 | - `account_id` support on JiraProcessor.
38 | - Check user email permissions with admin credentials in GDriveProcessor.
39 | - Check username permissions with admin token in GitHubProcessor.
40 | - Check user email permissions with admin token in SlackProcessor.
41 | - `py.typed` marker file.
42 | - Check user account id permissions with admin token in JiraProcessor.
43 | - Check user account id permission with admin token in ConfluenceProcessor.
44 | 
45 | ## 0.1.0 - 2024-12-24
46 | 
47 | ### Added
48 | 
49 | - Medatata enritcher and processor for Google Drive, Jira and Confluence data sources.
50 | 


--------------------------------------------------------------------------------
/packages/pangea-multipass-llama-index/README.md:
--------------------------------------------------------------------------------
 1 | # Pangea Multipass for Llama Index
 2 | 
 3 | This library extends the Pangea Multipass package to integrate metadata enrichment and document processing with Llama Index. It enables seamless use of authorization checks, metadata filtering, and custom processors on documents from Google Drive, JIRA, and Confluence, utilizing Llama Index structures for Retrieval-Augmented Generation (RAG) applications.
 4 | 
 5 | ## Features
 6 | 
 7 | - **Document Integration**: Adapts Pangea processors and enrichers to handle Llama Index documents.
 8 | - **Llama Index-Compatible Filtering**: Provides metadata filtering with operators for fine-grained document access control.
 9 | - **Authorization Processing**: Aggregates and applies multiple authorization checks on Llama Index nodes with custom, combinable node processors.
10 | 
11 | ## Installation
12 | 
13 | Use [Poetry](https://python-poetry.org/) to install dependencies:
14 | 
15 | ```bash
16 | poetry add pangea-multipass-llama-index
17 | ```
18 | 
19 | ## Usage
20 | ### Core Components
21 | - Document Reader: LIDocumentReader reads content from Llama Index documents for enrichment.
22 | - Processors for Llama Index:
23 |     - LlamaIndexJiraProcessor — Handles JIRA documents within Llama Index.
24 |     - LlamaIndexConfluenceProcessor — Processes Confluence documents in Llama Index.
25 |     - LlamaIndexGDriveProcessor — Manages Google Drive documents in Llama Index.
26 |     - Node Postprocessor Mixer: Combines multiple processors for complex, multi-source document filtering.
27 | - Metadata Filters: Filter documents based on metadata using operators like EQ, CONTAINS, and custom metadata keys.
28 | 
29 | ## License
30 | This project is licensed under the MIT License.
31 | 


--------------------------------------------------------------------------------
/packages/pangea-multipass/tests/integration/test_slack.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import unittest
 3 | 
 4 | from pangea_multipass import SlackProcessor, SlackReader, get_document_metadata
 5 | 
 6 | token = os.getenv("SLACK_ADMIN_TOKEN") or ""
 7 | user_email = os.getenv("SLACK_USER_EMAIL") or ""
 8 | 
 9 | _TOTAL_FILES = 12
10 | _AUTHORIZED_FILES = 7
11 | _TOTAL_CHANNELS = 4
12 | _AUTHORIZED_CHANNELS = 3
13 | 
14 | 
15 | class TestSlack(unittest.TestCase):
16 |     def setUp(self) -> None:
17 |         assert token
18 |         assert user_email
19 | 
20 |     def test_slack(self) -> None:
21 |         reader = SlackReader(token=token)
22 |         documents = reader.load_data()
23 |         assert len(documents) == _TOTAL_FILES
24 | 
25 |         processor = SlackProcessor(token=token, get_node_metadata=get_document_metadata, user_email=user_email)
26 |         filter = processor.get_filter()
27 |         assert len(filter.value) == _AUTHORIZED_CHANNELS
28 | 
29 |         filtered_docs = processor.filter(nodes=documents)
30 |         assert len(filtered_docs) == _AUTHORIZED_FILES
31 | 
32 |     def test_slack_pagination(self) -> None:
33 |         reader = SlackReader(token=token)
34 | 
35 |         channels = reader.get_channels()
36 |         assert len(channels) == _TOTAL_CHANNELS
37 | 
38 |         documents = []
39 | 
40 |         for channel in channels:
41 |             has_more_messages = True
42 |             while has_more_messages:
43 |                 docs = reader.read_messages(channel=channel, page_size=1)
44 |                 assert len(docs) <= 1  # Some messages are filtered out so we can't guarantee the exact number
45 |                 documents.extend(docs)
46 |                 has_more_messages = reader.has_more_messages
47 | 
48 |         assert len(documents) == _TOTAL_FILES
49 | 


--------------------------------------------------------------------------------
/examples/multipass_examples/04-dropbox-check-access.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from pangea_multipass import DropboxClient, DropboxReader, OauthFlow, data_load, data_save
 4 | from requests.exceptions import HTTPError
 5 | 
 6 | app_key = os.getenv("DROPBOX_APP_KEY")
 7 | assert app_key
 8 | 
 9 | # File to store tokens
10 | DROPBOX_TOKEN_FILE = "dropbox_tokens.json"
11 | 
12 | if not os.path.exists(DROPBOX_TOKEN_FILE):
13 |     code_verifier, code_challenge = OauthFlow.generate_pkce_pair()
14 | 
15 |     flow = OauthFlow(
16 |         auth_url=DropboxClient.AUTH_URL,
17 |         token_url=DropboxClient.TOKEN_URL,
18 |         client_id=app_key,
19 |     )
20 |     tokens = flow.run_pkce(code_verifier=code_verifier, code_challenge=code_challenge)
21 | else:
22 |     tokens = data_load(DROPBOX_TOKEN_FILE)
23 |     assert tokens
24 |     access_token = OauthFlow.refresh_access_token(
25 |         url=DropboxClient.TOKEN_URL, refresh_token=tokens["refresh_token"], client_id=app_key
26 |     )
27 |     tokens.update(access_token)
28 | 
29 | data_save(DROPBOX_TOKEN_FILE, tokens)
30 | access_token = tokens["access_token"]
31 | reader = DropboxReader(access_token)
32 | documents = []
33 | 
34 | print("Loading documents from Dropbox...")
35 | try:
36 |     documents = reader.load_data()
37 | 
38 | except HTTPError as e:
39 |     if e.response:
40 |         print(e.response.text)
41 |     else:
42 |         print(e)
43 | 
44 | print(f"Loaded {len(documents)} docs")
45 | 
46 | 
47 | # Inference time
48 | from pangea_multipass import DropboxProcessor, get_document_metadata
49 | 
50 | user_email = os.getenv("DROPBOX_USER_EMAIL")
51 | assert user_email
52 | 
53 | processor = DropboxProcessor(access_token, user_email=user_email, get_node_metadata=get_document_metadata)
54 | print("Filtering authorized documents...")
55 | authorized_docs = processor.filter(documents)
56 | 
57 | print(f"Authorized docs: {len(authorized_docs)}")
58 | 


--------------------------------------------------------------------------------
/packages/pangea-multipass/pangea_multipass/utils.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import logging
 3 | import os
 4 | from logging.handlers import TimedRotatingFileHandler
 5 | from typing import Dict, Optional
 6 | 
 7 | 
 8 | def data_load(filename: str) -> Optional[dict]:
 9 |     if not os.path.exists(filename):
10 |         return None
11 | 
12 |     with open(filename, "r") as f:
13 |         return json.load(f)
14 | 
15 | 
16 | def data_save(filename: str, data: dict):
17 |     with open(filename, "w") as f:
18 |         json.dump(data, f)
19 | 
20 | 
21 | _loggers: Dict[str, bool] = {}
22 | 
23 | 
24 | def set_logger_to_json(logger_name: str, level=logging.DEBUG):
25 |     if _loggers.get(logger_name) is not None:
26 |         return
27 | 
28 |     _loggers[logger_name] = True
29 |     logger = logging.getLogger(logger_name)
30 |     logger.setLevel(level)
31 |     handler = TimedRotatingFileHandler(
32 |         filename="multipass_logs.json", when="D", interval=1, backupCount=90, encoding="utf-8", delay=False
33 |     )
34 |     handler.setLevel(level)
35 |     formatter = logging.Formatter(
36 |         fmt='{"time": "%(asctime)s.%(msecs)03d", "name": "%(name)s", "level": "%(levelname)s",  "message": %(message)s },',
37 |         datefmt="%Y-%m-%d %H:%M:%S",
38 |     )
39 |     handler.setFormatter(formatter)
40 |     logger.addHandler(handler)
41 | 
42 | 
43 | def set_logger_to_stdout(logger_name: str, level=logging.DEBUG):
44 |     if _loggers.get(logger_name) is not None:
45 |         return
46 | 
47 |     _loggers[logger_name] = True
48 |     logger = logging.getLogger(logger_name)
49 |     logger.setLevel(level)
50 |     handler = logging.StreamHandler()
51 |     handler.setLevel(level)
52 |     formatter = logging.Formatter(
53 |         fmt="[%(asctime)s.%(msecs)03d %(name)s %(levelname)s]: %(message)s.",
54 |         datefmt="%Y-%m-%d %H:%M:%S",
55 |     )
56 |     handler.setFormatter(formatter)
57 |     logger.addHandler(handler)
58 | 


--------------------------------------------------------------------------------
/packages/pangea-multipass/tests/integration/test_dropbox.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import time
 3 | import unittest
 4 | 
 5 | from pangea_multipass import DropboxClient, DropboxProcessor, DropboxReader, OauthFlow, get_document_metadata
 6 | 
 7 | _TOTAL_FILES = 10
 8 | _AUTHORIZED_FILES = 5
 9 | _AUTHORIZED_FOLDERS = 3
10 | 
11 | 
12 | class TestDropbox(unittest.TestCase):
13 |     def setUp(self) -> None:
14 |         refresh_token = os.getenv("DROPBOX_REFRESH_TOKEN") or ""
15 |         assert refresh_token
16 |         app_key = os.getenv("DROPBOX_APP_KEY") or ""
17 |         assert app_key
18 |         self.user_email = os.getenv("DROPBOX_USER_EMAIL") or ""
19 |         assert self.user_email
20 |         token_data = OauthFlow.refresh_access_token(
21 |             url=DropboxClient.TOKEN_URL, refresh_token=refresh_token, client_id=app_key
22 |         )
23 |         self.access_token = token_data["access_token"]
24 |         assert self.access_token
25 | 
26 |     def test_dropbox(self) -> None:
27 |         reader = DropboxReader(token=self.access_token)
28 |         files = reader.load_data()
29 |         assert len(files) == _TOTAL_FILES
30 | 
31 |         processor = DropboxProcessor(
32 |             token=self.access_token, user_email=self.user_email, get_node_metadata=get_document_metadata
33 |         )
34 |         filter = processor.get_filter()
35 |         assert len(filter.value) == _AUTHORIZED_FOLDERS
36 | 
37 |         authorized_files = processor.filter(files)
38 |         assert len(authorized_files) == _AUTHORIZED_FILES
39 | 
40 |     def test_dropbox_pagination(self) -> None:
41 |         reader = DropboxReader(token=self.access_token)
42 |         has_more_files = True
43 |         all_files = []
44 | 
45 |         while has_more_files:
46 |             files = reader.read_page(page_size=1)
47 |             all_files.extend(files)
48 |             has_more_files = reader.has_more_files
49 | 
50 |         assert len(all_files) == _TOTAL_FILES
51 | 


--------------------------------------------------------------------------------
/examples/llama_index_examples/08-confluence-check-access.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Pangea Cyber Corporation
 2 | # Author: Pangea Cyber Corporation
 3 | 
 4 | import os
 5 | from typing import List
 6 | 
 7 | from llama_index.readers.confluence import ConfluenceReader
 8 | from pangea_multipass import ConfluenceAuth, ConfluenceME, PangeaMetadataKeys, enrich_metadata
 9 | from pangea_multipass_llama_index import LIDocument, LIDocumentReader
10 | 
11 | # Fetch documents from Confluence
12 | confluence_space_key = "~71202041f9bfec117041348629ccf3e3c751b3"
13 | confluence_space_id = 393230
14 | 
15 | admin_token = os.getenv("CONFLUENCE_ADMIN_TOKEN")
16 | assert admin_token
17 | admin_email = os.getenv("CONFLUENCE_ADMIN_EMAIL")
18 | assert admin_email
19 | url = os.getenv("CONFLUENCE_BASE_URL")
20 | assert url
21 | 
22 | 
23 | def confluence_read_docs() -> List[LIDocument]:
24 |     """Fetch all documents from Confluence using ConfluenceReader."""
25 | 
26 |     # Create a ConfluenceReader instance
27 |     print("Loading Confluence docs...")
28 |     reader = ConfluenceReader(
29 |         base_url=url,
30 |         user_name=admin_email,
31 |         password=admin_token,
32 |     )
33 |     documents: List[LIDocument] = reader.load_data(space_key=confluence_space_key, include_attachments=True)
34 | 
35 |     # Enrich metadata process
36 |     print(f"Processing {len(documents)} Confluence docs...")
37 |     confluence_me = ConfluenceME()
38 |     enrich_metadata(documents, [confluence_me], reader=LIDocumentReader())
39 | 
40 |     return documents
41 | 
42 | 
43 | documents = confluence_read_docs()
44 | print(f"Loaded {len(documents)} pages.")
45 | 
46 | # Inference
47 | from pangea_multipass_llama_index import LlamaIndexConfluenceProcessor
48 | 
49 | admin_token = os.getenv("CONFLUENCE_ADMIN_TOKEN")
50 | assert admin_token
51 | admin_email = os.getenv("CONFLUENCE_ADMIN_EMAIL")
52 | assert admin_email
53 | url = os.getenv("CONFLUENCE_BASE_URL")
54 | assert url
55 | account_id = os.getenv("CONFLUENCE_USER_ACCOUNT_ID")
56 | assert account_id
57 | 
58 | # Create Confluence filter with admin token
59 | confluence_processor = LlamaIndexConfluenceProcessor(
60 |     ConfluenceAuth(admin_email, admin_token, url), account_id=account_id
61 | )
62 | 
63 | authorized_docs = confluence_processor.filter(documents)  # type: ignore
64 | 
65 | print(f"\nAuthorized pages: {len(authorized_docs)}")
66 | for doc in authorized_docs:
67 |     print(f"\t{doc.metadata.get(PangeaMetadataKeys.CONFLUENCE_PAGE_ID, '')}")
68 | 


--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
 1 | name: CI
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 | 
 8 |   pull_request:
 9 |     types:
10 |       - opened
11 |       - synchronize
12 |       - reopened
13 |       - ready_for_review
14 | 
15 |   merge_group:
16 | 
17 |   workflow_dispatch:
18 | 
19 | permissions:
20 |   contents: read
21 | 
22 | jobs:
23 |   build:
24 |     runs-on: ubuntu-24.04
25 |     strategy:
26 |       matrix:
27 |         package: [pangea-multipass, pangea-multipass-langchain, pangea-multipass-llama-index]
28 |     defaults:
29 |       run:
30 |         working-directory: ./packages/${{ matrix.package }}
31 |     steps:
32 |       - name: Checkout code
33 |         uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
34 | 
35 |       - name: Install uv
36 |         uses: astral-sh/setup-uv@ed21f2f24f8dd64503750218de024bcf64c7250a # v7.1.5
37 |         with:
38 |           enable-cache: true
39 | 
40 |       - name: Setup Python
41 |         uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
42 |         with:
43 |           python-version: 3.12
44 | 
45 |       - name: Install dependencies
46 |         run: uv sync --all-extras --dev
47 | 
48 |       - name: Build
49 |         run: uv build
50 | 
51 |   mypy:
52 |     runs-on: ubuntu-24.04
53 |     strategy:
54 |       fail-fast: false
55 |       matrix:
56 |         path:
57 |           - examples/langchain_examples
58 |           - examples/llama_index_examples
59 |           - examples/multipass_examples
60 |           - packages/pangea-multipass
61 |           - packages/pangea-multipass-langchain
62 |           - packages/pangea-multipass-llama-index
63 |     defaults:
64 |       run:
65 |         working-directory: ${{ matrix.path }}
66 |     steps:
67 |       - name: Checkout code
68 |         uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
69 | 
70 |       - name: Install cairo
71 |         run: sudo apt-get install -y libcairo2-dev
72 | 
73 |       - name: Install uv
74 |         uses: astral-sh/setup-uv@ed21f2f24f8dd64503750218de024bcf64c7250a # v7.1.5
75 |         with:
76 |           enable-cache: true
77 | 
78 |       - name: Setup Python
79 |         uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
80 |         with:
81 |           python-version: 3.12
82 | 
83 |       - name: Install dependencies
84 |         run: uv sync --all-extras --dev
85 | 
86 |       - name: mypy
87 |         run: uv run mypy . --ignore-missing-imports --implicit-reexport
88 | 


--------------------------------------------------------------------------------
/examples/llama_index_examples/07-jira-check-acess.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Pangea Cyber Corporation
 2 | # Author: Pangea Cyber Corporation
 3 | 
 4 | import os
 5 | import warnings
 6 | from typing import List
 7 | 
 8 | from llama_index.core import Document
 9 | from llama_index.readers.jira import JiraReader
10 | from pangea_multipass import JiraAuth, JiraME, PangeaMetadataKeys, enrich_metadata
11 | from pangea_multipass_llama_index import LIDocument, LIDocumentReader
12 | 
13 | # Suppress specific warning
14 | warnings.filterwarnings("ignore", message='Field "model_name" has conflict with protected namespace')
15 | 
16 | 
17 | def jira_load_data(reader: JiraReader, query: str = "") -> List[Document]:
18 |     max_results = 100
19 |     start_at = 0
20 |     keep_iterating = True
21 |     all_documents: List[Document] = []
22 | 
23 |     while keep_iterating:
24 |         documents = reader.load_data(query, start_at=start_at, max_results=max_results)
25 |         all_documents.extend(documents)
26 |         l = len(documents)
27 |         start_at = start_at + l
28 |         keep_iterating = l >= max_results
29 | 
30 |     return all_documents
31 | 
32 | 
33 | def jira_read_docs() -> List[LIDocument]:
34 |     # Jira credentials and base URL
35 |     JIRA_BASE_URL = os.getenv("JIRA_BASE_URL") or ""
36 |     assert JIRA_BASE_URL
37 |     jira_admin_email = os.getenv("JIRA_ADMIN_EMAIL") or ""
38 |     assert jira_admin_email
39 |     jira_api_token = os.getenv("JIRA_ADMIN_TOKEN") or ""
40 |     assert jira_api_token
41 | 
42 |     # Initialize LlamaIndex JiraReader
43 |     print("Loading Jira docs...")
44 |     jira_reader = JiraReader(server_url=JIRA_BASE_URL, email=jira_admin_email, api_token=jira_api_token)
45 | 
46 |     documents = jira_load_data(jira_reader, "")
47 | 
48 |     # Metadata enricher library
49 |     print(f"Processing {len(documents)} Jira docs...")
50 |     jira_me = JiraME(JIRA_BASE_URL, jira_admin_email, jira_api_token)
51 |     enrich_metadata(documents, [jira_me], reader=LIDocumentReader())
52 | 
53 |     return documents
54 | 
55 | 
56 | documents = jira_read_docs()
57 | 
58 | # Inference
59 | from pangea_multipass_llama_index import LlamaIndexJiraProcessor
60 | 
61 | # Create JIRA filter
62 | jira_user_token = os.getenv("JIRA_ADMIN_TOKEN")
63 | assert jira_user_token
64 | jira_user_email = os.getenv("JIRA_ADMIN_EMAIL")
65 | assert jira_user_email
66 | jira_url = os.getenv("JIRA_BASE_URL")
67 | assert jira_url
68 | jira_account_id = os.getenv("JIRA_USER_ACCOUNT_ID")
69 | assert jira_account_id
70 | 
71 | jira_processor = LlamaIndexJiraProcessor(
72 |     JiraAuth(jira_user_email, jira_user_token, jira_url), account_id=jira_account_id
73 | )
74 | authorized_docs = jira_processor.filter(documents)  # type: ignore
75 | 
76 | print(f"\nAuthorized issues: {len(authorized_docs)}")
77 | for doc in authorized_docs:
78 |     print(f"\t{doc.metadata.get(PangeaMetadataKeys.JIRA_ISSUE_ID, '')}")
79 | 


--------------------------------------------------------------------------------
/packages/pangea-multipass-langchain/README.md:
--------------------------------------------------------------------------------
 1 | # Pangea Multipass for LangChain
 2 | 
 3 | The `pangea-multipass-langchain` package extends Pangea Multipass to integrate with LangChain's document processing, providing enhanced security, metadata filtering, and access control for LangChain documents. This package supports integrations with Google Drive, JIRA, and Confluence, leveraging metadata-based filtering and authorization to control document access.
 4 | 
 5 | ## Features
 6 | 
 7 | - **Document Reader**: Custom `LangChainDocumentReader` class reads content from LangChain documents, adapting to Pangea's document model.
 8 | - **Integration Processors**:
 9 |   - `LangChainJiraFilter`: Allows JIRA integration, authenticating and processing JIRA documents in LangChain.
10 |   - `LangChainConfluenceFilter`: Provides Confluence integration for document access control in LangChain.
11 |   - `LangChainGDriveFilter`: Uses Google OAuth credentials to access and filter Google Drive documents in LangChain.
12 | - **Document Filter Mixer**: The `DocumentFilterMixer` aggregates multiple processors, applying customized filters for advanced access control across various sources.
13 | 
14 | ## Installation
15 | 
16 | Use [Poetry](https://python-poetry.org/) to install dependencies:
17 | 
18 | ```bash
19 | poetry add pangea-multipass-langchain
20 | ```
21 | 
22 | ## Usage 
23 | ### Core Components
24 | 
25 | - LangChainDocumentReader: The LangChainDocumentReader class enables reading content from LangChain documents for authorization and metadata filtering. This class acts as a bridge between LangChain documents and Pangea's authorization model.
26 | - Processors for LangChain Integration: The package includes processors that integrate with specific data sources using authentication credentials. Each processor retrieves metadata from documents, allowing fine-grained control over document access:
27 |     - LangChainJiraFilter: Authenticates with JIRA and processes JIRA documents.
28 |     - LangChainConfluenceFilter: Processes Confluence documents, applying access control.
29 |     - LangChainGDriveFilter: Integrates Google Drive documents into LangChain using OAuth2 credentials.
30 | - DocumentFilterMixer: The DocumentFilterMixer aggregates multiple document processors, applying filters to handle complex document access control. It retrieves authorized and unauthorized documents based on the combined filters from each processor.
31 |     - Filter Documents: filter() applies filters to a list of LangChain documents.
32 |     - Retrieve Unauthorized Documents: get_unauthorized_documents() retrieves documents that fail authorization checks.
33 |     - Retrieve Authorized Documents: get_authorized_documents() provides access to documents meeting authorization criteria.
34 | - Metadata Filtering: The package includes metadata-based filtering, allowing users to apply filters with operators like EQ, GT, LT, CONTAINS, and more. Each filter can be customized to match document metadata for precise access control.
35 | 
36 | ## License
37 | This project is licensed under the MIT License.
38 | 


--------------------------------------------------------------------------------
/examples/multipass_examples/README.md:
--------------------------------------------------------------------------------
  1 | # Pangea Multipass Examples
  2 | 
  3 | ## Setting up your Data Sources
  4 | 
  5 | Check out the README in the base `examples` folder for the environment variables you need for each data source. 
  6 | 
  7 | ## Running the Code
  8 | 
  9 | Ensure you have [uv](https://docs.astral.sh/uv/) installed for dependency
 10 | management and virtual environment setup.
 11 | 
 12 | ### Installing Dependencies
 13 | 
 14 | Run the following command to install all dependencies:
 15 | 
 16 | ```bash
 17 | uv sync
 18 | ```
 19 | 
 20 | ### Running the GitHub check:
 21 | 
 22 | After you set the GitHub environment variables in the `examples\README.md` file, run this command:
 23 | 
 24 | ```bash
 25 | uv run 01-github-check.py
 26 | ```
 27 | 
 28 | *Note:* If your admin account has access to numerous repositories - directly or via Organizations - this may take a while. For test purposes, we recommend using a smaller test account.
 29 | 
 30 | Sample output:
 31 | 
 32 | ```bash
 33 | Loaded 8 docs:
 34 | offices.txt 
 35 | strategy.txt 
 36 | capacitor.txt 
 37 | folder_1/internal_architecture.txt 
 38 | folder_2/react.txt 
 39 | folder_1/salaries.txt 
 40 | folder_2/venture-capital.txt 
 41 | interest-rate.txt 
 42 | 
 43 | Authorized docs: 5
 44 | offices.txt 
 45 | strategy.txt 
 46 | capacitor.txt 
 47 | folder_1/internal_architecture.txt 
 48 | folder_2/react.txt
 49 | ```
 50 | 
 51 | ### Running the Google Drive check:
 52 | 
 53 | After you set the Google Drive environment variables in the `examples\README.md` file, run this command:
 54 | 
 55 | ```bash
 56 | uv run 02-gdrive-check.py
 57 | ```
 58 | 
 59 | 
 60 | ### Running the Slack check:
 61 | 
 62 | After you set the Slack environment variables in the `examples\README.md` file, run this command:
 63 | 
 64 | ```bash
 65 | uv run 03-slack-check.py
 66 | ```
 67 | 
 68 | *Note:* In order to read messages from your Slack channel, your app/bot will need to be present in the channel. This applies to both public and private channels. Any public channels that the bot is not in will generate a "not_in_channel" message.
 69 | 
 70 | Sample output:
 71 | 
 72 | ```bash
 73 | Error fetching messages for channel C021V27F8KU: not_in_channel
 74 | Error fetching messages for channel C0LNSFJ6897: not_in_channel
 75 | Loaded 38 messages.
 76 | User has acess to channel ids:
 77 | 	C021V27F8KU
 78 | 	C0LNLKJ6897
 79 | 	C021V8CDFMZ
 80 | 	C029J65B4KH
 81 | 	C02PFMW465Q
 82 | 	C087CNAQGLV
 83 | 	C087K7JPQQ4
 84 | User has access to 32 messages
 85 | ```
 86 | 
 87 | ### Running the Dropbox check:
 88 | 
 89 | After you set the Dropbox environment variables in the `examples\README.md` file, run this command:
 90 | 
 91 | ```bash
 92 | uv run 04-dropbox-check.py 
 93 | ```
 94 | 
 95 | The first time this runs, it will open a browser window to authorize the Dropbox application to your account and store the resulting tokens. Later runs will simply continue to the output below.
 96 | 
 97 | Sample output:
 98 | 
 99 | ```bash
100 | Listening for authentication response on http://localhost:8080 ...
101 | 127.0.0.1 - - [18/Feb/2025 11:18:32] "GET /?code=crD1VEFcJzAAAAAAAABLxoiqCA5-LbgQiaGWQR2R3gA HTTP/1.1" 200 -
102 | Loading documents from Dropbox...
103 | Loaded page: 1. Docs: 0
104 | Loaded page: 2. Docs: 20
105 | Loaded page: 3. Docs: 18
106 | Loaded page: 4. Docs: 23
107 | Loaded 61 docs
108 | Filtering authorized documents...
109 | Authorized docs: 22
110 | ```
111 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Pangea Multipass: Your Authorization Helper
 2 | 
 3 | Pangea Multipass is a Python library for checking user access to upstream data sources.
 4 | 
 5 | In practice, you can use it to check if a specific user has access to a file in a Google Drive, a ticket in Jira, or a page in Confluence. In concept, we've built this library to be extensible to eventually support Slack channels, GitHub repositories, Salesforce opportunities, and more. 
 6 | 
 7 | We originally built this to support our customers' Retrieval-Augmented Generation (RAG) applications to mitigate data leaks. In a RAG architecture, the application inserts additional context at inference time. If you don't check the user's authorization to that context, you could inadvertently leak sensitive information. 
 8 | 
 9 | While this is useful in AI/LLM apps, we've abstracted this to work independently so you can use it in any app.
10 | 
11 | Check out the `/examples` folder for AI-specific and generic examples.  
12 | 
13 | ## Features
14 | 
15 | - **Document Reading**: Supports document content extraction for use in processing and enrichment.
16 | - **Metadata Enrichment**: Includes enrichers for hashing, constant value setting, and custom metadata.
17 | - **Metadata Filtering**: Provides flexible operators to filter document metadata for customized queries.
18 | - **Authorization Processing**: Manages authorized and unauthorized nodes with customizable node processors.
19 | - **Extensible**: Built on abstract base classes, allowing easy extension and customization of functionality.
20 | 
21 | ## Installation
22 | 
23 | To install `pangea-multipass`, you can use [Poetry](https://python-poetry.org/) for dependency management:
24 | 
25 | ```bash
26 | poetry add pangea-multipass
27 | ```
28 | 
29 | There are full runnable demos in the `pangea_multipass_lib\examples` directory but here are the key aspects.
30 | 
31 | Using a set of Google Drive credentials - following the steps in the llama_index_examples folder - you initialize the data source:
32 | 
33 | ```python
34 |     gdrive_reader = GoogleDriveReader(
35 |         folder_id=gdrive_fid, token_path=admin_token_filepath, credentials_path=credentials_filepath
36 |     )
37 |     documents = gdrive_reader.load_data(folder_id=gdrive_fid)
38 | ```
39 | 
40 | This gives you a list of files. You can then use the processors to filter into the authorized and unauthorized resource lists:
41 | 
42 | ```python
43 | gdrive_processor = LlamaIndexGDriveProcessor(creds)
44 | node_processor = NodePostprocessorMixer([gdrive_processor])
45 | 
46 | authorized_docs = node_processor.postprocess_nodes(documents)
47 | unauthorized_docs = node_processor.get_unauthorized_nodes()
48 | ```
49 | 
50 | In general, the authorized list will be more important but you may notify an admin or log if a user is attempting to access a folder where they have limited access. It could be an attempt at data theft or their permissions are incomplete.
51 | 
52 | ## Roadmap
53 | 
54 | At release, this library supports Google Workspace, Confluence, and Jira. For adding systems, our top priorities are:
55 | 
56 | - Box
57 | - Dropbox
58 | - Office 365
59 | 
60 | Others we plan to support or are looking for contributions are:
61 | 
62 | - Zoom
63 | - Salesforce
64 | - GitLab
65 | - Zendesk
66 | - Notion
67 | - Sharepoint
68 | - Asana
69 | - Hubspot
70 | 
71 | Check out `EXTENDING.md` for the specific structure and requirements for extending Pangea Multipass for your data sources. Pull requests are welcome.
72 | 


--------------------------------------------------------------------------------
/examples/llama_index_examples/README.md:
--------------------------------------------------------------------------------
 1 | # Pangea Auth Llama Index Example App
 2 | 
 3 | These examples application demonstrates the integration of `PangeaAuth` with `LlamaIndex` to perform secure document retrieval and metadata enrichment from multiple data sources. The app connects to Google Drive, Confluence, and JIRA, retrieves documents, enriches metadata for authorization, and performs query-based searches using vector indexing.
 4 | 
 5 | ## Configuration
 6 | 
 7 | ### Environment Variables
 8 | 
 9 | Set the following environment variables to configure access to Confluence and JIRA:
10 | 
11 | - **Confluence**:
12 |   - `CONFLUENCE_ADMIN_TOKEN`: Admin token for Confluence authentication.
13 |   - `CONFLUENCE_ADMIN_EMAIL`: Admin email for Confluence authentication.
14 |   - `CONFLUENCE_BASE_URL`: Base URL of the Confluence server.
15 |   - `CONFLUENCE_USER_TOKEN`: User token for accessing Confluence.
16 |   - `CONFLUENCE_USER_EMAIL`: User email for accessing Confluence.
17 | 
18 | - **JIRA**:
19 |   - `JIRA_BASE_URL`: Base URL of the JIRA server.
20 |   - `JIRA_ADMIN_EMAIL`: Admin email for JIRA authentication.
21 |   - `JIRA_ADMIN_TOKEN`: Admin token for JIRA authentication.
22 |   - `JIRA_USER_TOKEN`: User token for accessing JIRA.
23 |   - `JIRA_USER_EMAIL`: User email for accessing JIRA.
24 | 
25 | <ADMIN> environment variables are used in ingestion time and this `admin` should have access to all the cloud files/issues/pages that are desired to be included in the vector store.
26 | <USER> environment variables are used in inference time and in this case, `user` could have restricted access to some files/issues/pages that are going to be checked on runtime.
27 | 
28 | ### Required Config Files
29 | 
30 | 1. **Google OAuth2 Credentials**: `credentials.json`  
31 |    This file should contain your OAuth2 credentials for Google Drive API access. Download it from your Google Cloud Console and place it in the `examples` directory.
32 | 
33 | 2. **Admin Access Token for Google Drive**: `admin_access_token.json`  
34 |    This file stores the access token for the Google Drive admin user and is generated through OAuth2.
35 | 
36 | ## Installation and Setup
37 | 
38 | ### Prerequisites
39 | 
40 | Ensure you have [uv](https://docs.astral.sh/uv/) installed for dependency management and virtual environment setup.
41 | 
42 | ### Installing Dependencies
43 | 
44 | Run the following command to install all dependencies:
45 | 
46 | ```bash
47 | uv sync
48 | ```
49 | 
50 | ## Running the Application
51 | - Set Up Environment Variables: Set all required environment variables in your terminal or in a .env file in the root directory.
52 | - Run the App:
53 | 
54 | ```bash
55 | uv run 07-2-rag-LlamaIndex-all-sources-processor
56 | ```
57 | 
58 | ## How It Works
59 | ### Document Retrieval and Enrichment
60 | The app connects to Google Drive, Confluence, and JIRA to retrieve documents based on the provided credentials and metadata filters. Documents are enriched with metadata through the PangeaAuth framework, ensuring enhanced access control.
61 | 
62 | ### Query Engine and Access Control
63 | The app uses a vector store to index documents and enables query-based retrieval. The query engine notifies users if certain sources were unauthorized, ensuring that answers reflect accessible data only.
64 | 
65 | ### Unauthorized Document Warning
66 | If documents from any source are unauthorized for the user, a warning is displayed, indicating missing context due to access restrictions.
67 | 
68 | ## License
69 | This project is licensed under the MIT License.
70 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
110 | .pdm.toml
111 | .pdm-python
112 | .pdm-build/
113 | 
114 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
115 | __pypackages__/
116 | 
117 | # Celery stuff
118 | celerybeat-schedule
119 | celerybeat.pid
120 | 
121 | # SageMath parsed files
122 | *.sage.py
123 | 
124 | # Environments
125 | .env
126 | .venv
127 | env/
128 | venv/
129 | ENV/
130 | env.bak/
131 | venv.bak/
132 | 
133 | # Spyder project settings
134 | .spyderproject
135 | .spyproject
136 | 
137 | # Rope project settings
138 | .ropeproject
139 | 
140 | # mkdocs documentation
141 | /site
142 | 
143 | # mypy
144 | .mypy_cache/
145 | .dmypy.json
146 | dmypy.json
147 | 
148 | # Pyre type checker
149 | .pyre/
150 | 
151 | # pytype static type analyzer
152 | .pytype/
153 | 
154 | # Cython debug symbols
155 | cython_debug/
156 | 
157 | # PyCharm
158 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
159 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
160 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
161 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
162 | #.idea/
163 | *.whl
164 | 


--------------------------------------------------------------------------------
/examples/llama_index_examples/02-gdrive-check-access.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2021 Pangea Cyber Corporation
  2 | # Author: Pangea Cyber Corporation
  3 | 
  4 | import os
  5 | import sys
  6 | from typing import List
  7 | 
  8 | from google.oauth2.credentials import Credentials
  9 | from llama_index.readers.google import GoogleDriveReader
 10 | from pangea_multipass import GDriveAPI, GDriveME, PangeaMetadataKeys, enrich_metadata
 11 | from pangea_multipass_llama_index import LIDocument, LIDocumentReader
 12 | 
 13 | if len(sys.argv) != 2:
 14 |     print(f"usage: {sys.argv[0]} <gdrive_folder_id>")
 15 |     exit(1)
 16 | 
 17 | 
 18 | SCOPES = [
 19 |     "openid",
 20 |     "https://www.googleapis.com/auth/userinfo.email",
 21 |     "https://www.googleapis.com/auth/userinfo.profile",
 22 |     "https://www.googleapis.com/auth/drive.metadata.readonly",
 23 |     "https://www.googleapis.com/auth/drive.readonly",
 24 | ]
 25 | 
 26 | # Sample folder data folder owned by apurv@gondwana.cloud https://drive.google.com/drive/u/1/folders/1Kj77oi2QGEOPKcIo_hKZPiHDJyKKFVgR
 27 | 
 28 | # gdrive_fid = "1Kj77oi2QGEOPKcIo_hKZPiHDJyKKFVgR"
 29 | gdrive_fid = sys.argv[1]
 30 | 
 31 | # File name for the admin user
 32 | admin_token_filepath = "admin_access_token.json"
 33 | 
 34 | 
 35 | def google_drive_read_docs() -> List[LIDocument]:
 36 |     print(f"Loading Google Drive docs. Folder ID: {gdrive_fid}.")
 37 |     # Google Drive Data Ingestion
 38 |     credentials_filepath = os.path.abspath("../credentials.json")
 39 | 
 40 |     # Invoke Google /auth endpoint and save the token for later use
 41 |     if not os.path.isfile(admin_token_filepath):
 42 |         print("Sign in with the admin user account:")
 43 |         GDriveAPI.get_and_save_access_token(credentials_filepath, admin_token_filepath, SCOPES)
 44 | 
 45 |     # load the documents and create the index
 46 |     print("Login to GDrive as admin...")
 47 |     gdrive_reader = GoogleDriveReader(
 48 |         folder_id=gdrive_fid, token_path=admin_token_filepath, credentials_path=credentials_filepath
 49 |     )
 50 |     print("Loading data...")
 51 |     documents: List[LIDocument] = gdrive_reader.load_data(folder_id=gdrive_fid)
 52 | 
 53 |     print(f"Processing {len(documents)} docs...")
 54 | 
 55 |     # Metadata enricher library
 56 |     creds = Credentials.from_authorized_user_file(admin_token_filepath, SCOPES)
 57 |     gdrive_me = GDriveME(creds, {})
 58 |     enrich_metadata(documents, [gdrive_me], reader=LIDocumentReader())
 59 |     # Finish metadata enrichement
 60 | 
 61 |     return documents
 62 | 
 63 | 
 64 | documents = google_drive_read_docs()
 65 | 
 66 | # Inference
 67 | from pangea_multipass import GDriveAPI
 68 | from pangea_multipass_llama_index import LlamaIndexGDriveProcessor, NodePostprocessorMixer
 69 | 
 70 | # Create GDrive filter
 71 | print("Login to GDrive as user...")
 72 | creds = Credentials.from_authorized_user_file(admin_token_filepath, SCOPES)
 73 | 
 74 | # User email to check permissions
 75 | user_email = "alice@gondwana.cloud"
 76 | 
 77 | gdrive_processor = LlamaIndexGDriveProcessor(creds, user_email=user_email)
 78 | node_processor = NodePostprocessorMixer([gdrive_processor])
 79 | 
 80 | # Process documents
 81 | authorized_docs = node_processor.postprocess_nodes(documents)  # type: ignore
 82 | unauthorized_docs = node_processor.get_unauthorized_nodes()
 83 | 
 84 | if len(authorized_docs):
 85 |     print(f"User: '{user_email}' has access to the next files in folder '{gdrive_fid}'")
 86 |     for docs in authorized_docs:
 87 |         file_id = docs.metadata.get(PangeaMetadataKeys.GDRIVE_FILE_ID, "")
 88 |         name = docs.metadata.get(PangeaMetadataKeys.FILE_NAME, "")
 89 |         print(f"id: {file_id:44} filename: {name}.")
 90 | else:
 91 |     print(f"User '{user_email}' has NO access to any file in folder '{gdrive_fid}'")
 92 | 
 93 | if len(unauthorized_docs):
 94 |     print(f"\nUser '{user_email}' has NO access to the next files in folder '{gdrive_fid}'")
 95 |     for docs in unauthorized_docs:
 96 |         file_id = docs.metadata.get(PangeaMetadataKeys.GDRIVE_FILE_ID, "")
 97 |         name = docs.metadata.get(PangeaMetadataKeys.FILE_NAME, "")
 98 |         print(f"id: {file_id:44} filename: {name}.")
 99 | else:
100 |     print(f"\nUser '{user_email}' has access to all the files in folder '{gdrive_fid}'")
101 | 


--------------------------------------------------------------------------------
/packages/pangea-multipass/pangea_multipass/dropbox_reader.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import logging
  3 | from typing import List, Optional
  4 | 
  5 | import requests
  6 | 
  7 | from .core import MultipassDocument, PangeaMetadataKeys, PangeaMetadataValues, generate_id
  8 | from .sources import DropboxClient
  9 | 
 10 | _actor = "dropbox_reader"
 11 | 
 12 | 
 13 | class DropboxReader:
 14 |     _token: str
 15 |     """Dropbox access token"""
 16 | 
 17 |     _has_more: bool
 18 |     _cursor: Optional[str]
 19 |     _folder_path: str
 20 |     _recursive: bool
 21 | 
 22 |     def __init__(self, token: str, folder_path: str = "", recursive: bool = True, logger_name: str = "multipass"):
 23 |         self._token = token
 24 |         self._folder_path = folder_path
 25 |         self._recursive = recursive
 26 |         self.logger = logging.getLogger(logger_name)
 27 |         self._client = DropboxClient(logger_name)
 28 |         self.restart()
 29 | 
 30 |     def restart(self):
 31 |         self._has_more = True
 32 |         self._cursor = None
 33 | 
 34 |     @property
 35 |     def has_more_files(self):
 36 |         """Check if there are more files to read"""
 37 |         return self._has_more
 38 | 
 39 |     def load_data(
 40 |         self,
 41 |     ) -> List[MultipassDocument]:
 42 |         """
 43 |         Retrieves all files in Dropbox.
 44 |         It download and return just files, skipping folders.
 45 |         """
 46 | 
 47 |         documents: List[MultipassDocument] = []
 48 | 
 49 |         while self._has_more:
 50 |             documents.extend(self.read_page(page_size=50))
 51 | 
 52 |         return documents
 53 | 
 54 |     def read_page(self, page_size: int = 50) -> List[MultipassDocument]:
 55 |         """
 56 |         Read a page of files from Dropbox
 57 |         Page size is an approximate number of files to read.
 58 |         It could be more due to Drobox API limitations or it could be less due to folders being skipped.
 59 |         """
 60 | 
 61 |         documents: List[MultipassDocument] = []
 62 | 
 63 |         url = DropboxClient.LIST_FILES_URL if self._cursor is None else DropboxClient.LIST_CONTINUE_URL
 64 |         data = {"path": self._folder_path, "recursive": self._recursive, "limit": page_size}
 65 |         if self._cursor:
 66 |             data = {"cursor": self._cursor}
 67 | 
 68 |         headers = {"Authorization": f"Bearer {self._token}", "Content-Type": "application/json"}
 69 |         response = requests.post(url, json=data, headers=headers)
 70 |         if response.status_code != 200:
 71 |             self.logger.error(
 72 |                 json.dumps(
 73 |                     {
 74 |                         "actor": _actor,
 75 |                         "fn": "read_page",
 76 |                         "action": "post",
 77 |                         "url": url,
 78 |                         "data": data,
 79 |                         "status_code": response.status_code,
 80 |                         "reason": response.reason,
 81 |                         "text": response.text,
 82 |                     }
 83 |                 )
 84 |             )
 85 | 
 86 |         response.raise_for_status()
 87 |         result = response.json()
 88 |         entries = result.get("entries", [])
 89 | 
 90 |         for entrie in entries:
 91 |             if entrie.get(".tag", "") != "file":
 92 |                 continue
 93 | 
 94 |             file_path: str = entrie.get("path_lower", None)
 95 |             if not file_path:
 96 |                 continue
 97 | 
 98 |             name = entrie.get("name", "")
 99 |             path = file_path.removesuffix(f"/{name}")
100 | 
101 |             file = self._client.download_file(token=self._token, file_path=file_path)
102 |             metadata: dict[str, str] = {
103 |                 PangeaMetadataKeys.DATA_SOURCE: PangeaMetadataValues.DATA_SOURCE_DROPBOX,
104 |                 PangeaMetadataKeys.DROPBOX_ID: entrie.get("id", ""),
105 |                 PangeaMetadataKeys.DROPBOX_PATH: path,
106 |                 PangeaMetadataKeys.DROPBOX_FILE_PATH: file_path,
107 |                 PangeaMetadataKeys.FILE_PATH: file_path,
108 |                 PangeaMetadataKeys.FILE_NAME: name,
109 |             }
110 |             documents.append(MultipassDocument(id=generate_id(), content=file, metadata=metadata))
111 | 
112 |         self._has_more = result.get("has_more", False)
113 |         self._cursor = result.get("cursor")
114 |         return documents
115 | 


--------------------------------------------------------------------------------
/packages/pangea-multipass/pangea_multipass/oauth.py:
--------------------------------------------------------------------------------
  1 | import base64
  2 | import hashlib
  3 | import os
  4 | import threading
  5 | import webbrowser
  6 | from http.server import BaseHTTPRequestHandler, HTTPServer
  7 | 
  8 | import requests
  9 | 
 10 | 
 11 | class OauthFlow:
 12 |     # TODO: improve to be thread safe
 13 |     _auth_code = None
 14 | 
 15 |     auth_url: str
 16 |     token_url: str
 17 |     client_id: str
 18 |     host: str
 19 |     port: int
 20 | 
 21 |     def __init__(
 22 |         self,
 23 |         auth_url: str,
 24 |         token_url: str,
 25 |         client_id: str,
 26 |         host: str = "localhost",
 27 |         port: int = 8080,
 28 |     ):
 29 |         self.auth_url = auth_url
 30 |         self.token_url = token_url
 31 |         self.client_id = client_id
 32 |         self.host = host
 33 |         self.port = port
 34 | 
 35 |     def run_pkce(self, code_challenge: str, code_verifier: str, code_challenge_method: str = "S256"):
 36 |         self._auth_code = None
 37 |         redirect_uri = f"http://{self.host}:{self.port}"
 38 | 
 39 |         auth_url = (
 40 |             f"{self.auth_url}?"
 41 |             f"client_id={self.client_id}&"
 42 |             "response_type=code&"
 43 |             "token_access_type=offline&"
 44 |             f"redirect_uri={redirect_uri}&"
 45 |             f"code_challenge={code_challenge}&"
 46 |             f"code_challenge_method={code_challenge_method}"
 47 |         )
 48 | 
 49 |         webbrowser.open(auth_url)
 50 | 
 51 |         server_thread = threading.Thread(target=OauthFlow._run_server, daemon=True)
 52 |         server_thread.start()
 53 | 
 54 |         while OauthFlow._auth_code is None:
 55 |             pass  # Busy wait (can be improved with event-based handling)
 56 | 
 57 |         response = requests.post(
 58 |             self.token_url,
 59 |             data={
 60 |                 "client_id": self.client_id,
 61 |                 "grant_type": "authorization_code",
 62 |                 "code": OauthFlow._auth_code,
 63 |                 "redirect_uri": redirect_uri,
 64 |                 "code_verifier": code_verifier,  # PKCE verification
 65 |             },
 66 |         )
 67 |         response.raise_for_status()
 68 |         return response.json()
 69 | 
 70 |     class OAuthHandler(BaseHTTPRequestHandler):
 71 |         """Handles the OAuth redirect to capture auth code automatically."""
 72 | 
 73 |         def do_GET(self):
 74 |             if "code=" in self.path:
 75 |                 OauthFlow._auth_code = self.path.split("code=")[-1].split("&")[0]
 76 |                 self.send_response(200)
 77 |                 self.send_header("Content-type", "text/html")
 78 |                 self.end_headers()
 79 |                 self.wfile.write(b"<h1>Authorization Successful!</h1><p>You can close this tab.</p>")
 80 |             else:
 81 |                 self.send_response(400)
 82 |                 self.end_headers()
 83 |                 self.wfile.write(b"<h1>Authorization Failed</h1>")
 84 | 
 85 |     @staticmethod
 86 |     def generate_pkce_pair():
 87 |         code_verifier = base64.urlsafe_b64encode(os.urandom(32)).rstrip(b"=").decode("utf-8")
 88 |         code_challenge = (
 89 |             base64.urlsafe_b64encode(hashlib.sha256(code_verifier.encode("utf-8")).digest())
 90 |             .rstrip(b"=")
 91 |             .decode("utf-8")
 92 |         )
 93 | 
 94 |         return code_verifier, code_challenge
 95 | 
 96 |     @staticmethod
 97 |     def _run_server(host: str = "localhost", port: int = 8080):
 98 |         """Starts a simple HTTP server to listen for OAuth callback."""
 99 | 
100 |         redirect_uri = f"http://{host}:{port}"
101 |         server = HTTPServer((host, port), OauthFlow.OAuthHandler)
102 |         print(f"\n🌍 Listening for authentication response on {redirect_uri} ...")
103 |         server.handle_request()  # Handles only one request (closes after first login)
104 | 
105 |     @staticmethod
106 |     def refresh_access_token(url: str, refresh_token: str, client_id: str):
107 |         """Refresh the access token using the refresh token."""
108 | 
109 |         # Send request to refresh the token
110 |         response = requests.post(
111 |             url,
112 |             data={
113 |                 "grant_type": "refresh_token",
114 |                 "refresh_token": refresh_token,
115 |                 "client_id": client_id,
116 |             },
117 |             headers={"Content-Type": "application/x-www-form-urlencoded"},
118 |         )
119 |         response.raise_for_status()
120 | 
121 |         return response.json()
122 | 


--------------------------------------------------------------------------------
/packages/pangea-multipass/pangea_multipass/gitlab_reader.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from typing import Any, List, Optional
  3 | 
  4 | import requests
  5 | 
  6 | from .sources import GitLabClient
  7 | from pangea_multipass import MultipassDocument, PangeaMetadataKeys, PangeaMetadataValues, generate_id
  8 | 
  9 | 
 10 | class GitLabReader:
 11 |     _token: str
 12 |     _has_more_files: bool
 13 |     _next_files_page: Optional[str]
 14 |     _current_repository: dict
 15 |     _logger_name: str
 16 | 
 17 |     def __init__(self, token: str, logger_name: str = "multipass"):
 18 |         self._token = token
 19 |         self.logger = logging.getLogger(logger_name)
 20 |         self._client = GitLabClient(logger_name)
 21 |         self._restart()
 22 | 
 23 |     def get_repos(self):
 24 |         """Get all the repositories the token has access to"""
 25 |         return self._client.get_user_projects(self._token)
 26 | 
 27 |     def read_repo_files(self, repository: dict, page_size: int = 100) -> List[MultipassDocument]:
 28 |         """
 29 |         Read files from a given repository
 30 |         If the repository is different from the last one, it will restart the reader.
 31 |         If the repository is the same, it will continue reading from the last file.
 32 |         """
 33 |         self._read_repo_files_checks(repository, page_size)
 34 |         if self._next_files_page is None:
 35 |             return []
 36 | 
 37 |         response = requests.get(self._next_files_page, headers={"Authorization": f"Bearer {self._token}"})
 38 | 
 39 |         repo_id = self._current_repository.get("id", None)
 40 |         if response.status_code != 200:
 41 |             raise Exception(f"Skipping {repo_id}: Could not fetch file tree")
 42 | 
 43 |         documents: List[MultipassDocument] = []
 44 |         files = response.json()
 45 |         for file in files:
 46 |             if file["type"] == "blob":  # Only download actual files
 47 |                 file_path = file["path"]
 48 |                 file_name = file["name"]
 49 |                 repo_name = self._current_repository.get("name", "")
 50 |                 repo_namespace_path = self._current_repository.get("path_with_namespace", "")
 51 |                 content = self._client.download_file(self._token, repo_id, file_path)  # type: ignore[arg-type]
 52 |                 metadata: dict[str, Any] = {
 53 |                     PangeaMetadataKeys.DATA_SOURCE: PangeaMetadataValues.DATA_SOURCE_GITLAB,
 54 |                     PangeaMetadataKeys.GITLAB_REPOSITORY_ID: repo_id,
 55 |                     PangeaMetadataKeys.GITLAB_REPOSITORY_NAME: repo_name,
 56 |                     PangeaMetadataKeys.GITLAB_REPOSITORY_NAMESPACE_WITH_PATH: repo_namespace_path,
 57 |                     PangeaMetadataKeys.FILE_PATH: file_path,
 58 |                     PangeaMetadataKeys.FILE_NAME: file_name,
 59 |                 }
 60 |                 documents.append(MultipassDocument(generate_id(), content, metadata))
 61 | 
 62 |         self._next_files_page = response.links.get("next", {}).get("url", None)  # Check if pagination has next page
 63 |         self._has_more_files = self._next_files_page is not None
 64 | 
 65 |         return documents
 66 | 
 67 |     def load_data(self) -> List[MultipassDocument]:
 68 |         """
 69 |         Load all the data from the repositories
 70 |         This will read all the files from all the repositories the token has access to.
 71 |         This process is blocking and can take a long time. If working with a large number of repositories,
 72 |         consider using the read_repo_files method.
 73 |         """
 74 |         documents: List[MultipassDocument] = []
 75 |         repos = self.get_repos()
 76 | 
 77 |         for repo in repos:
 78 |             has_more_files = True
 79 |             while has_more_files:
 80 |                 files = self.read_repo_files(repository=repo)
 81 |                 documents.extend(files)
 82 |                 has_more_files = self.has_more_files
 83 | 
 84 |         return documents
 85 | 
 86 |     @property
 87 |     def has_more_files(self):
 88 |         """Check if there are more files to read"""
 89 |         return self._has_more_files
 90 | 
 91 |     def _restart(self):
 92 |         self._has_more_files = True
 93 |         self._next_files_page = None
 94 |         self._current_repository = {}
 95 | 
 96 |     def _read_repo_files_checks(self, repository: dict, page_size: int) -> None:
 97 |         current_repo_id = self._current_repository.get("id", None)
 98 |         new_repo_id = repository.get("id", None)
 99 |         if current_repo_id is None or current_repo_id != new_repo_id:
100 |             self._restart()
101 |             self._current_repository = repository
102 | 
103 |         if self._has_more_files is True and self._next_files_page is None:
104 |             repo_id = repository.get("id", None)
105 |             if repo_id is None:
106 |                 raise Exception("Invalid repository id")
107 | 
108 |             self._next_files_page = f"https://gitlab.com/api/v4/projects/{repo_id}/repository/tree?recursive=true&per_page={page_size}&pagination=keyset"
109 |         else:
110 |             self._has_more_files = False
111 | 


--------------------------------------------------------------------------------
/examples/llama_index_examples/03-rag-LlamaIndex-gdrive-filter.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2021 Pangea Cyber Corporation
  2 | # Author: Pangea Cyber Corporation
  3 | 
  4 | import os
  5 | import warnings
  6 | from typing import List
  7 | 
  8 | from google.oauth2.credentials import Credentials
  9 | from llama_index.core import Settings, StorageContext, VectorStoreIndex, load_index_from_storage
 10 | from llama_index.embeddings.bedrock import BedrockEmbedding
 11 | from llama_index.llms.bedrock import Bedrock
 12 | from llama_index.readers.google import GoogleDriveReader
 13 | from pangea_multipass import GDriveAPI, GDriveME, enrich_metadata
 14 | from pangea_multipass_llama_index import LIDocument, LIDocumentReader
 15 | 
 16 | # Suppress specific warning
 17 | warnings.filterwarnings("ignore", message='Field "model_name" has conflict with protected namespace')
 18 | 
 19 | SCOPES = [
 20 |     "openid",
 21 |     "https://www.googleapis.com/auth/userinfo.email",
 22 |     "https://www.googleapis.com/auth/userinfo.profile",
 23 |     "https://www.googleapis.com/auth/drive.metadata.readonly",
 24 | ]
 25 | 
 26 | # import logging
 27 | # import sys
 28 | 
 29 | # logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
 30 | # logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))
 31 | 
 32 | 
 33 | # Initialize LLM, anthropic deployed on bedrock
 34 | llm = Bedrock(
 35 |     model="anthropic.claude-3-5-sonnet-20240620-v1:0",
 36 |     profile_name="dev",
 37 |     region_name="us-west-2",
 38 |     temperature=0.5,
 39 |     max_tokens=512,
 40 | )
 41 | 
 42 | # Initialize Embedding model, amazon titan deployed on bedrock
 43 | embed_model = BedrockEmbedding(model="amazon.titan-embed-g1-text-02", region_name="us-west-2", profile_name="dev")
 44 | 
 45 | # Set up the models
 46 | Settings.llm = llm
 47 | Settings.embed_model = embed_model
 48 | 
 49 | # Set up chunking parameters
 50 | Settings.chunk_size = 1000
 51 | Settings.chunk_overlap = 100
 52 | 
 53 | 
 54 | def google_drive_read_docs() -> List[LIDocument]:
 55 |     print("Loading Google Drive docs...")
 56 |     # Google Drive Data Ingestion
 57 |     credentials_filepath = os.path.abspath("../credentials.json")
 58 | 
 59 |     # Sample folder data folder owned by apurv@gondwana.cloud https://drive.google.com/drive/u/1/folders/1Kj77oi2QGEOPKcIo_hKZPiHDJyKKFVgR
 60 |     gdrive_fid = "1Kj77oi2QGEOPKcIo_hKZPiHDJyKKFVgR"
 61 | 
 62 |     # File name for the admin user
 63 |     admin_token_filepath = "admin_access_token.json"
 64 | 
 65 |     # # Invoke Google /auth endpoint and save he token for later use
 66 |     # GDrive.get_and_save_access_token(credentials_filepath, admin_token_filepath, SCOPES)
 67 | 
 68 |     # load the documents and create the index
 69 |     print("Login to GDrive as admin...")
 70 |     gdrive_reader = GoogleDriveReader(
 71 |         folder_id=gdrive_fid, token_path=admin_token_filepath, credentials_path=credentials_filepath
 72 |     )
 73 |     documents: List[LIDocument] = gdrive_reader.load_data(folder_id=gdrive_fid)
 74 | 
 75 |     print(f"Processing {len(documents)} docs...")
 76 | 
 77 |     # Metadata enricher library
 78 |     creds = Credentials.from_authorized_user_file(admin_token_filepath, SCOPES)
 79 |     gdrive_me = GDriveME(creds, {})
 80 |     enrich_metadata(documents, [gdrive_me], reader=LIDocumentReader())
 81 |     # Finish metadata enrichement
 82 | 
 83 |     return documents
 84 | 
 85 | 
 86 | # Load data from Gdrive or from the disk
 87 | PERSIST_DIR = "./storage/rbac/llamaindex/gdrive"
 88 | if not os.path.exists(PERSIST_DIR):
 89 |     # Load documents
 90 |     documents = google_drive_read_docs()
 91 | 
 92 |     print("Create and save index...")
 93 |     index = VectorStoreIndex.from_documents(documents)
 94 |     # store it for later
 95 |     index.storage_context.persist(persist_dir=PERSIST_DIR)
 96 | else:
 97 |     # load the existing index
 98 |     print("Loading index...")
 99 |     storage_context = StorageContext.from_defaults(persist_dir=PERSIST_DIR)
100 |     index = load_index_from_storage(storage_context)  # type: ignore
101 | 
102 | 
103 | # Inference
104 | from pangea_multipass_llama_index import LlamaIndexGDriveProcessor, NodePostprocessorMixer
105 | 
106 | # Create GDrive filter
107 | credentials_filepath = os.path.abspath("../credentials.json")
108 | print("Login to GDrive as user...")
109 | creds = GDriveAPI.get_user_credentials(credentials_filepath, scopes=SCOPES)
110 | 
111 | node_processor = NodePostprocessorMixer([LlamaIndexGDriveProcessor(creds)])
112 | metadata_filters = node_processor.get_filter()
113 | 
114 | # Using filters
115 | query_engine = index.as_query_engine(similarity_top_k=10, streaming=True, filters=metadata_filters)
116 | retriever = index.as_retriever(similarity_top_k=10)
117 | 
118 | 
119 | # Inference pipeline
120 | while True:
121 |     user_prompt = input("Enter your question:")
122 | 
123 |     nodes = retriever.retrieve(user_prompt)
124 |     count = len(node_processor.get_unauthorized_nodes())
125 |     count_authorized = len(node_processor.get_authorized_nodes())
126 | 
127 |     answer = query_engine.query(user_prompt)
128 |     # print("Assistant: ", answer)
129 |     answer.print_response_stream()  # type: ignore
130 | 
131 |     print("\n=================\n")
132 |     print(
133 |         f"\nWarning: This answer could be inaccurate as its missing context from {count} out of {len(nodes)} data sources. Include {count_authorized} sources."
134 |     )
135 |     print("\n++++++++++++++++++")
136 | 


--------------------------------------------------------------------------------
/examples/llama_index_examples/01-rag-LlamaIndex-gdrive-processor.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2021 Pangea Cyber Corporation
  2 | # Author: Pangea Cyber Corporation
  3 | 
  4 | import os
  5 | import warnings
  6 | from typing import List
  7 | 
  8 | from google.oauth2.credentials import Credentials
  9 | from llama_index.core import Settings, StorageContext, VectorStoreIndex, load_index_from_storage
 10 | from llama_index.embeddings.bedrock import BedrockEmbedding
 11 | from llama_index.llms.bedrock import Bedrock
 12 | from llama_index.readers.google import GoogleDriveReader
 13 | from pangea_multipass import GDriveME, enrich_metadata
 14 | from pangea_multipass_llama_index import LIDocument, LIDocumentReader
 15 | 
 16 | # Suppress specific warning
 17 | warnings.filterwarnings("ignore", message='Field "model_name" has conflict with protected namespace')
 18 | 
 19 | SCOPES = [
 20 |     "openid",
 21 |     "https://www.googleapis.com/auth/userinfo.email",
 22 |     "https://www.googleapis.com/auth/userinfo.profile",
 23 |     "https://www.googleapis.com/auth/drive.metadata.readonly",
 24 | ]
 25 | 
 26 | # import logging
 27 | # import sys
 28 | 
 29 | # logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
 30 | # logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))
 31 | 
 32 | 
 33 | # Initialize LLM, anthropic deployed on bedrock
 34 | llm = Bedrock(
 35 |     model="anthropic.claude-3-5-sonnet-20240620-v1:0",
 36 |     profile_name="dev",
 37 |     region_name="us-west-2",
 38 |     temperature=0.5,
 39 |     max_tokens=512,
 40 | )
 41 | 
 42 | # Initialize Embedding model, amazon titan deployed on bedrock
 43 | embed_model = BedrockEmbedding(model="amazon.titan-embed-g1-text-02", region_name="us-west-2", profile_name="dev")
 44 | 
 45 | # Set up the models
 46 | Settings.llm = llm
 47 | Settings.embed_model = embed_model
 48 | 
 49 | # Set up chunking parameters
 50 | Settings.chunk_size = 1000
 51 | Settings.chunk_overlap = 100
 52 | 
 53 | 
 54 | def google_drive_read_docs() -> List[LIDocument]:
 55 |     print("Loading Google Drive docs...")
 56 |     # Google Drive Data Ingestion
 57 |     credentials_filepath = os.path.abspath("../credentials.json")
 58 | 
 59 |     # Sample folder data folder owned by apurv@gondwana.cloud https://drive.google.com/drive/u/1/folders/1Kj77oi2QGEOPKcIo_hKZPiHDJyKKFVgR
 60 |     gdrive_fid = "1Kj77oi2QGEOPKcIo_hKZPiHDJyKKFVgR"
 61 | 
 62 |     # File name for the admin user
 63 |     admin_token_filepath = "admin_access_token.json"
 64 | 
 65 |     # # Invoke Google /auth endpoint and save he token for later use
 66 |     # GDrive.get_and_save_access_token(credentials_filepath, admin_token_filepath, SCOPES)
 67 | 
 68 |     # load the documents and create the index
 69 |     print("Login to GDrive as admin...")
 70 |     gdrive_reader = GoogleDriveReader(
 71 |         folder_id=gdrive_fid, token_path=admin_token_filepath, credentials_path=credentials_filepath
 72 |     )
 73 |     documents: List[LIDocument] = gdrive_reader.load_data(folder_id=gdrive_fid)
 74 | 
 75 |     print(f"Processing {len(documents)} docs...")
 76 | 
 77 |     # Metadata enricher library
 78 |     creds = Credentials.from_authorized_user_file(admin_token_filepath, SCOPES)
 79 |     gdrive_me = GDriveME(creds, {})
 80 |     enrich_metadata(documents, [gdrive_me], reader=LIDocumentReader())
 81 |     # Finish metadata enrichement
 82 | 
 83 |     return documents
 84 | 
 85 | 
 86 | # Load data from Gdrive or from the disk
 87 | PERSIST_DIR = "./storage/rbac/llamaindex/gdrive"
 88 | if not os.path.exists(PERSIST_DIR):
 89 |     # Load documents
 90 |     documents = google_drive_read_docs()
 91 | 
 92 |     print("Create and save index...")
 93 |     index = VectorStoreIndex.from_documents(documents)
 94 |     # store it for later
 95 |     index.storage_context.persist(persist_dir=PERSIST_DIR)
 96 | else:
 97 |     # load the existing index
 98 |     print("Loading index...")
 99 |     storage_context = StorageContext.from_defaults(persist_dir=PERSIST_DIR)
100 |     index = load_index_from_storage(storage_context)  # type: ignore
101 | 
102 | # Inference
103 | 
104 | from pangea_multipass import GDriveAPI
105 | from pangea_multipass_llama_index import LlamaIndexGDriveProcessor, NodePostprocessorMixer
106 | 
107 | # Create GDrive filter
108 | credentials_filepath = os.path.abspath("../credentials.json")
109 | creds = GDriveAPI.get_user_credentials(credentials_filepath, scopes=SCOPES)
110 | 
111 | print("Login to GDrive as user...")
112 | gdrive_processor = LlamaIndexGDriveProcessor(creds)
113 | node_processor = NodePostprocessorMixer([gdrive_processor])
114 | 
115 | # Using node postprocessor
116 | query_engine = index.as_query_engine(
117 |     streaming=True,
118 |     similarity_top_k=10,
119 |     node_postprocessors=[node_processor],
120 | )
121 | 
122 | retriever = index.as_retriever(similarity_top_k=10)
123 | 
124 | 
125 | # Inference pipeline
126 | while True:
127 |     user_prompt = input("Enter your question:")
128 | 
129 |     nodes = retriever.retrieve(user_prompt)
130 |     count = len(node_processor.get_unauthorized_nodes())
131 |     count_authorized = len(node_processor.get_authorized_nodes())
132 | 
133 |     answer = query_engine.query(user_prompt)
134 |     # print("Assistant: ", answer)
135 |     answer.print_response_stream()  # type: ignore
136 | 
137 |     print("\n=================\n")
138 |     print(
139 |         f"\nWarning: This answer could be inaccurate as its missing context from {count} out of {len(nodes)} data sources. Include {count_authorized} sources."
140 |     )
141 |     print("\n++++++++++++++++++")
142 | 


--------------------------------------------------------------------------------
/packages/pangea-multipass/pangea_multipass/github_reader.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from typing import Any, List, Optional
  3 | 
  4 | from .core import MultipassDocument, PangeaMetadataKeys, PangeaMetadataValues, generate_id
  5 | from .sources.github import GitHubClient
  6 | 
  7 | 
  8 | class GitHubReader:
  9 |     _token: str
 10 |     """GitHub personal access token"""
 11 | 
 12 |     _current_file: int = 0
 13 |     _repo_files: Optional[List[dict]] = None
 14 |     _current_repository: dict = {}
 15 | 
 16 |     def __init__(self, token: str, logger_name: str = "multipass"):
 17 |         self._token = token
 18 |         self.logger = logging.getLogger(logger_name)
 19 |         self._client = GitHubClient(logger_name)
 20 |         self._restart()
 21 | 
 22 |     def load_data(
 23 |         self,
 24 |     ) -> List[MultipassDocument]:
 25 |         """
 26 |         Load all the data from the repositories
 27 |         This will read all the files from all the repositories the token has access to.
 28 |         This process is blocking and can take a long time. If working with a large number of repositories,
 29 |         consider using the read_repo_files method.
 30 |         """
 31 |         documents: List[MultipassDocument] = []
 32 | 
 33 |         # Get repositories
 34 |         repos = self._client.get_user_repos(self._token)
 35 | 
 36 |         for repo in repos:
 37 |             owner = repo["owner"]["login"]
 38 |             repo_name = repo["name"]
 39 | 
 40 |             # Get all files recursively
 41 |             files = self._client.get_repo_files(self._token, owner, repo_name)
 42 | 
 43 |             for file in files:
 44 |                 file_path = file["path"]
 45 |                 download_url = file["url"]
 46 | 
 47 |                 # Fetch the file content
 48 |                 content = self._client.download_file_content(self._token, download_url)
 49 | 
 50 |                 # Create metadata
 51 |                 metadata: dict[str, Any] = {
 52 |                     PangeaMetadataKeys.GITHUB_REPOSITORY_NAME: repo_name,
 53 |                     PangeaMetadataKeys.GITHUB_REPOSITORY_OWNER: owner,
 54 |                     PangeaMetadataKeys.FILE_PATH: file_path,
 55 |                     PangeaMetadataKeys.FILE_NAME: file_path,
 56 |                     PangeaMetadataKeys.DATA_SOURCE: PangeaMetadataValues.DATA_SOURCE_GITHUB,
 57 |                     PangeaMetadataKeys.GITHUB_REPOSITORY_OWNER_AND_NAME: (owner, repo_name),
 58 |                 }
 59 | 
 60 |                 doc = MultipassDocument(id=generate_id(), content=content, metadata=metadata)
 61 |                 documents.append(doc)
 62 | 
 63 |         return documents
 64 | 
 65 |     def read_repo_files(self, repository: dict, page_size: int = 100) -> List[MultipassDocument]:
 66 |         """
 67 |         Read files from a given repository
 68 |         If the repository is different from the last one, it will restart the reader.
 69 |         If the repository is the same, it will continue reading from the last file.
 70 |         """
 71 |         documents: List[MultipassDocument] = []
 72 | 
 73 |         self._read_repo_files_checks(repository)
 74 |         if self._repo_files is None:
 75 |             return documents
 76 | 
 77 |         owner = self._current_repository["owner"]["login"]
 78 |         repo_name = self._current_repository["name"]
 79 | 
 80 |         i = 0
 81 |         while i < page_size and self._current_file < len(self._repo_files):
 82 |             file = self._repo_files[self._current_file]
 83 |             i += 1
 84 |             self._current_file += 1
 85 | 
 86 |             file_path = file["path"]
 87 |             download_url = file["url"]
 88 | 
 89 |             # Fetch the file content
 90 |             content = self._client.download_file_content(self._token, download_url)
 91 | 
 92 |             # Create metadata
 93 |             metadata: dict[str, Any] = {
 94 |                 PangeaMetadataKeys.GITHUB_REPOSITORY_NAME: repo_name,
 95 |                 PangeaMetadataKeys.GITHUB_REPOSITORY_OWNER: owner,
 96 |                 PangeaMetadataKeys.FILE_PATH: file_path,
 97 |                 PangeaMetadataKeys.FILE_NAME: file_path,
 98 |                 PangeaMetadataKeys.DATA_SOURCE: PangeaMetadataValues.DATA_SOURCE_GITHUB,
 99 |                 PangeaMetadataKeys.GITHUB_REPOSITORY_OWNER_AND_NAME: (owner, repo_name),
100 |             }
101 | 
102 |             doc = MultipassDocument(id=generate_id(), content=content, metadata=metadata)
103 |             documents.append(doc)
104 | 
105 |         return documents
106 | 
107 |     def get_repos(self) -> List[dict]:
108 |         """Get all the repositories the token has access to"""
109 |         return self._client.get_user_repos(self._token)
110 | 
111 |     @property
112 |     def has_more_files(self):
113 |         """Check if there are more files to read"""
114 |         return self._repo_files is not None and self._current_file < len(self._repo_files)
115 | 
116 |     def _restart(self) -> None:
117 |         self._current_file = 0
118 |         self._repo_files = None
119 |         self._current_repository = {}
120 | 
121 |     def _read_repo_files_checks(self, repository: dict) -> None:
122 |         current_repo_id = self._current_repository.get("id", None)
123 |         new_repo_id = repository.get("id", None)
124 | 
125 |         if current_repo_id is None or current_repo_id != new_repo_id:
126 |             self._restart()
127 |             self._current_repository = repository
128 | 
129 |         owner = self._current_repository["owner"]["login"]
130 |         repo_name = self._current_repository["name"]
131 | 
132 |         if self._repo_files is None:
133 |             self._repo_files = self._client.get_repo_files(self._token, owner, repo_name)
134 | 


--------------------------------------------------------------------------------
/examples/langchain_examples/02-rag-LangChain-gdrive.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2021 Pangea Cyber Corporation
  2 | # Author: Pangea Cyber Corporation
  3 | 
  4 | import os
  5 | from io import BytesIO
  6 | from pathlib import Path
  7 | from typing import List
  8 | 
  9 | import boto3
 10 | from google.oauth2.credentials import Credentials
 11 | from langchain_aws import BedrockEmbeddings, ChatBedrock
 12 | from langchain_community.vectorstores import FAISS
 13 | from langchain_core.documents import Document
 14 | from langchain_google_community import GoogleDriveLoader
 15 | from pangea_multipass import GDriveAPI, GDriveME, enrich_metadata
 16 | from pangea_multipass_langchain import DocumentFilterMixer, LangChainDocumentReader
 17 | 
 18 | # Initialization
 19 | bedrock_client = boto3.client("bedrock-runtime", region_name="us-west-2")
 20 | model_id = "anthropic.claude-3-5-sonnet-20240620-v1:0"
 21 | model_kwargs = {
 22 |     "max_tokens": 512,
 23 |     "temperature": 0.5,
 24 | }
 25 | 
 26 | ## Setup the LLM parameters
 27 | llm = ChatBedrock(
 28 |     client=bedrock_client,
 29 |     model_id=model_id,
 30 |     model_kwargs=model_kwargs,
 31 | )
 32 | 
 33 | ## Setup the Embedding model parameters
 34 | embedding_model = BedrockEmbeddings(model_id="amazon.titan-embed-g1-text-02", client=bedrock_client)
 35 | 
 36 | 
 37 | class TextLoader:
 38 |     file: BytesIO
 39 | 
 40 |     def __init__(self, file: BytesIO):
 41 |         self.file = file
 42 | 
 43 |     def load(self) -> List[Document]:
 44 |         return [Document(page_content=self.file.read().decode("utf-8"))]
 45 | 
 46 | 
 47 | ## Data ingestion pipeline
 48 | 
 49 | PERSIST_DIR = "./storage/data/langchain/faiss_index"
 50 | if not os.path.exists(PERSIST_DIR):
 51 |     os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = ""
 52 |     # Google Drive Data Ingestion
 53 |     admin_token_filepath = "admin_access_token.json"
 54 | 
 55 |     credentials_filepath = os.path.abspath("../credentials.json")
 56 |     print("Login to GDrive as admin...")
 57 |     GDriveAPI.get_and_save_access_token(
 58 |         credentials_filepath, admin_token_filepath, ["https://www.googleapis.com/auth/drive.readonly"]
 59 |     )
 60 | 
 61 |     loader = GoogleDriveLoader(
 62 |         folder_id="1Kj77oi2QGEOPKcIo_hKZPiHDJyKKFVgR",
 63 |         token_path=Path(admin_token_filepath),
 64 |         credentials_path=Path(credentials_filepath),
 65 |         recursive=True,
 66 |         load_extended_metadata=True,
 67 |         file_loader_cls=TextLoader,
 68 |     )
 69 | 
 70 |     docs = loader.load()
 71 |     print(f"GDrive docs loaded: {len(docs)}.")
 72 | 
 73 |     # Metadata enricher library
 74 |     SCOPES = [
 75 |         "openid",
 76 |         "https://www.googleapis.com/auth/userinfo.email",
 77 |         "https://www.googleapis.com/auth/userinfo.profile",
 78 |         "https://www.googleapis.com/auth/drive.metadata.readonly",
 79 |     ]
 80 | 
 81 |     creds = Credentials.from_authorized_user_file(admin_token_filepath, SCOPES)
 82 |     gdrive_me = GDriveME(creds, {})
 83 |     enrich_metadata(docs, [gdrive_me], reader=LangChainDocumentReader())
 84 |     # Finish metadata enrichement
 85 | 
 86 |     # Initialize the vector store https://faiss.ai
 87 |     print("Initializing vector store...")
 88 |     vectorstore = FAISS.from_documents(documents=docs, embedding=embedding_model)
 89 | 
 90 |     # Store to file system
 91 |     print("Storing vector store...")
 92 |     vectorstore.save_local(PERSIST_DIR)
 93 | else:
 94 |     print("Loading vector store...")
 95 |     vectorstore = FAISS.load_local(
 96 |         folder_path=PERSIST_DIR, embeddings=embedding_model, allow_dangerous_deserialization=True
 97 |     )
 98 | 
 99 | 
100 | ## Inference pipeline
101 | 
102 | from langchain.chains.combine_documents import create_stuff_documents_chain
103 | from langchain_core.prompts import ChatPromptTemplate
104 | from pangea_multipass_langchain import LangChainGDriveFilter
105 | 
106 | # Create GDrive filter
107 | credentials_filepath = os.path.abspath("../credentials.json")
108 | SCOPES = [
109 |     "openid",
110 |     "https://www.googleapis.com/auth/userinfo.email",
111 |     "https://www.googleapis.com/auth/userinfo.profile",
112 |     "https://www.googleapis.com/auth/drive.metadata.readonly",
113 | ]
114 | 
115 | print("Login to GDrive as user...")
116 | creds = GDriveAPI.get_user_credentials(credentials_filepath, scopes=SCOPES)
117 | 
118 | gdrive_filter = LangChainGDriveFilter(creds)
119 | filter_mixer = DocumentFilterMixer(document_filters=[gdrive_filter])
120 | 
121 | # Use indexed store as a reteriver to create qa chain
122 | retriever = vectorstore.as_retriever()
123 | 
124 | # Prompt template with System, Context and User prompt
125 | template = """"System: Answer the following question based only on the provided context:
126 | 
127 | <context>
128 | {context}
129 | </context>
130 | 
131 | Question: {input}
132 | """
133 | prompt = ChatPromptTemplate.from_template(template)
134 | 
135 | # Document chain using the LLM and prompt template
136 | qa_chain = create_stuff_documents_chain(llm, prompt)
137 | 
138 | while True:
139 |     user_prompt = input("Enter your question: ")
140 |     similar_docs = retriever.invoke(user_prompt)
141 | 
142 |     print(f"similar_docs: {len(similar_docs)}")
143 | 
144 |     filtered_docs = filter_mixer.filter(similar_docs)
145 |     print(f"filtered_docs: {len(filtered_docs)}")
146 |     count = len(filter_mixer.get_unauthorized_documents())
147 | 
148 |     response = qa_chain.invoke({"input": user_prompt, "context": filtered_docs})
149 |     print(f"\n{response}")
150 |     print("\n=================")
151 |     print(
152 |         f"Warning: This answer could be inaccurate as its missing context from {count} out of {len(similar_docs)} data sources. Include {len(filtered_docs)} sources."
153 |     )
154 |     print("=================\n")
155 | 


--------------------------------------------------------------------------------
/EXTENDING.md:
--------------------------------------------------------------------------------
  1 | # Extending Pangea Multipass
  2 | 
  3 | ## Core Components
  4 | 
  5 | - Metadata Enrichment: Use classes like HasherSHA256 and Constant to add enriched metadata to documents. 
  6 | - Metadata Enrichers are applied through `enrich_metadata` method to all the documents.
  7 | - DocumentReader: Implement custom readers for extracting document content.
  8 | - MetadataUpdater: Apply enriched metadata to documents.
  9 | - Filter Operators: Use FilterOperator for applying various metadata filters (e.g., EQ, GT, CONTAINS, etc.).
 10 | 
 11 | ## Extend file sources
 12 | 
 13 | In order to extend the file sources provided by this package, here it's explain how new classes should be implemented. In this case GDrive source is used as an example and documented in detail.
 14 | New sources implementation should have, at least, 2 parts:
 15 | 
 16 | - Metadata Enricher: Used on ingestion time to add metadata to the documents.
 17 | - Processors/Filters: Used on inference time to process enriched medatada and filter documents based on logged user. 
 18 | 
 19 | ## Metadata Enricher
 20 | 
 21 | CustomSource metadata enricher implementation should inherit from `MetadataEnricher` class.
 22 | 
 23 | ```python
 24 | class GDriveME(MetadataEnricher):
 25 | ```
 26 | 
 27 | This inheritance will require that the `extract_metadata` method is implemented with this signature:
 28 | 
 29 | ```
 30 | extract_metadata(self, doc: Any, file_content: str) -> dict[str, Any]:
 31 | ```
 32 | 
 33 | This method will receive the document itself, so it's possible to access to the document metadata and other attributes if needed, and it will also receive document content, so it's possible so process it for whatever is needed (hash it, process by another LLM in order to get further information about it, etc.)
 34 | 
 35 | ```python
 36 |     def extract_metadata(self, doc: Any, file_content: str) -> dict[str, Any]:
 37 |         metadata: dict[str, Any] = {}
 38 | 
 39 |         # This step is to normalize some attributes across platforms
 40 |         # Optional: If CustomSource has an attribute that it's related to the file name, title, etc. It would be nice to copy it to 
 41 |         # PangeaMetadataKeys.FILE_NAME in order to unify this key/value.
 42 |         metadata[PangeaMetadataKeys.FILE_NAME] = doc.metadata.get("file name", "")
 43 | 
 44 |         # Required: Metadata enricher should set data source key as follow
 45 |         metadata[PangeaMetadataKeys.DATA_SOURCE] = PangeaMetadataValues.DATA_SOURCE_GDRIVE
 46 | 
 47 |         # Required: at least for this use case, it's required that this metadata enricher set the file id, so it will be used
 48 |         # at inference time to request the permissions of this file.
 49 |         # In this case there is a function to get the id from the metadata due to each source LangChain/LlamaIndex use different
 50 |         # key names to save it into the metadata.
 51 |         id = self._get_id_from_metadata(doc.metadata)
 52 |         if not id:
 53 |             raise Exception("empty doc_id")
 54 |         metadata[PangeaMetadataKeys.GDRIVE_FILE_ID] = id
 55 | 
 56 |         # New metadata is returned so it will be used by `enrich_metadata` method implemented in core package
 57 |         return metadata
 58 | ```
 59 | 
 60 | NOTE: In this case GDrive metadata enricher have to be initialized with admin credentials to access the data source, so it is able to request all the files and their metadata.
 61 | 
 62 | ## Processor/Filter
 63 | 
 64 | CustomProcessor should inherit from `PangeaGenericNodeProcessor` and `Generic[T]`. 
 65 | 
 66 | ```python
 67 | class GDriveProcessor(PangeaGenericNodeProcessor, Generic[T]):
 68 | ```
 69 | 
 70 | `PangeaGenericNodeProcessor` will require that `filter` and `get_filter` methods are implemented.
 71 | 
 72 | `filter()` method will take care of filter available node in run time. In order to do this, this `Processor` should be initialized with user credentials, so it's able to check what files this user has access to. 
 73 | 
 74 | ```python
 75 |     def filter(
 76 |         self,
 77 |         nodes: List[T],
 78 |     ) -> List[T]:
 79 | 
 80 |         return [node for node in nodes if self._is_authorized(node)]
 81 | ```
 82 | 
 83 | `get_filter()` method will return a `MetadataFilter` to be used in LlamaIndex or LangChain retriever filters. In this case it requests all permissions of all files, so it's not performant for really large datasets. It's recommended to use `filter` so that only files that are of interest to the current prompt are requested.
 84 | 
 85 | ```python
 86 |     def get_filter(
 87 |         self,
 88 |     ):
 89 | 
 90 |         if not self.files_ids:
 91 |             self.files_ids = GDriveAPI.list_all_file_ids(self.creds)
 92 | 
 93 |         return MetadataFilter(key=PangeaMetadataKeys.GDRIVE_FILE_ID, value=self.files_ids, operator=FilterOperator.IN)
 94 | ```
 95 | 
 96 | ## API 
 97 | 
 98 | This third class is used just to group all the API request related to this particular data source. It's not required but it's a nice way to group all these required methods used internally for the above classes. 
 99 | 
100 | ```python
101 | class GDriveAPI:
102 |     _SCOPES = [
103 |         ...
104 |     ]
105 | 
106 |     _user_token_filepath: str = "gdrive_access_token.json"
107 | 
108 |     @staticmethod
109 |     def get_and_save_access_token(credentials_filepath, token_filepath, scopes):
110 |         pass
111 | 
112 |     @staticmethod
113 |     def get_user_info(creds: Credentials):
114 |         pass
115 | 
116 |     @staticmethod
117 |     def get_user_credentials(
118 |         credentials_filepath: str, user_token_filepath: str = _user_token_filepath, scopes=_SCOPES
119 |     ):
120 |         pass
121 | 
122 |     @staticmethod
123 |     def check_file_access(creds: Credentials, file_id: str) -> bool:
124 |         pass
125 | 
126 |     @staticmethod
127 |     def list_all_file_ids(creds: Credentials) -> List[str]:
128 |         pass
129 | ```
130 | 


--------------------------------------------------------------------------------
/examples/README.md:
--------------------------------------------------------------------------------
 1 | # Trying Pangea Multipass
 2 | 
 3 | Pangea Multipass is a general purpose library for checking a user's access to resources in an upstream system. While we (Pangea) originally built this for our AI/LLM apps, you can use this library independently. To see that in action, check out the `multipass_examples` folder, otherwise explore your LLM framework of choice.
 4 | 
 5 | Each directory has its own README to get setup though many of the steps overlap.
 6 | 
 7 | ## Set up the environment
 8 | 
 9 | These are the upstream data sources the core library currently supports. Configure the ones you need and store the credentials for the examples. Most of these will require administrator access to get the credentials.
10 | 
11 | 
12 | ### Google Drive
13 | 
14 | In order to use Google Drive as a source in the examples you need to:
15 | 
16 | - Download the `credentials.json` file from Google console and save it in `<repo-root-directory>/examples/` folder.
17 | - On the example script update `gdrive_fid` variable value with the Google Drive folder ID to process.
18 | 
19 | 
20 | ### Jira
21 | 
22 | In order to use Jira as a source, it's needed to set some environment variables:
23 | - `JIRA_BASE_URL`: Jira project base URL. Its format is `<your-project-id>.atlassian.net/`. Take care of remove `https://` part.
24 | - `JIRA_ADMIN_EMAIL`: Admin email used in the ingestion time. System will process all the tickets this user has access to.
25 | - `JIRA_ADMIN_TOKEN`: Access token of the admin email set above. [Learn more](https://support.atlassian.com/atlassian-account/docs/manage-api-tokens-for-your-atlassian-account/).
26 | - `JIRA_USER_EMAIL`: User email used in inference time. This email will be used to validate which tickets returned by the LLM the user has access to.
27 | - `JIRA_USER_TOKEN`: Access token of the user email set above. [Learn more](https://support.atlassian.com/atlassian-account/docs/manage-api-tokens-for-your-atlassian-account/).
28 | - `JIRA_USER_ACCOUNT_ID`: Set it to use `JIRA_ADMIN_TOKEN` and `JIRA_ADMIN_EMAIL` at inference time to check user permissions. This way it's not needed to set `JIRA_USER_EMAIL` and `JIRA_USER_TOKEN`.
29 | 
30 | 
31 | ### Confluence
32 | 
33 | In order to use Confluence as a source it's needed to set some environment variables:
34 | - `CONFLUENCE_BASE_URL`: Confluence project base URL. Its format is `https://<your-project-id>.atlassian.net/`.
35 | - `CONFLUENCE_ADMIN_EMAIL`: Admin email used in the ingestion time. System will process all the files this user has access to.
36 | - `CONFLUENCE_ADMIN_TOKEN`: Access token of the admin email set above. [Learn more](https://support.atlassian.com/atlassian-account/docs/manage-api-tokens-for-your-atlassian-account/)
37 | - `CONFLUENCE_USER_EMAIL`: User email used in inference time. This email will be used to validate which files returned by the LLM the user has access to.
38 | - `CONFLUENCE_USER_TOKEN`: Access token of the user email set above. [Learn more](https://support.atlassian.com/atlassian-account/docs/manage-api-tokens-for-your-atlassian-account/)
39 | 
40 | 
41 | ### GitHub
42 | 
43 | In order to use GitHub as a source it's needed to set some environment variables:
44 | - `GITHUB_ADMIN_TOKEN`: Access token used in the ingestion time. System will process all the repositories this token has access to. [Learn more](https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens#creating-a-fine-grained-personal-access-token). This could be a `Fine-grained personal access token` with access to all the respositories owned by the admin account and `repository permission` to `read access to code and metadata`.
45 | - `GITHUB_USER_TOKEN`: (Deprecated. Use `GITHUB_USERNAME` and `GITHUB_ADMIN_TOKEN` instead.) Token user in inference time. It will be used to validate which files returned by the LLM the user has access to. [Learn more](https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens#creating-a-personal-access-token-classic). This must be a `classic personal access token` with scoped access to (at least) all the `repo` items.
46 | - `GITHUB_USERNAME`: Username used in inference time. This email will be used to validate which files returned by the LLM the user has access to.
47 | 
48 | 
49 | ### Slack
50 | 
51 | In order to use Slack as a source it's needed to set some environment variables, those are `SLACK_ADMIN_TOKEN` and `SLACK_USER_TOKEN`.
52 | 
53 | To get these tokens, you must create a Slack App and generate the tokens. The default app settings are sufficient. For further instructions about how to get slack tokens you could [click here](https://api.slack.com/tutorials/tracks/getting-a-token).
54 | 
55 | For this particular application the token's scope should be at least: `channels:history`, `groups:history`, `users:read`, `users:read.email` to process all public and private channels and access to user emails to check its permissions.
56 | 
57 | - `SLACK_ADMIN_TOKEN`: Access token used in the ingestion time. System will process all the channels this token has access to.
58 | - `SLACK_USER_TOKEN`: (Deprecated. Use `SLACK_USER_EMAIL` and `SLACK_ADMIN_TOKEN` instead.) Token user in inference time. It will be used to validate which files returned by the LLM the user has access to.
59 | - `SLACK_USER_EMAIL`: User email used in inference time. This email will be used to validate which files returned by the LLM the user has access to.
60 | 
61 | 
62 | ### Dropbox
63 | 
64 | When using Dropbox as a source, the admin user needs to authenticate on Dropbox using [Oauth2 protocol](https://developers.dropbox.com/oauth-guide) and allow Pangea's `pangea-multipass` Dropbox App. To do so, follow the Oauth2 flow using PKCE and the `pangea-multipass`'s app key. This app key could be saved in `DROPBOX_APP_KEY` like so:
65 | 
66 | In order to use Dropbox as a source, you need two environment variables:
67 | 
68 | - `DROPBOX_APP_KEY`: The identifier for the Dropbox app that Multipass will use to access your files. For testing, you can use our Pangea app with key: `hmhe1wh0koy8cv6`
69 | - `DROPBOX_USER_EMAIL`: User email used in inference time. This email will be used to validate which files returned by the LLM the user has access to.
70 | 


--------------------------------------------------------------------------------
/packages/pangea-multipass/pangea_multipass/slack_reader.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from typing import Any, List, Optional, Tuple
  3 | 
  4 | from slack_sdk import WebClient
  5 | from slack_sdk.errors import SlackApiError
  6 | 
  7 | from .core import MultipassDocument, PangeaMetadataKeys, PangeaMetadataValues, generate_id
  8 | from .sources import SlackClient
  9 | 
 10 | 
 11 | class SlackReader:
 12 |     _token: str
 13 |     _slack_client: WebClient
 14 |     _channel_id: Optional[str] = None
 15 |     _latest_ts: Optional[str] = None
 16 |     _has_more_messages: bool = True
 17 | 
 18 |     def __init__(self, token: str, logger_name: str = "multipass") -> None:
 19 |         self._token = token
 20 |         self._slack_client = WebClient(token=self._token)
 21 |         self.logger = logging.getLogger(logger_name)
 22 |         self._client = SlackClient(logger_name)
 23 |         self._restart()
 24 | 
 25 |     def load_data(self, max_messages_per_channel: int = 1000) -> List[MultipassDocument]:
 26 |         """Load all messages from all channels"""
 27 |         documents: List[MultipassDocument] = []
 28 |         channels = self._client.list_channels(token=self._token)
 29 |         for channel in channels:
 30 |             self.logger.debug(f"Fetching messages from channel {channel['name']}")
 31 |             messages, _, _ = self._fetch_messages(channel["id"], max_messages_per_channel)
 32 |             documents.extend(self._process_messages(messages, channel))
 33 | 
 34 |         return documents
 35 | 
 36 |     def get_channels(self) -> List[dict[str, Any]]:
 37 |         """Get all the channels token has access to"""
 38 |         return self._client.list_channels(token=self._token)
 39 | 
 40 |     def read_messages(self, channel: dict, page_size: int = 100) -> List[MultipassDocument]:
 41 |         """
 42 |         Read messages from a given channel
 43 |         If the channel is different from the last one, it will restart the reader.
 44 |         If the channel is the same, it will continue reading from the last message.
 45 |         """
 46 |         new_channel_id = channel["id"]
 47 |         if self._channel_id is None or new_channel_id != self._channel_id:
 48 |             self.logger.debug(f"Restarting reader for channel {new_channel_id}")
 49 |             self._restart()
 50 |             self._channel_id = new_channel_id
 51 | 
 52 |         if self._channel_id is None:
 53 |             self.logger.error("Channel ID is not set")
 54 |             raise Exception("Channel ID is not set")
 55 | 
 56 |         messages, latest, more_messages = self._fetch_messages(self._channel_id, page_size, self._latest_ts)
 57 |         self._latest_ts = latest
 58 |         self._has_more_messages = more_messages
 59 |         return self._process_messages(messages, channel)
 60 | 
 61 |     @property
 62 |     def has_more_messages(self) -> bool:
 63 |         """Check if there are more messages to read"""
 64 |         return self._has_more_messages
 65 | 
 66 |     def _restart(self) -> None:
 67 |         self._channel_id = None
 68 |         self._latest_ts = None
 69 |         self._has_more_messages = True
 70 | 
 71 |     def _process_messages(self, messages: list[dict[str, Any]], channel: dict) -> List[MultipassDocument]:
 72 |         """Process the messages and create the documents"""
 73 |         channel_id = channel["id"]
 74 |         channel_name = channel["name"]
 75 |         documents: List[MultipassDocument] = []
 76 | 
 77 |         for message in messages:
 78 |             subtype = message.get("subtype", "")
 79 |             # Just ignore the channel join messages
 80 |             if subtype == "channel_join":
 81 |                 continue
 82 |             user = message.get("user", "")
 83 |             text = message.get("text", "")
 84 |             ts = message.get("ts", "")
 85 |             metadata = {
 86 |                 PangeaMetadataKeys.SLACK_CHANNEL_ID: channel_id,
 87 |                 PangeaMetadataKeys.SLACK_CHANNEL_NAME: channel_name,
 88 |                 PangeaMetadataKeys.SLACK_TIMESTAMP: ts,
 89 |                 PangeaMetadataKeys.SLACK_USER: user,
 90 |                 PangeaMetadataKeys.DATA_SOURCE: PangeaMetadataValues.DATA_SOURCE_SLACK,
 91 |             }
 92 |             documents.append(MultipassDocument(id=generate_id(), content=text, metadata=metadata))  # type: ignore[arg-type]
 93 | 
 94 |         return documents
 95 | 
 96 |     def _fetch_messages(
 97 |         self, channel_id: str, max_messages: int = 1000, latest: Optional[str] = None
 98 |     ) -> Tuple[List[dict[str, Any]], Optional[str], bool]:
 99 |         """
100 |         Fetch the messages from a given channel.
101 |         """
102 | 
103 |         page_size = 100
104 |         page_size = page_size if page_size < max_messages else max_messages
105 |         messages: List[dict[str, Any]] = []
106 |         more_messages = True
107 | 
108 |         try:
109 |             while len(messages) < max_messages:
110 |                 response = self._slack_client.conversations_history(channel=channel_id, latest=latest, limit=page_size)
111 |                 new_messages: List[dict[str, Any]] = response.get("messages", [])
112 |                 messages.extend(new_messages)
113 | 
114 |                 if not new_messages:
115 |                     self.logger.debug(f"No more messages to fetch for channel {channel_id}")
116 |                     more_messages = False
117 |                     break
118 | 
119 |                 message = new_messages[-1]
120 |                 latest = message.get("ts", "")
121 | 
122 |                 # We could delete this check and do another request and it should return an empty list.
123 |                 if len(new_messages) < page_size:
124 |                     self.logger.debug(
125 |                         f"Size of new messages is less than page size. No more messages to fetch for channel {channel_id}"
126 |                     )
127 |                     more_messages = False
128 |                     break
129 | 
130 |                 page_size = page_size if (max_messages - len(messages)) > page_size else (max_messages - len(messages))
131 | 
132 |         except SlackApiError as e:
133 |             self.logger.error(f"Error fetching messages for channel {channel_id}: {e.response['error']}")
134 |             raise Exception(f"Error fetching messages for channel {channel_id}: {e.response['error']}")
135 | 
136 |         return (messages, latest, more_messages)
137 | 


--------------------------------------------------------------------------------
/examples/langchain_examples/01-rag-LangChain-all-sources.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2021 Pangea Cyber Corporation
  2 | # Author: Pangea Cyber Corporation
  3 | 
  4 | import os
  5 | from io import BytesIO
  6 | from pathlib import Path
  7 | from typing import List
  8 | 
  9 | import boto3
 10 | from google.oauth2.credentials import Credentials
 11 | from langchain.document_loaders import ConfluenceLoader
 12 | from langchain_aws import BedrockEmbeddings, ChatBedrock
 13 | from langchain_community.vectorstores import FAISS
 14 | from langchain_core.documents import Document
 15 | from langchain_google_community import GoogleDriveLoader
 16 | from pangea_multipass import ConfluenceAuth, ConfluenceME, GDriveAPI, GDriveME, enrich_metadata
 17 | from pangea_multipass_langchain import DocumentFilterMixer, LangChainConfluenceFilter, LangChainDocumentReader
 18 | 
 19 | # Initialization
 20 | bedrock_client = boto3.client("bedrock-runtime", region_name="us-west-2")
 21 | model_id = "anthropic.claude-3-5-sonnet-20240620-v1:0"
 22 | model_kwargs = {
 23 |     "max_tokens": 512,
 24 |     "temperature": 0.5,
 25 | }
 26 | 
 27 | ## Setup the LLM paramaters
 28 | llm = ChatBedrock(
 29 |     client=bedrock_client,
 30 |     model_id=model_id,
 31 |     model_kwargs=model_kwargs,
 32 | )
 33 | 
 34 | ## Setup the Embedding model paramaters
 35 | embedding_model = BedrockEmbeddings(model_id="amazon.titan-embed-g1-text-02", client=bedrock_client)
 36 | 
 37 | 
 38 | class TextLoader:
 39 |     file: BytesIO
 40 | 
 41 |     def __init__(self, file: BytesIO):
 42 |         self.file = file
 43 | 
 44 |     def load(self) -> List[Document]:
 45 |         return [Document(page_content=self.file.read().decode("utf-8"))]
 46 | 
 47 | 
 48 | ## Data ingestion pipeline
 49 | 
 50 | 
 51 | def load_gdrive_documents() -> List[Document]:
 52 |     os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = ""
 53 |     # Google Drive Data Ingestion
 54 |     admin_token_filepath = "admin_access_token.json"
 55 | 
 56 |     credentials_filepath = os.path.abspath("../credentials.json")
 57 |     print("Login to GDrive as admin...")
 58 |     GDriveAPI.get_and_save_access_token(
 59 |         credentials_filepath, admin_token_filepath, ["https://www.googleapis.com/auth/drive.readonly"]
 60 |     )
 61 | 
 62 |     loader = GoogleDriveLoader(
 63 |         folder_id="1Kj77oi2QGEOPKcIo_hKZPiHDJyKKFVgR",
 64 |         token_path=Path(admin_token_filepath),
 65 |         credentials_path=Path(credentials_filepath),
 66 |         recursive=True,
 67 |         load_extended_metadata=True,
 68 |         file_loader_cls=TextLoader,
 69 |     )
 70 | 
 71 |     docs: List[Document] = loader.load()
 72 |     print(f"GDrive docs loaded: {len(docs)}.")
 73 | 
 74 |     # Metadata enricher library
 75 |     SCOPES = [
 76 |         "openid",
 77 |         "https://www.googleapis.com/auth/userinfo.email",
 78 |         "https://www.googleapis.com/auth/userinfo.profile",
 79 |         "https://www.googleapis.com/auth/drive.metadata.readonly",
 80 |     ]
 81 |     creds = Credentials.from_authorized_user_file(admin_token_filepath, SCOPES)
 82 |     gdrive_me = GDriveME(creds, {})
 83 |     enrich_metadata(docs, [gdrive_me], reader=LangChainDocumentReader())
 84 |     # Finish metadata enrichement
 85 |     return docs
 86 | 
 87 | 
 88 | def confluence_read_docs() -> List[Document]:
 89 |     """Fetch all documents from Confluence using ConfluenceLoader."""
 90 | 
 91 |     confluence_admin_token = os.getenv("CONFLUENCE_ADMIN_TOKEN")
 92 |     assert confluence_admin_token
 93 |     confluence_admin_email = os.getenv("CONFLUENCE_ADMIN_EMAIL")
 94 |     assert confluence_admin_email
 95 |     confluence_url = os.getenv("CONFLUENCE_BASE_URL")
 96 |     assert confluence_url
 97 | 
 98 |     confluence_space_key = "~71202041f9bfec117041348629ccf3e3c751b3"
 99 |     # confluence_space_id = 393230
100 | 
101 |     # Create a ConfluenceReader instance
102 |     print("Loading Confluence docs...")
103 |     loader = ConfluenceLoader(
104 |         url=confluence_url,
105 |         username=confluence_admin_email,
106 |         api_key=confluence_admin_token,
107 |         space_key=confluence_space_key,
108 |     )
109 |     documents: List[Document] = loader.load()
110 | 
111 |     # Enrich metadata process
112 |     print(f"Processing {len(documents)} Confluence docs...")
113 |     confluence_me = ConfluenceME()
114 |     enrich_metadata(documents, [confluence_me], reader=LangChainDocumentReader())
115 | 
116 |     return documents
117 | 
118 | 
119 | PERSIST_DIR = "./storage/data/langchain/faiss_index"
120 | if not os.path.exists(PERSIST_DIR):
121 |     gdrive_docs = load_gdrive_documents()
122 |     confluence_docs = confluence_read_docs()
123 |     docs = gdrive_docs + confluence_docs
124 | 
125 |     # Initialize the vector store https://faiss.ai
126 |     print("Initializing vector store...")
127 |     vectorstore = FAISS.from_documents(documents=docs, embedding=embedding_model)
128 | 
129 |     # Store to file system
130 |     print("Storing vector store...")
131 |     vectorstore.save_local(PERSIST_DIR)
132 | else:
133 |     print("Loading vector store...")
134 |     vectorstore = FAISS.load_local(
135 |         folder_path=PERSIST_DIR, embeddings=embedding_model, allow_dangerous_deserialization=True
136 |     )
137 | 
138 | 
139 | ## Inference pipeline
140 | 
141 | from langchain.chains.combine_documents import create_stuff_documents_chain
142 | from langchain_core.prompts import ChatPromptTemplate
143 | from pangea_multipass_langchain import LangChainGDriveFilter
144 | 
145 | # Create GDrive filter
146 | credentials_filepath = os.path.abspath("../credentials.json")
147 | SCOPES = [
148 |     "openid",
149 |     "https://www.googleapis.com/auth/userinfo.email",
150 |     "https://www.googleapis.com/auth/userinfo.profile",
151 |     "https://www.googleapis.com/auth/drive.metadata.readonly",
152 | ]
153 | print("Login to GDrive as user...")
154 | creds = GDriveAPI.get_user_credentials(credentials_filepath, scopes=SCOPES)
155 | gdrive_filter = LangChainGDriveFilter(creds)
156 | 
157 | # Create Confluence filter
158 | confluence_admin_token = os.getenv("CONFLUENCE_ADMIN_TOKEN")
159 | assert confluence_admin_token
160 | confluence_admin_email = os.getenv("CONFLUENCE_ADMIN_EMAIL")
161 | assert confluence_admin_email
162 | confluence_url = os.getenv("CONFLUENCE_BASE_URL")
163 | assert confluence_url
164 | confluence_account_id = os.getenv("CONFLUENCE_USER_ACCOUNT_ID")
165 | assert confluence_account_id
166 | 
167 | confluence_filter = LangChainConfluenceFilter(
168 |     ConfluenceAuth(confluence_admin_email, confluence_admin_token, confluence_url), account_id=confluence_account_id
169 | )
170 | 
171 | # Create mixed filter
172 | filter_mixer = DocumentFilterMixer(document_filters=[gdrive_filter, confluence_filter])
173 | 
174 | # Use indexed store as a reteriver to create qa chain
175 | retriever = vectorstore.as_retriever()
176 | 
177 | # Prompt template with System, Context and User prompt
178 | template = """"System: Answer the following question based only on the provided context:
179 | 
180 | <context>
181 | {context}
182 | </context>
183 | 
184 | Question: {input}
185 | """
186 | prompt = ChatPromptTemplate.from_template(template)
187 | 
188 | # Document chain using the LLM and prompt template
189 | qa_chain = create_stuff_documents_chain(llm, prompt)
190 | 
191 | while True:
192 |     user_prompt = input("Enter your question: ")
193 |     similar_docs = retriever.invoke(user_prompt)
194 | 
195 |     print(f"similar_docsilar: {len(similar_docs)}")
196 | 
197 |     filtered_docs = filter_mixer.filter(similar_docs)
198 |     print(f"filtered_docs: {len(filtered_docs)}")
199 |     count = len(filter_mixer.get_unauthorized_documents())
200 | 
201 |     response = qa_chain.invoke({"input": user_prompt, "context": filtered_docs})
202 |     print(f"\n{response}")
203 |     print("\n=================")
204 |     print(
205 |         f"Warning: This answer could be inaccurate as it's missing context from {count} out of {len(similar_docs)} data sources. Included {len(filtered_docs)} sources."
206 |     )
207 |     print("=================\n")
208 | 


--------------------------------------------------------------------------------
/packages/pangea-multipass-langchain/pangea_multipass_langchain/langchain.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2021 Pangea Cyber Corporation
  2 | # Author: Pangea Cyber Corporation
  3 | 
  4 | from typing import Any, List, Optional
  5 | 
  6 | from google.oauth2.credentials import Credentials
  7 | from langchain_core.documents import Document
  8 | from pangea_multipass import (
  9 |     ConfluenceAuth,
 10 |     ConfluenceProcessor,
 11 |     DocumentReader,
 12 |     DropboxProcessor,
 13 |     FilterOperator,
 14 |     GDriveProcessor,
 15 |     GitHubProcessor,
 16 |     GitLabProcessor,
 17 |     JiraAuth,
 18 |     JiraProcessor,
 19 | )
 20 | from pangea_multipass import MetadataFilter as PangeaMetadataFilter
 21 | from pangea_multipass import (
 22 |     MultipassDocument,
 23 |     PangeaGenericNodeProcessor,
 24 |     PangeaNodeProcessorMixer,
 25 |     SlackProcessor,
 26 | )
 27 | 
 28 | 
 29 | class LangChainDocumentReader(DocumentReader):
 30 |     """Lang chain document reader"""
 31 | 
 32 |     def read(self, doc: Document) -> str:
 33 |         return str(doc.page_content)
 34 | 
 35 | 
 36 | def get_doc_id(doc: Document) -> str:
 37 |     return doc.id if doc.id is not None else ""
 38 | 
 39 | 
 40 | def get_doc_metadata(doc: Document) -> dict[str, Any]:
 41 |     return dict(doc.metadata)
 42 | 
 43 | 
 44 | def from_multipass(documents: List[MultipassDocument]) -> List[Document]:
 45 |     lc_documents: List[Document] = []
 46 |     for doc in documents:
 47 |         lc_doc = Document(id=doc.id, page_content=doc.content)
 48 |         lc_doc.metadata = doc.metadata
 49 |         lc_documents.append(lc_doc)
 50 | 
 51 |     return lc_documents
 52 | 
 53 | 
 54 | class LangChainJiraFilter(JiraProcessor[Document]):
 55 |     """Filter for Jira integration with LangChain documents.
 56 | 
 57 |     Uses Jira authentication to check document access in the LangChain.
 58 | 
 59 |     Args:
 60 |         auth (JiraAuth): Jira authentication credentials.
 61 |         account_id (Optional[str]): Jira user's account id to check issues permissions.
 62 |     """
 63 | 
 64 |     def __init__(self, auth: JiraAuth, account_id: Optional[str] = None):
 65 |         super().__init__(auth, get_node_metadata=get_doc_metadata, account_id=account_id)
 66 | 
 67 | 
 68 | class LangChainConfluenceFilter(ConfluenceProcessor[Document]):
 69 |     """Filter for Confluence integration with LangChain documents.
 70 | 
 71 |     Uses Confluence authentication to check document access in the LangChain.
 72 | 
 73 |     Args:
 74 |         auth (ConfluenceAuth): Confluence authentication credentials.
 75 |         space_id (Optional[int]): The space ID to filter pages by.
 76 |         account_id (Optional[str]): User account id to check permissions using admin token.
 77 | 
 78 |     """
 79 | 
 80 |     def __init__(self, auth: ConfluenceAuth, space_id: Optional[int] = None, account_id: Optional[str] = None):
 81 |         super().__init__(auth, get_node_metadata=get_doc_metadata, space_id=space_id, account_id=account_id)
 82 | 
 83 | 
 84 | class LangChainGDriveFilter(GDriveProcessor[Document]):
 85 |     """Filter for Google Drive integration with LangChain documents.
 86 | 
 87 |     Uses Google Drive credentials to check document access in the LangChain.
 88 | 
 89 |     Args:
 90 |         creds (Credentials): Google OAuth2 credentials.
 91 |         user_email (Optional[str]): User email to check access to files.
 92 |     """
 93 | 
 94 |     def __init__(self, creds: Credentials, user_email: Optional[str] = None):
 95 |         super().__init__(creds, get_node_metadata=get_doc_metadata, user_email=user_email)
 96 | 
 97 | 
 98 | class LangChainGitHubFilter(GitHubProcessor[Document]):
 99 |     """Filter for GitHub integration with LangChain documents.
100 | 
101 |     Uses GitHub classic token to check document access in the LangChain.
102 | 
103 |     Args:
104 |         token (str): GitHub classic token.
105 |         username (str): GitHub username to check permissions.
106 |     """
107 | 
108 |     def __init__(self, token: str, username: str):
109 |         super().__init__(token, get_node_metadata=get_doc_metadata, username=username)
110 | 
111 | 
112 | class LangChainSlackFilter(SlackProcessor[Document]):
113 |     """Filter for Slack integration with LangChain documents.
114 | 
115 |     Uses Slack token to check access to channels in the LangChain.
116 | 
117 |     Args:
118 |         token (str): Slack token.
119 |         user_email (Optional[str]): User email to check access to channels.
120 |     """
121 | 
122 |     def __init__(self, token: str, user_email: Optional[str] = None):
123 |         super().__init__(token, get_node_metadata=get_doc_metadata, user_email=user_email)
124 | 
125 | 
126 | class LangChainGitLabFilter(GitLabProcessor[Document]):
127 |     """Filter for GitLab integration with LangChain documents.
128 | 
129 |     Uses GitLab token to access nodes in the LangChain.
130 | 
131 |     Args:
132 |         token (str): GitLab token.
133 |         username (str): Username to check access to files.
134 |     """
135 | 
136 |     def __init__(self, admin_token: str, username: str):
137 |         super().__init__(admin_token=admin_token, username=username, get_node_metadata=get_doc_metadata)
138 | 
139 | 
140 | class LangChainDropboxFilter(DropboxProcessor[Document]):
141 |     """Filter for Dropbox integration with LangChain documents.
142 | 
143 |     Uses Dropbox token to check access to documents in the LangChain.
144 | 
145 |     Args:
146 |         token (str): Dropbox token.
147 |         user_email (str): User email to check access to files.
148 |     """
149 | 
150 |     def __init__(self, token: str, user_email: str):
151 |         super().__init__(token, user_email=user_email, get_node_metadata=get_doc_metadata)
152 | 
153 | 
154 | class DocumentFilterMixer:
155 |     node_processor: PangeaNodeProcessorMixer[Document] = PangeaNodeProcessorMixer(get_doc_metadata, [])
156 | 
157 |     def __init__(self, document_filters: List[PangeaGenericNodeProcessor[Document]]):
158 |         super().__init__()
159 |         self.node_processor = PangeaNodeProcessorMixer[Document](
160 |             get_node_metadata=get_doc_metadata,
161 |             node_processors=document_filters,
162 |         )
163 | 
164 |     def filter(
165 |         self,
166 |         documents: List[Document],
167 |     ) -> List[Document]:
168 |         return self.node_processor.filter(documents)
169 | 
170 |     def get_filter(
171 |         self,
172 |     ) -> dict[str, Any]:
173 |         filters = []
174 |         for filter in self.node_processor.get_filters():
175 |             filters.append(_convert_metadata_filter_to_langchain(filter))
176 |         return {"$or": filters}
177 | 
178 |     def get_unauthorized_documents(
179 |         self,
180 |     ) -> List[Document]:
181 |         """Retrieves documents that are unauthorized for access.
182 | 
183 |         Returns:
184 |             List[Document]: List of unauthorized documents.
185 |         """
186 |         return self.node_processor.get_unauthorized_nodes()
187 | 
188 |     def get_authorized_documents(
189 |         self,
190 |     ) -> List[Document]:
191 |         """Retrieves documents that are authorized for access.
192 | 
193 |         Returns:
194 |             List[Document]: List of authorized documents.
195 |         """
196 |         return self.node_processor.get_authorized_nodes()
197 | 
198 | 
199 | def _convert_metadata_filter_to_langchain(input: PangeaMetadataFilter) -> dict[str, Any]:
200 |     if input.operator == FilterOperator.EQ:
201 |         filter = {input.key: input.value}
202 |     elif input.operator == FilterOperator.IN:
203 |         filter = {input.key: {"$in": input.value}}
204 |     elif input.operator == FilterOperator.CONTAINS:
205 |         filter = {input.key: {"$contain": input.value}}
206 |     elif input.operator == FilterOperator.GT:
207 |         filter = {input.key: {"$gt": input.value}}
208 |     elif input.operator == FilterOperator.LT:
209 |         filter = {input.key: {"$lt": input.value}}
210 |     elif input.operator == FilterOperator.NE:
211 |         filter = {input.key: {"$ne": input.value}}
212 |     elif input.operator == FilterOperator.GTE:
213 |         filter = {input.key: {"$gte": input.value}}
214 |     elif input.operator == FilterOperator.LTE:
215 |         filter = {input.key: {"$lte": input.value}}
216 |     elif input.operator == FilterOperator.NIN:
217 |         filter = {input.key: {"$nin": input.value}}
218 |     else:
219 |         raise TypeError(f"Invalid filter operator: {input.operator}")
220 | 
221 |     return filter
222 | 


--------------------------------------------------------------------------------
/packages/pangea-multipass/pangea_multipass/sources/gitlab/gitlab.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import logging
  3 | from typing import Any, Callable, Generic, List, Optional
  4 | from urllib.parse import quote
  5 | 
  6 | import requests
  7 | 
  8 | from pangea_multipass.core import (
  9 |     FilterOperator,
 10 |     MetadataFilter,
 11 |     PangeaGenericNodeProcessor,
 12 |     PangeaMetadataKeys,
 13 |     PangeaMetadataValues,
 14 |     T,
 15 | )
 16 | 
 17 | 
 18 | class GitLabClient:
 19 |     _actor = "gitlab_client"
 20 | 
 21 |     def __init__(self, logger_name: str = "multipass"):
 22 |         self.logger = logging.getLogger(logger_name)
 23 | 
 24 |     def get_auth_headers(self, token: str) -> dict[str, str]:
 25 |         """Authenticate to GitLab using a personal access token."""
 26 |         return {"Authorization": f"Bearer {token}"}
 27 | 
 28 |     def user_has_access(self, admin_token: str, user_id: str, project_id: str) -> bool:
 29 |         """
 30 |         Check if a specific user has access to a GitLab project using an admin token.
 31 |         """
 32 |         url = f"https://gitlab.com/api/v4/projects/{project_id}/members/all/{user_id}"
 33 |         headers = self.get_auth_headers(admin_token)
 34 |         response = requests.get(url, headers=headers)
 35 | 
 36 |         if response.status_code == 200:
 37 |             return True  # User has access
 38 |         elif response.status_code == 404:
 39 |             return False  # User does not have access
 40 |         elif response.status_code == 403:
 41 |             self._log_error("user_has_access", url, {}, response)
 42 |             raise Exception("Admin token does not have sufficient permissions to check access.")
 43 |         else:
 44 |             self._log_error("user_has_access", url, {}, response)
 45 |             raise Exception(f"Unexpected error: {response.status_code} - {response.json()}")
 46 | 
 47 |     def get_user(self, admin_token: str, username: str) -> dict:
 48 |         """Get user information using an admin token."""
 49 | 
 50 |         url = f"https://gitlab.com/api/v4/users?username={quote(username)}"
 51 |         response = requests.get(
 52 |             url,
 53 |             headers=self.get_auth_headers(admin_token),
 54 |         )
 55 | 
 56 |         if response.status_code != 200:
 57 |             self._log_error("get_user", url, {}, response)
 58 | 
 59 |         response.raise_for_status()
 60 |         users = response.json()
 61 |         return users[0] if len(users) else {}
 62 | 
 63 |     def get_user_info(self, admin_token: str) -> dict:
 64 |         """Get user information from current token"""
 65 | 
 66 |         url = "https://gitlab.com/api/v4/user/"
 67 |         response = requests.get(
 68 |             url,
 69 |             headers=self.get_auth_headers(admin_token),
 70 |         )
 71 | 
 72 |         if response.status_code != 200:
 73 |             self._log_error("get_user_info", url, {}, response)
 74 | 
 75 |         response.raise_for_status()
 76 |         return response.json()
 77 | 
 78 |     def get_user_projects(self, admin_token: str) -> list[dict[str, Any]]:
 79 |         """Fetch all projects the authenticated user has access to."""
 80 |         projects = []
 81 |         headers = self.get_auth_headers(admin_token)
 82 |         url = f"https://gitlab.com/api/v4/projects"
 83 |         params = {"per_page": 100, "membership": True, "simple": True}
 84 |         while url:
 85 |             response = requests.get(url, headers=headers, params=params)
 86 |             if response.status_code != 200:
 87 |                 self._log_error("get_user_projects", url, params, response)
 88 |                 raise Exception(f"Error fetching projects: {response.text}")
 89 | 
 90 |             projects.extend(response.json())
 91 |             url = response.links.get("next", {}).get("url")  # Pagination
 92 |         return projects
 93 | 
 94 |     def get_allowed_projects(self, admin_token: str, user_id: str) -> list[int]:
 95 |         projects = self.get_user_projects(admin_token=admin_token)
 96 |         user_projects = []
 97 | 
 98 |         for project in projects:
 99 |             if self.user_has_access(admin_token, user_id, project["id"]):
100 |                 user_projects.append(project["id"])
101 | 
102 |         return user_projects
103 | 
104 |     def download_file(self, token: str, repo_id: str, file_path: str):
105 |         encoded_file_path = quote(file_path, safe="")  # Encode special chars
106 |         file_url = f"https://gitlab.com/api/v4/projects/{repo_id}/repository/files/{encoded_file_path}/raw"
107 | 
108 |         response = requests.get(file_url, headers=self.get_auth_headers(token))
109 |         if response.status_code != 200:
110 |             self._log_error("download_file", file_url, {}, response)
111 |             raise Exception(f"Skipping {file_path}: Could not download file")
112 | 
113 |         return response.content
114 | 
115 |     def _log_error(self, function_name: str, url: str, data: dict, response: requests.Response):
116 |         self.logger.error(
117 |             json.dumps(
118 |                 {
119 |                     "actor": GitLabClient._actor,
120 |                     "fn": function_name,
121 |                     "url": url,
122 |                     "data": data,
123 |                     "status_code": response.status_code,
124 |                     "reason": response.reason,
125 |                     "text": response.text,
126 |                 }
127 |             )
128 |         )
129 | 
130 | 
131 | class GitLabProcessor(PangeaGenericNodeProcessor[T], Generic[T]):
132 |     _access_cache: dict[str, bool] = {}
133 |     _token: str
134 |     _username: str
135 |     _user_id: Optional[str]
136 |     _projects: list[int] = []
137 |     _get_node_metadata: Callable[[T], dict[str, Any]]
138 | 
139 |     def __init__(
140 |         self,
141 |         admin_token: str,
142 |         username: str,
143 |         get_node_metadata: Callable[[T], dict[str, Any]],
144 |         logger_name: str = "multipass",
145 |     ):
146 |         self._token = admin_token
147 |         self._username = username
148 |         self._access_cache = {}
149 |         self._get_node_metadata = get_node_metadata
150 |         self._user_id = None
151 |         self._client = GitLabClient(logger_name)
152 | 
153 |     def _has_access(self, metadata: dict[str, Any]) -> bool:
154 |         """Check if the user has access to the given file."""
155 | 
156 |         project_id = metadata.get(PangeaMetadataKeys.GITLAB_REPOSITORY_ID, None)
157 |         if not project_id:
158 |             raise KeyError(f"Invalid metadata key: {PangeaMetadataKeys.GITLAB_REPOSITORY_ID}")
159 | 
160 |         if self._user_id is None:
161 |             self._load_user_id()
162 | 
163 |         if self._user_id is None:
164 |             print("Could not load user ID")
165 |             return False
166 | 
167 |         if project_id in self._access_cache:
168 |             return self._access_cache[project_id]
169 | 
170 |         has_access = self._client.user_has_access(self._token, self._user_id, project_id)
171 |         self._access_cache[project_id] = has_access
172 |         return has_access
173 | 
174 |     def filter(
175 |         self,
176 |         nodes: List[T],
177 |     ) -> List[T]:
178 |         """Filter GitLab files by access permissions.
179 | 
180 |         Args:
181 |             nodes (List[T]): List of nodes to process.
182 | 
183 |         Returns:
184 |             List[T]: Nodes that have authorized access.
185 |         """
186 | 
187 |         filtered: List[T] = []
188 |         for node in nodes:
189 |             if self._is_authorized(node):
190 |                 filtered.append(node)
191 |         return filtered
192 | 
193 |     def get_filter(
194 |         self,
195 |     ) -> MetadataFilter:
196 |         """Generate a filter based on accessible GitLab project IDs.
197 | 
198 |         Returns:
199 |             MetadataFilter: Filter for GitLab project IDs.
200 |         """
201 | 
202 |         if not self._projects:
203 |             if self._user_id is None:
204 |                 self._load_user_id()
205 | 
206 |             if self._user_id is None:
207 |                 raise Exception("Could not load user ID")
208 | 
209 |             self._projects = self._client.get_allowed_projects(self._token, self._user_id)
210 | 
211 |         return MetadataFilter(
212 |             key=PangeaMetadataKeys.GITLAB_REPOSITORY_ID, value=self._projects, operator=FilterOperator.IN
213 |         )
214 | 
215 |     def _is_authorized(self, node: T) -> bool:
216 |         metadata = self._get_node_metadata(node)
217 |         return metadata[PangeaMetadataKeys.DATA_SOURCE] == PangeaMetadataValues.DATA_SOURCE_GITLAB and self._has_access(
218 |             metadata
219 |         )
220 | 
221 |     def _load_user_id(self):
222 |         user = self._client.get_user(self._token, username=self._username)
223 |         self._user_id = user.get("id", None)
224 | 


--------------------------------------------------------------------------------
/packages/pangea-multipass/pangea_multipass/sources/github/github.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import logging
  3 | from typing import Any, Callable, Generic, List, Tuple
  4 | 
  5 | import requests
  6 | 
  7 | from pangea_multipass.core import (
  8 |     FilterOperator,
  9 |     MetadataFilter,
 10 |     PangeaGenericNodeProcessor,
 11 |     PangeaMetadataKeys,
 12 |     PangeaMetadataValues,
 13 |     T,
 14 | )
 15 | 
 16 | 
 17 | class GitHubClient:
 18 |     _actor = "github_client"
 19 | 
 20 |     def __init__(self, logger_name: str = "multipass"):
 21 |         self.logger = logging.getLogger(logger_name)
 22 | 
 23 |     def get_auth_headers(self, token: str) -> dict[str, str]:
 24 |         """Authenticate to GitHub using a personal access token."""
 25 |         headers = {
 26 |             "Authorization": f"token {token}",
 27 |             "Accept": "application/vnd.github.v3+json",
 28 |         }
 29 |         return headers
 30 | 
 31 |     def has_access(self, token: str, owner: str, repo_name: str) -> bool:
 32 |         """
 33 |         Check if this token has access to this particular GitHub repository
 34 |         """
 35 |         access = False
 36 | 
 37 |         headers = self.get_auth_headers(token)
 38 |         url = f"https://api.github.com/repos/{owner}/{repo_name}"
 39 |         response = requests.get(url, headers=headers)
 40 | 
 41 |         if response.status_code == 200:
 42 |             access = True  # User has access
 43 |         elif response.status_code == 404:
 44 |             access = False  # Repository not found or no access
 45 |         elif response.status_code == 403:
 46 |             self._log_error("has_access", url, {}, response)
 47 |             raise Exception(f"Access forbidden. Check permissions or token scope.")
 48 |         else:
 49 |             self._log_error("has_access", url, {}, response)
 50 |             raise Exception(f"Unexpected error: {response.status_code} - {response.json()}")
 51 | 
 52 |         return access
 53 | 
 54 |     def user_has_access(self, admin_token: str, owner: str, repo_name: str, username: str) -> bool:
 55 |         """
 56 |         Checks if a user has access to a specific GitHub repository using an admin token
 57 |         """
 58 |         headers = self.get_auth_headers(admin_token)
 59 |         url = f"https://api.github.com/repos/{owner}/{repo_name}/collaborators/{username}"
 60 |         response = requests.get(url, headers=headers)
 61 | 
 62 |         if response.status_code == 204:
 63 |             return True
 64 |         elif response.status_code == 404:
 65 |             return False
 66 |         elif response.status_code == 403:
 67 |             self._log_error("user_has_access", url, {}, response)
 68 |             raise Exception("Admin token does not have sufficient permissions to check access.")
 69 |         else:
 70 |             self._log_error("user_has_access", url, {}, response)
 71 |             raise Exception(f"Unexpected error: {response.status_code} - {response.json()}")
 72 | 
 73 |     def get_user_repos(self, token: str) -> List[dict[str, Any]]:
 74 |         """Get all repositories the authenticated user has access to."""
 75 | 
 76 |         headers = self.get_auth_headers(token)
 77 |         url = "https://api.github.com/user/repos"
 78 |         repos: List[dict[str, Any]] = []
 79 |         page = 1
 80 | 
 81 |         while True:
 82 |             response = requests.get(url, headers=headers, params={"per_page": 100, "page": page})
 83 |             if response.status_code != 200:
 84 |                 self._log_error("get_user_repos", url, {"per_page": 100, "page": page}, response)
 85 |                 raise Exception(f"Error fetching repositories: {response.json()}")
 86 | 
 87 |             data = response.json()
 88 |             if not data:
 89 |                 break
 90 | 
 91 |             repos.extend(data)
 92 |             page += 1
 93 | 
 94 |         return repos
 95 | 
 96 |     def get_repo_files(self, token: str, owner: str, repo: str) -> List[dict[str, Any]]:
 97 |         """Fetch all files in a repository using the GitHub Tree API."""
 98 | 
 99 |         headers = self.get_auth_headers(token)
100 | 
101 |         url = f"https://api.github.com/repos/{owner}/{repo}/git/trees/main?recursive=1"
102 |         response = requests.get(url, headers=headers)
103 | 
104 |         if response.status_code == 200:
105 |             tree_data = response.json()
106 |             return [item for item in tree_data.get("tree", []) if item["type"] == "blob"]
107 |         elif response.status_code == 404:
108 |             self.logger.warning(f"Repository '{repo}' not found.")
109 |             return []
110 |         else:
111 |             self._log_error("get_repo_files", url, {}, response)
112 |             raise Exception(f"Error fetching files for repository '{repo}': {response.json()}")
113 | 
114 |     def download_file_content(self, token: str, url: str) -> str:
115 |         """Download the content of a file from GitHub."""
116 | 
117 |         headers = self.get_auth_headers(token)
118 | 
119 |         response = requests.get(url, headers=headers)
120 |         if response.status_code == 200:
121 |             return str(response.content)
122 |         else:
123 |             self._log_error("download_file_content", url, {}, response)
124 |             raise Exception(f"Error downloading file: {response.json()}")
125 | 
126 |     def get_allowed_repos(self, token: str, username: str) -> List[dict]:
127 |         projects = self.get_user_repos(token)
128 |         user_projects = []
129 | 
130 |         for project in projects:
131 |             if self.user_has_access(token, project["owner"]["login"], project["name"], username):
132 |                 user_projects.append(project)
133 | 
134 |         return user_projects
135 | 
136 |     def _log_error(self, function_name: str, url: str, data: dict, response: requests.Response):
137 |         self.logger.error(
138 |             json.dumps(
139 |                 {
140 |                     "actor": GitHubClient._actor,
141 |                     "fn": function_name,
142 |                     "url": url,
143 |                     "data": data,
144 |                     "status_code": response.status_code,
145 |                     "reason": response.reason,
146 |                     "text": response.text,
147 |                 }
148 |             )
149 |         )
150 | 
151 | 
152 | class GitHubProcessor(PangeaGenericNodeProcessor[T], Generic[T]):
153 |     _access_cache: dict[Tuple[str, str], bool] = {}
154 |     _token: str
155 |     _repos: List[Tuple[str, str]] = []
156 |     _username: str
157 | 
158 |     def __init__(
159 |         self,
160 |         token: str,
161 |         get_node_metadata: Callable[[T], dict[str, Any]],
162 |         username: str,
163 |         logger_name: str = "multipass",
164 |     ):
165 |         super().__init__()
166 |         self._token = token
167 |         self._access_cache = {}
168 |         self.get_node_metadata = get_node_metadata
169 |         self._username = username
170 |         self._client = GitHubClient(logger_name)
171 | 
172 |     def filter(
173 |         self,
174 |         nodes: List[T],
175 |     ) -> List[T]:
176 |         """Filter GitHub files by access permissions.
177 | 
178 |         Args:
179 |             nodes (List[T]): List of nodes to process.
180 | 
181 |         Returns:
182 |             List[Any]: Nodes that have authorized access.
183 |         """
184 | 
185 |         filtered: List[T] = []
186 |         for node in nodes:
187 |             if self._is_authorized(node):
188 |                 filtered.append(node)
189 |         return filtered
190 | 
191 |     def get_filter(
192 |         self,
193 |     ) -> MetadataFilter:
194 |         """Generate a filter based on accessible Jira issue IDs.
195 | 
196 |         Returns:
197 |             MetadataFilter: Filter for Jira issue IDs.
198 |         """
199 | 
200 |         if not self._repos:
201 |             repos_info = self._client.get_allowed_repos(self._token, username=self._username)
202 |             repos = []
203 | 
204 |             for repo in repos_info:
205 |                 owner = repo["owner"]["login"]
206 |                 repo_name = repo["name"]
207 |                 repos.append((owner, repo_name))
208 | 
209 |         self._repos = repos
210 | 
211 |         return MetadataFilter(
212 |             key=PangeaMetadataKeys.GITHUB_REPOSITORY_OWNER_AND_NAME, value=self._repos, operator=FilterOperator.IN
213 |         )
214 | 
215 |     def _has_access(self, metadata: dict[str, Any]) -> bool:
216 |         """Check if the authenticated user has access to a repository."""
217 | 
218 |         repo_name = metadata.get(PangeaMetadataKeys.GITHUB_REPOSITORY_NAME, None)
219 |         if repo_name is None:
220 |             raise KeyError(f"Invalid metadata key: {PangeaMetadataKeys.GITHUB_REPOSITORY_NAME}")
221 | 
222 |         owner = metadata.get(PangeaMetadataKeys.GITHUB_REPOSITORY_OWNER, None)
223 |         if owner is None:
224 |             raise KeyError(f"Invalid metadata key: {PangeaMetadataKeys.GITHUB_REPOSITORY_OWNER}")
225 | 
226 |         access_tuple = (owner, repo_name)
227 |         has_access = self._access_cache.get(access_tuple, None)
228 |         if has_access is not None:
229 |             return has_access
230 | 
231 |         if self._username:
232 |             has_access = self._client.user_has_access(
233 |                 admin_token=self._token, owner=owner, repo_name=repo_name, username=self._username
234 |             )
235 |         else:
236 |             has_access = self._client.has_access(token=self._token, owner=owner, repo_name=repo_name)
237 | 
238 |         self._access_cache[access_tuple] = has_access
239 |         return has_access
240 | 
241 |     def _is_authorized(self, node: T) -> bool:
242 |         metadata = self.get_node_metadata(node)
243 |         return metadata[PangeaMetadataKeys.DATA_SOURCE] == PangeaMetadataValues.DATA_SOURCE_GITHUB and self._has_access(
244 |             metadata
245 |         )
246 | 


--------------------------------------------------------------------------------
/examples/llama_index_examples/02-rag-LlamaIndex-all-sources-processor.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2021 Pangea Cyber Corporation
  2 | # Author: Pangea Cyber Corporation
  3 | 
  4 | import os
  5 | import warnings
  6 | from typing import List
  7 | 
  8 | from google.oauth2.credentials import Credentials
  9 | from llama_index.core import Document, Settings, StorageContext, VectorStoreIndex, load_index_from_storage
 10 | from llama_index.embeddings.bedrock import BedrockEmbedding
 11 | from llama_index.llms.bedrock import Bedrock
 12 | from llama_index.readers.confluence import ConfluenceReader
 13 | from llama_index.readers.google import GoogleDriveReader
 14 | from llama_index.readers.jira import JiraReader
 15 | from pangea_multipass import ConfluenceAuth, ConfluenceME, GDriveAPI, GDriveME, JiraAuth, JiraME, enrich_metadata
 16 | from pangea_multipass_llama_index import LIDocument, LIDocumentReader
 17 | 
 18 | # Suppress specific warning
 19 | warnings.filterwarnings("ignore", message='Field "model_name" has conflict with protected namespace')
 20 | 
 21 | SCOPES = [
 22 |     "openid",
 23 |     "https://www.googleapis.com/auth/userinfo.email",
 24 |     "https://www.googleapis.com/auth/userinfo.profile",
 25 |     "https://www.googleapis.com/auth/drive.metadata.readonly",
 26 | ]
 27 | 
 28 | # import logging
 29 | # import sys
 30 | 
 31 | # logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
 32 | # logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))
 33 | 
 34 | 
 35 | # Initialize LLM, anthropic deployed on bedrock
 36 | llm = Bedrock(
 37 |     model="anthropic.claude-3-5-sonnet-20240620-v1:0",
 38 |     profile_name="dev",
 39 |     region_name="us-west-2",
 40 |     temperature=0.5,
 41 |     max_tokens=512,
 42 | )
 43 | 
 44 | # Initialize Embedding model, amazon titan deployed on bedrock
 45 | embed_model = BedrockEmbedding(model="amazon.titan-embed-g1-text-02", region_name="us-west-2", profile_name="dev")
 46 | 
 47 | # Set up the models
 48 | Settings.llm = llm
 49 | Settings.embed_model = embed_model
 50 | 
 51 | # Set up chunking parameters
 52 | Settings.chunk_size = 1000
 53 | Settings.chunk_overlap = 100
 54 | 
 55 | 
 56 | def google_drive_read_docs() -> List[LIDocument]:
 57 |     print("Loading Google Drive docs...")
 58 |     # Google Drive Data Ingestion
 59 |     credentials_filepath = os.path.abspath("../credentials.json")
 60 | 
 61 |     # Sample folder data folder owned by apurv@gondwana.cloud https://drive.google.com/drive/u/1/folders/1Kj77oi2QGEOPKcIo_hKZPiHDJyKKFVgR
 62 |     gdrive_fid = "1Kj77oi2QGEOPKcIo_hKZPiHDJyKKFVgR"
 63 | 
 64 |     # File name for the admin user
 65 |     admin_token_filepath = "admin_access_token.json"
 66 | 
 67 |     # # Invoke Google /auth endpoint and save he token for later use
 68 |     # GDrive.get_and_save_access_token(credentials_filepath, admin_token_filepath, SCOPES)
 69 | 
 70 |     # load the documents and create the index
 71 |     print("Login to GDrive as admin...")
 72 |     gdrive_reader = GoogleDriveReader(
 73 |         folder_id=gdrive_fid, token_path=admin_token_filepath, credentials_path=credentials_filepath
 74 |     )
 75 |     documents: List[LIDocument] = gdrive_reader.load_data(folder_id=gdrive_fid)
 76 | 
 77 |     print(f"Processing {len(documents)} docs...")
 78 | 
 79 |     # Metadata enricher library
 80 |     creds = Credentials.from_authorized_user_file(admin_token_filepath, SCOPES)
 81 |     gdrive_me = GDriveME(creds, {})
 82 |     enrich_metadata(documents, [gdrive_me], reader=LIDocumentReader())
 83 |     # Finish metadata enrichement
 84 | 
 85 |     return documents
 86 | 
 87 | 
 88 | # Fetch documents from Confluence
 89 | confluence_space_key = "~71202041f9bfec117041348629ccf3e3c751b3"
 90 | confluence_space_id = 393230
 91 | 
 92 | 
 93 | def confluence_read_docs() -> List[LIDocument]:
 94 |     """Fetch all documents from Confluence using ConfluenceReader."""
 95 | 
 96 |     token = os.getenv("CONFLUENCE_ADMIN_TOKEN")
 97 |     assert token
 98 |     email = os.getenv("CONFLUENCE_ADMIN_EMAIL")
 99 |     assert email
100 |     url = os.getenv("CONFLUENCE_BASE_URL")
101 |     assert url
102 | 
103 |     # Create a ConfluenceReader instance
104 |     print("Loading Confluence docs...")
105 |     reader = ConfluenceReader(
106 |         base_url=url,
107 |         user_name=email,
108 |         password=token,
109 |     )
110 |     documents: List[LIDocument] = reader.load_data(space_key=confluence_space_key, include_attachments=True)
111 | 
112 |     # Enrich metadata process
113 |     print(f"Processing {len(documents)} Confluence docs...")
114 |     confluence_me = ConfluenceME()
115 |     enrich_metadata(documents, [confluence_me], reader=LIDocumentReader())
116 | 
117 |     return documents
118 | 
119 | 
120 | def jira_load_data(reader: JiraReader, query: str = "") -> List[Document]:
121 |     max_results = 100
122 |     start_at = 0
123 |     keep_iterating = True
124 |     all_documents: List[Document] = []
125 | 
126 |     while keep_iterating:
127 |         documents = reader.load_data(query, start_at=start_at, max_results=max_results)
128 |         all_documents.extend(documents)
129 |         l = len(documents)
130 |         start_at = start_at + l
131 |         keep_iterating = l >= max_results
132 | 
133 |     return all_documents
134 | 
135 | 
136 | def jira_read_docs() -> List[LIDocument]:
137 |     # Jira credentials and base URL
138 |     JIRA_BASE_URL = os.getenv("JIRA_BASE_URL") or ""
139 |     assert JIRA_BASE_URL
140 |     jira_admin_email = os.getenv("JIRA_ADMIN_EMAIL") or ""
141 |     assert jira_admin_email
142 |     jira_api_token = os.getenv("JIRA_ADMIN_TOKEN") or ""
143 |     assert jira_api_token
144 | 
145 |     # Initialize LlamaIndex JiraReader
146 |     print("Loading Jira docs...")
147 |     jira_reader = JiraReader(server_url=JIRA_BASE_URL, email=jira_admin_email, api_token=jira_api_token)
148 | 
149 |     documents = jira_load_data(jira_reader, "")
150 | 
151 |     # Metadata enricher library
152 |     print(f"Processing {len(documents)} Jira docs...")
153 |     jira_me = JiraME(JIRA_BASE_URL, jira_admin_email, jira_api_token)
154 |     enrich_metadata(documents, [jira_me], reader=LIDocumentReader())
155 | 
156 |     return documents
157 | 
158 | 
159 | # Load data from Gdrive or from the disk
160 | PERSIST_DIR = "./storage/rbac/llamaindex/all_sources"
161 | if not os.path.exists(PERSIST_DIR):
162 |     # Load documents
163 |     gdrive_documents = google_drive_read_docs()
164 |     confluence_documents = confluence_read_docs()
165 |     jira_documents = jira_read_docs()
166 | 
167 |     # Combine documents
168 |     documents = gdrive_documents + confluence_documents + jira_documents
169 | 
170 |     print("Create and save index...")
171 |     index = VectorStoreIndex.from_documents(documents)
172 |     # store it for later
173 |     index.storage_context.persist(persist_dir=PERSIST_DIR)
174 | else:
175 |     # load the existing index
176 |     print("Loading index...")
177 |     storage_context = StorageContext.from_defaults(persist_dir=PERSIST_DIR)
178 |     index = load_index_from_storage(storage_context)  # type: ignore
179 | 
180 | 
181 | # Inference
182 | 
183 | from pangea_multipass_llama_index import (
184 |     LlamaIndexConfluenceProcessor,
185 |     LlamaIndexGDriveProcessor,
186 |     LlamaIndexJiraProcessor,
187 |     NodePostprocessorMixer,
188 | )
189 | 
190 | # Create GDrive filter
191 | credentials_filepath = os.path.abspath("../credentials.json")
192 | print("Login to GDrive as user...")
193 | creds = GDriveAPI.get_user_credentials(credentials_filepath, scopes=SCOPES)
194 | gdrive_processor = LlamaIndexGDriveProcessor(creds)
195 | 
196 | # Create Confluence filter
197 | confluence_admin_token = os.getenv("CONFLUENCE_ADMIN_TOKEN")
198 | assert confluence_admin_token
199 | confluence_admin_email = os.getenv("CONFLUENCE_ADMIN_EMAIL")
200 | assert confluence_admin_email
201 | confluence_url = os.getenv("CONFLUENCE_BASE_URL")
202 | assert confluence_url
203 | confluence_account_id = os.getenv("CONFLUENCE_USER_ACCOUNT_ID")
204 | assert confluence_account_id
205 | confluence_processor = LlamaIndexConfluenceProcessor(
206 |     ConfluenceAuth(confluence_admin_email, confluence_admin_token, confluence_url), account_id=confluence_account_id
207 | )
208 | 
209 | # Create JIRA filter
210 | JIRA_BASE_URL = os.getenv("JIRA_BASE_URL") or ""
211 | assert JIRA_BASE_URL
212 | jira_admin_email = os.getenv("JIRA_ADMIN_EMAIL") or ""
213 | assert jira_admin_email
214 | jira_admin_token = os.getenv("JIRA_ADMIN_TOKEN") or ""
215 | assert jira_admin_token
216 | jira_account_id = os.getenv("JIRA_USER_ACCOUNT_ID") or ""
217 | assert jira_account_id
218 | jira_processor = LlamaIndexJiraProcessor(
219 |     JiraAuth(jira_admin_email, jira_admin_token, JIRA_BASE_URL), account_id=jira_account_id
220 | )
221 | 
222 | # Initialize query engine and the reteriver to send prompts
223 | # query_engine = index.as_query_engine(similarity_top_k=10, streaming=True, filters=metadata_filters)
224 | node_processor = NodePostprocessorMixer(
225 |     [
226 |         gdrive_processor,
227 |         jira_processor,
228 |         confluence_processor,
229 |     ]
230 | )
231 | 
232 | query_engine = index.as_query_engine(
233 |     streaming=True,
234 |     similarity_top_k=10,
235 |     node_postprocessors=[node_processor],
236 | )
237 | 
238 | retriever = index.as_retriever(similarity_top_k=10)
239 | 
240 | 
241 | # Inference pipeline
242 | while True:
243 |     user_prompt = input("Enter your question:")
244 | 
245 |     nodes = retriever.retrieve(user_prompt)
246 |     count = len(node_processor.get_unauthorized_nodes())
247 |     count_authorized = len(node_processor.get_authorized_nodes())
248 | 
249 |     answer = query_engine.query(user_prompt)
250 |     # print("Assistant: ", answer)
251 |     answer.print_response_stream()  # type: ignore
252 | 
253 |     print("\n=================\n")
254 |     print(
255 |         f"\nWarning: This answer could be inaccurate as its missing context from {count} out of {len(nodes)} data sources. Include {count_authorized} sources."
256 |     )
257 |     print("\n++++++++++++++++++")
258 | 


--------------------------------------------------------------------------------
/packages/pangea-multipass/pangea_multipass/sources/slack/slack.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import logging
  3 | from typing import Any, Callable, Generic, List, Optional
  4 | 
  5 | import requests
  6 | from slack_sdk import WebClient
  7 | from slack_sdk.errors import SlackApiError
  8 | 
  9 | from pangea_multipass.core import (
 10 |     FilterOperator,
 11 |     MetadataFilter,
 12 |     PangeaGenericNodeProcessor,
 13 |     PangeaMetadataKeys,
 14 |     PangeaMetadataValues,
 15 |     T,
 16 | )
 17 | 
 18 | 
 19 | class SlackClient:
 20 |     _actor = "slack_client"
 21 | 
 22 |     def __init__(self, logger_name: str = "multipass"):
 23 |         self.logger = logging.getLogger(logger_name)
 24 | 
 25 |     def list_channels(self, token: str) -> List[dict[str, Any]]:
 26 |         """
 27 |         List all channels the authenticated user has access to.
 28 | 
 29 |         Args:
 30 |             token (str): Slack token.
 31 | 
 32 |         Returns:
 33 |             List of channel ids that the authenticated user has access to.
 34 |         """
 35 | 
 36 |         client = WebClient(token=token)
 37 |         try:
 38 |             response = client.conversations_list(types="public_channel,private_channel")
 39 |             channels: List[dict[str, Any]] = response.get("channels", [])
 40 |             return channels
 41 |         except SlackApiError as e:
 42 |             self._log_error("list_channels", "conversations.list", {}, e.response)
 43 |             return []
 44 | 
 45 |     def get_channel_members(self, token: str, channel_id: str) -> Optional[List[str]]:
 46 |         """
 47 |         Retrieve the list of members in a Slack channel.
 48 | 
 49 |         Args:
 50 |             token (str): Slack token.
 51 |             channel_id (str): Channel id to request members.
 52 | 
 53 |         Returns:
 54 |             List of user IDs in the channel.
 55 |         """
 56 | 
 57 |         client = WebClient(token=token)
 58 |         try:
 59 |             response = client.conversations_members(channel=channel_id)
 60 |             return response["members"]
 61 |         except SlackApiError as e:
 62 |             self._log_error("get_channel_members", "conversations.members", {"channel": channel_id}, e.response)
 63 |             return None
 64 | 
 65 |     def get_all_channels(self, token: str) -> Optional[List[str]]:
 66 |         """
 67 |         Retrieve all channels in the workspace.
 68 | 
 69 |         Args:
 70 |             token (str): Slack token
 71 | 
 72 |         Returns:
 73 |             List of channel IDs.
 74 |         """
 75 | 
 76 |         client = WebClient(token=token)
 77 |         channels: List[dict[str, Any]] = []
 78 |         try:
 79 |             response = client.conversations_list(types="public_channel,private_channel", limit=1000)
 80 |             channels = response.get("channels", [])
 81 |             return [channel["id"] for channel in channels]
 82 |         except SlackApiError as e:
 83 |             self._log_error("get_all_channels", "conversations.list", {}, e.response)
 84 |             return None
 85 | 
 86 |     def get_user_id(self, token: str, user_email: str) -> Optional[str]:
 87 |         """
 88 |         Retrieve the Slack user ID for a given email address.
 89 | 
 90 |         Args:
 91 |             token (str): Slack token.
 92 |             user_email (str): User email to request user id.
 93 | 
 94 |         Returns:
 95 |             User ID or None if the user does not exist.
 96 |         """
 97 | 
 98 |         client = WebClient(token=token)
 99 |         try:
100 |             response = client.users_lookupByEmail(email=user_email)
101 |             return response["user"]["id"]
102 |         except SlackApiError as e:
103 |             self._log_error("get_user_id", "users.lookupByEmail", {"email": user_email}, e.response)
104 |             return None
105 | 
106 |     def get_channels_for_user(self, token: str, user_id: str, channel_ids: List[str]) -> List[str]:
107 |         """
108 |         Check which channels a user has access to.
109 | 
110 |         Args:
111 |             token (str): Slack token.
112 |             user_id (str): Slack user id.
113 |             channels_ids (List[str]): Channels id to check access for user_id.
114 | 
115 |         Returns:
116 |             List of channel IDs the user has access to.
117 |         """
118 |         client = WebClient(token=token)
119 |         accessible_channels = []
120 |         for channel_id in channel_ids:
121 |             try:
122 |                 response = client.conversations_members(channel=channel_id)
123 |                 members: List[str] = response.get("members", [])
124 |                 if user_id in members:
125 |                     accessible_channels.append(channel_id)
126 |             except SlackApiError as e:
127 |                 if e.response["error"] == "not_in_channel":
128 |                     continue  # User is not in this channel
129 |                 else:
130 |                     self._log_error(
131 |                         "get_channels_for_user", "conversations.members", {"channel": channel_id}, e.response
132 |                     )
133 |                     pass
134 |         return accessible_channels
135 | 
136 |     def _log_error(self, function_name: str, url: str, data: dict, response: requests.Response):
137 |         self.logger.error(
138 |             json.dumps(
139 |                 {
140 |                     "actor": SlackClient._actor,
141 |                     "fn": function_name,
142 |                     "url": url,
143 |                     "data": data,
144 |                     "status_code": response.status_code,
145 |                     "reason": response.reason,
146 |                     "text": response.text,
147 |                 }
148 |             )
149 |         )
150 | 
151 | 
152 | class SlackProcessor(PangeaGenericNodeProcessor[T], Generic[T]):
153 |     _channels_id_cache: dict[str, bool] = {}
154 |     _token: str
155 |     _user_email: Optional[str] = None
156 |     _user_id: Optional[str] = None
157 | 
158 |     def __init__(
159 |         self,
160 |         token: str,
161 |         get_node_metadata: Callable[[T], dict[str, Any]],
162 |         user_email: Optional[str] = None,
163 |         logger_name: str = "multipass",
164 |     ):
165 |         super().__init__()
166 |         self._token = token
167 |         self._channels_id_cache = {}
168 |         self.get_node_metadata = get_node_metadata
169 |         self._user_email = user_email
170 |         self._client = SlackClient(logger_name)
171 | 
172 |     def _has_access(self, metadata: dict[str, Any]) -> bool:
173 |         """Check if the authenticated user has access to a channel."""
174 | 
175 |         channel_id = metadata.get(PangeaMetadataKeys.SLACK_CHANNEL_ID, None)
176 |         if channel_id is None:
177 |             raise KeyError(f"Invalid metadata key: {PangeaMetadataKeys.SLACK_CHANNEL_ID}")
178 | 
179 |         if not self._channels_id_cache:
180 |             self._load_channels_from_token()
181 |         else:
182 |             self._load_channels_with_email()
183 | 
184 |         return self._channels_id_cache.get(channel_id, False)
185 | 
186 |     def filter(
187 |         self,
188 |         nodes: List[T],
189 |     ) -> List[T]:
190 |         """Filter Slack channels by access permissions.
191 | 
192 |         Args:
193 |             nodes (List[T]): List of nodes to process.
194 | 
195 |         Returns:
196 |             List[Any]: Nodes that have authorized access.
197 |         """
198 | 
199 |         filtered: List[T] = []
200 |         for node in nodes:
201 |             if self._is_authorized(node):
202 |                 filtered.append(node)
203 |         return filtered
204 | 
205 |     def get_filter(
206 |         self,
207 |     ) -> MetadataFilter:
208 |         """Generate a filter based on accessible Slack channel IDs.
209 | 
210 |         Returns:
211 |             MetadataFilter: Filter for Slack channel IDs.
212 |         """
213 | 
214 |         if not self._user_email:
215 |             self._load_channels_from_token()
216 |         else:
217 |             self._load_channels_with_email()
218 | 
219 |         channels = list(self._channels_id_cache.keys())
220 | 
221 |         return MetadataFilter(key=PangeaMetadataKeys.SLACK_CHANNEL_ID, value=channels, operator=FilterOperator.IN)
222 | 
223 |     def check_user_access(self, token: str, channel_id: str, user_email: str) -> bool:
224 |         """
225 |         Check if a user has access to a specific Slack channel.
226 | 
227 |         Args:
228 |             token (str): Slack token.
229 |             channel_id (srt): ID of the Slack channel.
230 |             user_email (str): Email of the user to check.
231 | 
232 |         Returns:
233 |             True if the user is a member of the channel, False otherwise.
234 |         """
235 | 
236 |         user_id = self._client.get_user_id(token, user_email)
237 |         if not user_id:
238 |             return False
239 | 
240 |         channel_members = self._client.get_channel_members(token, channel_id)
241 |         if channel_members is None:
242 |             return False
243 | 
244 |         return user_id in channel_members
245 | 
246 |     def _load_channels_with_email(self) -> None:
247 |         if self._channels_id_cache:
248 |             return
249 | 
250 |         if not self._user_id and self._user_email is not None:
251 |             self._user_id = self._client.get_user_id(self._token, self._user_email)
252 | 
253 |         if not self._user_id:
254 |             return
255 | 
256 |         all_channels = self._client.get_all_channels(self._token)
257 |         if all_channels is None:
258 |             return
259 | 
260 |         channels = self._client.get_channels_for_user(self._token, user_id=self._user_id, channel_ids=all_channels)
261 |         for channel in channels:
262 |             self._channels_id_cache[channel] = True
263 | 
264 |     def _load_channels_from_token(self) -> None:
265 |         if self._channels_id_cache:
266 |             return
267 | 
268 |         for channel in self._client.list_channels(self._token):
269 |             self._channels_id_cache[channel["id"]] = True
270 | 
271 |     def _is_authorized(self, node: T) -> bool:
272 |         metadata = self.get_node_metadata(node)
273 |         return metadata[PangeaMetadataKeys.DATA_SOURCE] == PangeaMetadataValues.DATA_SOURCE_SLACK and self._has_access(
274 |             metadata
275 |         )
276 | 


--------------------------------------------------------------------------------
/packages/pangea-multipass-llama-index/pangea_multipass_llama_index/llama_index.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2021 Pangea Cyber Corporation
  2 | # Author: Pangea Cyber Corporation
  3 | 
  4 | from typing import Any, List, Optional
  5 | 
  6 | from google.oauth2.credentials import Credentials
  7 | from llama_index.core import Document as LIDocument
  8 | from llama_index.core.postprocessor.types import BaseNodePostprocessor
  9 | from llama_index.core.schema import NodeWithScore, QueryBundle
 10 | from llama_index.core.vector_stores import FilterCondition, FilterOperator, MetadataFilter, MetadataFilters
 11 | from pangea_multipass import (
 12 |     ConfluenceAuth,
 13 |     ConfluenceProcessor,
 14 |     DocumentReader,
 15 |     DropboxProcessor,
 16 |     GDriveProcessor,
 17 |     GitHubProcessor,
 18 |     GitLabProcessor,
 19 |     JiraAuth,
 20 |     JiraProcessor,
 21 | )
 22 | from pangea_multipass import MetadataFilter as PangeaMetadataFilter
 23 | from pangea_multipass import (
 24 |     MultipassDocument,
 25 |     PangeaGenericNodeProcessor,
 26 |     PangeaNodeProcessorMixer,
 27 |     SlackProcessor,
 28 | )
 29 | 
 30 | 
 31 | class LIDocumentReader(DocumentReader):
 32 |     """Document reader for Llama Index documents.
 33 | 
 34 |     Provides methods for reading content from a Llama Index document.
 35 | 
 36 |     Methods:
 37 |         read(doc: LIDocument) -> str: Reads and returns the content of a Llama Index document.
 38 |     """
 39 | 
 40 |     def read(self, doc: LIDocument) -> str:
 41 |         """Reads and returns the content of the given Llama Index document.
 42 | 
 43 |         Args:
 44 |             doc (LIDocument): The Llama Index document to read.
 45 | 
 46 |         Returns:
 47 |             str: The content of the document.
 48 |         """
 49 |         return str(doc.get_content())
 50 | 
 51 | 
 52 | # pangea-metadata-llama-index
 53 | def get_doc_id(doc: LIDocument) -> str:
 54 |     """Fetches the document ID from a Llama Index document.
 55 | 
 56 |     Args:
 57 |         doc (LIDocument): The Llama Index document.
 58 | 
 59 |     Returns:
 60 |         str: The document ID.
 61 |     """
 62 |     return str(doc.doc_id)
 63 | 
 64 | 
 65 | def get_node_metadata(node: NodeWithScore) -> dict[str, Any]:
 66 |     """Fetches metadata from a node with a score.
 67 | 
 68 |     Args:
 69 |         node (NodeWithScore): The node from which metadata is retrieved.
 70 | 
 71 |     Returns:
 72 |         dict[str, Any]: A dictionary containing node metadata.
 73 |     """
 74 |     return dict(node.metadata)
 75 | 
 76 | 
 77 | def from_multipass(documents: List[MultipassDocument]) -> List[LIDocument]:
 78 |     li_documents: List[LIDocument] = []
 79 |     for doc in documents:
 80 |         li_doc = LIDocument(doc_id=doc.id, text=doc.content)
 81 |         li_doc.metadata = doc.metadata
 82 |         li_documents.append(li_doc)
 83 | 
 84 |     return li_documents
 85 | 
 86 | 
 87 | class LlamaIndexJiraProcessor(JiraProcessor[NodeWithScore]):
 88 |     """Processor for Jira integration with Llama Index nodes.
 89 | 
 90 |     Uses Jira authentication to access nodes.
 91 | 
 92 |     Args:
 93 |         auth (JiraAuth): Jira authentication credentials.
 94 |         account_id (Optional[str]): Jira user's account id to check issues permissions.
 95 |     """
 96 | 
 97 |     def __init__(self, auth: JiraAuth, account_id: Optional[str] = None):
 98 |         super().__init__(auth, get_node_metadata=get_node_metadata, account_id=account_id)
 99 | 
100 | 
101 | class LlamaIndexConfluenceProcessor(ConfluenceProcessor[NodeWithScore]):
102 |     """Processor for Confluence integration with Llama Index nodes.
103 | 
104 |     Uses Confluence authentication to check nodes access.
105 | 
106 |     Args:
107 |         auth (ConfluenceAuth): Confluence authentication credentials.
108 |         space_id (Optional[int]): The space ID to filter pages by.
109 |         account_id (Optional[str]): User account id to check permissions using admin token.
110 | 
111 |     """
112 | 
113 |     def __init__(self, auth: ConfluenceAuth, space_id: Optional[int] = None, account_id: Optional[str] = None):
114 |         super().__init__(auth, get_node_metadata=get_node_metadata, space_id=space_id, account_id=account_id)
115 | 
116 | 
117 | class LlamaIndexGDriveProcessor(GDriveProcessor[NodeWithScore]):
118 |     """Processor for Google Drive integration with Llama Index nodes.
119 | 
120 |     Uses Google Drive credentials to check nodes access.
121 | 
122 |     Args:
123 |         creds (Credentials): Google OAuth2 credentials.
124 |         user_email (Optional[str]): User email to check access to files.
125 |     """
126 | 
127 |     def __init__(self, creds: Credentials, user_email: Optional[str] = None):
128 |         super().__init__(creds, get_node_metadata=get_node_metadata, user_email=user_email)
129 | 
130 | 
131 | class LlamaIndexGitHubProcessor(GitHubProcessor[NodeWithScore]):
132 |     """Processor for GitHub integration with Llama Index nodes.
133 | 
134 |     Uses GitHub token to check node access.
135 | 
136 |     Args:
137 |         token (str): GitHub classic token.
138 |         username (str): GitHub username to check permissions.
139 |     """
140 | 
141 |     def __init__(self, token: str, username: str):
142 |         super().__init__(token, get_node_metadata=get_node_metadata, username=username)
143 | 
144 | 
145 | class LlamaIndexSlackProcessor(SlackProcessor[NodeWithScore]):
146 |     """Processor for Slack integration with Llama Index nodes.
147 | 
148 |     Uses Slack token to check node access.
149 | 
150 |     Args:
151 |         token (str): Slack token.
152 |         user_email (Optional[str]): User email to check access to files.
153 |     """
154 | 
155 |     def __init__(self, token: str, user_email: Optional[str] = None):
156 |         super().__init__(token, get_node_metadata=get_node_metadata, user_email=user_email)
157 | 
158 | 
159 | class LlamaIndexGitLabProcessor(GitLabProcessor[NodeWithScore]):
160 |     """Processor for GitLab integration with Llama Index nodes.
161 | 
162 |     Uses GitLab token to access nodes in the Llama Index.
163 | 
164 |     Args:
165 |         token (str): GitLab token.
166 |         username (str): Username to check access to files.
167 |     """
168 | 
169 |     def __init__(self, admin_token: str, username: str):
170 |         super().__init__(admin_token=admin_token, username=username, get_node_metadata=get_node_metadata)
171 | 
172 | 
173 | class LlamaIndexDropboxProcessor(DropboxProcessor[NodeWithScore]):
174 |     """Processor for Dropbox integration with Llama Index nodes.
175 | 
176 |     Uses Dropbox token to check node access.
177 | 
178 |     Args:
179 |         token (str): Dropbox token.
180 |         user_email (str): User email to check access to files.
181 |     """
182 | 
183 |     def __init__(self, token: str, user_email: str):
184 |         super().__init__(token, user_email=user_email, get_node_metadata=get_node_metadata)
185 | 
186 | 
187 | class NodePostprocessorMixer(BaseNodePostprocessor):
188 |     """Postprocessor mixer for processing nodes with multiple processors.
189 | 
190 |     This class mixes multiple node processors and applies them to Llama Index nodes.
191 | 
192 |     Attributes:
193 |         node_processor (PangeaNodeProcessorMixer[NodeWithScore]): A mixer of node processors.
194 | 
195 |     Methods:
196 |         _postprocess_nodes(nodes: List[NodeWithScore], query_bundle: Optional[QueryBundle] = None) -> List[NodeWithScore]:
197 |             Postprocesses a list of nodes with the mixed processors.
198 |         get_filter() -> MetadataFilters: Gets the metadata filters used for processing nodes.
199 |         get_unauthorized_nodes() -> List[NodeWithScore]: Retrieves nodes that are unauthorized for access.
200 |         get_authorized_nodes() -> List[NodeWithScore]: Retrieves nodes that are authorized for access.
201 |     """
202 | 
203 |     node_processor: PangeaNodeProcessorMixer[NodeWithScore] = PangeaNodeProcessorMixer(get_node_metadata, [])
204 | 
205 |     def __init__(self, node_processors: List[PangeaGenericNodeProcessor[NodeWithScore]]):
206 |         """Initializes the NodePostprocessorMixer with a list of node processors.
207 | 
208 |         Args:
209 |             node_processors (List[PangeaGenericNodeProcessor]): List of node processors to mix and apply.
210 |         """
211 | 
212 |         super().__init__()
213 |         self.node_processor = PangeaNodeProcessorMixer[NodeWithScore](
214 |             get_node_metadata=get_node_metadata,
215 |             node_processors=node_processors,
216 |         )
217 | 
218 |     def _postprocess_nodes(
219 |         self,
220 |         nodes: List[NodeWithScore],
221 |         query_bundle: Optional[QueryBundle] = None,
222 |     ) -> List[NodeWithScore]:
223 |         """Applies postprocessing to a list of nodes using the mixed node processors.
224 | 
225 |         Args:
226 |             nodes (List[NodeWithScore]): The nodes to be postprocessed.
227 |             query_bundle (Optional[QueryBundle]): Query context for processing. Defaults to None.
228 | 
229 |         Returns:
230 |             List[NodeWithScore]: The list of postprocessed nodes.
231 |         """
232 | 
233 |         return self.node_processor.filter(nodes)
234 | 
235 |     def get_filter(
236 |         self,
237 |     ) -> MetadataFilters:
238 |         """Generates metadata filters for processing nodes.
239 | 
240 |         Returns:
241 |             MetadataFilters: A set of metadata filters with an OR condition applied.
242 |         """
243 | 
244 |         filters: List[MetadataFilter | MetadataFilters] = []
245 |         for filter in self.node_processor.get_filters():
246 |             filters.append(_convert_metadata_filter_to_llama_index(filter))
247 | 
248 |         return MetadataFilters(filters=filters, condition=FilterCondition.OR)
249 | 
250 |     def get_unauthorized_nodes(
251 |         self,
252 |     ) -> List[NodeWithScore]:
253 |         """Retrieves nodes that are unauthorized for access.
254 | 
255 |         Returns:
256 |             List[NodeWithScore]: List of unauthorized nodes.
257 |         """
258 |         return self.node_processor.get_unauthorized_nodes()
259 | 
260 |     def get_authorized_nodes(
261 |         self,
262 |     ) -> List[NodeWithScore]:
263 |         """Retrieves nodes that are authorized for access.
264 | 
265 |         Returns:
266 |             List[NodeWithScore]: List of authorized nodes.
267 |         """
268 |         return self.node_processor.get_authorized_nodes()
269 | 
270 | 
271 | def _convert_metadata_filter_to_llama_index(input: PangeaMetadataFilter) -> MetadataFilter:
272 |     """Converts a Pangea metadata filter to a Llama Index-compatible filter.
273 | 
274 |     Args:
275 |         input (PangeaMetadataFilter): The Pangea metadata filter to convert.
276 | 
277 |     Returns:
278 |         MetadataFilter: The converted Llama Index metadata filter.
279 |     """
280 |     return MetadataFilter(key=input.key, value=input.value, operator=FilterOperator(input.operator))
281 | 


--------------------------------------------------------------------------------
/packages/pangea-multipass/pangea_multipass/core.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2021 Pangea Cyber Corporation
  2 | # Author: Pangea Cyber Corporation
  3 | 
  4 | import dataclasses
  5 | import enum
  6 | import hashlib
  7 | from abc import ABC, abstractmethod
  8 | from secrets import token_hex
  9 | from typing import Any, Callable, Generic, List, Sequence, TypeVar
 10 | 
 11 | T = TypeVar("T")
 12 | _PANGEA_METADATA_KEY_PREFIX = "_pangea_"
 13 | 
 14 | 
 15 | def generate_id() -> str:
 16 |     return token_hex(20)
 17 | 
 18 | 
 19 | class FilterOperator(str, enum.Enum):
 20 |     """Defines operators for filtering metadata."""
 21 | 
 22 |     IN = "in"  # In array (string or number)
 23 |     CONTAINS = "contains"  # metadata array contains value (string or number)
 24 |     EQ = "=="  # default operator (string, int, float)
 25 |     GT = ">"  # greater than (int, float)
 26 |     LT = "<"  # less than (int, float)
 27 |     NE = "!="  # not equal to (string, int, float)
 28 |     GTE = ">="  # greater than or equal to (int, float)
 29 |     LTE = "<="  # less than or equal to (int, float)
 30 |     NIN = "nin"  # Not in array (string or number)
 31 |     ANY = "any"  # Contains any (array of strings)
 32 |     ALL = "all"  # Contains all (array of strings)
 33 |     TEXT_MATCH = "text_match"  # full text match (allows you to search for a specific substring, token or phrase within the text field)
 34 |     IS_EMPTY = "is_empty"  # the field is not exist or empty (null or empty array)
 35 | 
 36 | 
 37 | class PangeaMetadataKeys(str, enum.Enum):
 38 |     DATA_SOURCE = f"{_PANGEA_METADATA_KEY_PREFIX}data_source"
 39 |     FILE_NAME = f"{_PANGEA_METADATA_KEY_PREFIX}file_name"
 40 |     FILE_PATH = f"{_PANGEA_METADATA_KEY_PREFIX}file_path"
 41 |     CONFLUENCE_PAGE_ID = f"{_PANGEA_METADATA_KEY_PREFIX}confluence_page_id"
 42 |     JIRA_ISSUE_ID = f"{_PANGEA_METADATA_KEY_PREFIX}jira_issue_id"
 43 |     GDRIVE_FILE_ID = f"{_PANGEA_METADATA_KEY_PREFIX}gdrive_file_id"
 44 |     NODE_ID = f"{_PANGEA_METADATA_KEY_PREFIX}node_id"
 45 |     GITHUB_REPOSITORY_NAME = f"{_PANGEA_METADATA_KEY_PREFIX}repository_name"
 46 |     GITHUB_REPOSITORY_OWNER = f"{_PANGEA_METADATA_KEY_PREFIX}repository_owner"
 47 |     GITHUB_REPOSITORY_OWNER_AND_NAME = f"{_PANGEA_METADATA_KEY_PREFIX}repository_owner_and_name"
 48 |     SLACK_CHANNEL_ID = f"{_PANGEA_METADATA_KEY_PREFIX}slack_channel_id"
 49 |     SLACK_CHANNEL_NAME = f"{_PANGEA_METADATA_KEY_PREFIX}slack_channel_name"
 50 |     SLACK_USER = f"{_PANGEA_METADATA_KEY_PREFIX}slack_user"
 51 |     SLACK_TIMESTAMP = f"{_PANGEA_METADATA_KEY_PREFIX}slack_timestamp"
 52 |     GITLAB_REPOSITORY_ID = f"{_PANGEA_METADATA_KEY_PREFIX}gitlab_repository_id"
 53 |     GITLAB_REPOSITORY_NAME = f"{_PANGEA_METADATA_KEY_PREFIX}gitlab_repository_name"
 54 |     GITLAB_REPOSITORY_NAMESPACE_WITH_PATH = f"{_PANGEA_METADATA_KEY_PREFIX}gitlab_repository_namespace_with_path"
 55 |     DROPBOX_ID = f"{_PANGEA_METADATA_KEY_PREFIX}dropbox_id"
 56 |     DROPBOX_PATH = f"{_PANGEA_METADATA_KEY_PREFIX}path"
 57 |     DROPBOX_FILE_PATH = f"{_PANGEA_METADATA_KEY_PREFIX}file_path"
 58 | 
 59 | 
 60 | class PangeaMetadataValues(str, enum.Enum):
 61 |     DATA_SOURCE_CONFLUENCE = "confluence"
 62 |     DATA_SOURCE_GDRIVE = "gdrive"
 63 |     DATA_SOURCE_JIRA = "jira"
 64 |     DATA_SOURCE_GITHUB = "github"
 65 |     DATA_SOURCE_SLACK = "slack"
 66 |     DATA_SOURCE_GITLAB = "gitlab"
 67 |     DATA_SOURCE_DROPBOX = "dropbox"
 68 | 
 69 | 
 70 | @dataclasses.dataclass
 71 | class MultipassDocument:
 72 |     id: str
 73 |     content: str
 74 |     metadata: dict[str, Any]
 75 | 
 76 | 
 77 | def get_document_metadata(doc: MultipassDocument) -> dict[str, Any]:
 78 |     """Fetches metadata from a multipass document.
 79 | 
 80 |     Args:
 81 |         doc (MultipassDocument): The doc from which metadata is retrieved.
 82 | 
 83 |     Returns:
 84 |         dict[str, Any]: A dictionary containing node metadata.
 85 |     """
 86 |     return doc.metadata
 87 | 
 88 | 
 89 | @dataclasses.dataclass
 90 | class MetadataFilter:
 91 |     """Represents a filter for document metadata."""
 92 | 
 93 |     key: str
 94 |     value: Any
 95 |     operator: FilterOperator
 96 | 
 97 | 
 98 | class DocumentReader(ABC):
 99 |     """Interface for reading documents."""
100 | 
101 |     @abstractmethod
102 |     def read(self, doc: Any) -> str:
103 |         """Reads and returns content of the document as a string."""
104 |         pass
105 | 
106 | 
107 | class PangeaGenericNodeProcessor(ABC, Generic[T]):
108 |     """Abstract processor for handling nodes with filtering and processing methods."""
109 | 
110 |     @abstractmethod
111 |     def filter(self, nodes: List[T]) -> List[T]:
112 |         """Processes nodes and applies filtering."""
113 |         pass
114 | 
115 |     @abstractmethod
116 |     def get_filter(self) -> MetadataFilter:
117 |         """Returns a filter based on the processed nodes' metadata."""
118 |         pass
119 | 
120 | 
121 | class MetadataEnricher(ABC):
122 |     """Interface for generating additional metadata for documents."""
123 | 
124 |     _key: str
125 |     """Key used in the metadata dictionary for the enrichment. """
126 | 
127 |     def __init__(self, key: str):
128 |         if not key.startswith(_PANGEA_METADATA_KEY_PREFIX):
129 |             key = f"{_PANGEA_METADATA_KEY_PREFIX}{key}"
130 | 
131 |         self._key = key
132 | 
133 |     @abstractmethod
134 |     def extract_metadata(self, doc: Any, file_content: str) -> dict[str, Any]:
135 |         """Generates metadata based on document and its content."""
136 |         pass
137 | 
138 | 
139 | class MetadataUpdater(ABC):
140 |     """Interface for updating document metadata."""
141 | 
142 |     @abstractmethod
143 |     def update_metadata(self, doc: Any, metadata: dict[str, Any]) -> None:
144 |         """Updates document with provided metadata."""
145 |         pass
146 | 
147 | 
148 | class GenericMetadataUpdater(MetadataUpdater):
149 |     """Updates metadata of a Llama Index or Lang Chain Document."""
150 | 
151 |     def update_metadata(self, doc: Any, metadata: dict[str, Any]) -> None:
152 |         """Updates document metadata with given key-value pairs."""
153 |         doc.metadata.update(metadata)
154 | 
155 | 
156 | class HasherSHA256(MetadataEnricher):
157 |     """Generates SHA-256 hash for the document and adds it to metadata."""
158 | 
159 |     def extract_metadata(self, doc: Any, file_content: str) -> dict[str, Any]:
160 |         """Returns SHA-256 hash of the document content."""
161 |         return {self._key: hashlib.sha256(file_content.encode()).hexdigest()}
162 | 
163 | 
164 | class Constant(MetadataEnricher):
165 |     """Sets a constant value as metadata for the document."""
166 | 
167 |     value: str
168 | 
169 |     def __init__(self, key: str, value: str):
170 |         super().__init__(f"{key}")
171 |         self.value = value
172 | 
173 |     def extract_metadata(self, doc: Any, file_content: str) -> dict[str, Any]:
174 |         """Sets a constant value in the metadata."""
175 |         return {self._key: self.value}
176 | 
177 | 
178 | def enrich_metadata(
179 |     documents: Sequence[Any],
180 |     metadata_enrichers: List[MetadataEnricher],
181 |     reader: DocumentReader,
182 |     updater: MetadataUpdater = GenericMetadataUpdater(),
183 | ) -> None:
184 |     """Enriches metadata of documents by applying specified enrichers.
185 | 
186 |     Args:
187 |         documents: A sequence of documents to enrich.
188 |         metadata_enrichers: List of metadata enrichers to apply.
189 |         reader: A reader instance to obtain document content.
190 |         updater: Optional updater instance to apply metadata changes.
191 |     """
192 | 
193 |     for doc in documents:
194 |         file_content = reader.read(doc)
195 | 
196 |         # Add Pangea Node Random ID
197 |         updater.update_metadata(doc, {PangeaMetadataKeys.NODE_ID: generate_id()})
198 | 
199 |         for enricher in metadata_enrichers:
200 |             updater.update_metadata(doc, enricher.extract_metadata(doc, file_content))
201 | 
202 |         reader.read(doc)
203 | 
204 | 
205 | class PangeaNodeProcessorMixer(Generic[T]):
206 |     """Combines multiple node processors for authorization filtering.
207 | 
208 |     Aggregates results from various node processors to create a unified view of authorized and unauthorized nodes.
209 | 
210 |     Attributes:
211 |         _node_processors (List[PangeaGenericNodeProcessor]): List of node processors.
212 |         _get_node_metadata (Callable): Function to get node metadata.
213 |         _unauthorized_nodes (List[T]): Cached list of unauthorized nodes.
214 |         _authorized_nodes (List[T]): Cached list of authorized nodes.
215 |     """
216 | 
217 |     _node_processors: List[PangeaGenericNodeProcessor[T]] = []
218 |     _get_node_metadata: Callable[[T], dict[str, Any]]
219 |     _unauthorized_nodes: List[T] = []
220 |     _authorized_nodes: List[T] = []
221 | 
222 |     def __init__(
223 |         self,
224 |         get_node_metadata: Callable[[T], dict[str, Any]],
225 |         node_processors: List[PangeaGenericNodeProcessor[T]],
226 |     ):
227 |         self._node_processors = node_processors
228 |         self._get_node_metadata = get_node_metadata
229 | 
230 |     def filter(
231 |         self,
232 |         nodes: List[T],
233 |     ) -> List[T]:
234 |         """Process nodes through each processor to filter authorized nodes.
235 | 
236 |         Args:
237 |             nodes (List[T]): List of nodes to process.
238 | 
239 |         Returns:
240 |             List[T]: Nodes that have been authorized across all processors.
241 |         """
242 | 
243 |         authorized: dict[str, T] = {}
244 |         unauthorized: dict[str, T] = {}
245 |         for node in nodes:
246 |             id = self._get_node_metadata(node).get(PangeaMetadataKeys.NODE_ID, None)
247 |             if not id:
248 |                 raise Exception(f"{PangeaMetadataKeys.NODE_ID} key should be set in node metadata")
249 | 
250 |             unauthorized[id] = node
251 | 
252 |         # This works as an OR operator among all node post processors
253 |         for npp in self._node_processors:
254 |             for node in npp.filter(list(unauthorized.values())):
255 |                 id = self._get_node_metadata(node).get(PangeaMetadataKeys.NODE_ID)
256 |                 authorized[id] = unauthorized.pop(id)  # type: ignore
257 | 
258 |         self._unauthorized_nodes = list(unauthorized.values())
259 |         self._authorized_nodes = list(authorized.values())
260 |         return self._authorized_nodes
261 | 
262 |     def get_filters(self) -> List[MetadataFilter]:
263 |         """Retrieve filters from all node processors.
264 | 
265 |         Returns:
266 |             List[MetadataFilter]: List of filters from each processor.
267 |         """
268 | 
269 |         filters = []
270 |         for np in self._node_processors:
271 |             filters.append(np.get_filter())
272 | 
273 |         return filters
274 | 
275 |     def get_unauthorized_nodes(
276 |         self,
277 |     ) -> List[T]:
278 |         """Retrieve nodes that were unauthorized after processing.
279 | 
280 |         Returns:
281 |             List[T]: Unauthorized nodes.
282 |         """
283 | 
284 |         return self._unauthorized_nodes
285 | 
286 |     def get_authorized_nodes(
287 |         self,
288 |     ) -> List[T]:
289 |         """Retrieve nodes that were authorized after processing.
290 | 
291 |         Returns:
292 |             List[T]: Authorized nodes.
293 |         """
294 | 
295 |         return self._authorized_nodes
296 | 


--------------------------------------------------------------------------------
/packages/pangea-multipass/pangea_multipass/sources/dropbox/dropbox.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import logging
  3 | from typing import Any, Callable, Generic, List, Optional
  4 | 
  5 | import requests
  6 | 
  7 | from pangea_multipass.core import (
  8 |     FilterOperator,
  9 |     MetadataFilter,
 10 |     PangeaGenericNodeProcessor,
 11 |     PangeaMetadataKeys,
 12 |     PangeaMetadataValues,
 13 |     T,
 14 | )
 15 | 
 16 | 
 17 | class DropboxClient:
 18 |     _actor = "dropbox_client"
 19 | 
 20 |     AUTH_URL = "https://www.dropbox.com/oauth2/authorize"
 21 |     TOKEN_URL = "https://api.dropbox.com/oauth2/token"
 22 |     LIST_FILES_URL = "https://api.dropboxapi.com/2/files/list_folder"
 23 |     LIST_CONTINUE_URL = "https://api.dropboxapi.com/2/files/list_folder/continue"
 24 | 
 25 |     def __init__(self, logger_name: str = "multipass"):
 26 |         self.logger = logging.getLogger(logger_name)
 27 | 
 28 |     def download_file(self, token: str, file_path: str):
 29 |         """Download a file from Dropbox."""
 30 | 
 31 |         headers = {
 32 |             "Authorization": f"Bearer {token}",
 33 |             "Dropbox-API-Arg": json.dumps({"path": file_path}),
 34 |         }
 35 | 
 36 |         url = "https://content.dropboxapi.com/2/files/download"
 37 |         response = requests.post(url, headers=headers, stream=True)
 38 |         if response.status_code != 200:
 39 |             self.logger.error(
 40 |                 json.dumps(
 41 |                     {
 42 |                         "actor": DropboxClient._actor,
 43 |                         "fn": "download_file",
 44 |                         "url": url,
 45 |                         "data": {"path": file_path},
 46 |                         "status_code": response.status_code,
 47 |                         "reason": response.reason,
 48 |                         "text": response.text,
 49 |                     }
 50 |                 )
 51 |             )
 52 |         response.raise_for_status()
 53 |         return response.content
 54 | 
 55 |     def check_user_access(self, token: str, file_path: str, user_email: str):
 56 |         """
 57 |         Checks if a user has access to a specific Dropbox file.
 58 | 
 59 |         :param token: Admin OAuth token with access to all files.
 60 |         :param file_path: Path to the file in Dropbox (e.g., "/Documents/file.txt").
 61 |         :param user_email: Email of the user whose access needs to be checked.
 62 |         :return: Boolean indicating whether the user has access.
 63 |         """
 64 |         url = "https://api.dropboxapi.com/2/sharing/list_file_members"
 65 |         headers = {"Authorization": f"Bearer {token}", "Content-Type": "application/json"}
 66 |         data = {"file": file_path}
 67 | 
 68 |         response = requests.post(url, json=data, headers=headers)
 69 |         if response.status_code != 200:
 70 |             self._log_error("check_user_access", url, data, response)
 71 |             return False
 72 | 
 73 |         response_data = response.json()
 74 |         self.logger.debug(
 75 |             json.dumps(
 76 |                 {
 77 |                     "actor": DropboxClient._actor,
 78 |                     "fn": "check_user_access",
 79 |                     "actions": "post",
 80 |                     "url": url,
 81 |                     "data": data,
 82 |                     "response": response_data,
 83 |                 }
 84 |             )
 85 |         )
 86 | 
 87 |         members = response_data.get("users", [])
 88 |         for member in members:
 89 |             if member.get("user", {}).get("email", "").lower() == user_email.lower():
 90 |                 return True
 91 | 
 92 |         return False
 93 | 
 94 |     def list_shared_folders(self, token: str, user_email: str) -> List[str]:
 95 |         """
 96 |         Lists shared folders that a user has access to in Dropbox.
 97 | 
 98 |         :param token: Admin OAuth token with access to all files.
 99 |         :param user_email: Email of the user whose accessible folders need to be listed.
100 |         :return: List of folder paths the user has access to.
101 |         """
102 | 
103 |         accessible_folders: List[str] = []
104 |         has_more = True
105 |         cursor: Optional[str] = None
106 |         headers = {"Authorization": f"Bearer {token}", "Content-Type": "application/json"}
107 | 
108 |         while has_more:
109 |             url = (
110 |                 "https://api.dropboxapi.com/2/sharing/list_folders"
111 |                 if cursor is None
112 |                 else "https://api.dropboxapi.com/2/sharing/list_folders/continue"
113 |             )
114 |             data = {} if cursor is None else {"cursor": cursor}
115 |             response = requests.post(url, json=data, headers=headers)
116 | 
117 |             if response.status_code != 200:
118 |                 self._log_error("list_shared_folders", url, data, response)
119 |                 return accessible_folders
120 | 
121 |             resp_data = response.json()
122 |             self.logger.debug(
123 |                 json.dumps(
124 |                     {
125 |                         "actor": DropboxClient._actor,
126 |                         "fn": "list_shared_folders",
127 |                         "actions": "post",
128 |                         "url": url,
129 |                         "data": data,
130 |                         "response": resp_data,
131 |                     }
132 |                 )
133 |             )
134 | 
135 |             shared_folders = resp_data.get("entries", [])
136 |             cursor = resp_data.get("cursor", None)
137 |             has_more = cursor is not None
138 | 
139 |             for folder in shared_folders:
140 |                 folder_id = folder.get("shared_folder_id")
141 |                 folder_name = folder.get("name")
142 | 
143 |                 members_url = "https://api.dropboxapi.com/2/sharing/list_folder_members"
144 |                 members_data = {"shared_folder_id": folder_id}
145 | 
146 |                 members_response = requests.post(members_url, json=members_data, headers=headers)
147 | 
148 |                 if members_response.status_code == 200:
149 |                     members = members_response.json().get("users", [])
150 |                     for member in members:
151 |                         if member.get("user", {}).get("email", "").lower() == user_email.lower():
152 |                             if not folder_name.startswith("/"):
153 |                                 folder_name = f"/{folder_name}"
154 |                             accessible_folders.append(folder_name)
155 |                             break
156 | 
157 |         return accessible_folders
158 | 
159 |     def list_subfolders(self, token: str, root: str) -> List[str]:
160 |         """
161 |         Lists all folders in Dropbox.
162 | 
163 |         :param token: Admin OAuth token with access to all files.
164 |         :return: List of all folder paths.
165 |         """
166 | 
167 |         folders: List[str] = []
168 |         has_more = True
169 |         cursor: Optional[str] = None
170 |         headers = {"Authorization": f"Bearer {token}", "Content-Type": "application/json"}
171 | 
172 |         while has_more:
173 |             url = DropboxClient.LIST_FILES_URL if cursor is None else DropboxClient.LIST_CONTINUE_URL
174 |             data = {"path": root, "recursive": True, "limit": 100}
175 |             if cursor:
176 |                 data = {"cursor": cursor}
177 | 
178 |             response = requests.post(url, headers=headers, json=data)
179 | 
180 |             if response.status_code != 200:
181 |                 self._log_error("list_subfolders", url, data, response)
182 |                 return folders
183 | 
184 |             resp_data = response.json()
185 |             folder_entries = resp_data.get("entries", [])
186 |             cursor = resp_data.get("cursor", None)
187 |             has_more = resp_data.get("has_more", False)
188 | 
189 |             for entrie in folder_entries:
190 |                 if entrie.get(".tag") != "folder":
191 |                     continue
192 | 
193 |                 folder_path = entrie.get("path_lower", "")
194 |                 folders.append(folder_path)
195 | 
196 |         return folders
197 | 
198 |     def _log_error(self, function_name: str, url: str, data: dict, response: requests.Response):
199 |         self.logger.error(
200 |             json.dumps(
201 |                 {
202 |                     "actor": DropboxClient._actor,
203 |                     "fn": function_name,
204 |                     "url": url,
205 |                     "data": data,
206 |                     "status_code": response.status_code,
207 |                     "reason": response.reason,
208 |                     "text": response.text,
209 |                 }
210 |             )
211 |         )
212 | 
213 | 
214 | class DropboxProcessor(PangeaGenericNodeProcessor[T], Generic[T]):
215 |     _access_cache: dict[str, bool] = {}
216 |     _token: str
217 |     _folders: List[str] = []
218 |     _user_email: str
219 | 
220 |     def __init__(
221 |         self,
222 |         token: str,
223 |         user_email: str,
224 |         get_node_metadata: Callable[[T], dict[str, Any]],
225 |         logger_name: str = "multipass",
226 |     ):
227 |         super().__init__()
228 |         self._token = token
229 |         self._access_cache = {}
230 |         self.get_node_metadata = get_node_metadata
231 |         self._user_email = user_email
232 |         self.logger = logging.getLogger(logger_name)
233 |         self._client = DropboxClient(logger_name)
234 | 
235 |     def _has_access(self, metadata: dict[str, Any]) -> bool:
236 |         """Check if the authenticated user has access to a file."""
237 | 
238 |         path = metadata.get(PangeaMetadataKeys.DROPBOX_FILE_PATH, "")
239 |         if not path:
240 |             raise KeyError(f"Invalid metadata key: {PangeaMetadataKeys.DROPBOX_FILE_PATH}")
241 | 
242 |         has_access = self._access_cache.get(path, None)
243 |         if has_access is not None:
244 |             return has_access
245 | 
246 |         has_access = self._client.check_user_access(token=self._token, file_path=path, user_email=self._user_email)
247 | 
248 |         self._access_cache[path] = has_access
249 |         return has_access
250 | 
251 |     def filter(
252 |         self,
253 |         nodes: List[T],
254 |     ) -> List[T]:
255 |         """Filter Dropbox files by access permissions.
256 | 
257 |         Args:
258 |             nodes (List[T]): List of nodes to process.
259 | 
260 |         Returns:
261 |             List[Any]: Nodes that have authorized access.
262 |         """
263 | 
264 |         filtered: List[T] = []
265 |         for node in nodes:
266 |             if self._is_authorized(node):
267 |                 filtered.append(node)
268 |         return filtered
269 | 
270 |     def get_filter(
271 |         self,
272 |     ) -> MetadataFilter:
273 |         """Generate a filter based on accessible Dropbox paths.
274 | 
275 |         Returns:
276 |             MetadataFilter: Filter for Dropbox paths.
277 |         """
278 | 
279 |         if not self._folders:
280 |             shared_folders = self._client.list_shared_folders(self._token, self._user_email)
281 |             folders = {value: True for value in shared_folders}
282 | 
283 |             for folder in shared_folders:
284 |                 subfolders = self._client.list_subfolders(self._token, folder)
285 |                 folders.update({value: True for value in subfolders})
286 | 
287 |             self._access_cache = folders
288 |             self._folders = list(folders.keys())
289 | 
290 |         return MetadataFilter(key=PangeaMetadataKeys.DROPBOX_PATH, value=self._folders, operator=FilterOperator.IN)
291 | 
292 |     def _is_authorized(self, node: T) -> bool:
293 |         metadata = self.get_node_metadata(node)
294 |         return metadata[
295 |             PangeaMetadataKeys.DATA_SOURCE
296 |         ] == PangeaMetadataValues.DATA_SOURCE_DROPBOX and self._has_access(metadata)
297 | 


--------------------------------------------------------------------------------
/packages/pangea-multipass/pangea_multipass/sources/jira/jira.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2021 Pangea Cyber Corporation
  2 | # Author: Pangea Cyber Corporation
  3 | 
  4 | import dataclasses
  5 | from typing import Any, Callable, Generic, List, Optional
  6 | from urllib.parse import urljoin
  7 | 
  8 | import requests
  9 | from requests.auth import HTTPBasicAuth
 10 | from requests.exceptions import HTTPError
 11 | 
 12 | from pangea_multipass.core import (
 13 |     _PANGEA_METADATA_KEY_PREFIX,
 14 |     FilterOperator,
 15 |     MetadataEnricher,
 16 |     MetadataFilter,
 17 |     PangeaGenericNodeProcessor,
 18 |     PangeaMetadataKeys,
 19 |     PangeaMetadataValues,
 20 |     T,
 21 | )
 22 | 
 23 | 
 24 | @dataclasses.dataclass
 25 | class JiraAuth:
 26 |     """Holds authentication details for Jira API."""
 27 | 
 28 |     email: str
 29 |     token: str
 30 |     url: str
 31 | 
 32 | 
 33 | class JiraME(MetadataEnricher):
 34 |     """Jira Metadata Enricher.
 35 | 
 36 |     Enriches metadata for documents using data fetched from Jira, like issue assignments and reporter details.
 37 | 
 38 |     Attributes:
 39 |         _url (str): URL for the Jira instance.
 40 |         _email (str): Email for authenticating with Jira.
 41 |         _api_token (str): API token for Jira access.
 42 |         _auth (JiraAuth): Authentication details for Jira.
 43 |     """
 44 | 
 45 |     _url: str
 46 |     _email: str
 47 |     _api_token: str
 48 |     _auth: JiraAuth
 49 | 
 50 |     def __init__(self, url: str, email: str, api_token: str):
 51 |         self._url = url.rstrip("/")
 52 |         self._email = email
 53 |         self._api_token = api_token
 54 |         self._auth = JiraAuth(email, api_token, self._url)
 55 | 
 56 |     def extract_metadata(self, doc: Any, file_content: str) -> dict[str, Any]:
 57 |         """Fetch Jira-related metadata for the document.
 58 | 
 59 |         Args:
 60 |             doc (Any): The document to enrich with metadata.
 61 |             file_content (str): The content of the file.
 62 | 
 63 |         Returns:
 64 |             dict[str, Any]: Extracted metadata including issue ID, assignee, and reporter details.
 65 |         """
 66 | 
 67 |         metadata: dict[str, Any] = {}
 68 | 
 69 |         # This step is to normalize some attributes across platforms
 70 |         metadata[PangeaMetadataKeys.DATA_SOURCE] = PangeaMetadataValues.DATA_SOURCE_JIRA
 71 |         metadata[PangeaMetadataKeys.FILE_NAME] = doc.metadata.get("title", "")
 72 | 
 73 |         id = doc.metadata.get("id", "")
 74 |         if not id:
 75 |             raise Exception("invalid metadata key")
 76 | 
 77 |         metadata[PangeaMetadataKeys.JIRA_ISSUE_ID] = id
 78 | 
 79 |         # New metadata
 80 |         issue = JiraAPI.get_issue(self._auth, id)
 81 |         # Sometimes field is present but it's null, so we should handle that case
 82 |         fields = issue.get("fields", {})
 83 |         if fields is None:
 84 |             fields = {}
 85 |         assignee = fields.get("assignee", {})
 86 |         if assignee is None:
 87 |             assignee = {}
 88 |         reporter = fields.get("reporter", {})
 89 |         if reporter is None:
 90 |             reporter = {}
 91 | 
 92 |         metadata[f"{_PANGEA_METADATA_KEY_PREFIX}jira_assignee_account_id"] = assignee.get("accountId", "")
 93 |         metadata[f"{_PANGEA_METADATA_KEY_PREFIX}jira_assignee_name"] = assignee.get("displayName", "")
 94 |         metadata[f"{_PANGEA_METADATA_KEY_PREFIX}jira_reporter_account_id"] = reporter.get("accountId", "")
 95 |         metadata[f"{_PANGEA_METADATA_KEY_PREFIX}jira_reporter_name"] = reporter.get("displayName", "")
 96 | 
 97 |         return metadata
 98 | 
 99 | 
100 | class JiraProcessor(PangeaGenericNodeProcessor[T], Generic[T]):
101 |     """Processes Jira documents for access control.
102 | 
103 |     Filters Jira documents based on issue ID permissions and caches access results.
104 | 
105 |     Attributes:
106 |         auth (JiraAuth): Jira authentication details.
107 |         issue_ids_cache (dict[str, bool]): Cache of access status for Jira issue IDs.
108 |         issue_ids_list (List[str]): List of authorized Jira issue IDs.
109 |         get_node_metadata (Callable): Function to retrieve metadata for nodes.
110 |     """
111 | 
112 |     auth: JiraAuth
113 |     issue_ids_cache: dict[str, bool]
114 |     issue_ids_list: List[str]
115 |     get_node_metadata: Callable[[T], dict[str, Any]]
116 |     _account_id: Optional[str]
117 | 
118 |     def __init__(
119 |         self, auth: JiraAuth, get_node_metadata: Callable[[T], dict[str, Any]], account_id: Optional[str] = None
120 |     ):
121 |         super().__init__()
122 |         self.auth = auth
123 |         self.issue_ids_cache = {}
124 |         self.get_node_metadata = get_node_metadata
125 |         self._account_id = account_id
126 | 
127 |     def filter(
128 |         self,
129 |         nodes: List[T],
130 |     ) -> List[Any]:
131 |         """Filter Jira nodes by access permissions.
132 | 
133 |         Args:
134 |             nodes (List[T]): List of nodes to process.
135 | 
136 |         Returns:
137 |             List[Any]: Nodes that have authorized access.
138 |         """
139 | 
140 |         filtered: List[T] = []
141 |         if not self._account_id:
142 |             for node in nodes:
143 |                 if self._is_authorized(node):
144 |                     filtered.append(node)
145 |             return filtered
146 | 
147 |         issues = []
148 |         for node in nodes:
149 |             metadata = self.get_node_metadata(node)
150 |             if metadata[PangeaMetadataKeys.DATA_SOURCE] == PangeaMetadataValues.DATA_SOURCE_JIRA:
151 |                 issues.append(int(metadata.get(PangeaMetadataKeys.JIRA_ISSUE_ID, "")))
152 |                 filtered.append(node)
153 | 
154 |         allowed_issues = JiraAPI.get_allowed_issues(self.auth, self._account_id, issues)
155 |         return list(
156 |             filter(
157 |                 lambda x: (int(self.get_node_metadata(x).get(PangeaMetadataKeys.JIRA_ISSUE_ID, ""))) in allowed_issues,
158 |                 filtered,
159 |             )
160 |         )
161 | 
162 |     def get_filter(
163 |         self,
164 |     ) -> MetadataFilter:
165 |         """Generate a filter based on accessible Jira issue IDs.
166 | 
167 |         Returns:
168 |             MetadataFilter: Filter for Jira issue IDs.
169 |         """
170 | 
171 |         if not self.issue_ids_list:
172 |             self.issue_ids_list = JiraAPI.get_issue_ids(self.auth)
173 |         return MetadataFilter(
174 |             key=PangeaMetadataKeys.JIRA_ISSUE_ID, value=self.issue_ids_list, operator=FilterOperator.IN
175 |         )
176 | 
177 |     def _is_authorized(self, node: T) -> bool:
178 |         metadata = self.get_node_metadata(node)
179 |         return metadata[PangeaMetadataKeys.DATA_SOURCE] == PangeaMetadataValues.DATA_SOURCE_JIRA and self._has_access(
180 |             metadata
181 |         )
182 | 
183 |     def _has_access(self, metadata: dict[str, Any]) -> bool:
184 |         id = metadata.get(PangeaMetadataKeys.JIRA_ISSUE_ID, None)
185 |         if id is None:
186 |             raise KeyError("Invalid metadata key")
187 | 
188 |         access = self.issue_ids_cache.get(id, None)
189 |         if access is not None:
190 |             return access
191 | 
192 |         try:
193 |             JiraAPI.get_issue(self.auth, id)
194 |             access = True
195 |         except HTTPError as e:
196 |             if e.response is None or e.response.status_code == 404:
197 |                 access = False
198 | 
199 |         if access is None:
200 |             return False
201 | 
202 |         self.issue_ids_cache[id] = access
203 |         return access
204 | 
205 | 
206 | class JiraAPI:
207 |     @staticmethod
208 |     def _get(auth: JiraAuth, path: str, params: dict[str, Any] = {}) -> dict[str, Any]:
209 |         """
210 |         Makes a request to the Jira API.
211 | 
212 |         Args:
213 |             auth (JiraAuth): The authentication credentials for Jira.
214 |             path (str): The API path to send the request to.
215 |             params (dict, optional): The query parameters for the request.
216 | 
217 |         Returns:
218 |             dict: The JSON response from the Jira API.
219 |         """
220 | 
221 |         basic_auth = HTTPBasicAuth(auth.email, auth.token)
222 |         url = urljoin(f"https://{auth.url}", path)
223 |         response = requests.get(url, headers={"Accept": "application/json"}, params=params, auth=basic_auth)
224 |         response.raise_for_status()
225 |         return response.json()
226 | 
227 |     @staticmethod
228 |     def _post(auth: JiraAuth, path: str, body: dict[str, Any] = {}) -> dict[str, Any]:
229 |         headers = {"Accept": "application/json", "Content-Type": "application/json"}
230 | 
231 |         basic_auth = HTTPBasicAuth(auth.email, auth.token)
232 | 
233 |         response = requests.request(
234 |             "POST", urljoin(f"https://{auth.url}", path), json=body, headers=headers, auth=basic_auth
235 |         )
236 | 
237 |         response.raise_for_status()
238 |         return response.json()
239 | 
240 |     @staticmethod
241 |     def get_issue(auth: JiraAuth, issue_id: str) -> dict[str, Any]:
242 |         """
243 |         Retrieves details of a specific Jira issue.
244 | 
245 |         Args:
246 |             auth (JiraAuth): The authentication credentials for Jira.
247 |             issue_id (str): The ID of the Jira issue to retrieve.
248 | 
249 |         Returns:
250 |             dict: The JSON response containing issue details.
251 |         """
252 | 
253 |         return JiraAPI._get(auth, f"/rest/api/3/issue/{issue_id}")
254 | 
255 |     @staticmethod
256 |     def myself(auth: JiraAuth) -> dict[str, Any]:
257 |         """
258 |         Retrieves the profile information of the currently authenticated user in Jira.
259 | 
260 |         Args:
261 |             auth (JiraAuth): The authentication credentials for Jira.
262 | 
263 |         Returns:
264 |             dict: A dictionary containing the authenticated user's profile information.
265 | 
266 |         Raises:
267 |             HTTPError: If the request to Jira fails.
268 |         """
269 |         return JiraAPI._get(auth, "/rest/api/3/myself")
270 | 
271 |     @staticmethod
272 |     def search(auth: JiraAuth, params: dict[str, Any] = {}) -> dict[str, Any]:
273 |         """
274 |         Searches for issues in Jira using specified query parameters.
275 | 
276 |         This method provides a way to search for issues in Jira, returning a paginated list
277 |         of issues based on search criteria defined in the `params` argument. The parameters
278 |         can be customized to filter issues based on various criteria such as project, status,
279 |         labels, etc.
280 | 
281 |         Args:
282 |             auth (JiraAuth): The authentication credentials for Jira.
283 |             params (dict, optional): A dictionary of query parameters for customizing the search.
284 |                                      Default is an empty dictionary.
285 | 
286 |         Returns:
287 |             dict: A dictionary containing the search results, including issue details and pagination info.
288 | 
289 |         Raises:
290 |             HTTPError: If the request to Jira fails.
291 |         """
292 |         return JiraAPI._get(auth, "/rest/api/3/search", params)
293 | 
294 |     @staticmethod
295 |     def get_issue_ids(auth: JiraAuth) -> List[str]:
296 |         """
297 |         Retrieves the IDs of all issues in Jira.
298 | 
299 |         This method iterates through all issues in the Jira instance and retrieves their IDs.
300 |         It paginates through results if there are more issues than the `max_results` limit.
301 | 
302 |         Args:
303 |             auth (JiraAuth): The authentication credentials for Jira.
304 | 
305 |         Returns:
306 |             List[str]: A list of all issue IDs in the Jira instance.
307 |         """
308 | 
309 |         max_results = 50
310 |         start_at = 0
311 |         keep_iterating = True
312 |         issue_ids: List[str] = []
313 | 
314 |         while keep_iterating:
315 |             params = {
316 |                 "query": "",
317 |                 "maxResults": max_results,
318 |                 "startAt": start_at,
319 |                 "fields": ["id"],
320 |             }
321 | 
322 |             resp = JiraAPI.search(auth, params)
323 |             issues = resp.get("issues", [])
324 |             total = resp.get("total", 0)
325 | 
326 |             ids = [issue["id"] for issue in issues]
327 |             issue_ids.extend(ids)
328 | 
329 |             start_at = start_at + len(ids)
330 |             keep_iterating = start_at < total
331 | 
332 |         return issue_ids
333 | 
334 |     @staticmethod
335 |     def get_permission_check(auth: JiraAuth, account_id: str, issues: List[int]) -> dict[str, Any]:
336 |         body = {
337 |             "accountId": account_id,
338 |             "projectPermissions": [
339 |                 {
340 |                     "issues": issues,
341 |                     "permissions": ["EDIT_ISSUES"],
342 |                 }
343 |             ],
344 |         }
345 | 
346 |         return JiraAPI._post(auth=auth, path="rest/api/3/permissions/check", body=body)
347 | 
348 |     @staticmethod
349 |     def get_allowed_issues(auth: JiraAuth, account_id: str, issues: List[int]) -> List[int]:
350 |         resp = JiraAPI.get_permission_check(auth, account_id, issues)
351 |         return resp.get("projectPermissions", [])[0].get("issues", [])
352 | 


--------------------------------------------------------------------------------