├── .gitattributes ├── .gitignore ├── .pre-commit-config.yaml ├── .python-version ├── LICENSE ├── README.md ├── __main__.py ├── git-frontend ├── .gitignore ├── README.md ├── package-lock.json ├── package.json ├── public │ ├── favicon.ico │ ├── index.html │ ├── logo192.png │ ├── logo512.png │ ├── manifest.json │ └── robots.txt └── src │ ├── App.css │ ├── App.js │ ├── App.test.js │ ├── index.css │ ├── index.js │ ├── logo.svg │ ├── reportWebVitals.js │ └── setupTests.js ├── images └── self_portrait.jpg ├── main.py ├── measure_process_template_diffusion.ipynb ├── papers ├── 2203.15544.pdf └── README.md ├── pyproject.toml ├── random.ipynb ├── requirements.txt ├── setup.cfg ├── src ├── __init__.py ├── alembic.ini ├── alembic │ └── env.py ├── ast_parsers │ └── python_ast_parser.py ├── backend │ ├── config.py │ ├── database.py │ └── memory_tree_manager.py ├── community_detection │ └── extract_connected_components.py ├── config.py ├── configs │ ├── config1.yaml │ └── config2.yaml ├── dynamic_import.py ├── fetch_repo.py ├── ml_models │ ├── __init__.py │ ├── graph_networks │ │ ├── gnn_head.py │ │ ├── kernels.py │ │ └── message_passing.py │ ├── topic_modeling │ │ └── topic_model.py │ └── transformers │ │ ├── category_decomposition.py │ │ └── semantic_graph_context_generator.py ├── notebooks │ ├── bottom_up_vs_top_down.ipynb │ ├── community_structure.ipynb │ ├── github_repo_parser.ipynb │ └── inference_me.md ├── pipeline.py └── system_prompts │ └── format_system_prompts.py └── test_config.yaml /.gitattributes: -------------------------------------------------------------------------------- 1 | * text=auto eol=lf 2 | *.ipynb linguist-documentation 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .venv* 2 | venv 3 | .env 4 | *.pyc 5 | *.swp 6 | *.egg-info 7 | __pycache__ 8 | .DS_Store 9 | work 10 | context 11 | 12 | # ignore jupyter notebook checkpoints 13 | .ipynb_checkpoints 14 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | # WARNING! 2 | # 3 | # When making changes to auto-formatters used in pre-commit hooks, you are 4 | # likely to cause merge conflicts with main and/or other pull requests. 5 | # Fixing them might revert other people's work. Expect pain! 6 | # To avoid accidental reversions and keep it easy to review, please make sure 7 | # that changes here are in a pull request by themselves, that it consists of 8 | # two commits: 9 | # 10 | # 1. The changes to this file 11 | # 2. Changes made by running `python3 -m pre_commit run --all-files`. 12 | # 13 | # Then each time your pull request is blocked by a merge conflict, do the 14 | # following steps: 15 | # 16 | # git reset HEAD^1 && git checkout -f # discard the change commit 17 | # git rebase main # re-apply other people's changes 18 | # python3 -m pre_commit run --all-files # re-run the rules 19 | # git add . # add the newly changed files 20 | # git commit -m 'apply pre-commit' # commit it 21 | # git push -f # force push back to your branch 22 | # 23 | # Keep in mind you may have to do this a few times, as changes here may impact 24 | # other pull requests. Try to keep it up-to-date so they can go in when it'll 25 | # cause least disruption. 26 | # 27 | # /WARNING! 28 | 29 | exclude: build|stubs|^bot/templates/$|openassistant/templates|docs/docs/api/openapi.json 30 | 31 | repos: 32 | - repo: https://github.com/pre-commit/pre-commit-hooks 33 | rev: v4.4.0 34 | hooks: 35 | - id: trailing-whitespace 36 | - id: check-ast 37 | - id: check-yaml 38 | # Always check YAML but skip a few YAML files that are auto-generated 39 | # and which break the standard YAML check. The alternative would be to 40 | # skip any unsafe errors (and thus break YAML compatibility) or use 41 | # some other checker that may not work in general. 42 | exclude: ^copilot/.*/addons/.*$ 43 | - id: check-json 44 | - id: check-case-conflict 45 | - id: detect-private-key 46 | - id: fix-encoding-pragma 47 | args: [--remove] 48 | - id: forbid-submodules 49 | - id: mixed-line-ending 50 | - id: requirements-txt-fixer 51 | - id: check-executables-have-shebangs 52 | - id: check-shebang-scripts-are-executable 53 | - id: check-byte-order-marker 54 | - id: check-symlinks 55 | - id: check-merge-conflict 56 | - id: check-added-large-files 57 | args: [--maxkb=1024] 58 | - id: end-of-file-fixer 59 | 60 | - repo: https://github.com/psf/black 61 | rev: 22.12.0 62 | hooks: 63 | - id: black-jupyter 64 | 65 | - repo: https://github.com/pycqa/flake8 66 | rev: 6.0.0 67 | hooks: 68 | - id: flake8 69 | exclude: "^src/(ml_models/topic_modeling/topic_model|__init__)\\.py$" 70 | 71 | - repo: https://github.com/pycqa/isort 72 | rev: 5.12.0 73 | hooks: 74 | - id: isort 75 | 76 | - repo: https://github.com/pre-commit/mirrors-prettier 77 | rev: v2.7.1 78 | hooks: 79 | - id: prettier 80 | args: 81 | [ 82 | --prose-wrap=always, 83 | --write, 84 | --ignore-path=./website/.prettierignore, 85 | ] 86 | 87 | - repo: local 88 | hooks: 89 | - id: next-lint-website 90 | name: Lint website 91 | files: ^website/ 92 | exclude: ^website/node_modules/ 93 | types_or: [javascript, jsx, ts, tsx] 94 | language: node 95 | pass_filenames: false 96 | entry: website/next-lint.js 97 | -------------------------------------------------------------------------------- /.python-version: -------------------------------------------------------------------------------- 1 | 3.10.8 2 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # GitModel (Graph Intelligence Tensor) 2 | 3 | ![Self Portrait](images/self_portrait.jpg) 4 | 5 | GitModel is a pipeline that can generate high quality topic models of github repos using GPT, GNN 6 | message passing layers and BERTopic. 7 | 8 | ⚠️Warning: Not under active development. A more expansive system is actively being developed @ https://github.com/Neural-Dragon-AI - currently private. 9 | A preview from the org can be seen here including the openbugger and babydragon package https://www.youtube.com/live/BbUOoGfpjvQ?feature=share&t=18645 10 | - Generate documentation for Python repos and create enriched subgraphs for your 11 | codebases 12 | - Swap system prompt tasks(bug hunting, todo, documentation labeling, etc) for 13 | enriching semantic graph and dataset building. 14 | - The generated data is saved to context folder. 15 | - in src/format_system_prompts. w/ tree works but it requires manual changing 16 | one line of code. will fix soon 17 | - Pre-commit for isort, prettier, detect private_keys, black, flake8, and more. 18 | - GNN Message Passing and Topic modeling pipeline as an inductive bias (GRNN) 19 | - BERTopic is highly customizable and can compose several different clustering, 20 | embedding, vectorizers, bag of words and dimensionality reduction techniques. 21 | 22 | 23 | 24 | 25 | - https://github.com/huggingface/diffusers/blob/main/CONTRIBUTING.md 26 | 27 | ## How to Use 28 | 29 | ```bash 30 | python3.10 -m venv venv && source venv/bin/activate && pip install -U pip setuptools wheel 31 | pip install -r requirements.txt 32 | python __main__.py 33 | ``` 34 | 35 | **main.py** 36 | 37 | ```python 38 | import argparse 39 | from getpass import getpass 40 | 41 | import openai 42 | 43 | from src import Pipeline 44 | 45 | if __name__ == "__main__": 46 | argsparse = argparse.ArgumentParser() 47 | argsparse.add_argument("--config", type=str, default="./test_config.yaml") 48 | argsparse.add_argument("--repo", type=str, default="https://github.com/danielpatrickhug/GitModel.git") 49 | argsparse.add_argument("--repo_name", type=str, default="gitmodel") 50 | 51 | args = argsparse.parse_args() 52 | 53 | openai_secret = getpass("Enter the secret key: ") 54 | # Set up OpenAI API credentials 55 | openai.api_key = openai_secret 56 | 57 | print("starting pipeline") 58 | pipeline = Pipeline.from_yaml(args.config) 59 | gnn_head_outputs, topic_model_outputs = pipeline.run(args.repo, args.repo_name) 60 | for i, topic_model_output in enumerate(topic_model_outputs): 61 | topic_model_output["data"].to_csv(f"context/{args.repo_name}_topic_model_outputs_{i}.csv") 62 | topic_model_output["topic_info"].to_csv(f"context/{args.repo_name}_topic_info_{i}.csv") 63 | with open(f"context/{args.repo_name}_tree_{i}.txt", "w", encoding="utf-8") as f: 64 | f.write(topic_model_output["tree"]) 65 | ``` 66 | ## Topic model your dependencies. 67 | 68 | If you have enough patience or a lot of money to afford more then one computer. 69 | run GitModel on /venv/lib/python3.10/site-packages 70 | 71 | 72 | ## Examples 73 | 74 | ### Gitmodel 75 | 76 | - https://github.com/danielpatrickhug/GitModel 77 | - uses Deepminds clrs topic tree in system prompt during semantic graph 78 | generation 79 | 80 | ``` 81 | 82 | . ├─Function description and comparison including Gaussian kernel and sparse 83 | matrices\_**\_ │ ├─■──Understanding the Purpose and Handling of a Function for 84 | Sparse Matrices with Inputs, Outputs, and P ── Topic: 9 │ └─Understanding 85 | kernels and functions in the 'kernels.py' file for estimating PDF and computing 86 | simila │ ├─■──Purpose and functions of kernel-related functions in kernels.py 87 | file of a Python program, including ── Topic: 22 │ └─■──Understanding the 88 | cos_sim and cos_sim_torch functions in kernels.py file\_\_** ── Topic: 25 89 | └─Graph message passing and adjacency matrix computation using embeddings\_**\_ 90 | ├─k-hop message passing and cosine similarity kernel computation for graph 91 | embeddings\_\_** │ ├─k-hop message passing with adjacency matrix and node 92 | features\_**\_ │ │ ├─Computation of Gaussian Kernel Matrix between Two Sets of 93 | Embeddings using PyTorch\_\_** │ │ │ ├─■──Cosine Similarity with PyTorch Tensors 94 | and Functional.\_**_ ── Topic: 1 │ │ │ └─■──Function to compute adjacency matrix 95 | for embeddings using specified kernel type and threshold value_ ── Topic: 19 │ │ 96 | └─Message Passing and K-hop Aggregation in Graphs using Sparse Matrices and Node 97 | Features\_\_** │ │ ├─■──Document pruning and adjacency matrix recomputation 98 | using embeddings and thresholding\_**\_ ── Topic: 11 │ │ └─k-hop message passing 99 | and adjacency matrix computation in sparse graphs.\_\_** │ │ ├─■──Computing 100 | graph laplacian and degree matrix from pairwise distances using a given 101 | function.\_**\_ ── Topic: 7 │ │ └─■──Message Passing with K-hop Adjacency and 102 | Aggregated Features in Sparse Matrices\_\_** ── Topic: 8 │ └─"Outlier Reduction 103 | Using Count-TF-IDF and OpenAI Representation Model"\_**\_ │ ├─Topic Modeling and 104 | Outlier Reduction in Natural Language Processing (NLP)\_\_** │ │ ├─Understanding 105 | the compose\*inference function in a chatbot system.\_**\_ │ │ │ ├─■──Processing 106 | conversation transcripts with Python functions\_\_** ── Topic: 18 │ │ │ 107 | └─Understanding the compose_inference function in a chatbot conversation with 108 | message templates\_\*\*\* │ │ │ ├─■──Understanding the `compose_inference` 109 | Function in Chatbot Conversation Generation with OpenAI GPT\_** ── Topic: 2 │ │ 110 | │ └─■──Function to create prompt message template with role and text input 111 | parameters and validation of rol ── Topic: 17 │ │ └─Outlier Reduction with 112 | Machine Learning Models\_**\_ │ │ ├─Document processing and reduction techniques 113 | for topic modeling with various machine learning models │ │ │ ├─MiniLM language 114 | model for sentence embedding\_\_** │ │ │ │ ├─■──Embedding sentences using MiniLM 115 | language model with multiprocessing and GPU acceleration\_**\_ ── Topic: 15 │ │ 116 | │ │ └─■──Embedding Sentences using Pre-Trained Language Model with 117 | SentenceTransformer Library\_\_** ── Topic: 23 │ │ │ └─■──Topic modeling 118 | algorithms and document reduction techniques\_**\_ ── Topic: 0 │ │ └─SQLalchemy 119 | migrations in online mode with engine configuration\_\_** │ │ ├─■──Probability 120 | Density Estimation with Gaussian Kernel Density Estimator\_**\_ ── Topic: 12 │ │ 121 | └─Running database migrations with SQLAlchemy and Alembic\_\_** │ │ ├─■──Graph 122 | network message passing & Mobile App Navigation System Design\_**\_ ── Topic: 21 123 | │ │ └─■──Running migrations with SQLAlchemy and Alembic in online mode\_\_** ── 124 | Topic: 6 │ └─Class Settings definition using BaseSettings and its purpose for 125 | managing configuration in a third-p │ ├─■──Empty class definition for managing 126 | application settings using Pydantic's BaseSettings\_**\_ ── Topic: 3 │ 127 | └─■──MemoryTreeManager class implementation\_\_** ── Topic: 16 └─Codebase 128 | decomposition and analysis with Git repository and AST nodes.\_**\_ ├─Code 129 | decomposition and processing in Git repositories.\_\_** │ ├─■──Python code 130 | parsing and analysis\_**\_ ── Topic: 4 │ └─Code decomposition in a Git 131 | repository\_\_** │ ├─■──Decomposing Git Repositories with System Prompts.\_**\_ 132 | ── Topic: 10 │ └─Parsing and pruning files in a GitHub repository\_\_** │ 133 | ├─■──parsing and pruning files in a local Git repository\_**\_ ── Topic: 5 │ 134 | └─■──purpose of `get_repo_contents` function in `repo_graph_generation.py` for 135 | retrieving and pruning Git ── Topic: 24 └─Analyzing chatbot main capabilities in 136 | a codebase using natural language processing and notable fram ├─■──summarizing 137 | code in a GitHub repository using ChatGPT\_\_** ── Topic: 14 └─Understanding 138 | Codebase Structure and Functionality with Hierarchical Trees and 139 | Frameworks\_**\_ ├─■──Analyzing codebase structure and functionalities using a 140 | hierarchical topic tree\_\_\*\* ── Topic: 13 └─■──Understanding the difference 141 | between format_system_prompts and format_system_prompts_with_tree in a ── Topic: 142 | 20 143 | 144 | ``` 145 | 146 | w/ graph code bert embeddings 147 | 148 | ``` 149 | 150 | . ├─"The Pipeline Class and Its Methods in GitModel Project"\_**\_ │ 151 | ├─Probability Density Estimation using Gaussian KDE in SciPy\_\_** │ │ 152 | ├─Probability density function estimation using Gaussian kernel density 153 | estimation\_**\_ │ │ │ ├─■──Probability density estimation with Gaussian 154 | kernel\_\_** ── Topic: 16 │ │ │ └─■──Understanding cos\*sim_torch function and 155 | configuring context with URL and target metadata\_**\_ ── Topic: 14 │ │ └─Empty 156 | class definition for MessageTreeManagerConfiguration in Python\_\_** │ │ ├─Empty 157 | class definition in MessageTreeManagerConfiguration with BaseModel 158 | inheritance.\_**\_ │ │ │ ├─■──Questions about bug fixing with system prompts in 159 | kernel computation with tensors and matrices.\_\_** ── Topic: 13 │ │ │ └─Empty 160 | class definitions and inability to determine expected behavior of 161 | MemoryTreeManager class\_**\_ │ │ │ ├─■──Purpose of run_migrations_online in 162 | Alembic environment file\_\_** ── Topic: 12 │ │ │ └─■──Empty class definition of 163 | MessageTreeManagerConfiguration inheriting from BaseModel\_\*\*\* ── Topic: 25 │ 164 | │ └─Understanding the purpose of SemanticGraphContextGenerator and TopicModel 165 | classes in the codebase\_** │ │ ├─■──Purpose of Pipeline class in codebase with 166 | SemanticGraphContextGenerator, MessageTreeManagerConfigur ── Topic: 15 │ │ 167 | └─■──Understanding the purpose and usage of TopicModel class in dimensional 168 | tensors and input shape setti ── Topic: 20 │ └─GitModel Pipeline class with 169 | find_files_with_substring method\_**\_ │ ├─GitModel Pipeline Class and 170 | find_files_with_substring Method Description\_\_** │ │ ├─■──Understanding the 171 | `clone_and_create_context_folder` Function\_**\_ ── Topic: 4 │ │ └─GitModel 172 | Pipeline class and methods for searching files with substring\_\_** │ │ 173 | ├─GitModel Pipeline class and methods for file searching\_**\_ │ │ │ ├─■──Python 174 | class for loading and initializing configuration values from a YAML file with 175 | dynamic imports ── Topic: 9 │ │ │ └─■──The Pipeline class and its methods in 176 | GitModel project configuration and file searching.\_\_** ── Topic: 10 │ │ 177 | └─■──Python Pipeline Class for Generating a Semantic Graph Context for Git 178 | Repository Data Processing\_**\_ ── Topic: 8 │ └─■──Cloning and Storing 179 | Repository in "Work" Folder with Custom Name using Python Function\_\_** ── 180 | Topic: 22 └─Understanding the purpose and input of a Pipeline class in a project 181 | involving semantic graphs and e ├─Topic Modeling with Hierarchical Topics and 182 | Outlier Reduction Strategies in Python\_**\_ │ ├─Working with context folders 183 | and creating directories using os module.\_\_** │ │ ├─■──Creating a work folder 184 | and cloning a repository to create a context folder in Python\_**\_ ── Topic: 18 185 | │ │ └─■──Working with context and folder paths in Python\_\_** ── Topic: 3 │ 186 | └─■──Topic modeling and representation using hierarchical and ctfidf 187 | models\_**\_ ── Topic: 5 └─PyTorch function for computing Gaussian kernel matrix 188 | and k-hop message passing on an adjacency matr ├─Compute k-hop adjacency matrix 189 | and aggregated features using message passing in graph analysis.\_\_** │ ├─k-hop 190 | message passing with adjacency matrix and node features\_**\_ │ │ ├─■──Document 191 | Pruning and Adjacency Matrix Recomputation\_\_** ── Topic: 23 │ │ └─Computing 192 | k-hop adjacency matrix with message passing in graph neural networks.\_**\_ │ │ 193 | ├─■──Computing k-hop adjacency matrix and aggregated features using message 194 | passing\_\_** ── Topic: 0 │ │ └─■──GNNHead class for computing kernel matrix 195 | with node features in numpy array\_**\_ ── Topic: 1 │ └─Data Migrations in 196 | Offline Mode.\_\_** │ ├─■──Degree matrix computation using adjacency distance 197 | matrix and pairwise distances in Python\_**\_ ── Topic: 21 │ └─■──SQLAlchemy 198 | migration in 'offline' mode\_\_** ── Topic: 11 └─Understanding code inputs and 199 | purpose in a Pipeline class\_**\_ ├─Parsing Python files using AST module and 200 | extracting specific information\_\_** │ ├─■──Cosine Similarity Computation using 201 | PyTorch and NumPy\_**\_ ── Topic: 6 │ └─■──Python code parsing and data 202 | extraction using AST\_\_** ── Topic: 17 └─Code Structure and Purpose of Pipeline 203 | Class with Config and Semantic Graph Context Generator in Pyt ├─Code for a 204 | Pipeline with Semantic Graph Context Generator\_**\_ │ ├─■──Understanding 205 | Pipeline Class and Semantic Graph Context Generation in Python Code\_\_** ── 206 | Topic: 24 │ └─■──Summarizing code in a GitHub repository using ChatGPT\_**\_ ── 207 | Topic: 2 └─Semantic Graph Context Generator Class and Methods\_\_** 208 | ├─■──Semantic Graph Context Generation for Git Repositories.\_**\_ ── Topic: 19 209 | └─■──Implementation of class instantiation using configuration and dictionary 210 | mapping.\_\_\*\* ── Topic: 7 211 | 212 | ``` 213 | 214 | ### DeepMind CLRS 215 | 216 | - https://github.com/deepmind/clrs 217 | 218 | ``` 219 | 220 | . ├─Purpose and Attributes of the `Net` Class in Graph Neural Networks\_**\_ │ 221 | ├─Graph Attention Networks and DAG Shortest Paths in JAX.\_\_** │ │ ├─Graph 222 | Attention Networks (GAT and GATv2) code implementation\_**\_ │ │ │ ├─Code for 223 | DAG shortest path and depth-first search algorithms\_\_** │ │ │ │ ├─■──String 224 | Matching and Maximum Subarray\_**\_ ── Topic: 10 │ │ │ │ └─Depth-First Search 225 | and DAG Shortest Path Algorithms implemented in Python\_\_** │ │ │ │ 226 | ├─■──Description of string probe functions in probing.py file for Hash Table 227 | probing.\_**\_ ── Topic: 1 │ │ │ │ └─Graph Algorithms - DFS and DAG Shortest 228 | Paths\_\_** │ │ │ │ ├─■──Graph algorithms (DFS and DAG shortest path) in 229 | Python\_**\_ ── Topic: 0 │ │ │ │ └─■──Functions for decoding diff and graph 230 | features in PyTorch graph neural networks.\_\_** ── Topic: 6 │ │ │ └─■──Graph 231 | Attention Networks (GAT and GATv2)\_**\_ ── Topic: 20 │ │ └─■──Message Passing 232 | with \_MessagePassingScanState, \_MessagePassingOutputChunked and 233 | MessagePassingStateC ── Topic: 17 │ └─Implementing a Baseline Model with 234 | Selectable Message Passing Algorithm and its Dataset Sampler.\_\_** │ ├─Handling 235 | of untrained parameters in optimization updates\_**\_ │ │ ├─■──Updating 236 | parameters with filtered gradients from multiple algorithms.\_\_** ── Topic: 8 │ 237 | │ └─■──Processing trajectory hints with variable-length time dimension using 238 | batching.\_**\_ ── Topic: 9 │ └─Processing time-chunked data with batched 239 | samplers and message passing nets.\_\_** │ ├─Model processing of time-chunked 240 | data with dataset sampling and batch processing\_**\_ │ │ ├─■──CLRS dataset 241 | download and URL retrieval on Google Cloud Platform\_\_** ── Topic: 13 │ │ 242 | └─Chunked data and dataset sampling with JAX.\_**\_ │ │ ├─■──JAX functions for 243 | reshaping and restacking data for pmap computation\_\_** ── Topic: 4 │ │ └─Data 244 | chunking with batched sampling and message passing in neural networks.\_**\_ │ │ 245 | ├─Processing time-chunked data with batch samplers and a NetChunked class\_\_** 246 | │ │ │ ├─■──Time-chunked data processing using BaselineModelChunked and 247 | NetChunked in TensorFlow.\_**\_ ── Topic: 2 │ │ │ └─■──Creating samplers for 248 | training data.\_\_** ── Topic: 11 │ │ └─■──Documented code for sampling 249 | algorithms using randomized position generation.\_**\_ ── Topic: 3 │ └─■──Point 250 | Sampling and Convex Hull Computation\_\_** ── Topic: 18 └─Loss functions for 251 | training with time-chunked data\_**\_ ├─Loss calculation for time-chunked and 252 | full-sample training.\_\_** │ ├─Code functions for evaluating predictions using 253 | permutations and masking\_**\_ │ │ ├─■──Functions for Evaluating Predictions in 254 | Probing Tasks.\_\_** ── Topic: 7 │ │ └─■──permutation pointer manipulation and 255 | reduction in predictions\_**\_ ── Topic: 16 │ └─Loss calculation and decoder 256 | output postprocessing in neural networks.\_\_** │ ├─■──Postprocessing with 257 | Sinkhorn operator in log space\_**\_ ── Topic: 15 │ └─■──Loss calculation 258 | methods for training with time and full samples\_\_** ── Topic: 19 └─Functions 259 | for expanding and broadcasting JAX arrays\_**\_ ├─Description and input/output 260 | parameters of \_expand_to and \_is_not_done_broadcast functions\_\_** │ 261 | ├─■──Array expansion and broadcasting techniques\_**\_ ── Topic: 21 │ 262 | └─■──Purpose and Functionality of \_is_not_done_broadcast Function\_\_** ── 263 | Topic: 14 └─Sampler classes and associated data generation types\_**\_ 264 | ├─■──Understanding Parameters and Expected Input/Output of Various Functions 265 | (including mst_prim, floyd_w ── Topic: 5 └─■──Sampling classes and their data 266 | generation purpose\_\_** ── Topic: 12 267 | 268 | ``` 269 | 270 | - recurrent generation augmented with the above topic tree in system prompt 271 | 272 | ``` 273 | 274 | . ├─DFS and DAG Shortest Paths Algorithm Implementation with Probing\_**\_ │ 275 | ├─■──Bipartite matching-based flow networks\_\_** ── Topic: 34 │ └─Search and 276 | Shortest Path Algorithms\_**\_ │ ├─DAG shortest path algorithm with probing and 277 | initialization\_\_** │ │ ├─■──Strongly Connected Components Algorithm with 278 | Kosaraju's Implementation\_**\_ ── Topic: 37 │ │ └─Graph Sampling and DAG 279 | Shortest Path Algorithm\_\_** │ │ ├─■──Bipartite matching using Edmonds-Karp 280 | algorithm\_**\_ ── Topic: 18 │ │ └─■──Random graph generation using Bellman-Ford 281 | algorithm in Python\_\_** ── Topic: 0 │ └─Graham scan convex hull algorithm 282 | implementation in Python\_**\_ │ ├─■──Maximum subarray algorithm 283 | implementation\_\_** ── Topic: 6 │ └─■──Graham scan convex hull algorithm 284 | implementation\_**\_ ── Topic: 12 └─Postprocessing Decoder Output for Chunked 285 | Data Processing Net\_\_** ├─Postprocessing Decoder Output with Chunked Data in 286 | JAX\_**\_ │ ├─Functions and Files in Probing.py Explained\_\_** │ │ ├─Functions 287 | and techniques for data splitting and replication in probing and pmap 288 | computation.\_**\_ │ │ │ ├─Understanding the strings_pair_cat function and 289 | split_stages function in probing.py file\_\_** │ │ │ │ ├─TFDS CLRSDataset 290 | Command-Line Tool for Sampling Datasets\_**\_ │ │ │ │ │ ├─■──CLRS30 dataset and 291 | related functions explanation\_\_** ── Topic: 5 │ │ │ │ │ └─■──TFDS CLRSDataset 292 | Builder Implementation\_**\_ ── Topic: 16 │ │ │ │ └─Functions and Probing in 293 | Python Code\_\_** │ │ │ │ ├─Purpose of the `split_stages` function in 294 | `probing.py` and related functions for evaluating 295 | `ProbesD │ │ │ │ │ ├─Functions for evaluating hint and output predictions using permutation objects and dictionaries.____ │ │ │ │ │ │ ├─Processing randomized `pos`input in a sampler with pointers and permutations.____ │ │ │ │ │ │ │ ├─■──Process randomization of`pos`input in algorithms including string algorithms____ ── Topic: 29 │ │ │ │ │ │ │ └─■──A function to replace should-be permutations with proper permutation pointers using a sample iterato ── Topic: 19 │ │ │ │ │ │ └─Function for Evaluating Permutation Predictions using Hint Data____ │ │ │ │ │ │ ├─■──Function to Reduce Permutations in a Dictionary of Result Objects____ ── Topic: 11 │ │ │ │ │ │ └─■──Function to evaluate hint predictions with tuple and list inputs____ ── Topic: 17 │ │ │ │ │ └─Understanding probing functions in Hash Table implementation____ │ │ │ │ │ ├─Hash Table Probing Functions in probing.py File____ │ │ │ │ │ │ ├─■──Splitting ProbesDict into DataPoints by stage in Python____ ── Topic: 14 │ │ │ │ │ │ └─■──Understanding Hash Table Probing Functions (strings_pi, strings_pos, strings_pair_cat) in Python's` 296 | ── Topic: 1 │ │ │ │ │ └─■──Functions for Checking Input Dimensions in Machine 297 | Learning Models\_**_ ── Topic: 15 │ │ │ │ └─JAX pmap reshaping and computation 298 | functions (\_pmap_reshape, \_maybe_pmap_reshape, \_maybe_pmap_data)_ │ │ │ │ 299 | ├─JAX pmap computation and pytree reshaping\_\_** │ │ │ │ │ ├─■──Purpose and 300 | attributes of the Stage and OutputClass classes\_**\_ ── Topic: 22 │ │ │ │ │ 301 | └─■──JAX tree reshaping for pmap computation with \_pmap_reshape and 302 | \_maybe_pmap_reshape functions\_\_** ── Topic: 3 │ │ │ │ └─Numpy array copying 303 | functions with assertions\_**\_ │ │ │ │ ├─■──Functions for copying data between 304 | numpy arrays in Python\_\_** ── Topic: 21 │ │ │ │ └─■──Function Purpose and 305 | Parameters Analysis in Codebase\_**\_ ── Topic: 9 │ │ │ └─Trajectory Batching 306 | with Variable-Length Time Dimension\_\_** │ │ │ ├─■──Trajectory Batching and 307 | Concatenation\_**\_ ── Topic: 35 │ │ │ └─■──Batch processing of variable-length 308 | hint trajectories.\_\_** ── Topic: 31 │ │ └─Understanding the 309 | `_is_not_done_broadcast` function and its input/output parameters.\_**\_ │ │ 310 | ├─■──Understanding the \_is_not_done_broadcast function in JAX array for 311 | sequence completion.\_\_** ── Topic: 8 │ │ └─■──Array broadcasting and expansion 312 | with \_expand_and_broadcast_to and \_expand_to functions\_**\_ ── Topic: 27 │ 313 | └─Postprocessing Decoder Output with Sinkhorn Algorithm and Hard 314 | Categorization\_\_** │ ├─Node Feature Decoding with Encoders and Decoders\_**\_ 315 | │ │ ├─■──Position Encoding Function for Natural Language Processing\_\_** ── 316 | Topic: 23 │ │ └─Node feature decoding using decoders and edge features\_**\_ │ │ 317 | ├─■──Creating Encoders with Xavier Initialization and Truncated Normal 318 | Distribution for Encoding Categori ── Topic: 33 │ │ └─Node feature decoding with 319 | decoders and edge features\_\_** │ │ ├─■──Node feature decoding and encoding 320 | with decoders and edge features\_**\_ ── Topic: 2 │ │ └─■──Graph diff 321 | decoders\_\_** ── Topic: 32 │ └─Postprocessing of decoder output in graph neural 322 | networks.\_**\_ │ ├─Decoder Output Postprocessing with Sinkhorn Algorithm and 323 | Cross-Entropy Loss\_\_** │ │ ├─Message Passing Net with Time-Chunked Data 324 | Processing\_**\_ │ │ │ ├─■──Python Class for Message Passing Model with 325 | Selectable Algorithm\_\_** ── Topic: 26 │ │ │ └─■──NetChunked message passing 326 | operation with LSTM states for time-chunked data\_**\_ ── Topic: 7 │ │ └─Loss 327 | calculation for time-chunked training with scalar truth data.\_\_** │ │ ├─Loss 328 | calculation function for time-chunked training with scalar truth data.\_**\_ │ │ 329 | │ ├─■──Loss calculation for time-chunked training data\_\_** ── Topic: 4 │ │ │ 330 | └─■──Logarithmic Sinkhorn Operator for Permutation Pointer Logits\_**\_ ── 331 | Topic: 10 │ │ └─■──Decoder postprocessing with Sinkhorn operator\_\_** ── Topic: 332 | 28 │ └─Gradient Filtering for Optimizer Updates\_**\_ │ ├─■──Filtering processor 333 | parameters in Haiku models\_\_** ── Topic: 30 │ └─■──Filtering null gradients 334 | for untrained parameters during optimization.\_**\_ ── Topic: 24 └─PGN with Jax 335 | implementation and NeurIPS 2020 paper\_\_** ├─Message-Passing Neural Network 336 | (MPNN) for Graph Convolutional Networks (GCNs)\_**\_ │ ├─■──"Applying Triplet 337 | Message Passing with HK Transforms in MPNN for Graph Neural Networks"\_\_** ── 338 | Topic: 20 │ └─■──Implementation of Deep Sets (Zaheer et al., NeurIPS 2017) using 339 | adjacency matrices and memory networ ── Topic: 13 └─GATv2 Graph Attention 340 | Network with adjustable sizes of multi-head attention and residual connections 341 | ├─■──Graph Attention Network v2 architecture with adjustable head number and 342 | output size.\_**\_ ── Topic: 36 └─■──Processor factory with various models and 343 | configurations\_\_** ── Topic: 25 344 | 345 | ``` 346 | 347 | ## Langchain 348 | 349 | - https://github.com/hwchase17/langchain 350 | 351 | ``` 352 | 353 | . ├─Combining documents with different chain types and LLM chains\_**\_ │ 354 | ├─MapReduce Chain Loading and Combining\_\_** │ │ ├─Question answering chain 355 | with sources loading and combining\_**\_ │ │ │ ├─■──Loading question answering 356 | with sources chain with multiple loader mappings and chains.\_\_** ── Topic: 53 357 | │ │ │ └─■──Loading and Combining Documents with Language Models for Summarizing 358 | and QA\_**\_ ── Topic: 71 │ │ └─Map Reduce Chain Loading Function\_\_** │ │ 359 | ├─Document Refinement using LLM Chains\_**\_ │ │ │ ├─■──Combining Documents with 360 | Stuffing and LLM Chain in Python\_\_** ── Topic: 97 │ │ │ 361 | └─BaseQAWithSourcesChain document handling and processing.\_**\_ │ │ │ 362 | ├─■──Question Answering with Sources over Documents Chain\_\_** ── Topic: 60 │ │ 363 | │ └─■──Python class for chatbot with vector database and question 364 | generation\_**\_ ── Topic: 16 │ │ └─MapReduce chain implementation\_\_** │ │ 365 | ├─■──MapReduceDocumentsChain document combination with chaining and 366 | mapping\_**\_ ── Topic: 12 │ │ └─■──MapReduce Chain Loading Function\_\_** ── 367 | Topic: 95 │ └─LLMBashChain document examples and related keywords\_**\_ │ ├─Bash 368 | operations and language modeling chain implementation\_\_** │ │ 369 | ├─LLMSummarizationCheckerChain document samples\_**\_ │ │ │ ├─■──Working with 370 | SQL databases in Python using SQLDatabaseChain\_\_** ── Topic: 46 │ │ │ 371 | └─Document processing with LLMSummarizationCheckerChain\_**\_ │ │ │ 372 | ├─■──Implementation of Program-Aided Language Models with PALChain class and 373 | related prompts and assertio ── Topic: 31 │ │ │ 374 | └─■──LLMSummarizationCheckerChain class and its functionality\_\_** ── Topic: 93 375 | │ │ └─LLMBashChain - interpreting prompts and executing bash code\_**\_ │ │ 376 | ├─■──LLMMathChain - Python code execution for math prompts\_\_** ── Topic: 92 │ 377 | │ └─■──Bash execution with LLMBashChain\_**\_ ── Topic: 80 │ └─■──MRKLChain 378 | implementation with ChainConfig and API integration\_\_** ── Topic: 59 └─Code 379 | organization and structure in Python including several classes related to 380 | self-hosted embedding ├─Code organization and improvement suggestions for a 381 | class definition.\_**\_ │ ├─Code Loading and Organization Best Practices\_\_** │ 382 | │ ├─Web scraping Hacker News webpage titles\_**\_ │ │ │ ├─Loading files using 383 | unstructured in Python\_\_** │ │ │ │ ├─Unstructured file loading with retry and 384 | partitioning capabilities.\_**\_ │ │ │ │ │ ├─■──Retry Decorator for OpenAI API 385 | Calls\_\_** ── Topic: 45 │ │ │ │ │ └─Unstructured File Loading and 386 | Partitioning\_**\_ │ │ │ │ │ ├─■──Unstructured File Loader for Partitioning 387 | Files in Various Formats\_\_** ── Topic: 25 │ │ │ │ │ └─■──Loading files with 388 | Unstructured package in different modes (Python code).\_**\_ ── Topic: 26 │ │ │ 389 | │ └─PDF manipulation in Python with pypdf, pdfminer, fitz and pymupdf 390 | libraries\_\_** │ │ │ │ ├─■──PDF file loading and text extraction using PyMuPDF 391 | and PDFMiner\_**\_ ── Topic: 69 │ │ │ │ └─■──Extracting Text from Paged PDF 392 | using PyPDF and PDFMiner\_\_** ── Topic: 96 │ │ │ └─Extracting Hacker News 393 | Webpage Information using WebBaseLoader and BeautifulSoup.\_**\_ │ │ │ ├─■──Web 394 | scraping Hacker News with BeautifulSoup and WebBaseLoader\_\_** ── Topic: 21 │ │ 395 | │ └─■──Web Scraping for College Confidential and Lyrics Websites\_**\_ ── Topic: 396 | 76 │ │ └─Code organization and structure in various Python modules\_\_** │ │ 397 | ├─Compliments on clear and structured codebase with good use of type hints for 398 | memory handling and con │ │ │ ├─Implementation of ReAct paper using ReActChain 399 | with examples in Python\_**\_ │ │ │ │ ├─■──Implementation of ReAct paper in 400 | ReActChain agent with OpenAI LLC model and tools\_\_** ── Topic: 101 │ │ │ │ 401 | └─In-memory Docstore for Efficient Lookup and Exploration\_**\_ │ │ │ │ 402 | ├─■──Document Store Exploration with DocstoreExplorer\_\_** ── Topic: 87 │ │ │ │ 403 | └─■──InMemoryDocstore for Storing and Searching Documents with 404 | AddableMixin\_**\_ ── Topic: 61 │ │ │ └─Compliments on Code Readability and 405 | Organization in Python Codebase.\_\_** │ │ │ ├─Memory Handling and Conversation 406 | Management\_**\_ │ │ │ │ ├─Memory Conversation Summarizer Implementation\_\_** │ 407 | │ │ │ │ ├─Memory and Conversation Summarization in AI-assisted dialogues.\_**\_ 408 | │ │ │ │ │ │ ├─■──Purpose of ChatPromptValue class in chat.py\_\_** ── Topic: 30 409 | │ │ │ │ │ │ └─■──Memory management and conversation summarization in AI chatbot 410 | system.\_**\_ ── Topic: 6 │ │ │ │ │ └─■──Implementation of Chain class with 411 | CallbackManager and Memory attributes.\_\_** ── Topic: 52 │ │ │ │ └─Potential 412 | bugs and suggestions for loading LLM, few-shot prompts, and examples from JSON 413 | and YAML fi │ │ │ │ ├─Code structure and organization tips for loading examples 414 | and templates from files in Python.\_**\_ │ │ │ │ │ ├─Compliments on code 415 | structure and organization\_\_** │ │ │ │ │ │ ├─■──Loading few-shot prompts from 416 | config with prefix and suffix templates\_**\_ ── Topic: 34 │ │ │ │ │ │ └─Code 417 | organization and structure for creating chat prompt templates\_\_** │ │ │ │ │ │ 418 | ├─■──Chat prompt template and message prompt templates for generating chatbot 419 | prompts.\_**\_ ── Topic: 8 │ │ │ │ │ │ └─■──Purpose of `_load_prompt_from_file` 420 | function in loading.py module.\_\_** ── Topic: 13 │ │ │ │ │ └─■──Function for 421 | Loading a Chain of LLM Checkers from a Configuration Dictionary.\_**\_ ── Topic: 422 | 3 │ │ │ │ └─Documented class definitions for tools used in handling API 423 | requests, including OpenSearchVectorSear │ │ │ │ ├─Handling API requests using 424 | tools such as RequestsPostTool and OpenSearchVectorSearch\_\_** │ │ │ │ │ 425 | ├─Python requests wrapper for making HTTP requests with various tools and 426 | methods\_**\_ │ │ │ │ │ │ ├─■──DeepInfra API token and text generation model 427 | wrapper\_\_** ── Topic: 41 │ │ │ │ │ │ └─RequestsWrapper and BaseRequestsTool 428 | for making HTTP requests (POST, GET, PATCH, DELETE) to API endp │ │ │ │ │ │ 429 | ├─■──Checking Validity of Template Strings with Input Variables and Formatter 430 | Mapping\_**\_ ── Topic: 14 │ │ │ │ │ │ └─■──Requests tools for making HTTP 431 | requests with Python\_\_** ── Topic: 10 │ │ │ │ │ └─Code organization and 432 | positive feedback\_**\_ │ │ │ │ │ ├─Bing Search API Wrapper and Handler 433 | Classes\_\_** │ │ │ │ │ │ ├─■──Langchain callback manager and codebase 434 | organization\_**\_ ── Topic: 2 │ │ │ │ │ │ └─■──Bing Search API Wrapper and SERP 435 | API Usage in Python\_\_** ── Topic: 1 │ │ │ │ │ └─Handling iFixit devices with 436 | models and remote hardware\_**\_ │ │ │ │ │ ├─■──Loading iFixit repair guides and 437 | device wikis with transformer model inference.\_\_** ── Topic: 0 │ │ │ │ │ 438 | └─■──Potential Issues with Modifying Input Dictionary in a Prompt Loading 439 | Function\_**\_ ── Topic: 9 │ │ │ │ └─Implementation and Usage of 440 | SearxSearchWrapper with Environment Variables and SSL Support\_\_** │ │ │ │ 441 | ├─Python Libraries for API Wrappers and Search Engines\_**\_ │ │ │ │ │ ├─Python 442 | packages for integrating with search engines: SearxSearchWrapper and 443 | QdrantClient.\_\_** │ │ │ │ │ │ ├─■──Implementation of Searx API Wrapper 444 | (SearxSearchWrapper) using Python's BaseModel with QdrantClient ── Topic: 33 │ │ 445 | │ │ │ │ └─■──Handling environment variables and dictionaries with 446 | get*from_dict_or_env function\_**\_ ── Topic: 72 │ │ │ │ │ └─Purpose and Issues 447 | with `print_text` Function in `langchain` Repository's `input.py` File\_\_** │ │ 448 | │ │ │ ├─■──Printing Highlighted Text with Options in Python\_**\_ ── Topic: 51 │ 449 | │ │ │ │ └─■──Converting Python Objects to String Representation with Nested 450 | Structures and Joining on Newline Cha ── Topic: 66 │ │ │ │ └─GitbookLoader class 451 | and its methods\_\_** │ │ │ │ ├─■──Handling newlines recursively in data 452 | structures using pandas\_**\_ ── Topic: 29 │ │ │ │ └─GitBookLoader class for 453 | loading web pages with options to load all or single pages\_\_** │ │ │ │ 454 | ├─■──GitbookLoader class for loading single or multiple pages from GitBook with 455 | relative paths in the nav ── Topic: 28 │ │ │ │ └─■──Length-Based Example 456 | Selection and Text Length Calculation\_**\_ ── Topic: 57 │ │ │ └─Ngram overlap 457 | score using sentence_bleu and method1 smoothing function\_\_** │ │ │ ├─Ngram 458 | overlap score using sentence_bleu method1 smoothing function and auto 459 | reweighting\_**\_ │ │ │ │ ├─■──Code structure and organization in langchain 460 | document loaders with support for parsing comma-separat ── Topic: 70 │ │ │ │ 461 | └─Ngram overlap score using sentence_bleu and method1 smoothing function with 462 | auto reweighting in nltk │ │ │ │ ├─■──Compliments on well-structured and 463 | organized code in different classes and methods\_\_** ── Topic: 65 │ │ │ │ 464 | └─■──Sentence BLEU score and ngram overlap computation with method1 smoothing 465 | function and auto reweighti ── Topic: 49 │ │ │ └─Model Definition and 466 | Experimentation with Datetime and UTCNow Attributes\_**\_ │ │ │ ├─■──Data 467 | Modeling with Time Zones in Python\_\_** ── Topic: 91 │ │ │ └─■──Constitutional 468 | Principles and Tracing in Python\_**\_ ── Topic: 68 │ │ └─Text splitting for 469 | knowledge triple extraction\_\_** │ │ ├─Text Splitting Toolkit\_**\_ │ │ │ 470 | ├─Text splitting interface and implementation\_\_** │ │ │ │ ├─Python REPL Tool 471 | and AST Implementation\_**\_ │ │ │ │ │ ├─Python REPL Tool Implementation\_\_** │ 472 | │ │ │ │ │ ├─SQL database metadata retrieval tool\_**\_ │ │ │ │ │ │ │ ├─■──Python 473 | function to concatenate cell information for AI and human usage\_\_** ── Topic: 474 | 44 │ │ │ │ │ │ │ └─SQL database metadata tool for listing table schema and 475 | metadata\_**\_ │ │ │ │ │ │ │ ├─■──SQL database metadata extraction tool for 476 | specified tables\_\_** ── Topic: 75 │ │ │ │ │ │ │ └─■──JSON and SQL database 477 | tools for listing and getting values\_**\_ ── Topic: 15 │ │ │ │ │ │ └─Python 478 | REPL Tool using AST and Coroutine\_\_** │ │ │ │ │ │ ├─■──Tool implementation 479 | with direct function or coroutine input and error handling.\_**\_ ── Topic: 99 │ 480 | │ │ │ │ │ └─■──Python REPL Tool with AST and version validation\_\_** ── Topic: 481 | 74 │ │ │ │ │ └─Implementing API wrappers for news, movie information, and 482 | weather using APIChain\_**\_ │ │ │ │ │ ├─Implementing APIs for News, Weather, 483 | and Movie Information in LangChain's Load Tools Module\_\_** │ │ │ │ │ │ 484 | ├─■──Language model for reasoning about position and color attributes of objects 485 | in weather forecasting w ── Topic: 73 │ │ │ │ │ │ └─Implementing APIs for 486 | fetching news and movies using Python\_**\_ │ │ │ │ │ │ ├─■──well-structured and 487 | readable implementation of API initialization functions in load_tools.py for Too 488 | ── Topic: 85 │ │ │ │ │ │ └─■──Working with API authentication and chaining for 489 | news and movie information retrieval (using news_ap ── Topic: 100 │ │ │ │ │ 490 | └─■──Wolfram Alpha SDK querying using WolframAlphaQueryRun class and api_wrapper 491 | attribute\_\_** ── Topic: 89 │ │ │ │ └─TextSplitting for Vector Storage with 492 | Overlapping Chunks\_**\_ │ │ │ │ ├─Python's StrictFormatter class and its 493 | check_unused_args method for formatting and validation of inp │ │ │ │ │ ├─L2 494 | distance search using ndarray in Python\_\_** │ │ │ │ │ │ ├─■──L2 search for 495 | nearest neighbors with np.linalg.norm\_**\_ ── Topic: 32 │ │ │ │ │ │ 496 | └─■──Parsing and Organizing Notes with Hashing and Embeddings\_\_** ── Topic: 67 497 | │ │ │ │ │ └─Python Class for Strict Formatter with Check on Unused Args\_**\_ │ 498 | │ │ │ │ ├─Vector Store Toolkit and Deployment\_\_** │ │ │ │ │ │ ├─■──Vector 499 | Store Toolkit and Deployment with OpenAI LLM\_**\_ ── Topic: 35 │ │ │ │ │ │ 500 | └─■──Working with AirbyteJSONLoader to load local Airbyte JSON files\_\_** ── 501 | Topic: 47 │ │ │ │ │ └─Python Formatter class with check_unused_args method and 502 | strict validation\_**\_ │ │ │ │ │ ├─Python's StrictFormatter class and its 503 | check_unused_args method for validating unused and extra argu │ │ │ │ │ │ 504 | ├─■──Finding TODO Tasks in Code Snippets\_\_** ── Topic: 4 │ │ │ │ │ │ └─Python 505 | Formatter and StrictFormatter with check_unused_args method\_**\_ │ │ │ │ │ │ 506 | ├─■──Color Mapping Function for Prompt Inputs with Exclusions\_\_** ── Topic: 88 507 | │ │ │ │ │ │ └─■──Implementing strict checking of unused and extra keys in a 508 | subclass of formatter\_**\_ ── Topic: 48 │ │ │ │ │ └─Python module for loading 509 | and manipulating language chain data with verbosity control.\_\_** │ │ │ │ │ 510 | ├─■──Python function for getting verbosity from language chaining with Azure 511 | OpenAI and difference from O ── Topic: 64 │ │ │ │ │ └─■──Purpose of functions in 512 | loading.py and csv toolkit of langchain repository\_**\_ ── Topic: 42 │ │ │ │ 513 | └─Text splitting using chunk size and overlap with various libraries and 514 | interfaces.\_\_** │ │ │ │ ├─Text splitting and chunking with overlap and length 515 | functions\_**\_ │ │ │ │ │ ├─■──Developing and Maintaining Docker Compose Modules 516 | in Python\_\_** ── Topic: 79 │ │ │ │ │ └─Text splitting and chunking using 517 | TextSplitter interface\_**\_ │ │ │ │ │ ├─Text Splitting Interface and 518 | Implementation\_\_** │ │ │ │ │ │ ├─■──Text splitting using TokenTextSplitter 519 | class.\_**\_ ── Topic: 7 │ │ │ │ │ │ └─■──Document Loading and Splitting with 520 | Text Splitting and Callback Management.\_\_** ── Topic: 84 │ │ │ │ │ └─■──Python 521 | code for initializing an agent with various optional arguments\_**\_ ── Topic: 522 | 18 │ │ │ │ └─Loading Google Docs from Google Drive using Credentials and Tokens 523 | with Python\_\_** │ │ │ │ ├─Document Loading from Cloud Storage (GCS and S3) 524 | using BaseLoader Class\_**\_ │ │ │ │ │ ├─■──Online PDF loading and caching using 525 | SQLite and temporary directories\_\_** ── Topic: 98 │ │ │ │ │ └─■──Loading 526 | documents from cloud storage using GCSFileLoader and S3FileLoader classes.\_**\_ 527 | ── Topic: 36 │ │ │ │ └─■──Google Drive Loader and Credentials for Loading Google 528 | Docs\_\_** ── Topic: 86 │ │ │ └─StreamlitCallbackHandler for logging to 529 | streamlit in Python code\_**\_ │ │ │ ├─Streaming with LLMs and Callback 530 | Handlers\_\_** │ │ │ │ ├─Networkx wrapper for entity graph operations with Redis 531 | caching.\_**\_ │ │ │ │ │ ├─NetworkX Entity Graph with Missing Tables and 532 | Callback Manager\_\_** │ │ │ │ │ │ ├─■──Graph Index Creation and Operations 533 | using NetworkX Library in Python\_**\_ ── Topic: 58 │ │ │ │ │ │ 534 | └─■──NetworkxEntityGraph and entity graph operations.\_\_** ── Topic: 20 │ │ │ │ 535 | │ └─Redis cache implementation in Python\_**\_ │ │ │ │ │ ├─■──Implementing a 536 | SQAlchemy-based cache system with missing and existing prompts for better 537 | performance ── Topic: 17 │ │ │ │ │ └─■──Implementation of a Redis cache as a 538 | backend in Python\_\_** ── Topic: 39 │ │ │ │ └─Python Callback Handler for 539 | Streamlit Logging\_**\_ │ │ │ │ ├─■──Callback handlers for printing to standard 540 | output.\_\_** ── Topic: 43 │ │ │ │ └─■──StreamlitCallbackHandler for logging 541 | prompts and actions to Streamlit\_**\_ ── Topic: 90 │ │ │ └─ZeroShotAgent class 542 | and observation prefix property in Python\_\_** │ │ │ ├─Creating a JSON agent 543 | using a toolkit for zeroshot agent execution with format instructions and inpu │ 544 | │ │ │ ├─■──Creating Pandas DataFrames using Agent Scratchpad and Python AST REPL 545 | Tool.\_**\_ ── Topic: 82 │ │ │ │ └─Creating a JSON agent with toolkit, format 546 | instructions, and prefix/suffix\_\_** │ │ │ │ ├─■──SQL agent creation with 547 | SQLDatabaseToolkit, BaseLLM and BaseCallbackManager\_**\_ ── Topic: 11 │ │ │ │ 548 | └─■──Creating a JSON agent with OpenAPI toolkit and interacting with it using 549 | JSON tools\_\_** ── Topic: 56 │ │ │ └─Classes for language model-driven decision 550 | making and use of "agent_scratchpad" in LLMChain prompts* │ │ │ ├─■──Agent class 551 | and entity extraction using "agent*scratchpad" variable\_**\_ ── Topic: 38 │ │ │ 552 | └─■──Code for a text-based game-playing agent using self-ask-with-search 553 | approach in TextWorld environmen ── Topic: 102 │ │ └─Text Mapping for 554 | Approximate k-NN Search using nmslib in Python\_\_** │ │ ├─Script Scoring with 555 | KNN Search\_**\_ │ │ │ ├─■──Document bulk-ingest function for embeddings in 556 | Elasticsearch index\_\_** ── Topic: 23 │ │ │ └─■──Script Scoring Search with 557 | Cosine Similarity and k-Nearest Neighbors (k-NN) Algorithm\_**\_ ── Topic: 19 │ 558 | │ └─Default text mapping for Approximate k-NN Search in dense vector fields 559 | using NMSLIB engine\_\_** │ │ ├─■──Default Mapping for Approximate k-NN Search 560 | using NMSLIB Engine\_**\_ ── Topic: 81 │ │ └─■──Elasticsearch indexing and 561 | scripting with default mappings and painless scripting\_\_** ── Topic: 94 │ 562 | └─Tracing and Recording Runs with SharedTracer and TracerStack\_**\_ │ ├─Python 563 | classes ToolRun and ChainRun in schemas.py file with additional attributes and 564 | their purpose. │ │ ├─■──Extracting information about ElementInViewPort instances 565 | in chainrun toolrun runs.\_\_** ── Topic: 77 │ │ └─■──Purpose and attributes of 566 | the ChainRun class in schemas.py file\_**\_ ── Topic: 78 │ └─Tracing and 567 | thread-safe execution with SharedTracer Singleton class\_\_** │ ├─■──Tracing 568 | Execution Order with BaseTracer in a Thread-Safe Manner\_**\_ ── Topic: 55 │ 569 | └─■──TracerStack and SharedTracer Implementation in Python\_\_** ── Topic: 63 570 | └─Python wrapper for OpenAI and Hugging Face language models\_**\_ ├─Self-Hosted 571 | Hugging Face Instructor Embedding Models on Remote Hardware\_\_** │ 572 | ├─HuggingFace and Sentence-Transformers Embeddings for Cohere\_**\_ │ │ 573 | ├─■──Output parsing using regular expressions and the BaseOutputParser 574 | class\_\_** ── Topic: 54 │ │ └─NLP Embeddings using Hugging Face and Sentence 575 | Transformers\_**\_ │ │ ├─■──Neural Embeddings with Hugging Face and Cohere 576 | API\_\_** ── Topic: 24 │ │ └─■──Loading sentence embedding model with 577 | sentence_transformers library.\_**\_ ── Topic: 27 │ └─Self-hosted HuggingFace 578 | pipeline API for running models on remote hardware\_\_** │ ├─Self-hosted 579 | HuggingFace pipeline for remote GPU hardware inference with autolaunched 580 | instances on va │ │ ├─■──Self-hosted HuggingFace pipeline for remote hardware 581 | with HuggingFace Transformers and AutoTokenizer ── Topic: 40 │ │ 582 | └─■──Self-hosted embeddings for sentence_transformers with remote hardware 583 | support.\_**\_ ── Topic: 22 │ └─■──Self-hosted embeddings for running custom 584 | embedding models on remote hardware\_\_** ── Topic: 62 └─Python wrapper for 585 | OpenAI language model with API key authentication and model parameters 586 | configurat ├─OpenAI Language Model Wrapper Class with API Key Authentication and 587 | Model Parameters Configuration\_\_ │ ├─■──StochasticAI Wrapper for Large 588 | Language Models with Environment Key Validation and PDF Partitioning* ── Topic: 589 | 50 │ └─Integration of OpenAI Language Model with GooseAI class for Text 590 | Generation\_**\_ │ ├─■──OpenAI Chat Model Implementation\_\_** ── Topic: 37 │ 591 | └─■──Python Wrapper for OpenAI Language Models\_**\_ ── Topic: 5 └─■──Anthropic 592 | Large Language Models and API Usage in AtlasDB Project Management\_\_** ── 593 | Topic: 83 594 | 595 | ``` 596 | 597 | ## Pyknotid 598 | 599 | - https://pyknotid.readthedocs.io/en/latest/ 600 | 601 | ``` 602 | 603 | └─Calculation of Alexander polynomial for knots in Python and Mathematica.\_**\_ 604 | ├─Alexander polynomial calculation using Mathematica process and knot routing 605 | algorithm with various p │ ├─Calculation of Alexander polynomial for knots using 606 | Python and Mathematica representations\_\_** │ │ ├─CellKnot object 607 | initialization and properties with sin, cos, linspace, phi, psi, theta, 608 | rotation, pe │ │ │ ├─Mollweide projection and spherical coordinates\_**\_ │ │ │ 609 | │ ├─Rotation of Spheres using Rotation Matrices\_\_** │ │ │ │ │ ├─■──Rotation of 610 | sphere to align given positions at the top\_**\_ ── Topic: 41 │ │ │ │ │ 611 | └─■──Rotation matrix computation and manipulation using iterable angles.\_\_** 612 | ── Topic: 18 │ │ │ │ └─Mollweide projection and conversion of spherical 613 | coordinates\_**\_ │ │ │ │ ├─Mollweide projection and spherical coordinates 614 | conversion\_\_** │ │ │ │ │ ├─■──Vector magnitude calculation, Mollweide 615 | projection, and well-written code in Python.\_**\_ ── Topic: 51 │ │ │ │ │ 616 | └─■──"Mollweide projection and spherical coordinate conversion"\_\_** ── Topic: 617 | 30 │ │ │ │ └─■──Verbose printing function for Pyknotid counters.\_**\_ ── Topic: 618 | 10 │ │ │ └─CellKnot class and points folding\_\_** │ │ │ ├─CellKnot and Knot 619 | Folding\_**\_ │ │ │ │ ├─■──Understanding the "cell_trefoil" function and the 620 | "aperiodic_trefoil" function for creating interpol ── Topic: 37 │ │ │ │ 621 | └─■──CellKnot class and related methods\_\_** ── Topic: 33 │ │ │ └─3D geometric 622 | scaling with numpy and crossing signs\_**\_ │ │ │ ├─Geometric Transformation 623 | with Crossing Signs\_\_** │ │ │ │ ├─■──Numpy arrays for creating and perturbing 624 | a simple link using sin and cos in Python code.\_**\_ ── Topic: 15 │ │ │ │ 625 | └─■──Geometric transformation with crossing signs and np array\_\_** ── Topic: 3 626 | │ │ │ └─■──3D point scaling helper functions in p4_3**1, p4_4**1, p4_5**1_false, 627 | p5_3**1 and p5_4**1.\_\_** ── Topic: 47 │ │ └─Knot representations and 628 | calculation of Alexander polynomial using Python and Mathematica\_**\_ │ │ 629 | ├─Line Segment Open by Distance Generator\_\_** │ │ │ ├─Issues with missing 630 | function definitions and potential bugs in serialisation functions.\_**\_ │ │ │ 631 | │ ├─■──JSON and polynomial serialisation with potential implementation 632 | issues\_\_** ── Topic: 17 │ │ │ │ └─■──Issues with incomplete function for 633 | serialising Jones polynomials in Python\_**\_ ── Topic: 36 │ │ │ └─Line vectors 634 | open by distance fraction with seed and number of segments as parameters.\_\_** 635 | │ │ │ ├─Line segment manipulation and generation\_**\_ │ │ │ │ ├─Line Segments 636 | and Open/Closed Loop Detection\_\_** │ │ │ │ │ ├─■──Open and closed line 637 | segments generation with distance constraint.\_**\_ ── Topic: 5 │ │ │ │ │ 638 | └─■──Writing Mathematica code to file and running it using MathKernel\_\_** ── 639 | Topic: 28 │ │ │ │ └─Loading and manipulating CSV files with Pandas and saving to 640 | JSON.\_**\_ │ │ │ │ ├─■──Writing and loading data in json format with numpy and 641 | handling file paths (filenotfounderror explan ── Topic: 14 │ │ │ │ └─■──Parsing 642 | CSV data using pandas in Python\_\_** ── Topic: 19 │ │ │ └─Downloading Knots 643 | Database with Pyknotid Library.\_**\_ │ │ │ ├─Knots database download and 644 | management\_\_** │ │ │ │ ├─■──Downloading Knots Database using Pyknotid 645 | Library\_**\_ ── Topic: 23 │ │ │ │ └─■──Deleting old versions of database files 646 | in specific format using Python.\_\_** ── Topic: 44 │ │ │ └─■──Recursive file 647 | inclusion using fnmatch patterns in Python\_**\_ ── Topic: 43 │ │ └─Alexander 648 | polynomial computation using Mathematica for knot representations\_\_** │ │ 649 | ├─Calculation of Alexander polynomial using Python and Mathematica code 650 | snippets.\_**\_ │ │ │ ├─MeshCollectionVisual class and associated methods for 651 | vertex colors and shading in mesh visualizatio │ │ │ │ ├─Code Refactoring and 652 | Todo Tasks with Comments and Unit Tests\_\_** │ │ │ │ │ ├─■──Classes and 653 | functionality for handling periodic boundary conditions in a 2D space.\_**\_ ── 654 | Topic: 39 │ │ │ │ │ └─■──Code Refactoring and Unit Testing\_\_** ── Topic: 4 │ │ 655 | │ │ └─MeshCollectionVisual class and vertex colors in 3D mesh 656 | visualization.\_**\_ │ │ │ │ ├─Signal Smoothing with Window Functions\_\_** │ │ 657 | │ │ │ ├─■──Testing vector intersection in a dp/dq region using 658 | do_vectors_intersect function and obtaining bool ── Topic: 20 │ │ │ │ │ └─signal 659 | smoothing with different windows and sizes\_**\_ │ │ │ │ │ ├─■──Signal Smoothing 660 | using Different Windows\_\_** ── Topic: 49 │ │ │ │ │ └─■──Code organization and 661 | readability of periodic_vassiliev_degree_2 function in adherence with PEP 8 gu 662 | ── Topic: 26 │ │ │ │ └─MeshCollectionVisual class and related methods\_**\_ │ │ 663 | │ │ ├─■──MeshCollectionVisual class and its methods for mesh visualization and 664 | handling vertex colors and sha ── Topic: 9 │ │ │ │ └─■──Cell object for lines 665 | with periodic boundary conditions\_\_** ── Topic: 45 │ │ │ └─Alexander 666 | polynomial calculation using Mathematica\_**\_ │ │ │ ├─Calculating the Alexander 667 | polynomial of knots using various representations\_\_** │ │ │ │ ├─Gauss code 668 | conversion to crossing indices\_**\_ │ │ │ │ │ ├─Recommendations for the "mag" 669 | function implementation in periodic.py\_\_** │ │ │ │ │ │ ├─■──Implementing 670 | vector magnitude using dot product in Pyknotid\_**\_ ── Topic: 8 │ │ │ │ │ │ 671 | └─■──Improving code with imports and using numpy.zeros instead of n.zeros.\_\_** 672 | ── Topic: 50 │ │ │ │ │ └─■──Converting Gauss code to crossings in a crossing 673 | object\_**\_ ── Topic: 22 │ │ │ │ └─Calculation of Alexander polynomial using 674 | Mathematica for knot representations\_\_** │ │ │ │ ├─Knot theory and Alexander 675 | polynomial calculation using Mathematica\_**\_ │ │ │ │ │ ├─■──BoundingBox class 676 | implementation in Python with numpy and axis manipulation\_\_** ── Topic: 1 │ │ 677 | │ │ │ └─■──Calculation of Alexander polynomial for knot representations using 678 | Mathematica\_**\_ ── Topic: 0 │ │ │ │ └─3D sphere plotting with Mollweide 679 | projection using VisPy\_\_** │ │ │ │ ├─■──"3D visualization of spherical data 680 | using VisPy and Mollweide projection"\_**\_ ── Topic: 2 │ │ │ │ └─■──Class 681 | definition of MeshCollectionVisual that creates a mesh by concatenating visuals' 682 | vertices, ind ── Topic: 12 │ │ │ └─Database objects matching invariants using 683 | Python code\_\_** │ │ │ ├─Database objects and invariants in knot theory\_**\_ │ 684 | │ │ │ ├─■──"Database storage and manipulation of knots using Peewee and 685 | optimized Cython routines"\_\_** ── Topic: 35 │ │ │ │ └─■──Database searching 686 | with invariants in Python\_**\_ ── Topic: 7 │ │ │ └─■──OpenKnot class for 687 | holding vertices of an open curve in spacecurves module\_\_** ── Topic: 38 │ │ 688 | └─Parsing data files and visualizing with matplotlib and mayavi/vispy.\_**\_ │ │ 689 | ├─Code for drawing bounding boxes in 3D using VisPy\_\_** │ │ │ ├─Plotting 690 | Lissajous Conformation with Vispy and Mayavi Toolkits\_**\_ │ │ │ │ 691 | ├─■──Plotting Lissajous conformations with plot_cell using Vispy and Mayavi 692 | toolkits\_\_** ── Topic: 13 │ │ │ │ └─■──Understanding the plot_line() function 693 | in pyknotid's visualise.py and its 3D plotting toolkits (Maya ── Topic: 27 │ │ │ 694 | └─■──Bounding box visualization with VisPy\_**\_ ── Topic: 32 │ │ └─Analyzing 695 | knot type of curve in a data file using argparse parser and VisPy canvas 696 | plotting.\_\_** │ │ ├─■──Plotting 2D projections with optional markers in Python 697 | using pyplot\_**\_ ── Topic: 48 │ │ └─Analysis of knot types in data files using 698 | argparse and vispy_canvas.\_\_** │ │ ├─■──Working with VisPy Canvas and Scene 699 | objects\_**\_ ── Topic: 40 │ │ └─■──Parsing and analyzing knot types in data 700 | files using argparse\_\_** ── Topic: 42 │ └─Alternative Periodic Vassiliev 701 | Function with Conway Notation (Degree 4, Z4 Coefficients) and Core Cr │ 702 | ├─■──Calculation of writhing numbers using Arrow diagrams and Gauss codes\_**\_ 703 | ── Topic: 16 │ └─Alternative periodic Vassiliev function for Conway notation 704 | with z4 coefficients and related invaria │ ├─■──Arnold's invariants and their 705 | calculation by transforming representation into an unknot\_\_** ── Topic: 6 │ 706 | └─■──Alternative periodic Vassiliev function in pyknotid with Conway notation 707 | and Z4 coefficients\_**\_ ── Topic: 11 └─"Calculating higher order writhe 708 | integrals using numpy and cython"\_\_** ├─Calculation of Higher Order Writhe 709 | Integral with NumPy and Cython Implementation.\_**\_ │ ├─Calculation of higher 710 | order writhe integral using points and order contributions\_\_** │ │ 711 | ├─■──higher_order_writhe_integral function implementation with numpy\_**\_ ── 712 | Topic: 29 │ │ └─■──Writhing matrix and coefficient calculations for points in 713 | 3-dimensional space\_\_** ── Topic: 46 │ └─■──Calculation of Writhe and Average 714 | Crossing Number using Integral\_**\_ ── Topic: 21 └─■──Distance Quantity 715 | Calculation from Curve Integral\_\_** ── Topic: 31 716 | 717 | ``` 718 | 719 | ## PyReason 720 | 721 | - https://github.com/lab-v2/pyreason 722 | 723 | ``` 724 | 725 | . ├─Updating Interpretations Graph with Nodes and Edges\_**\_ │ ├─Analysis of 726 | Python code implementing a graph data structure and functions to add nodes and 727 | edges, us │ │ ├─■──Code Refactoring and Commenting, Handling None Values in 728 | Python Code\_\_** ── Topic: 10 │ │ └─Code analysis and review of a function for 729 | adding edges and nodes to a graph, including checking for │ │ ├─■──Positive 730 | aspects of a codebase with clear and descriptive function names and variable 731 | names.\_**\_ ── Topic: 4 │ │ └─■──Methods for adding edges to a graph with 732 | considerations for existing nodes and labels.\_\_** ── Topic: 11 │ └─Updating 733 | nodes and edges in a graph with rule trace and atom trace, using Numba for 734 | optimization\_**\_ │ ├─Python functions for updating rule traces with graph 735 | attributes\_\_** │ │ ├─■──Updating nodes with rules and traces in a converging 736 | system.\_**\_ ── Topic: 15 │ │ └─■──Interpretation of Graph Labels Using Numba 737 | in Python\_\_** ── Topic: 5 │ └─analysis of profiling data for program 738 | optimization\_**\_ │ ├─■──Parsing YAML to create a list of rules\_\_** ── Topic: 739 | 9 │ └─■──Parsing and Profiling Data from CSV files in Python\_**\_ ── Topic: 1 740 | └─Python Object Getter Implementation for Fact Objects\_\_** ├─Python code 741 | structure and implementation in pyreason's numba_wrapper lib\_**\_ │ ├─Functions 742 | and objects in pyreason's numba_wrapper module that return nested functions and 743 | implement │ │ ├─■──Function for getting time lower and upper bounds from a fact 744 | object\_\_** ── Topic: 14 │ │ └─Higher-order functions in rule_type.py for 745 | accessing fields of a rule object in pyreason/pyreason/sc │ │ ├─Python classes 746 | and object-oriented programming concepts with "Fact" class examples.\_**\_ │ │ │ 747 | ├─■──Nested function type checking with isinstance in Python\_\_** ── Topic: 8 │ 748 | │ │ └─■──Class Fact and its attributes and methods.\_**\_ ── Topic: 7 │ │ 749 | └─Numba implementation functions for label and world types in PyReason\_\_** │ │ 750 | ├─Higher-order functions for getting attributes of rules in PyReason's 751 | numba_types module\_**\_ │ │ │ ├─■──Code structure and naming conventions in 752 | pyreason's numba_wrapper/numpy_types directory\_\_** ── Topic: 6 │ │ │ 753 | └─■──Implementation details of the `get_target_criteria` function and 754 | `unbox_rule` in rule_type.py\_**\_ ── Topic: 0 │ │ └─■──Implementation of 755 | interval methods in pyreason using numba_wrapper\_\_** ── Topic: 2 │ 756 | └─Compliments on codebase functions for calculating minimum, maximum, average, 757 | and average lower using │ ├─■──Implementing a reset function to update the 758 | bounds of an interval object in Python\_**\_ ── Topic: 18 │ └─■──Compliments on 759 | Function Names and Docstrings for Array Calculation Functions\_\_** ── Topic: 13 760 | └─Working with pyapi and objects in Python code\_**\_ ├─Understanding a Python 761 | code snippet for unboxing facts with structmodels\_\_** │ ├─■──Object Unboxing 762 | and Attribute Retrieval in Python with PyAPI\_**\_ ── Topic: 17 │ └─Numba code 763 | for creating and boxing a struct model instance\_\_** │ ├─■──Code snippet for 764 | creating a struct proxy and boxing its components in Numba.\_**\_ ── Topic: 3 │ 765 | └─■──Python class RuleModel with labeled attributes and types\_\_** ── Topic: 12 766 | └─■──Functions for creating and boxing objects in a Python-C++ interface\_\_\_\_ 767 | ── Topic: 16 768 | 769 | ``` 770 | 771 | ``` 772 | -------------------------------------------------------------------------------- /__main__.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from getpass import getpass 3 | 4 | import openai 5 | 6 | from src import Pipeline 7 | 8 | if __name__ == "__main__": 9 | argsparse = argparse.ArgumentParser() 10 | argsparse.add_argument("--config", type=str, default="./test_config.yaml") 11 | argsparse.add_argument("--repo", type=str, default="https://github.com/LAION-AI/Open-Assistant.git") 12 | argsparse.add_argument("--repo_name", type=str, default="OA") 13 | 14 | args = argsparse.parse_args() 15 | 16 | openai_secret = getpass("Enter the secret key: ") 17 | # Set up OpenAI API credentials 18 | openai.api_key = openai_secret 19 | 20 | print("starting pipeline") 21 | pipeline = Pipeline.from_yaml(args.config) 22 | gnn_head_outputs, topic_model_outputs = pipeline.run(args.repo, args.repo_name) 23 | for i, topic_model_output in enumerate(topic_model_outputs): 24 | topic_model_output["data"].to_csv(f"context/{args.repo_name}_topic_model_outputs_{i}.csv") 25 | topic_model_output["topic_info"].to_csv(f"context/{args.repo_name}_topic_info_{i}.csv") 26 | with open(f"context/{args.repo_name}_tree_{i}.txt", "w", encoding="utf-8") as f: 27 | f.write(topic_model_output["tree"]) 28 | -------------------------------------------------------------------------------- /git-frontend/.gitignore: -------------------------------------------------------------------------------- 1 | # See https://help.github.com/articles/ignoring-files/ for more about ignoring files. 2 | 3 | # dependencies 4 | /node_modules 5 | /.pnp 6 | .pnp.js 7 | 8 | # testing 9 | /coverage 10 | 11 | # production 12 | /build 13 | 14 | # misc 15 | .DS_Store 16 | .env.local 17 | .env.development.local 18 | .env.test.local 19 | .env.production.local 20 | 21 | npm-debug.log* 22 | yarn-debug.log* 23 | yarn-error.log* 24 | -------------------------------------------------------------------------------- /git-frontend/README.md: -------------------------------------------------------------------------------- 1 | # Getting Started with Create React App 2 | 3 | This project was bootstrapped with 4 | [Create React App](https://github.com/facebook/create-react-app). 5 | 6 | ## Available Scripts 7 | 8 | In the project directory, you can run: 9 | 10 | ### `npm start` 11 | 12 | Runs the app in the development mode.\ 13 | Open [http://localhost:3000](http://localhost:3000) to view it in your browser. 14 | 15 | The page will reload when you make changes.\ 16 | You may also see any lint errors in the console. 17 | 18 | ### `npm test` 19 | 20 | Launches the test runner in the interactive watch mode.\ 21 | See the section about [running tests](https://facebook.github.io/create-react-app/docs/running-tests) 22 | for more information. 23 | 24 | ### `npm run build` 25 | 26 | Builds the app for production to the `build` folder.\ 27 | It correctly bundles React in production mode and optimizes the build for the best 28 | performance. 29 | 30 | The build is minified and the filenames include the hashes.\ 31 | Your app is ready to be deployed! 32 | 33 | See the section about 34 | [deployment](https://facebook.github.io/create-react-app/docs/deployment) for 35 | more information. 36 | 37 | ### `npm run eject` 38 | 39 | **Note: this is a one-way operation. Once you `eject`, you can't go back!** 40 | 41 | If you aren't satisfied with the build tool and configuration choices, you can 42 | `eject` at any time. This command will remove the single build dependency from 43 | your project. 44 | 45 | Instead, it will copy all the configuration files and the transitive 46 | dependencies (webpack, Babel, ESLint, etc) right into your project so you have 47 | full control over them. All of the commands except `eject` will still work, but 48 | they will point to the copied scripts so you can tweak them. At this point 49 | you're on your own. 50 | 51 | You don't have to ever use `eject`. The curated feature set is suitable for 52 | small and middle deployments, and you shouldn't feel obligated to use this 53 | feature. However we understand that this tool wouldn't be useful if you couldn't 54 | customize it when you are ready for it. 55 | 56 | ## Learn More 57 | 58 | You can learn more in the 59 | [Create React App documentation](https://facebook.github.io/create-react-app/docs/getting-started). 60 | 61 | To learn React, check out the [React documentation](https://reactjs.org/). 62 | 63 | ### Code Splitting 64 | 65 | This section has moved here: 66 | [https://facebook.github.io/create-react-app/docs/code-splitting](https://facebook.github.io/create-react-app/docs/code-splitting) 67 | 68 | ### Analyzing the Bundle Size 69 | 70 | This section has moved here: 71 | [https://facebook.github.io/create-react-app/docs/analyzing-the-bundle-size](https://facebook.github.io/create-react-app/docs/analyzing-the-bundle-size) 72 | 73 | ### Making a Progressive Web App 74 | 75 | This section has moved here: 76 | [https://facebook.github.io/create-react-app/docs/making-a-progressive-web-app](https://facebook.github.io/create-react-app/docs/making-a-progressive-web-app) 77 | 78 | ### Advanced Configuration 79 | 80 | This section has moved here: 81 | [https://facebook.github.io/create-react-app/docs/advanced-configuration](https://facebook.github.io/create-react-app/docs/advanced-configuration) 82 | 83 | ### Deployment 84 | 85 | This section has moved here: 86 | [https://facebook.github.io/create-react-app/docs/deployment](https://facebook.github.io/create-react-app/docs/deployment) 87 | 88 | ### `npm run build` fails to minify 89 | 90 | This section has moved here: 91 | [https://facebook.github.io/create-react-app/docs/troubleshooting#npm-run-build-fails-to-minify](https://facebook.github.io/create-react-app/docs/troubleshooting#npm-run-build-fails-to-minify) 92 | -------------------------------------------------------------------------------- /git-frontend/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "git-frontend", 3 | "version": "0.1.0", 4 | "private": true, 5 | "dependencies": { 6 | "@emotion/react": "^11.10.6", 7 | "@emotion/styled": "^11.10.6", 8 | "@mui/icons-material": "^5.11.11", 9 | "@mui/material": "^5.11.14", 10 | "@testing-library/jest-dom": "^5.16.5", 11 | "@testing-library/react": "^13.4.0", 12 | "@testing-library/user-event": "^13.5.0", 13 | "react": "^18.2.0", 14 | "react-dom": "^18.2.0", 15 | "react-scripts": "5.0.1", 16 | "web-vitals": "^2.1.4" 17 | }, 18 | "scripts": { 19 | "preinstall": "npx npm-force-resolutions", 20 | "start": "react-scripts start", 21 | "build": "react-scripts build", 22 | "test": "react-scripts test", 23 | "eject": "react-scripts eject" 24 | }, 25 | "eslintConfig": { 26 | "extends": [ 27 | "react-app", 28 | "react-app/jest" 29 | ] 30 | }, 31 | "browserslist": { 32 | "production": [ 33 | ">0.2%", 34 | "not dead", 35 | "not op_mini all" 36 | ], 37 | "development": [ 38 | "last 1 chrome version", 39 | "last 1 firefox version", 40 | "last 1 safari version" 41 | ] 42 | }, 43 | "resolutions": { 44 | "nth-check": ">=2.0.1" 45 | }, 46 | "devDependencies": { 47 | "npm-force-resolutions": "0.0.10" 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /git-frontend/public/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danielpatrickhug/GitModel/3764c938f659d0dc3b51ab3b4394aa5974fcc49e/git-frontend/public/favicon.ico -------------------------------------------------------------------------------- /git-frontend/public/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 12 | 13 | 17 | 18 | 27 | React App 28 | 29 | 30 | 31 |
32 | 42 | 43 | 44 | -------------------------------------------------------------------------------- /git-frontend/public/logo192.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danielpatrickhug/GitModel/3764c938f659d0dc3b51ab3b4394aa5974fcc49e/git-frontend/public/logo192.png -------------------------------------------------------------------------------- /git-frontend/public/logo512.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danielpatrickhug/GitModel/3764c938f659d0dc3b51ab3b4394aa5974fcc49e/git-frontend/public/logo512.png -------------------------------------------------------------------------------- /git-frontend/public/manifest.json: -------------------------------------------------------------------------------- 1 | { 2 | "short_name": "React App", 3 | "name": "Create React App Sample", 4 | "icons": [ 5 | { 6 | "src": "favicon.ico", 7 | "sizes": "64x64 32x32 24x24 16x16", 8 | "type": "image/x-icon" 9 | }, 10 | { 11 | "src": "logo192.png", 12 | "type": "image/png", 13 | "sizes": "192x192" 14 | }, 15 | { 16 | "src": "logo512.png", 17 | "type": "image/png", 18 | "sizes": "512x512" 19 | } 20 | ], 21 | "start_url": ".", 22 | "display": "standalone", 23 | "theme_color": "#000000", 24 | "background_color": "#ffffff" 25 | } 26 | -------------------------------------------------------------------------------- /git-frontend/public/robots.txt: -------------------------------------------------------------------------------- 1 | # https://www.robotstxt.org/robotstxt.html 2 | User-agent: * 3 | Disallow: 4 | -------------------------------------------------------------------------------- /git-frontend/src/App.css: -------------------------------------------------------------------------------- 1 | body { 2 | background-color: #f5f5f5; 3 | } 4 | 5 | .App-header { 6 | background-color: #3f51b5; 7 | padding: 1rem 0; 8 | color: white; 9 | } 10 | 11 | .results-container { 12 | margin-top: 2rem; 13 | padding: 1rem; 14 | background-color: white; 15 | border-radius: 8px; 16 | box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1); 17 | } 18 | -------------------------------------------------------------------------------- /git-frontend/src/App.js: -------------------------------------------------------------------------------- 1 | import React, { useState } from "react"; 2 | import { 3 | AppBar, 4 | Box, 5 | Button, 6 | Container, 7 | Grid, 8 | Paper, 9 | TextField, 10 | Toolbar, 11 | Typography, 12 | } from "@mui/material"; 13 | import { GitHub } from "@mui/icons-material"; 14 | import "./App.css"; 15 | 16 | function App() { 17 | const [repoUrl, setRepoUrl] = useState(""); 18 | const [openaiSecret, setOpenaiSecret] = useState(""); 19 | const [results, setResults] = useState(null); 20 | 21 | const handleRepoUrlChange = (event) => { 22 | setRepoUrl(event.target.value); 23 | }; 24 | 25 | const handleOpenaiSecretChange = (event) => { 26 | setOpenaiSecret(event.target.value); 27 | }; 28 | 29 | const handleSubmit = async () => { 30 | // Replace this URL with the actual API endpoint of your FastAPI backend 31 | const apiUrl = "http://localhost:8000/api/topic-modeling"; 32 | 33 | try { 34 | const response = await fetch(apiUrl, { 35 | method: "POST", 36 | headers: { 37 | "Content-Type": "application/json", 38 | }, 39 | body: JSON.stringify({ 40 | config: "./test_config.yaml", // Update this as needed 41 | repo: repoUrl, 42 | repo_name: "gitmodel", // Update this as needed 43 | openai_secret: openaiSecret, 44 | }), 45 | }); 46 | 47 | const data = await response.json(); 48 | setResults(data); 49 | } catch (error) { 50 | console.error("Error fetching data:", error); 51 | } 52 | }; 53 | 54 | return ( 55 |
56 | 57 | 58 | 59 | 64 | GitHub Repo Topic Modeler 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 80 | 81 | 82 | 90 | 91 | 92 | 100 | 101 | 102 | 103 | 104 | {results && ( 105 | 106 | 107 | Topic Modeling Results 108 | 109 | 110 | {/* Display the results in a desired format */} 111 |
{JSON.stringify(results, null, 2)}
112 |
113 | )} 114 |
115 |
116 | ); 117 | } 118 | 119 | export default App; 120 | -------------------------------------------------------------------------------- /git-frontend/src/App.test.js: -------------------------------------------------------------------------------- 1 | import { render, screen } from "@testing-library/react"; 2 | import App from "./App"; 3 | 4 | test("renders learn react link", () => { 5 | render(); 6 | const linkElement = screen.getByText(/learn react/i); 7 | expect(linkElement).toBeInTheDocument(); 8 | }); 9 | -------------------------------------------------------------------------------- /git-frontend/src/index.css: -------------------------------------------------------------------------------- 1 | body { 2 | margin: 0; 3 | font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", "Roboto", "Oxygen", 4 | "Ubuntu", "Cantarell", "Fira Sans", "Droid Sans", "Helvetica Neue", 5 | sans-serif; 6 | -webkit-font-smoothing: antialiased; 7 | -moz-osx-font-smoothing: grayscale; 8 | } 9 | 10 | code { 11 | font-family: source-code-pro, Menlo, Monaco, Consolas, "Courier New", 12 | monospace; 13 | } 14 | -------------------------------------------------------------------------------- /git-frontend/src/index.js: -------------------------------------------------------------------------------- 1 | import React from "react"; 2 | import ReactDOM from "react-dom/client"; 3 | import "./index.css"; 4 | import App from "./App"; 5 | import reportWebVitals from "./reportWebVitals"; 6 | 7 | const root = ReactDOM.createRoot(document.getElementById("root")); 8 | root.render( 9 | 10 | 11 | 12 | ); 13 | 14 | // If you want to start measuring performance in your app, pass a function 15 | // to log results (for example: reportWebVitals(console.log)) 16 | // or send to an analytics endpoint. Learn more: https://bit.ly/CRA-vitals 17 | reportWebVitals(); 18 | -------------------------------------------------------------------------------- /git-frontend/src/logo.svg: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /git-frontend/src/reportWebVitals.js: -------------------------------------------------------------------------------- 1 | const reportWebVitals = (onPerfEntry) => { 2 | if (onPerfEntry && onPerfEntry instanceof Function) { 3 | import("web-vitals").then(({ getCLS, getFID, getFCP, getLCP, getTTFB }) => { 4 | getCLS(onPerfEntry); 5 | getFID(onPerfEntry); 6 | getFCP(onPerfEntry); 7 | getLCP(onPerfEntry); 8 | getTTFB(onPerfEntry); 9 | }); 10 | } 11 | }; 12 | 13 | export default reportWebVitals; 14 | -------------------------------------------------------------------------------- /git-frontend/src/setupTests.js: -------------------------------------------------------------------------------- 1 | // jest-dom adds custom jest matchers for asserting on DOM nodes. 2 | // allows you to do things like: 3 | // expect(element).toHaveTextContent(/react/i) 4 | // learn more: https://github.com/testing-library/jest-dom 5 | import "@testing-library/jest-dom"; 6 | -------------------------------------------------------------------------------- /images/self_portrait.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danielpatrickhug/GitModel/3764c938f659d0dc3b51ab3b4394aa5974fcc49e/images/self_portrait.jpg -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import openai 2 | from fastapi import FastAPI, HTTPException 3 | from fastapi.middleware.cors import CORSMiddleware 4 | from fastapi.responses import JSONResponse 5 | from pydantic import BaseModel 6 | 7 | from src import Pipeline 8 | 9 | app = FastAPI() 10 | 11 | # Add the CORS middleware 12 | app.add_middleware( 13 | CORSMiddleware, 14 | allow_origins=["*"], 15 | allow_credentials=True, 16 | allow_methods=["*"], 17 | allow_headers=["*"], 18 | ) 19 | 20 | 21 | class AnalyzeRepoInput(BaseModel): 22 | config: str 23 | repo: str 24 | repo_name: str 25 | openai_secret: str 26 | 27 | 28 | @app.post("/api/topic-modeling") 29 | async def analyze_repo(input_data: AnalyzeRepoInput): 30 | openai_secret = input_data.openai_secret 31 | openai.api_key = openai_secret 32 | 33 | pipeline = Pipeline.from_yaml(input_data.config) 34 | try: 35 | gnn_head_outputs, topic_model_outputs = pipeline.run(input_data.repo, input_data.repo_name) 36 | if topic_model_outputs: 37 | topic_model_output = topic_model_outputs[0] 38 | return JSONResponse(content=topic_model_output["tree"]) 39 | else: 40 | raise HTTPException(status_code=400, detail="Error processing topic model") 41 | except Exception as e: 42 | raise HTTPException(status_code=500, detail=str(e)) 43 | 44 | 45 | if __name__ == "__main__": 46 | import uvicorn 47 | 48 | uvicorn.run(app, host="0.0.0.0", port=8000) 49 | -------------------------------------------------------------------------------- /papers/2203.15544.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danielpatrickhug/GitModel/3764c938f659d0dc3b51ab3b4394aa5974fcc49e/papers/2203.15544.pdf -------------------------------------------------------------------------------- /papers/README.md: -------------------------------------------------------------------------------- 1 | # Essays 2 | 3 | ## Chat GPT 3.5 4 | 5 | Prompt: 6 | 7 | - Write an essay on code repository that can topic model any python repo even 8 | its own. what are the implications on a global scale. touch on topic modeling 9 | operating systems 10 | 11 | > Code repositories are integral to the software development process as they 12 | > enable developers to collaborate, share and maintain code across different 13 | > platforms. One significant advancement in code repository technology is the 14 | > ability to perform topic modeling on Python repositories. Topic modeling 15 | > refers to the process of identifying patterns and topics within a given text 16 | > corpus, which can be useful for tasks such as categorizing documents, 17 | > clustering similar documents, and even identifying latent semantic 18 | > relationships between different code files. 19 | 20 | > One exciting aspect of the ability to topic model Python repositories is that 21 | > it can be applied to any repository, including the repository of the tool 22 | > performing the analysis. This means that the tool itself can be continuously 23 | > improved as it analyzes new data, allowing it to better identify trends and 24 | > topics in the codebase. This continuous learning can lead to more accurate and 25 | > relevant insights, as the tool can evolve alongside the code it analyzes. 26 | 27 | > The implications of this technology on a global scale are significant. As more 28 | > and more Python code is produced and shared globally, the ability to 29 | > efficiently categorize, search and discover relevant code becomes increasingly 30 | > important. Topic modeling can enable better search and discovery of code, 31 | > allowing developers to more easily find code that is relevant to their needs. 32 | > This can lead to increased collaboration and faster development cycles, as 33 | > developers can more easily share code and build on top of existing work. 34 | 35 | > Furthermore, topic modeling can help identify trends and best practices in 36 | > code development. For example, if a topic model identifies that a particular 37 | > function or library is being used extensively across different projects, it 38 | > may indicate that this function or library is particularly useful or 39 | > efficient. This can help inform decisions on which libraries to use, which 40 | > functions to write, and even which design patterns to employ. 41 | 42 | > Finally, it is worth noting that topic modeling is not limited to Python 43 | > repositories. Operating systems themselves can be analyzed using topic 44 | > modeling techniques, allowing us to better understand the structure and 45 | > organization of operating system code. This can help improve the performance 46 | > and reliability of operating systems, which are critical components of modern 47 | > computing. In conclusion, the ability to topic model Python repositories has 48 | > significant implications for the development and maintenance of software on a 49 | > global scale. It can enable better search and discovery of code, identify 50 | > trends and best practices, and even inform decisions on which libraries to use 51 | > and which functions to write. As the volume of Python code continues to grow, 52 | > the ability to efficiently analyze and understand this code becomes 53 | > increasingly important. 54 | 55 | Prompt: 56 | 57 | - Write a section on how generating semantic content over a code repository with 58 | self instruct system prompts can then be passed into graph neural networks and 59 | graph attention networks with the code it was generated on can be used to 60 | topic model repositories and its dependencies 61 | 62 | > Once semantic content has been generated over a code repository using 63 | > self-instruct system prompts, the next step is to use this data to develop a 64 | > topic model for the repository and its dependencies. One powerful technique 65 | > for accomplishing this is through the use of graph neural networks (GNNs) and 66 | > graph attention networks (GATs). 67 | 68 | > GNNs are neural networks that can operate directly on graphs, allowing them to 69 | > model complex relationships between different nodes within the graph. In the 70 | > context of code repositories, each file or function within the repository can 71 | > be represented as a node in a graph, with edges connecting nodes that have 72 | > some form of relationship. For example, a function that calls another function 73 | > would be connected to the function it calls, while a file that imports another 74 | > file would be connected to the file it imports. 75 | 76 | > By passing the semantic content generated over a code repository into a GNN, 77 | > we can train the network to learn relationships between different nodes within 78 | > the graph. For example, the GNN may learn that certain functions tend to be 79 | > used together, or that certain files are commonly imported together. This 80 | > information can then be used to identify different topics or clusters within 81 | > the repository, based on the relationships between different nodes. 82 | 83 | > GATs are a variant of GNNs that use attention mechanisms to selectively focus 84 | > on specific parts of the graph when making predictions. This can be useful in 85 | > the context of code repositories, as it allows the network to identify the 86 | > most relevant nodes when identifying topics or clusters. For example, if the 87 | > network is trying to identify a particular library that is commonly used 88 | > within the repository, it may focus on nodes that are related to that library, 89 | > such as files that import the library or functions that use it extensively. 90 | 91 | > By combining semantic content generated over a code repository with GNNs and 92 | > GATs, we can develop a powerful tool for topic modeling repositories and their 93 | > dependencies. This can be useful for a range of tasks, including identifying 94 | > the most commonly used libraries, identifying clusters of related functions or 95 | > files, and even identifying potential code quality issues or security 96 | > vulnerabilities within the repository. 97 | 98 | > In summary, the combination of semantic content generation, GNNs and GATs can 99 | > enable powerful topic modeling of code repositories and their 100 | > dependencies.This can help developers more easily navigate and understand 101 | > large code bases, and can even lead to improved code quality and security. 102 | 103 | ## GPT4 104 | 105 | #### Prompt: 106 | 107 | - Write an essay on code repository that can topic model any python repo even 108 | its own. what are the implications on a global scale. touch on topic modeling 109 | operating systems, and using GNN to message pass between repos 110 | 111 | > Title: Topic Modeling and Code Repositories: A Future of Global Collaboration 112 | > and Innovation 113 | 114 | #### Introduction 115 | 116 | > Code repositories have revolutionized the software development process by 117 | > providing a centralized platform for developers to store, manage, and share 118 | > their code. With the rapid growth of open-source software, there is an 119 | > increasing need to leverage advanced machine learning techniques to better 120 | > understand and organize the vast amount of code available. One such method is 121 | > topic modeling, which can be applied to any Python repository, including its 122 | > own. This essay will explore the implications of such a system on a global 123 | > scale, discussing its potential impact on operating systems and the use of 124 | > Graph Neural Networks (GNN) for message passing between repositories. 125 | 126 | #### Topic Modeling and Code Repositories 127 | 128 | > Topic modeling is a machine learning technique used to automatically identify 129 | > and group related themes or topics within a large collection of documents. In 130 | > the context of code repositories, this technique can be employed to analyze 131 | > the underlying structure and organization of the codebase, making it easier 132 | > for developers to navigate, understand, and contribute to the project. By 133 | > applying topic modeling to any Python repository, including its own, a new 134 | > level of abstraction and organization can be achieved, allowing for more 135 | > efficient collaboration and innovation. 136 | 137 | #### Global Implications 138 | 139 | > The ability to automatically topic model any Python repository, even its own, 140 | > has far-reaching implications on a global scale. First and foremost, this 141 | > technique can lead to better collaboration between developers from diverse 142 | > backgrounds and skill levels. By providing a more organized and understandable 143 | > view of a codebase, developers can more easily find relevant sections of the 144 | > code and identify where their expertise can be applied. 145 | 146 | > Furthermore, topic modeling can facilitate the identification and reuse of 147 | > existing code, promoting more efficient software development practices. By 148 | > identifying and categorizing similar code snippets across different 149 | > repositories, developers can more easily leverage pre-existing solutions to 150 | > common problems, reducing the need for redundant work and improving overall 151 | > software quality. 152 | 153 | #### Topic Modeling Operating Systems 154 | 155 | > Operating systems are complex and consist of a vast amount of code. The 156 | > application of topic modeling to operating systems can significantly improve 157 | > their organization, maintainability, and extensibility. By grouping related 158 | > code modules, developers can more easily understand the interactions between 159 | > different components of the system, allowing them to more effectively debug, 160 | > optimize, and enhance the operating system. 161 | 162 | > Additionally, topic modeling operating systems can lead to more modular 163 | > designs, enabling better cross-platform compatibility and easier integration 164 | > of new features. This, in turn, can foster greater innovation and 165 | > collaboration between different operating system communities, ultimately 166 | > benefiting the global development ecosystem. 167 | 168 | #### Graph Neural Networks for Message Passing between Repositories 169 | 170 | > Graph Neural Networks (GNN) provide a powerful tool for message passing 171 | > between repositories, enabling more efficient collaboration and knowledge 172 | > sharing. By representing code repositories as nodes in a graph and connecting 173 | > them based on their topical similarity, GNNs can be used to propagate 174 | > information and learn high-level representations of the code. 175 | 176 | > This approach can help identify and bridge gaps in knowledge between different 177 | > repositories, allowing developers to leverage insights from related projects 178 | > more effectively. Moreover, the use of GNNs for message passing between 179 | > repositories can facilitate the transfer of best practices and code patterns 180 | > across different development communities, promoting more consistent and 181 | > high-quality software development on a global scale. 182 | 183 | #### Conclusion 184 | 185 | > The application of topic modeling to code repositories, including the ability 186 | > to model any Python repository, even its own, has the potential to 187 | > revolutionize software development on a global scale. By improving code 188 | > organization, facilitating collaboration, and fostering the reuse of existing 189 | > solutions, topic modeling can contribute to more efficient and innovative 190 | > software development practices. The integration of topic modeling with 191 | > operating systems and the use of Graph Neural Networks for message passing 192 | > between repositories further enhances the potential for global collaboration, 193 | > knowledge sharing, and the advancement of the software development ecosystem. 194 | 195 | #### Prompt: 196 | 197 | - Write an story about a person that builds a code repository that can topic 198 | model any python repo even its own. 199 | 200 | > Once upon a time in the quaint little town of Codeville, a programmer named 201 | > Alex was known for their extraordinary coding skills. Having spent years 202 | > mastering the intricacies of Python, Alex was well-respected and admired by 203 | > their peers. Yet, they weren't satisfied with just writing code—they aspired 204 | > to create something groundbreaking, a testament to their talents and a gift to 205 | > the programming world. 206 | 207 | > One day, while reading about machine learning, Alex stumbled upon an 208 | > interesting idea: topic modeling. It was a powerful technique to discover 209 | > hidden patterns in large collections of text documents. Alex's eyes lit up 210 | > with excitement, and an ambitious plan took shape in their mind—a code 211 | > repository that could perform topic modeling on any Python repository, even 212 | > its own! 213 | 214 | > And so, Alex embarked on a journey to create the ultimate code repository. 215 | > They named their project "PythonThinker," as a tribute to the famed 216 | > philosopher and mathematician, Pythagoras. They dedicated countless hours to 217 | > researching natural language processing, machine learning algorithms, and 218 | > various Python libraries. With every passing day, the idea of PythonThinker 219 | > became more real, as Alex poured their heart and soul into the project. 220 | 221 | > Alex began by designing a system to extract and analyze code from any Python 222 | > repository. This involved tokenizing the code, removing stop words, and 223 | > normalizing the data. They then implemented a state-of-the-art topic modeling 224 | > algorithm that could adapt to the structure and complexity of any codebase. 225 | > This, in turn, allowed PythonThinker to reveal hidden topics, relationships, 226 | > and patterns within the code. 227 | 228 | > As the project progressed, Alex realized that PythonThinker had the potential 229 | > to revolutionize software development. By understanding the underlying topics 230 | > within codebases, developers could gain insights into the organization and 231 | > structure of projects, allowing them to quickly identify areas for improvement 232 | > and optimization. 233 | 234 | > After months of tireless work, PythonThinker was finally complete. It was a 235 | > beautifully crafted piece of software, capable of ingesting any Python 236 | > repository and spewing out valuable insights. What truly set it apart, 237 | > however, was its ability to introspect—to topic model its very own codebase. 238 | 239 | > With great anticipation, Alex fed PythonThinker its own codebase as input. As 240 | > the algorithm whirred to life, it analyzed every line of code, every function, 241 | > and every module it contained. The moment of truth was finally here. Alex 242 | > nervously awaited the results, hoping their masterpiece would live up to its 243 | > promise. 244 | 245 | > And it did. PythonThinker was able to successfully topic model its own code, 246 | > revealing a beautiful tapestry of interconnected topics and dependencies. Alex 247 | > was elated. Their creation was not only a testament to their programming 248 | > prowess, but it also demonstrated the power of self-aware software—a code 249 | > repository capable of understanding and analyzing itself. 250 | 251 | > The news of PythonThinker's success spread like wildfire across Codeville and 252 | > beyond. Developers from around the world flocked to the project, eager to 253 | > harness its power to improve their own codebases. And as Alex watched their 254 | > creation soar to new heights, they felt an immense sense of pride and 255 | > accomplishment. 256 | 257 | > PythonThinker continued to evolve, becoming an indispensable tool for 258 | > developers everywhere. Alex's creation had transformed the world of 259 | > programming, proving that with hard work, passion, and a touch of genius, even 260 | > the wildest dreams can come true. And so, the legend of Alex and PythonThinker 261 | > lived on, inspiring countless programmers to push the boundaries of what is 262 | > possible with code. 263 | 264 | #### Prompt: 265 | 266 | - Write a story about a 26-year-old AI developer from long island, that builds a 267 | code repository that can topic model any python repo even its own. 268 | 269 | > Once upon a time on the picturesque shores of Long Island, there lived a 270 | > brilliant 26-year-old AI developer named Charlie. Known for his remarkable 271 | > coding skills and knack for innovation, Charlie was already a highly respected 272 | > member of the tech community. His passion for programming and machine learning 273 | > seemed boundless, but there was one challenge that consumed his thoughts: 274 | > creating a code repository that could perform topic modeling on any Python 275 | > repository, including its very own. 276 | 277 | > Charlie had long been intrigued by topic modeling—a powerful technique that 278 | > uncovers hidden patterns and relationships within large collections of text 279 | > documents. He believed that such an algorithm could revolutionize the software 280 | > development process, offering valuable insights and fostering collaboration 281 | > among developers. 282 | 283 | > And so, Charlie embarked on a quest to create the ultimate code repository, 284 | > one that would stand as a testament to his ingenuity and determination. He 285 | > named his ambitious project "AthenaCode," inspired by the Greek goddess of 286 | > wisdom and strategy, Athena. 287 | 288 | > Charlie dedicated himself to the task with unmatched intensity. He spent 289 | > countless hours poring over research papers, studying natural language 290 | > processing, and experimenting with cutting-edge machine learning algorithms. 291 | > He even reached out to experts from around the world, seeking their guidance 292 | > and insights. 293 | 294 | > AthenaCode's foundation was built on a system that could efficiently extract 295 | > and analyze code from any Python repository. Charlie designed it to tokenize 296 | > the code, remove stop words, and normalize the data. Once this was done, he 297 | > implemented a state-of-the-art topic modeling algorithm capable of adapting to 298 | > the unique structure and complexity of any given codebase. 299 | 300 | > Charlie's hard work soon bore fruit. AthenaCode began to reveal hidden topics, 301 | > relationships, and patterns within codebases, providing valuable insights into 302 | > the organization and structure of projects. Developers could now quickly 303 | > identify areas for improvement and optimization, making their work more 304 | > efficient and effective. 305 | 306 | > But the true test of AthenaCode's power lay in its ability to introspect—to 307 | > topic model its own codebase. With a mix of excitement and trepidation, 308 | > Charlie fed AthenaCode's code into its algorithm. As it processed each line of 309 | > code, every function, and every module, Charlie watched in awe as his creation 310 | > came to life. 311 | 312 | > The results were nothing short of astonishing. AthenaCode had successfully 313 | > topic modeled its own codebase, revealing a web of interconnected topics and 314 | > dependencies that even Charlie had not anticipated. Overwhelmed with pride, he 315 | > knew his creation was destined to change the world of programming. 316 | 317 | > Word of AthenaCode's success spread like wildfire across the tech community. 318 | > Developers from around the globe clamored to use Charlie's revolutionary tool 319 | > to optimize their own codebases. As he watched his creation gain momentum, 320 | > Charlie couldn't help but feel a deep sense of satisfaction and 321 | > accomplishment. 322 | 323 | > With time, AthenaCode continued to evolve and grow, becoming an indispensable 324 | > resource for developers everywhere. Charlie's groundbreaking creation had 325 | > transformed the software development landscape, demonstrating that with 326 | > determination, hard work, and a touch of genius, even the most ambitious 327 | > dreams can be realized. 328 | 329 | > And so, the story of Charlie and AthenaCode lived on, inspiring countless 330 | > programmers to reach for the stars and push the boundaries of what is possible 331 | > with code. And as the sun set over the shores of Long Island, the tale of a 332 | > young AI developer and his remarkable creation continued to spread, leaving an 333 | > indelible mark on the world of programming. 334 | 335 | #### Prompt: Write an story about a person that builds a code repository that can topic model any python repo even its own. the main focus is 336 | 337 | message passing between code bases from different branches of physics and the 338 | quest for a unifying framework 339 | 340 | > In the bustling city of Scientia, a curious programmer named Dr. Ada Smith was 341 | > known for her relentless pursuit of knowledge. Adept at Python programming and 342 | > passionate about both machine learning and the mysteries of the universe, Ada 343 | > sought to understand the complex relationships between different branches of 344 | > physics. Her dream was to uncover a unifying framework that could bridge the 345 | > gaps between these fields, and she believed that the key to unlocking this 346 | > puzzle lay in the code repositories of the world's leading physicists. 347 | 348 | > In her quest, Ada conceived an ambitious plan: to build a code repository that 349 | > could topic model any Python repo—even its own. She envisioned a system that 350 | > could not only analyze and understand the intricate patterns within individual 351 | > codebases, but also facilitate message passing between these codebases, 352 | > allowing for seamless collaboration and the exchange of knowledge between 353 | > physicists working in diverse fields. 354 | 355 | > And so, with unwavering determination, Ada embarked on her groundbreaking 356 | > project, which she named "QuantumWeaver." The name was inspired by the way her 357 | > creation would weave together the fabric of knowledge from various branches of 358 | > physics, much like the interconnected threads of a tapestry. 359 | 360 | > Ada devoted herself to the task, meticulously studying natural language 361 | > processing, machine learning algorithms, and various Python libraries to 362 | > create a state-of-the-art topic modeling system. QuantumWeaver's primary 363 | > function was to analyze code from Python repositories, identifying hidden 364 | > topics and relationships within the codebases of physicists working in fields 365 | > such as quantum mechanics, relativity, and thermodynamics. 366 | 367 | > As QuantumWeaver began to take shape, Ada focused on implementing message 368 | > passing capabilities between the codebases. This would allow researchers to 369 | > share insights, methods, and algorithms across different branches of physics, 370 | > fostering a collaborative environment that could lead to the discovery of the 371 | > long-sought unifying framework. 372 | 373 | > After months of tireless work, Ada's masterpiece was finally complete. It was 374 | > time to test QuantumWeaver's ability to topic model its own codebase—a true 375 | > demonstration of the power and versatility of her creation. 376 | 377 | > With bated breath, Ada fed QuantumWeaver its own code as input. The algorithm 378 | > sprang to life, analyzing every line of code, every function, and every module 379 | > it contained. As the results poured in, Ada's eyes widened with 380 | > amazement—QuantumWeaver had successfully topic modeled its own code, a 381 | > testament to its extraordinary capabilities. 382 | 383 | > The news of QuantumWeaver's success spread rapidly throughout the scientific 384 | > community. Physicists from around the world eagerly adopted the system, using 385 | > it to analyze their codebases and exchange knowledge between seemingly 386 | > disparate fields. As they shared insights, algorithms, and methodologies, it 387 | > became clear that QuantumWeaver was on the verge of revolutionizing the field 388 | > of physics. 389 | 390 | > As the years passed, the collaborative environment fostered by QuantumWeaver 391 | > led to numerous breakthroughs and discoveries. Researchers found unexpected 392 | > connections between various branches of physics, paving the way for a deeper 393 | > understanding of the universe. 394 | 395 | > And one fateful day, Ada's dream was realized. A brilliant young physicist, 396 | > armed with the insights and knowledge gleaned from the QuantumWeaver network, 397 | > unveiled a groundbreaking unifying framework—a beautiful theory that elegantly 398 | > connected the diverse branches of physics under a single, coherent paradigm. 399 | 400 | > QuantumWeaver had not only transformed the way physicists collaborated and 401 | > shared knowledge, but it had also played a pivotal role in uncovering the 402 | > elusive unifying framework that had captivated the scientific community for 403 | > centuries. Ada's creation would be remembered as a turning point in the 404 | > history of science, and her relentless pursuit of knowledge would inspire 405 | > generations of physicists and programmers to reach for the stars and uncover 406 | > the mysteries of the universe. 407 | 408 | #### Prompt 409 | 410 | - Write a formal essay on this topic 411 | 412 | ##### Title: QuantumWeaver: The Role of Topic Modeling and Message Passing in Unifying the Branches of Physics 413 | 414 | ##### Abstract 415 | 416 | > This essay explores the development and impact of QuantumWeaver, a 417 | > groundbreaking code repository designed to perform topic modeling on any 418 | > Python repo, including its own. We examine how the system's message passing 419 | > capabilities fostered collaboration between researchers in different branches 420 | > of physics and ultimately led to the discovery of a unifying framework. By 421 | > highlighting the transformative potential of advanced algorithms and 422 | > interdisciplinary collaboration, we underscore the significance of 423 | > QuantumWeaver as a turning point in the history of science. 424 | 425 | ##### Introduction 426 | 427 | > The quest for a unifying framework in physics—a theory that could bridge the 428 | > gaps between distinct branches such as quantum mechanics, relativity, and 429 | > thermodynamics—has captivated scientists for centuries. One such visionary, 430 | > Dr. Ada Smith, believed that the key to discovering this elusive framework lay 431 | > in the code repositories of leading physicists. Her groundbreaking project, 432 | > QuantumWeaver, sought to facilitate seamless collaboration and the exchange of 433 | > knowledge between researchers by leveraging topic modeling and message passing 434 | > between codebases. 435 | 436 | ##### QuantumWeaver: A Groundbreaking Code Repository 437 | 438 | > Dr. Smith's QuantumWeaver was designed to analyze code from Python 439 | > repositories and identify hidden topics and relationships within the codebases 440 | > of physicists working in diverse fields. By incorporating state-of-the-art 441 | > natural language processing and machine learning algorithms, the system could 442 | > efficiently perform topic modeling on any Python repo—even its own. 443 | 444 | > The true innovation of QuantumWeaver, however, lay in its message passing 445 | > capabilities. This feature allowed researchers to share insights, methods, and 446 | > algorithms across different branches of physics, fostering a collaborative 447 | > environment that could potentially reveal the unifying framework. 448 | 449 | ##### Impact on the Scientific Community 450 | 451 | > The introduction of QuantumWeaver led to a paradigm shift in the way 452 | > physicists collaborated and shared knowledge. The interconnected network of 453 | > researchers facilitated by the system's message passing capabilities enabled 454 | > the exchange of ideas and methodologies between seemingly disparate fields. 455 | > This led to unexpected connections and insights, paving the way for a deeper 456 | > understanding of the universe. 457 | 458 | > As QuantumWeaver continued to be adopted by physicists around the world, the 459 | > collaborative environment it fostered resulted in numerous breakthroughs and 460 | > discoveries. Ultimately, the insights and knowledge gleaned from this 461 | > interconnected network of researchers culminated in the unveiling of a 462 | > groundbreaking unifying framework—a theory that elegantly connected the 463 | > diverse branches of physics under a single, coherent paradigm. 464 | 465 | #Conclusion 466 | 467 | > QuantumWeaver, as an innovative code repository and collaborative platform, 468 | > played a pivotal role in uncovering the long-sought unifying framework in 469 | > physics. By harnessing the power of topic modeling and message passing between 470 | > codebases, the system revolutionized the way physicists collaborated, shared 471 | > knowledge, and approached the exploration of the universe. The story of 472 | > QuantumWeaver and Dr. Ada Smith serves as a testament to the transformative 473 | > potential of advanced algorithms and interdisciplinary collaboration, 474 | > inspiring future generations of scientists and programmers to push the 475 | > boundaries of human understanding. 476 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "GitModel" 3 | version = "0.1.0" 4 | description = "GitModel is a Python library for working with Git repositories as a database for RAG." 5 | authors = ["Daniel Hug "] 6 | 7 | [tool.isort] 8 | profile = "black" 9 | filter_files = true 10 | multi_line_output = 3 11 | include_trailing_comma = true 12 | force_grid_wrap = 0 13 | use_parentheses = true 14 | ensure_newline_before_comments = true 15 | line_length = 120 16 | 17 | [tool.black] 18 | line-length = 120 19 | target-version = ['py310'] 20 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | astor 2 | bertopic 3 | GitPython 4 | networkx 5 | numpy 6 | openai 7 | pandas 8 | pre-commit 9 | scipy 10 | sentence-transformers 11 | torch 12 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 120 3 | extend-ignore = E203, W503, E501, E741 4 | -------------------------------------------------------------------------------- /src/__init__.py: -------------------------------------------------------------------------------- 1 | from src.config import * 2 | from src.pipeline import Pipeline 3 | from src.system_prompts.format_system_prompts import * 4 | -------------------------------------------------------------------------------- /src/alembic.ini: -------------------------------------------------------------------------------- 1 | # A generic, single database configuration. 2 | 3 | [alembic] 4 | # path to migration scripts 5 | script_location = %(here)s/alembic 6 | 7 | # template used to generate migration file names; The default value is %%(rev)s_%%(slug)s 8 | # Uncomment the line below if you want the files to be prepended with date and time 9 | # see https://alembic.sqlalchemy.org/en/latest/tutorial.html#editing-the-ini-file 10 | # for all available tokens 11 | file_template = %%(year)d_%%(month).2d_%%(day).2d_%%(hour).2d%%(minute).2d-%%(rev)s_%%(slug)s 12 | 13 | # sys.path path, will be prepended to sys.path if present. 14 | # defaults to the current working directory. 15 | prepend_sys_path = . 16 | 17 | # timezone to use when rendering the date within the migration file 18 | # as well as the filename. 19 | # If specified, requires the python-dateutil library that can be 20 | # installed by adding `alembic[tz]` to the pip requirements 21 | # string value is passed to dateutil.tz.gettz() 22 | # leave blank for localtime 23 | # timezone = 24 | 25 | # max length of characters to apply to the 26 | # "slug" field 27 | # truncate_slug_length = 40 28 | 29 | # set to 'true' to run the environment during 30 | # the 'revision' command, regardless of autogenerate 31 | # revision_environment = false 32 | 33 | # set to 'true' to allow .pyc and .pyo files without 34 | # a source .py file to be detected as revisions in the 35 | # versions/ directory 36 | # sourceless = false 37 | 38 | # version location specification; This defaults 39 | # to alembic/versions. When using multiple version 40 | # directories, initial revisions must be specified with --version-path. 41 | # The path separator used here should be the separator specified by "version_path_separator" below. 42 | # version_locations = %(here)s/bar:%(here)s/bat:alembic/versions 43 | 44 | # version path separator; As mentioned above, this is the character used to split 45 | # version_locations. The default within new alembic.ini files is "os", which uses os.pathsep. 46 | # If this key is omitted entirely, it falls back to the legacy behavior of splitting on spaces and/or commas. 47 | # Valid values for version_path_separator are: 48 | # 49 | # version_path_separator = : 50 | # version_path_separator = ; 51 | # version_path_separator = space 52 | version_path_separator = os # Use os.pathsep. Default configuration used for new projects. 53 | 54 | # the output encoding used when revision files 55 | # are written from script.py.mako 56 | # output_encoding = utf-8 57 | 58 | # sqlalchemy.url = postgresql://:@/ 59 | sqlalchemy.url = postgresql://postgres:postgres@localhost:5432/postgres 60 | 61 | [post_write_hooks] 62 | # post_write_hooks defines scripts or Python functions that are run 63 | # on newly generated revision scripts. See the documentation for further 64 | # detail and examples 65 | 66 | # format using "black" - use the console_scripts runner, against the "black" entrypoint 67 | hooks = black 68 | black.type = console_scripts 69 | black.entrypoint = black 70 | # black.options = -l 79 REVISION_SCRIPT_FILENAME 71 | 72 | # Logging configuration 73 | [loggers] 74 | keys = root,sqlalchemy,alembic 75 | 76 | [handlers] 77 | keys = console 78 | 79 | [formatters] 80 | keys = generic 81 | 82 | [logger_root] 83 | level = WARN 84 | handlers = console 85 | qualname = 86 | 87 | [logger_sqlalchemy] 88 | level = WARN 89 | handlers = 90 | qualname = sqlalchemy.engine 91 | 92 | [logger_alembic] 93 | level = INFO 94 | handlers = 95 | qualname = alembic 96 | 97 | [handler_console] 98 | class = StreamHandler 99 | args = (sys.stderr,) 100 | level = NOTSET 101 | formatter = generic 102 | 103 | [formatter_generic] 104 | format = %(levelname)-5.5s [%(name)s] %(message)s 105 | datefmt = %H:%M:%S 106 | -------------------------------------------------------------------------------- /src/alembic/env.py: -------------------------------------------------------------------------------- 1 | from logging.config import fileConfig 2 | 3 | import sqlmodel 4 | from sqlalchemy import engine_from_config, pool 5 | 6 | from alembic import context 7 | 8 | # this is the Alembic Config object, which provides 9 | # access to the values within the .ini file in use. 10 | config = context.config 11 | 12 | # Interpret the config file for Python logging. 13 | # This line sets up loggers basically. 14 | if config.config_file_name is not None: 15 | fileConfig(config.config_file_name) 16 | 17 | # add your model's MetaData object here 18 | # for 'autogenerate' support 19 | # from myapp import mymodel 20 | # target_metadata = mymodel.Base.metadata 21 | target_metadata = sqlmodel.SQLModel.metadata 22 | 23 | # other values from the config, defined by the needs of env.py, 24 | # can be acquired: 25 | # my_important_option = config.get_main_option("my_important_option") 26 | # ... etc. 27 | 28 | 29 | def run_migrations_offline() -> None: 30 | """Run migrations in 'offline' mode. 31 | 32 | This configures the context with just a URL 33 | and not an Engine, though an Engine is acceptable 34 | here as well. By skipping the Engine creation 35 | we don't even need a DBAPI to be available. 36 | 37 | Calls to context.execute() here emit the given string to the 38 | script output. 39 | 40 | """ 41 | url = config.get_main_option("sqlalchemy.url") 42 | context.configure( 43 | url=url, 44 | target_metadata=target_metadata, 45 | literal_binds=True, 46 | dialect_opts={"paramstyle": "named"}, 47 | ) 48 | 49 | with context.begin_transaction(): 50 | context.run_migrations() 51 | 52 | 53 | def run_migrations_online() -> None: 54 | """Run migrations in 'online' mode. 55 | 56 | In this scenario we need to create an Engine 57 | and associate a connection with the context. 58 | 59 | """ 60 | connectable = engine_from_config( 61 | config.get_section(config.config_ini_section), 62 | prefix="sqlalchemy.", 63 | poolclass=pool.NullPool, 64 | ) 65 | 66 | with connectable.connect() as connection: 67 | context.configure(connection=connection, target_metadata=target_metadata) 68 | 69 | with context.begin_transaction(): 70 | context.get_context()._ensure_version_table() 71 | connection.execute("LOCK TABLE alembic_version IN ACCESS EXCLUSIVE MODE") 72 | context.run_migrations() 73 | 74 | 75 | if context.is_offline_mode(): 76 | run_migrations_offline() 77 | else: 78 | run_migrations_online() 79 | -------------------------------------------------------------------------------- /src/ast_parsers/python_ast_parser.py: -------------------------------------------------------------------------------- 1 | import ast 2 | import os 3 | 4 | import astor 5 | 6 | 7 | def parse_python_file(file_path): 8 | parsed_contents = { 9 | "imports": [], 10 | "globals": [], 11 | "classes": [], 12 | "functions": [], 13 | } 14 | 15 | with open(file_path, "r") as file: 16 | file_contents = file.read() 17 | parsed_tree = ast.parse(file_contents) 18 | 19 | for node in ast.iter_child_nodes(parsed_tree): 20 | if isinstance(node, ast.Import) or isinstance(node, ast.ImportFrom): 21 | parsed_contents["imports"].append(astor.to_source(node).strip()) 22 | elif isinstance(node, ast.Assign) and len(node.targets) == 1 and isinstance(node.targets[0], ast.Name): 23 | parsed_contents["globals"].append(astor.to_source(node).strip()) 24 | elif isinstance(node, ast.FunctionDef): 25 | if node.name == "main": 26 | parsed_contents["functions"].append(ast.get_source_segment(file_contents, node)) 27 | else: 28 | parsed_contents["functions"].append(ast.get_source_segment(file_contents, node)) 29 | elif isinstance(node, ast.ClassDef): 30 | parsed_contents["classes"].append(ast.get_source_segment(file_contents, node)) 31 | 32 | return parsed_contents 33 | 34 | 35 | def get_methods(class_or_str): 36 | if isinstance(class_or_str, str): 37 | class_or_str = ast.parse(class_or_str) 38 | 39 | method_nodes = [node for node in ast.iter_child_nodes(class_or_str) if isinstance(node, ast.FunctionDef)] 40 | method_sources = [] 41 | for node in method_nodes: 42 | source_lines, _ = ast.get_source_segment(class_or_str, node) 43 | method_sources.append("".join(source_lines).strip()) 44 | return method_sources 45 | 46 | 47 | def parse_github_repo(local_dir): 48 | parsed_files = [] 49 | content_labels = {0: "imports", 1: "globals", 2: "classes", 3: "functions", 4: "main", 5: "file_name"} 50 | 51 | for root, dirs, files in os.walk(local_dir): 52 | for file_name in files: 53 | if file_name.endswith(".py"): 54 | file_path = os.path.join(root, file_name) 55 | try: 56 | parsed_contents = parse_python_file(file_path) 57 | content = {content_labels[i]: v for i, v in enumerate(parsed_contents.values())} 58 | content[content_labels[5]] = file_path 59 | parsed_files.append(content) 60 | except Exception as e: 61 | print(e) 62 | continue 63 | 64 | return parsed_files 65 | -------------------------------------------------------------------------------- /src/backend/config.py: -------------------------------------------------------------------------------- 1 | from pydantic import BaseModel, BaseSettings 2 | 3 | 4 | class MessageTreeManagerConfiguration(BaseModel): 5 | pass 6 | 7 | 8 | class Settings(BaseSettings): 9 | pass 10 | 11 | 12 | settings = Settings() 13 | -------------------------------------------------------------------------------- /src/backend/database.py: -------------------------------------------------------------------------------- 1 | # from sqlmodel import create_engine 2 | -------------------------------------------------------------------------------- /src/backend/memory_tree_manager.py: -------------------------------------------------------------------------------- 1 | class MemoryTreeManager: 2 | pass 3 | -------------------------------------------------------------------------------- /src/community_detection/extract_connected_components.py: -------------------------------------------------------------------------------- 1 | import networkx as nx 2 | from networkx.algorithms import community 3 | 4 | 5 | def community_detection(A): 6 | # Convert the adjacency matrix to a NetworkX graph 7 | G = nx.from_numpy_array(A) 8 | 9 | # Perform Girvan-Newman community detection 10 | communities_generator = community.girvan_newman(G) 11 | 12 | # Get the first level of communities 13 | communities = next(communities_generator) 14 | 15 | # Sort communities by size (number of nodes) 16 | sorted_communities = sorted(communities, key=lambda x: len(x), reverse=True) 17 | 18 | return sorted_communities 19 | 20 | 21 | def print_communities(data, A): 22 | # Print sorted communities 23 | sorted_communities = community_detection(data, A) 24 | for i, com in enumerate(sorted_communities): 25 | print(f"Community {i + 1}:") 26 | print(f"Nodes: {list(com)}") 27 | component_files_structure = [] 28 | component_text_structure = [] 29 | component_history_structure = [] 30 | for row_key in list(com): 31 | file_component = data.loc[row_key, "file_name"] 32 | text_component = data.loc[row_key, "assistant_reply"] 33 | history_component = data.loc[row_key, "conversation_history"] 34 | component_files_structure.append(file_component) 35 | component_text_structure.append(text_component) 36 | component_history_structure.append(history_component) 37 | print(component_files_structure) 38 | for text, history in zip(component_text_structure, component_history_structure): 39 | # print(history[-2]['content']) 40 | print() 41 | print(text) 42 | print() 43 | print() 44 | 45 | 46 | def dfs(v, visited, adj_matrix, component): 47 | """Depth-first search algorithm.""" 48 | visited[v] = True 49 | component.append(v) 50 | for i, val in enumerate(adj_matrix[v]): 51 | if val > 0 and not visited[i]: 52 | dfs(i, visited, adj_matrix, component) 53 | 54 | 55 | def connected_components(adj_matrix): 56 | """Find connected components in a graph represented by an adjacency matrix.""" 57 | visited = [False for _ in range(adj_matrix.shape[0])] 58 | components = [] 59 | 60 | for v in range(adj_matrix.shape[0]): 61 | if not visited[v]: 62 | component = [] 63 | dfs(v, visited, adj_matrix, component) 64 | components.append(component) 65 | 66 | return components 67 | -------------------------------------------------------------------------------- /src/config.py: -------------------------------------------------------------------------------- 1 | import yaml 2 | 3 | 4 | class Config: 5 | def __init__(self, pipeline_settings, semantic_graph_context_generator, gnn_heads, topic_model): 6 | self.semantic_graph_context_generator = semantic_graph_context_generator 7 | self.gnn_heads = gnn_heads 8 | self.topic_model = topic_model 9 | self.pipeline_settings = pipeline_settings 10 | 11 | @classmethod 12 | def from_yaml(cls, yaml_path): 13 | with open(yaml_path, "r") as f: 14 | config_dict = yaml.safe_load(f) 15 | return cls(**config_dict) 16 | 17 | 18 | class PipelineSettings: 19 | def __init__(self, config: Config): 20 | self.config = config 21 | 22 | def __repr__(self) -> str: 23 | return f"PipelineSettings(config={self.config})" 24 | -------------------------------------------------------------------------------- /src/configs/config1.yaml: -------------------------------------------------------------------------------- 1 | semantic_graph_context_generator: 2 | - __impl__: SemanticGraphContextGenerator 3 | generator_id: summary 4 | model_id: gpt-3.5-turbo 5 | temperature: 0 6 | max_tokens: 500 7 | with_tree: False 8 | - __impl__: SemanticGraphContextGenerator 9 | generator_id: qa_chain 10 | model_id: gpt-3.5-turbo 11 | temperature: 0 12 | max_tokens: 600 13 | with_tree: False 14 | - __impl__: SemanticGraphContextGenerator 15 | generator_id: ux_ui_consult 16 | model_id: gpt-3.5-turbo 17 | temperature: 0 18 | max_tokens: 500 19 | with_tree: False 20 | - __impl__: SemanticGraphContextGenerator 21 | generator_id: bug_fix 22 | model_id: gpt-3.5-turbo 23 | temperature: 0 24 | max_tokens: 500 25 | with_tree: False 26 | 27 | gnn_heads: 28 | - __impl__: GNNHead 29 | generator_id: message_passing 30 | k_hop: 2 31 | is_sparse: True 32 | embbedding_id: sbert 33 | embbedding_model: all-MiniLM-L6-v2 34 | kernel_id: cosine_similarity 35 | graph_id: adjacency 36 | connection_threshold: 0.7 37 | 38 | topic_model: 39 | - __impl__: TopicModel 40 | topic_model_id: topic_model 41 | embedding_model: all-MiniLM-L6-v2 42 | clustering_model: 43 | HDBSCAN(min_cluster_size=15, metric='euclidean', 44 | cluster_selection_method='eom', prediction_data=True) 45 | dimensionality_reduction: 46 | umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, 47 | metric='cosine') 48 | vectorizer_model: CountVectorizer(stop_words="english") 49 | representation_model: 50 | OpenAI(model="gpt-3.5-turbo", delay_in_seconds=1, chat=True) 51 | hierarchical_linkage_fn: sch.linkage(x, 'single', optimal_ordering=True) 52 | with_tree: True 53 | auto_cluster: True 54 | -------------------------------------------------------------------------------- /src/configs/config2.yaml: -------------------------------------------------------------------------------- 1 | pipeline_settings: 2 | - __impl__: PipelineSettings 3 | combine_gnn_strategy: sum 4 | norm_fn: z_score 5 | skip_graph_creation: True 6 | 7 | semantic_graph_context_generator: 8 | - __impl__: SemanticGraphContextGenerator 9 | generator_id: summary 10 | model_id: gpt-3.5-turbo 11 | temperature: 0 12 | max_tokens: 300 13 | with_tree: False 14 | - __impl__: SemanticGraphContextGenerator 15 | generator_id: qa_chain 16 | model_id: gpt-3.5-turbo 17 | temperature: 0 18 | max_tokens: 300 19 | with_tree: False 20 | 21 | gnn_heads: 22 | - __impl__: GNNHead 23 | generator_id: message_passing 24 | k_hop: 2 25 | is_sparse: True 26 | embbedding_id: sbert 27 | embbedding_model: all-MiniLM-L6-v2 28 | kernel_id: cosine_similarity 29 | graph_id: adjacency 30 | connection_threshold: 0.7 31 | - __impl__: GNNHead 32 | generator_id: message_passing 33 | k_hop: 3 34 | is_sparse: True 35 | embbedding_id: sbert 36 | embbedding_model: all-MiniLM-L6-v2 37 | kernel_id: cosine_similarity 38 | graph_id: adjacency 39 | connection_threshold: 0.7 40 | - __impl__: GNNHead 41 | generator_id: message_passing 42 | k_hop: 2 43 | is_sparse: True 44 | embbedding_id: sbert 45 | embbedding_model: all-MiniLM-L6-v2 46 | kernel_id: gaussian 47 | sigma: 1 48 | graph_id: laplacian 49 | connection_threshold: 0.7 50 | 51 | topic_model: 52 | - __impl__: TopicModel 53 | topic_model_id: topic_model 54 | embedding_model: all-MiniLM-L6-v2 55 | clustering_model: 56 | HDBSCAN(min_cluster_size=15, metric='euclidean', 57 | cluster_selection_method='eom', prediction_data=True) 58 | dimensionality_reduction: 59 | umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, 60 | metric='cosine') 61 | vectorizer_model: CountVectorizer(stop_words="english") 62 | representation_model: 63 | OpenAI(model="gpt-3.5-turbo", delay_in_seconds=1, chat=True) 64 | hierarchical_linkage_fn: sch.linkage(x, 'centroid', optimal_ordering=True) 65 | with_tree: True 66 | auto_cluster: True 67 | -------------------------------------------------------------------------------- /src/dynamic_import.py: -------------------------------------------------------------------------------- 1 | from copy import deepcopy 2 | from typing import Dict 3 | 4 | from src.config import Config, PipelineSettings 5 | from src.ml_models.graph_networks.gnn_head import GNNHead 6 | from src.ml_models.topic_modeling.topic_model import TopicModel 7 | from src.ml_models.transformers.semantic_graph_context_generator import SemanticGraphContextGenerator 8 | 9 | 10 | def instantiate_class_from_config(config: Config) -> Dict[str, list]: 11 | config = deepcopy(config) 12 | 13 | obj_map = {} 14 | for component_name in ["pipeline_settings", "semantic_graph_context_generator", "gnn_heads", "topic_model"]: 15 | if component_name not in obj_map: 16 | obj_map[component_name] = [] 17 | 18 | for args in getattr(config, component_name): 19 | impl = args.pop("__impl__") 20 | try: 21 | _cls = { 22 | "PipelineSettings": PipelineSettings, 23 | "SemanticGraphContextGenerator": SemanticGraphContextGenerator, 24 | "GNNHead": GNNHead, 25 | "TopicModel": TopicModel, 26 | }[impl] 27 | except KeyError: 28 | raise Exception(f"{impl} cannot be found in module my_module") 29 | obj_map[component_name].append(_cls(config=args)) 30 | 31 | # Print the recovered dictionary 32 | return obj_map 33 | -------------------------------------------------------------------------------- /src/fetch_repo.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from git import Repo 4 | 5 | 6 | def clone_and_create_context_folder(repo_name: str, id_name: str) -> str: 7 | # Create a work folder in the current working directory if it doesn't already exist 8 | work_folder = os.path.join(os.getcwd(), "work") 9 | if not os.path.exists(work_folder): 10 | os.makedirs(work_folder) 11 | 12 | # Clone the repository into the work folder 13 | repo_path = os.path.join(work_folder, id_name) 14 | if not os.path.exists(repo_path): 15 | Repo.clone_from(repo_name, repo_path) 16 | 17 | # Create a context folder with the specified id_name in the current working directory 18 | context_folder = os.path.join(os.getcwd(), f"context/{id_name}_context") 19 | if not os.path.exists(context_folder): 20 | os.makedirs(context_folder) 21 | 22 | return repo_path, context_folder 23 | -------------------------------------------------------------------------------- /src/ml_models/__init__.py: -------------------------------------------------------------------------------- 1 | from src.ml_models.graph_networks.gnn_head import GNNHead # noqa: F401 2 | from src.ml_models.topic_modeling.topic_model import TopicModel # noqa: F401 3 | from src.ml_models.transformers.semantic_graph_context_generator import SemanticGraphContextGenerator # noqa: F401 4 | -------------------------------------------------------------------------------- /src/ml_models/graph_networks/gnn_head.py: -------------------------------------------------------------------------------- 1 | import math 2 | import multiprocessing 3 | from typing import List, Tuple 4 | 5 | import numpy as np 6 | import torch 7 | import torch.nn as nn 8 | import torch.nn.functional as F 9 | from sentence_transformers import SentenceTransformer 10 | from transformers import AutoModel 11 | 12 | from src.config import Config 13 | from src.ml_models.graph_networks.kernels import compute_kernel_by_type, graph_laplacian 14 | from src.ml_models.graph_networks.message_passing import k_hop_message_passing, k_hop_message_passing_sparse 15 | 16 | 17 | class GNNHead: 18 | def __init__(self, config: Config): 19 | self.config = config 20 | 21 | def __repr__(self) -> str: 22 | return f"GNNHead(config={self.config})" 23 | 24 | def compute_kernel_matrix(self, node_features: np.ndarray) -> np.ndarray: 25 | """ 26 | Compute the kernel matrix using the specified kernel function. 27 | 28 | Parameters: 29 | node_features (numpy array): The feature matrix of the nodes. 30 | 31 | Returns: 32 | kernel_matrix (numpy array): The kernel matrix computed using the specified kernel function. 33 | """ 34 | if self.config["kernel_id"] == "cosine_similarity": 35 | kernel_matrix = compute_kernel_by_type( 36 | node_features, threshold=self.config["connection_threshold"], kernel_type="cosine" 37 | ) 38 | elif self.config["kernel_id"] == "gaussian": 39 | kernel_matrix = compute_kernel_by_type( 40 | node_features, 41 | threshold=self.config["connection_threshold"], 42 | kernel_type="gaussian", 43 | sigma=self.config["sigma"], 44 | ) 45 | else: 46 | raise ValueError("Invalid kernel function specified.") 47 | return kernel_matrix 48 | 49 | def embed_data(self, data, key="query", cores=1, gpu=False, batch_size=128): 50 | """ 51 | Embed the sentences/text using the MiniLM language model (which uses mean pooling) 52 | """ 53 | print("Embedding data") 54 | if self.config["embbedding_model"] == "all-MiniLM-L6-v2": 55 | model = SentenceTransformer(self.config["embbedding_model"]) 56 | elif self.config["embbedding_model"] == "graphcodebert-base": 57 | model = AutoModel.from_pretrained("sentence-transformers/bert-base-nli-mean-tokens") 58 | else: 59 | raise ValueError("Invalid embedding model specified.") 60 | print("Model loaded") 61 | 62 | sentences = data[key].tolist() 63 | unique_sentences = data[key].unique() 64 | print("Unique sentences", len(unique_sentences)) 65 | 66 | if cores == 1: 67 | embeddings = model.encode(unique_sentences, show_progress_bar=True, batch_size=batch_size) 68 | else: 69 | devices = ["cpu"] * cores 70 | if gpu: 71 | devices = None # use all CUDA devices 72 | 73 | # Start the multi-process pool on multiple devices 74 | print("Multi-process pool starting") 75 | pool = model.start_multi_process_pool(devices) 76 | print("Multi-process pool started") 77 | 78 | chunk_size = math.ceil(len(unique_sentences) / cores) 79 | 80 | # Compute the embeddings using the multi-process pool 81 | embeddings = model.encode_multi_process( 82 | unique_sentences, pool, batch_size=batch_size, chunk_size=chunk_size 83 | ) 84 | model.stop_multi_process_pool(pool) 85 | 86 | print("Embeddings computed") 87 | 88 | mapping = {sentence: embedding for sentence, embedding in zip(unique_sentences, embeddings)} 89 | embeddings = np.array([mapping[sentence] for sentence in sentences]) 90 | 91 | return embeddings 92 | 93 | def generate_graph(self, data) -> Tuple[np.ndarray, np.ndarray]: 94 | """ 95 | Generate a graph using the kernel function and message passing. 96 | 97 | Parameters: 98 | node_features (numpy array): The feature matrix of the nodes. 99 | 100 | Returns: 101 | A_k (numpy array): The k-hop adjacency matrix. 102 | agg_features (numpy array): The aggregated feature matrix for each node in the k-hop neighborhood. 103 | """ 104 | embs = self.embed_data(data) 105 | kernel_matrix = self.compute_kernel_matrix(embs) 106 | if self.config["graph_id"] == "adjacency": 107 | A = kernel_matrix 108 | elif self.config["graph_id"] == "laplacian": 109 | L, D = graph_laplacian(kernel_matrix) 110 | A = np.linalg.pinv(D) @ L 111 | else: 112 | raise ValueError("Invalid graph type specified.") 113 | 114 | if self.config["is_sparse"]: 115 | A_k, agg_features = k_hop_message_passing_sparse(A, embs, self.config["k_hop"]) 116 | else: 117 | A_k, agg_features = k_hop_message_passing(A, embs, self.config["k_hop"]) 118 | 119 | return A_k, agg_features 120 | 121 | 122 | class GraphCodeBERT: 123 | def __init__(self, config: Config): 124 | self.config = config 125 | 126 | def __repr__(self) -> str: 127 | return f"GraphCodeBERT(config={self.config})" 128 | 129 | def mean_pooling(self, model_output, attention_mask): 130 | token_embeddings = model_output[0] # First element of model_output contains all token embeddings 131 | input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() 132 | return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9) 133 | 134 | def encode(self): 135 | pass 136 | 137 | 138 | class GATLayer(nn.Module): 139 | """ 140 | A single layer of the Graph Attention Network. 141 | """ 142 | 143 | def __init__(self, in_features, out_features, dropout=0.0): 144 | super().__init__() 145 | self.in_features = in_features 146 | self.out_features = out_features 147 | 148 | self.W = nn.Linear(in_features, out_features, bias=False) 149 | self.a = nn.Linear(2 * out_features, 1, bias=False) 150 | 151 | self.dropout = nn.Dropout(dropout) 152 | 153 | def forward(self, h, adj): 154 | Wh = self.W(h) 155 | a_input = self.prepare_attention_input(Wh) 156 | e = F.leaky_relu(self.a(a_input)) 157 | attention = F.softmax(e, dim=1) 158 | attention = self.dropout(attention) 159 | 160 | h_prime = torch.matmul(attention.transpose(1, 2), Wh) 161 | return h_prime 162 | 163 | def prepare_attention_input(self, Wh): 164 | N = Wh.size()[0] 165 | Wh_repeated_in_chunks = Wh.repeat_interleave(N, dim=0) 166 | Wh_repeated_alternating = Wh.repeat(N, 1) 167 | all_combinations_matrix = torch.cat([Wh_repeated_in_chunks, Wh_repeated_alternating], dim=1) 168 | return all_combinations_matrix.view(N, N, 2 * self.out_features) 169 | 170 | 171 | class GAT(nn.Module): 172 | """ 173 | The Graph Attention Network model. 174 | """ 175 | 176 | def __init__(self, in_features, out_features, dropout=0.0): 177 | super().__init__() 178 | self.in_features = in_features 179 | self.out_features = out_features 180 | 181 | self.layers = nn.ModuleList( 182 | [ 183 | GATLayer(in_features, out_features, dropout=dropout), 184 | GATLayer(out_features, out_features, dropout=dropout), 185 | ] 186 | ) 187 | 188 | def forward(self, embs, adj_matrices): 189 | h = embs 190 | for layer, adj in zip(self.layers, adj_matrices): 191 | h = layer(h, adj) 192 | return h 193 | 194 | 195 | def combine_graphs_with_gat(graphs): 196 | """ 197 | Combine a list of graphs using the Graph Attention Network. 198 | 199 | Parameters: 200 | graphs (List of Tuple): List of tuples where each tuple contains an adjacency matrix and its 201 | corresponding node embeddings. 202 | 203 | Returns: 204 | combined_adj_matrix (numpy array): The combined adjacency matrix. 205 | combined_embs (numpy array): The combined node embeddings. 206 | """ 207 | in_features = graphs[0][1].shape[1] # input dimensionality of the embeddings 208 | out_features = 256 # output dimensionality of the embeddings 209 | gat = GAT(in_features, out_features) 210 | 211 | adj_matrices = [t[0] for t in graphs] 212 | embs = torch.cat([torch.FloatTensor(t[1]) for t in graphs], dim=0) 213 | 214 | with torch.no_grad(): 215 | combined_embs = gat(embs, adj_matrices) 216 | 217 | combined_adj_matrix = sum(adj_matrices) 218 | max_value = np.max(combined_adj_matrix) 219 | normalized_matrix = combined_adj_matrix / max_value 220 | multiprocessing.resource_tracker.unregister() 221 | 222 | return normalized_matrix, combined_embs.numpy() 223 | 224 | 225 | def linearly_sum_gnn_heads( 226 | matrices: List[Tuple[np.ndarray, np.ndarray]], normalization_fn: str = "max" 227 | ) -> Tuple[np.ndarray, np.ndarray]: 228 | """ 229 | Linearly sum a list of adjacency matrices and normalize the result. Also computes the average of the node embeddings 230 | and checks that the number of documents in the adjacency matrices matches the number of documents in the embeddings set. 231 | 232 | Parameters: 233 | matrices (List of Tuple): List of tuples where each tuple contains an adjacency matrix and its 234 | corresponding node embeddings. 235 | 236 | Returns: 237 | normalized_matrix (numpy array): The linearly summed and normalized adjacency matrix. 238 | avg_embeddings (numpy array): The average node embeddings. 239 | """ 240 | if len(matrices) == 1: 241 | return matrices[0] 242 | # Compute the average of the embeddings 243 | avg_embeddings = np.mean([t[1] for t in matrices], axis=0) 244 | 245 | # Check that the number of documents in the adjacency matrices matches the number of documents in the embeddings set 246 | num_docs_adj = matrices[0][0].shape[0] 247 | num_docs_embs = avg_embeddings.shape[0] 248 | if num_docs_adj != num_docs_embs: 249 | raise ValueError( 250 | "Number of documents in adjacency matrix does not match number of documents in embeddings set." 251 | ) 252 | 253 | # Linearly sum the adjacency matrices and normalize the matrix by dividing it by the maximum value 254 | summed_matrix = sum([t[0] for t in matrices]) 255 | if normalization_fn == "max": 256 | max_value = np.max(summed_matrix) 257 | normalized_matrix = summed_matrix / max_value 258 | elif normalization_fn == "sum": 259 | normalized_matrix = summed_matrix / len(matrices) 260 | elif normalization_fn == "min-max": 261 | min_value = np.min(summed_matrix) 262 | max_value = np.max(summed_matrix) 263 | normalized_matrix = (summed_matrix - min_value) / (max_value - min_value) 264 | elif normalization_fn == "z_score": 265 | mean = np.mean(summed_matrix) 266 | std = np.std(summed_matrix) 267 | normalized_matrix = (summed_matrix - mean) / std 268 | elif normalization_fn == "circular": 269 | min_value = np.min(summed_matrix) 270 | max_value = np.max(summed_matrix) 271 | normalized_matrix_linear = (summed_matrix - min_value) / (max_value - min_value) 272 | normalized_matrix = np.sin(2 * np.pi * normalized_matrix_linear) 273 | else: 274 | raise ValueError(f"Normalization function {normalization_fn} not supported.") 275 | 276 | A_k, agg_embs = k_hop_message_passing_sparse(normalized_matrix, avg_embeddings, 2) 277 | print(f"adj matrix shape: {A_k.shape}") 278 | print(f"agg embeddings shape: {agg_embs.shape}") 279 | return A_k, agg_embs 280 | -------------------------------------------------------------------------------- /src/ml_models/graph_networks/kernels.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy 3 | import torch 4 | import torch.nn.functional as F 5 | from torch import Tensor 6 | 7 | 8 | def gaussian_kernel_torch(embs_a, embs_b, sigma=1.0): 9 | """ 10 | Computes the Gaussian kernel matrix between two sets of embeddings using PyTorch. 11 | :param embs_a: Tensor of shape (batch_size_a, embedding_dim) containing the first set of embeddings. 12 | :param embs_b: Tensor of shape (batch_size_b, embedding_dim) containing the second set of embeddings. 13 | :param sigma: Width of the Gaussian kernel. 14 | :return: Tensor of shape (batch_size_a, batch_size_b) containing the Gaussian kernel matrix. 15 | """ 16 | if not isinstance(embs_a, torch.Tensor): 17 | embs_a = torch.tensor(embs_a) 18 | 19 | if not isinstance(embs_b, torch.Tensor): 20 | embs_b = torch.tensor(embs_b) 21 | 22 | # Compute the pairwise distances between the embeddings 23 | dist_matrix = torch.cdist(embs_a, embs_b) 24 | 25 | # Compute the Gaussian kernel matrix 26 | kernel_matrix = torch.exp(-(dist_matrix**2) / (2 * sigma**2)) 27 | 28 | return kernel_matrix 29 | 30 | 31 | def estimate_pdf(scores: list): 32 | """ 33 | estimate scores probability density function 34 | :scores: list of distance scores from topic features to topic centroid 35 | :return: distribution 36 | """ 37 | return scipy.stats.gaussian_kde(scores) 38 | 39 | 40 | def cos_sim(a: Tensor, b: Tensor): 41 | """ 42 | Computes the cosine similarity cos_sim(a[i], b[j]) for all i and j. 43 | :return: Matrix with res[i][j] = cos_sim(a[i], b[j]) 44 | """ 45 | if not isinstance(a, torch.Tensor): 46 | a = torch.tensor(np.array(a)) 47 | 48 | if not isinstance(b, torch.Tensor): 49 | b = torch.tensor(np.array(b)) 50 | 51 | if len(a.shape) == 1: 52 | a = a.unsqueeze(0) 53 | 54 | if len(b.shape) == 1: 55 | b = b.unsqueeze(0) 56 | 57 | a_norm = torch.nn.functional.normalize(a, p=2, dim=1) 58 | b_norm = torch.nn.functional.normalize(b, p=2, dim=1) 59 | return torch.mm(a_norm, b_norm.transpose(0, 1)) 60 | 61 | 62 | def cos_sim_torch(embs_a: Tensor, embs_b: Tensor) -> Tensor: 63 | """ 64 | Computes the cosine similarity cos_sim(a[i], b[j]) for all i and j. 65 | Using torch.nn.functional.cosine_similarity 66 | :return: Matrix with res[i][j] = cos_sim(a[i], b[j]) 67 | """ 68 | if not isinstance(embs_a, torch.Tensor): 69 | embs_a = torch.tensor(np.array(embs_a)) 70 | 71 | if not isinstance(embs_b, torch.Tensor): 72 | embs_b = torch.tensor(np.array(embs_b)) 73 | 74 | if len(embs_a.shape) == 1: 75 | embs_a = embs_a.unsqueeze(0) 76 | 77 | if len(embs_b.shape) == 1: 78 | embs_b = embs_b.unsqueeze(0) 79 | A = F.cosine_similarity(embs_a.unsqueeze(1), embs_b.unsqueeze(0), dim=2) 80 | return A 81 | 82 | 83 | def prune_ref_docs(qa_embs, ref_embs, ref_docs, threshold=0.1): 84 | """ 85 | Drops unnecessary documents from the reference embeddings and updates the list of reference documents, 86 | and then recomputes the adjacency matrix. 87 | 88 | Parameters: 89 | qa_embs (numpy array): The embedding matrix of QA pairs. 90 | ref_embs (numpy array): The embedding matrix of reference sentences. 91 | ref_docs (list): The list of reference documents. 92 | threshold (float): The threshold below which documents are considered unnecessary. 93 | 94 | Returns: 95 | pruned_ref_embs (numpy array): The pruned embedding matrix of reference sentences. 96 | pruned_ref_docs (list): The pruned list of reference documents. 97 | pruned_A (numpy array): The pruned adjacency matrix. 98 | """ 99 | 100 | # Compute the initial adjacency matrix with full reference embeddings 101 | A = gaussian_kernel_torch(qa_embs, ref_embs, sigma=0.5) 102 | print(f"Before: {A.shape}") 103 | # Compute the row-wise sum of the adjacency matrix 104 | row_sum = torch.sum(A, dim=0) 105 | 106 | # Identify the indexes of the relevant documents 107 | relevant_idx = torch.where(row_sum > threshold * row_sum.max())[0] 108 | 109 | # Drop unnecessary rows from the reference embeddings 110 | pruned_ref_embs = ref_embs[relevant_idx] 111 | 112 | # Update the list of reference documents 113 | pruned_ref_docs = [ref_docs[i] for i in relevant_idx] 114 | 115 | # Recompute the adjacency matrix with pruned reference embeddings 116 | pruned_A = gaussian_kernel_torch(qa_embs, pruned_ref_embs, sigma=0.5) 117 | print(f"After: {pruned_A.shape}") 118 | return pruned_ref_embs, pruned_ref_docs, pruned_A 119 | 120 | 121 | def compute_kernel_by_type(embs, threshold=0.65, kernel_type="cosine", sigma=1.0): 122 | # match case to kernel type 123 | if kernel_type == "gaussian": 124 | A = gaussian_kernel_torch(embs, embs, sigma) 125 | if kernel_type == "cosine": 126 | A = cos_sim(embs, embs) 127 | adj_matrix = torch.zeros_like(A) 128 | adj_matrix[A > threshold] = 1 129 | adj_matrix[A <= threshold] = 0 130 | adj_matrix = adj_matrix.numpy().astype(np.float32) 131 | return adj_matrix 132 | 133 | 134 | def compute_kernel_with_activation_fn(embs, activation_fn=F.relu, kernel_type="cosine", sigma=1.0): 135 | # match case to kernel type 136 | if kernel_type == "gaussian": 137 | A = gaussian_kernel_torch(embs, embs, sigma) 138 | if kernel_type == "cosine": 139 | A = cos_sim(embs, embs) 140 | A = activation_fn(A) 141 | A = A.numpy().astype(np.float32) 142 | return A 143 | 144 | 145 | def compute_kernel(embs): 146 | # TODO match case to kernel type 147 | A = cos_sim(embs, embs) 148 | adj_matrix = A.numpy().astype(np.float32) 149 | return adj_matrix 150 | 151 | 152 | def degree_matrix(A): 153 | """ 154 | compute degree matrix using adjacency distance matrix from pairwise distances 155 | :A: nxn size matrix embedding minmaxed using mu sigma and pairwise distances 156 | :return: degree matrix 157 | """ 158 | n = A.shape[0] 159 | D = np.zeros((n, n)) 160 | for i in range(n): 161 | D[i, i] = np.sum(A[i, :]) 162 | return D 163 | 164 | 165 | def graph_laplacian(A): 166 | """ 167 | compute graph laplacian using degree and adjacency matrix from pairwise distances 168 | :A: nxn size matrix embedding minmaxed using mu sigma and pairwise distances 169 | :return: graph laplacian, and degree matrix 170 | """ 171 | D = degree_matrix(A) 172 | L = D - A 173 | return L, D 174 | -------------------------------------------------------------------------------- /src/ml_models/graph_networks/message_passing.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy.sparse as sp 3 | from tqdm import tqdm 4 | 5 | 6 | def k_hop_message_passing(A, node_features, k): 7 | """ 8 | Compute the k-hop adjacency matrix and aggregated features using message passing. 9 | 10 | Parameters: 11 | A (numpy array): The adjacency matrix of the graph. 12 | node_features (numpy array): The feature matrix of the nodes. 13 | k (int): The number of hops for message passing. 14 | 15 | Returns: 16 | A_k (numpy array): The k-hop adjacency matrix. 17 | agg_features (numpy array): The aggregated feature matrix for each node in the k-hop neighborhood. 18 | """ 19 | 20 | print("Compute the k-hop adjacency matrix") 21 | A_k = np.linalg.matrix_power(A, k) 22 | 23 | print("Aggregate the messages from the k-hop neighborhood:") 24 | agg_features = node_features.copy() 25 | 26 | for i in tqdm(range(k)): 27 | agg_features += np.matmul(np.linalg.matrix_power(A, i + 1), node_features) 28 | 29 | return A_k, agg_features 30 | 31 | 32 | def k_hop_message_passing_sparse(A, node_features, k): 33 | """ 34 | Compute the k-hop adjacency matrix and aggregated features using message passing. 35 | 36 | Parameters: 37 | A (numpy array or scipy sparse matrix): The adjacency matrix of the graph. 38 | node_features (numpy array or scipy sparse matrix): The feature matrix of the nodes. 39 | k (int): The number of hops for message passing. 40 | 41 | Returns: 42 | A_k (numpy array): The k-hop adjacency matrix. 43 | agg_features (numpy array): The aggregated feature matrix for each node in the k-hop neighborhood. 44 | """ 45 | 46 | # Convert input matrices to sparse matrices if they are not already 47 | if not sp.issparse(A): 48 | A = sp.csr_matrix(A) 49 | if not sp.issparse(node_features): 50 | node_features = sp.csr_matrix(node_features) 51 | 52 | # Compute the k-hop adjacency matrix and the aggregated features 53 | A_k = A.copy() 54 | agg_features = node_features.copy() 55 | 56 | for i in tqdm(range(k)): 57 | # Compute the message passing for the k-hop neighborhood 58 | message = A_k.dot(node_features) 59 | # Apply a GCN layer to aggregate the messages 60 | agg_features = A_k.dot(agg_features) + message 61 | # Update the k-hop adjacency matrix by adding new edges 62 | A_k += A_k.dot(A) 63 | 64 | return A_k.toarray(), agg_features.toarray() 65 | -------------------------------------------------------------------------------- /src/ml_models/topic_modeling/topic_model.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import sys 4 | from typing import Tuple 5 | 6 | import numpy as np 7 | import pandas as pd 8 | from bertopic import BERTopic 9 | from bertopic.representation import MaximalMarginalRelevance, OpenAI, TextGeneration 10 | from bertopic.vectorizers import ClassTfidfTransformer 11 | from sentence_transformers import SentenceTransformer 12 | from sklearn.cluster import AgglomerativeClustering 13 | from sklearn.decomposition import TruncatedSVD 14 | from sklearn.feature_extraction.text import CountVectorizer 15 | from transformers import pipeline 16 | 17 | os.environ["TOKENIZERS_PARALLELISM"] = "false" 18 | lg = logging.getLogger(__name__) 19 | lg.setLevel(logging.DEBUG) 20 | handler = logging.StreamHandler(sys.stdout) 21 | handler.setFormatter(logging.Formatter("[%(asctime)s] [%(levelname)s] - %(message)s")) 22 | lg.addHandler(handler) 23 | 24 | 25 | class TopicModel: 26 | def __init__(self, config): 27 | self.config = config 28 | 29 | def __repr__(self) -> str: 30 | return f"TopicModel(config={self.config})" 31 | 32 | def fit_topic_model(self, topic_model, data, embeddings, key="query"): 33 | topics, probs = topic_model.fit_transform(data[key].to_list(), embeddings) 34 | return topics, probs 35 | 36 | def get_topic_info(self, topic_model): 37 | return topic_model.get_topic_info() 38 | 39 | def reduce_topics(self, topic_model, data, nr_topics, key="query"): 40 | topic_model.reduce_topics(data[key].to_list(), nr_topics) 41 | return topic_model 42 | 43 | def get_representative_docs(self, topic_model): 44 | return topic_model.get_representative_docs() 45 | 46 | def reduce_outliers(self, topic_model, data, topics, probs, key="query", strategy="c-tf-idf"): 47 | if strategy == "c-tf-idf": 48 | new_topics = topic_model.reduce_outliers(data[key].to_list(), topics, strategy, threshold=0.1) 49 | elif strategy == "embeddings": 50 | new_topics = topic_model.reduce_outliers(data[key].to_list(), topics, strategy) 51 | elif strategy == "distributions": 52 | new_topics = topic_model.reduce_outliers( 53 | data[key].to_list(), topics, probabilities=probs, strategy=strategy 54 | ) 55 | else: 56 | raise ValueError("Invalid strategy") 57 | topic_model.update_topics( 58 | data[key].to_list(), 59 | topics=new_topics, 60 | representation_model=self.representation_model, 61 | vectorizer_model=self.vectorizer_model, 62 | ctfidf_model=self.ctfidf_model, 63 | ) 64 | return topic_model, new_topics 65 | 66 | def compute_hierarchical_topic_tree(self, topic_model, data, key="query"): 67 | hierarchical_topics = topic_model.hierarchical_topics(data[key].to_list()) 68 | tree = topic_model.get_topic_tree(hierarchical_topics) 69 | return hierarchical_topics, tree 70 | 71 | def run(self, data, gnn_outputs: Tuple[np.ndarray, np.ndarray], key="query") -> dict: 72 | """ 73 | Run the topic modeling. 74 | """ 75 | # Prepare the data 76 | if self.config["auto_cluster"]: 77 | if self.config["representation_model"].split("(")[0] == "pipeline": 78 | generator = self.config["representation_model"] 79 | representation_model = [MaximalMarginalRelevance(diversity=0.3), TextGeneration(eval(generator))] 80 | elif self.config["representation_model"].split("(")[0] == "OpenAI": 81 | representation_model = [ 82 | MaximalMarginalRelevance(diversity=0.3), 83 | eval(self.config["representation_model"]), 84 | ] 85 | else: 86 | lg.info("Using MaximalMarginalRelevance as representation model") 87 | representation_model = MaximalMarginalRelevance(diversity=0.3) 88 | 89 | topic_model = BERTopic( 90 | nr_topics="auto", 91 | vectorizer_model=eval(self.config["vectorizer_model"]), 92 | embedding_model=self.config["embedding_model"], 93 | representation_model=representation_model, 94 | n_gram_range=(1, 2), 95 | min_topic_size=10, 96 | top_n_words=10, 97 | calculate_probabilities=False, 98 | ) 99 | else: 100 | # Fit the BERTopic model 101 | topic_model = BERTopic( 102 | nr_topics="auto", 103 | vectorizer_model=eval(self.config["vectorizer_model"]), 104 | umap_model=eval(self.config["dimensionality_reduction"]), 105 | hdbscan_model=eval(self.config["clustering_model"]), 106 | embedding_model=eval(self.config["embedding_model"]), 107 | representation_model=eval(self.config["representation_model"]), 108 | n_gram_range=(1, 2), 109 | min_topic_size=10, 110 | top_n_words=10, 111 | calculate_probabilities=False, 112 | ) 113 | topics, probabilities = topic_model.fit_transform(data[key].to_list(), gnn_outputs[1]) 114 | freq = self.get_topic_info(topic_model) 115 | rep_docs = self.get_representative_docs(topic_model) 116 | hr, tree = self.compute_hierarchical_topic_tree(topic_model=topic_model, data=data) 117 | # add dict with topic info 118 | docs = [] 119 | for k, v in rep_docs.items(): 120 | docs.append((k, v)) 121 | rep = pd.DataFrame(docs, columns=["Topic", "Rep_docs"]) 122 | topic_info_freq = pd.merge(freq, rep, on="Topic") 123 | data["topic"] = topics 124 | data["prob"] = probabilities 125 | return {"data": data, "topic_info": topic_info_freq, "tree": tree} 126 | -------------------------------------------------------------------------------- /src/ml_models/transformers/category_decomposition.py: -------------------------------------------------------------------------------- 1 | import json 2 | from getpass import getpass 3 | 4 | import openai 5 | 6 | openai_secret = getpass("Enter the secret key: ") 7 | # Set up OpenAI API credentials 8 | openai.api_key = openai_secret 9 | 10 | 11 | def chat_gpt_inference(messages: list): 12 | response = openai.ChatCompletion.create( 13 | model="gpt-3.5-turbo", 14 | messages=messages, 15 | max_tokens=1500, 16 | temperature=0.2, 17 | ) 18 | return response 19 | 20 | 21 | def create_prompt_message_template(text, role="user"): 22 | if role not in ["user", "assistant"]: 23 | raise ValueError("Not a valid role. Please use 'user' or 'assistant'.") 24 | return {"role": role, "content": text} 25 | 26 | 27 | def compose_inference(text_block, messages): 28 | user_template = create_prompt_message_template(text_block, role="user") 29 | messages.append(user_template) 30 | chat_resp = chat_gpt_inference(messages) 31 | reply_text = chat_resp["choices"][0]["message"]["content"] 32 | assistant_template = create_prompt_message_template(reply_text, role="assistant") 33 | messages.append(assistant_template) 34 | return messages, reply_text 35 | 36 | 37 | def category_generation(topic: str, topic_reasoning_chain: list): 38 | """Decompose a category into its constituent parts. 39 | 40 | Args: 41 | category (str): The category to decompose. 42 | 43 | Returns: 44 | list: The list of constituent parts of the category. 45 | """ 46 | # Define the prompt 47 | system_prompt = """Please design a categorical framework that captures the essential qualities and thought processes of a genius category theorist. Begin by identifying the key objects and morphisms that represent the foundational elements of a genius category theorist's intellectual toolbox. Then, outline the relationships and transformations between these objects and morphisms, highlighting the critical reasoning, creativity, and innovative problem-solving strategies employed by such a theorist. Finally, explain how this category can serve as a foundation for further exploration and development of novel ideas within the realm of category theory.""" 48 | 49 | messages = [{"role": "system", "content": system_prompt}] 50 | 51 | with open(topic_reasoning_chain, "r") as f: 52 | segments = f.readlines() 53 | 54 | with open("ct_output_file.jsonl", "a") as f: 55 | for i, line in enumerate(segments): 56 | # print(line) 57 | topic = eval(line)["label"] 58 | print(topic) 59 | prompt = f"""Please provide a description of {topic} within a categorical framework, 60 | detailing the objects and morphisms that represent its essential components. Offer a brief summary of these objects and 61 | morphisms as they pertain to the category specific to {topic}.""" 62 | try: 63 | messages, reply_text = compose_inference(prompt, messages) 64 | 65 | except Exception: 66 | messages, reply_text = compose_inference(prompt, [{"role": "system", "content": system_prompt}]) 67 | print(reply_text) 68 | print() 69 | row = { 70 | "topic": topic, 71 | "assistant_reply": reply_text, 72 | } 73 | json.dump(row, f) 74 | f.write("\n") 75 | -------------------------------------------------------------------------------- /src/ml_models/transformers/semantic_graph_context_generator.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | import openai 4 | 5 | from src.ast_parsers.python_ast_parser import get_methods, parse_github_repo 6 | from src.config import Config 7 | from src.system_prompts.format_system_prompts import ( 8 | extract_questions, 9 | format_system_prompts, 10 | format_system_prompts_with_tree, 11 | ) 12 | 13 | 14 | class SemanticGraphContextGenerator: 15 | def __init__(self, config: Config): 16 | self.config = config 17 | 18 | def __repr__(self) -> str: 19 | return f"SemanticGraphContextGenerator(config={self.config})" 20 | 21 | def get_repo_contents(self, git_repo_path): 22 | contents = parse_github_repo(git_repo_path) 23 | print(len(contents)) 24 | pruned_contents = [] 25 | for cont in contents: 26 | fp = cont["file_name"] 27 | fn = fp.split("/")[-1] 28 | fn_ = fn.split(".")[0] 29 | if fn_ in ["__init__"] or fn_.split("_")[-1] in ["test"]: 30 | continue 31 | else: 32 | pruned_contents.append(cont) 33 | return pruned_contents 34 | 35 | def chat_gpt_inference(self, messages: list): 36 | response = openai.ChatCompletion.create( 37 | model="gpt-3.5-turbo", 38 | messages=messages, 39 | max_tokens=self.config["max_tokens"], 40 | temperature=self.config["temperature"], 41 | ) 42 | return response 43 | 44 | def create_prompt_message_template(self, text, role="user"): 45 | if role not in ["user", "assistant"]: 46 | raise ValueError("Not a valid role. Please use 'user' or 'assistant'.") 47 | return {"role": role, "content": text} 48 | 49 | def compose_inference(self, text_block, messages): 50 | user_template = self.create_prompt_message_template(text_block, role="user") 51 | messages.append(user_template) 52 | chat_resp = self.chat_gpt_inference(messages) 53 | reply_text = chat_resp["choices"][0]["message"]["content"] 54 | assistant_template = self.create_prompt_message_template(reply_text, role="assistant") 55 | messages.append(assistant_template) 56 | return messages, reply_text 57 | 58 | def process_transcript(self, segments, file_name, git_repo_path, output_file_path, system_prompt, task, code_type): 59 | messages = [{"role": "system", "content": system_prompt}] 60 | 61 | with open(output_file_path, "a") as f: 62 | for i, sent in enumerate(segments): 63 | 64 | text_block = f"""```{sent}```""" 65 | try: 66 | messages, reply_text = self.compose_inference(text_block[:2000], messages) 67 | 68 | except Exception: 69 | messages, reply_text = self.compose_inference( 70 | text_block[:2000], [{"role": "system", "content": system_prompt}] 71 | ) 72 | if task == "self-instruction": 73 | questions = extract_questions(reply_text) 74 | if len(questions) > 0: 75 | for q in questions: 76 | sub_text_block = f"""{q}```{sent}```""" 77 | print(sub_text_block) 78 | print() 79 | messages, reply_text = self.compose_inference(sub_text_block[:2000], messages) 80 | print(reply_text) 81 | row = { 82 | "git_repo_path": git_repo_path, 83 | "file_name": file_name, 84 | "code_type": code_type, 85 | "system_task": task, 86 | "system_prompt": system_prompt, 87 | "conversation_history": messages, 88 | "assistant_reply": reply_text, 89 | } 90 | json.dump(row, f) 91 | f.write("\n") 92 | else: 93 | row = { 94 | "git_repo_path": git_repo_path, 95 | "file_name": file_name, 96 | "code_type": code_type, 97 | "system_task": task, 98 | "system_prompt": system_prompt, 99 | "conversation_history": messages, 100 | "assistant_reply": reply_text, 101 | } 102 | json.dump(row, f) 103 | f.write("\n") 104 | 105 | return messages 106 | 107 | def decompose_repo(self, git_repo_path, name_id, out_path, skip_graph_generation=False): 108 | contents = self.get_repo_contents(git_repo_path) 109 | context_paths = [] 110 | for cont in contents: 111 | 112 | if not self.config["with_tree"]: 113 | system_prompts = format_system_prompts(git_repo_path, cont["file_name"]) 114 | else: 115 | system_prompts = format_system_prompts_with_tree( 116 | git_repo_path, cont["file_name"], self.config["topic_tree"] 117 | ) 118 | for k, v in zip(system_prompts.keys(), system_prompts.values()): 119 | 120 | func_task = k 121 | out_file_name = f"{name_id}_{func_task}" 122 | print(f"file_name: {cont['file_name']}") 123 | num_funcs = len(cont["functions"]) 124 | num_classes = len(cont["classes"]) 125 | print(f"Imports: {cont['imports']}") 126 | context_paths.append(f"{out_path}/{out_file_name}.jsonl") 127 | if skip_graph_generation: 128 | continue 129 | try: 130 | if num_funcs > 0 or num_classes > 0: 131 | print(f" len of functions: {len(cont['functions'])}") 132 | _ = self.process_transcript( 133 | cont["functions"], 134 | cont["file_name"], 135 | git_repo_path, 136 | f"{out_path}/{out_file_name}.jsonl", 137 | system_prompts[func_task], 138 | func_task, 139 | "functions", 140 | ) 141 | _ = self.process_transcript( 142 | cont["classes"], 143 | cont["file_name"], 144 | git_repo_path, 145 | f"{out_path}/{out_file_name}.jsonl", 146 | system_prompts[func_task], 147 | func_task, 148 | "classes", 149 | ) 150 | for cls in cont["classes"]: 151 | cls_funcs = get_methods(cls) 152 | 153 | print(f"len of class: {len(cls)}") 154 | for method in cls_funcs: 155 | print(f"len of method: {len(method)}") 156 | _ = self.process_transcript( 157 | cls_funcs, 158 | cont["file_name"], 159 | git_repo_path, 160 | f"{out_path}/{out_file_name}.jsonl", 161 | system_prompts[func_task], 162 | func_task, 163 | "methods", 164 | ) 165 | except Exception as e: 166 | print(e) 167 | continue 168 | print("\n\n") 169 | return set(context_paths) 170 | -------------------------------------------------------------------------------- /src/notebooks/inference_me.md: -------------------------------------------------------------------------------- 1 | ```python 2 | from ast_parsers.python_ast_parser import parse_python_file, get_methods, parse_github_repo 3 | from system_prompts.format_system_prompts import format_system_prompts, format_system_prompts_with_tree 4 | from ..ml_models.transformers.chatgpt_api_inference import process_transcript 5 | from ..repo_graph_generation import decompose_repo 6 | from ..ml_models.graph_networks.kernels import compute_kernel_by_type, graph_laplacian, compute_kernel 7 | from ..ml_models.graph_networks.message_passing import k_hop_message_passing_sparse 8 | from ..ml_models.transformers.sentence_embeddings import embed_data 9 | from ..ml_models.topic_modeling.umap_hdbscan_pipeline import ( 10 | load_topic_model, 11 | get_topic_model, 12 | get_representative_docs, 13 | reduce_outliers, 14 | fit_topic_model, 15 | compute_hierarchical_topic_tree, 16 | get_topic_info, 17 | ) 18 | 19 | 20 | from getpass import getpass 21 | 22 | openai_secret = getpass("Enter the secret value: ") 23 | # Set up OpenAI API credentials 24 | openai.api_key = openai_secret 25 | 26 | 27 | !git clone https://github.com/danielpatrickhug/GitModel.git 28 | ``` 29 | 30 | TODO. standardize this 31 | 32 | ```python 33 | git_repo_path = "/content/GitModel" 34 | out_path = "/content/gitmodel_sum" 35 | name_id = "gitmodel" 36 | contents = parse_github_repo(git_repo_path) 37 | print(len(contents)) 38 | pruned_contents = [] 39 | for cont in contents: 40 | fp = cont["file_name"] 41 | fn = fp.split("/")[-1] 42 | fn_ = fn.split(".")[0] 43 | if fn_ in ["__init__"] or fn_.split("_")[-1] in ["test"]: 44 | continue 45 | else: 46 | print(cont["file_name"]) 47 | pruned_contents.append(cont) 48 | 49 | decompose_repo() 50 | ``` 51 | 52 | ```python 53 | def load_jsonl(filepaths): 54 | data = [] 55 | for filepath in filepaths: 56 | with open(filepath, "r") as f: 57 | for line in f: 58 | data.append(json.loads(line)) 59 | return data 60 | 61 | 62 | root_dir = "/content/gitmodel_sum" 63 | repo = "gitmodel" 64 | repo_files = [ 65 | f"{root_dir}/{repo}_summary.jsonl", 66 | f"{root_dir}/{repo}_question_asking.jsonl", 67 | ] 68 | 69 | res = load_jsonl(repo_files) 70 | sents = [] 71 | for r in res: 72 | messages = r["conversation_history"] 73 | reply = r["assistant_reply"] 74 | sents.append(reply) 75 | sents.append(messages[-2]["content"]) 76 | ``` 77 | 78 | ```python 79 | data = pd.DataFrame(sents, columns=["query"]) 80 | data["_id"] = data.index 81 | ``` 82 | 83 | ```python 84 | MODEL_NAME = "all-MiniLM-L6-v2" # "allenai-specter"# 85 | embs = embed_data(data, model_name=MODEL_NAME) 86 | A = compute_kernel_by_type(embs, threshold=0.6, kernel_type="cosine") 87 | k=2 88 | A_k, agg_features = k_hop_message_passing_sparse(A, embs, k) 89 | ``` 90 | 91 | Graph Laplacian 92 | 93 | ```python 94 | L, D = graph_laplacian(A) 95 | L_k, D_k = graph_laplacian(A_k) 96 | ``` 97 | 98 | SVD for when the heads 99 | 100 | ```python 101 | U, S, VT = np.linalg.svd(A) 102 | print(f"U: {U.shape}\n") 103 | print(f"S: {S.shape}\n") 104 | print(f"VT: {VT.shape}\n") 105 | plt.plot(np.diag(S)) 106 | plt.xlabel("Singular value index") 107 | plt.ylabel("Singular value") 108 | plt.title("Singular values of A") 109 | plt.show() 110 | ``` 111 | 112 | ```python 113 | U_k, S_k, VT_k = np.linalg.svd(A_k) 114 | print(f"U_{k}: {U_k.shape}\n") 115 | print(f"S_{k}: {S_k.shape}\n") 116 | print(f"VT_{k}: {VT_k.shape}\n") 117 | plt.plot(np.diag(S_k)) 118 | plt.xlabel("Singular value index") 119 | plt.ylabel("Singular value") 120 | plt.title("Singular values of A_k") 121 | plt.show() 122 | ``` 123 | 124 | ```python 125 | topic_model = load_topic_model(nr_topics="auto") 126 | topics, probs = fit_topic_model(topic_model, data, agg_features) 127 | freq = get_topic_info(topic_model) 128 | rep_docs = topic_model 129 | hr, tree = compute_hierarchical_topic_tree(topic_model=topic_model, data=data) 130 | ``` 131 | -------------------------------------------------------------------------------- /src/pipeline.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | import os 4 | import random 5 | import sys 6 | from typing import Tuple 7 | 8 | import numpy as np 9 | import pandas as pd 10 | 11 | from src.config import Config 12 | from src.dynamic_import import instantiate_class_from_config 13 | from src.fetch_repo import clone_and_create_context_folder 14 | from src.ml_models.graph_networks.gnn_head import combine_graphs_with_gat, linearly_sum_gnn_heads 15 | 16 | lg = logging.getLogger(__name__) 17 | lg.setLevel(logging.DEBUG) 18 | handler = logging.StreamHandler(sys.stdout) 19 | handler.setFormatter(logging.Formatter("[%(asctime)s] [%(levelname)s] - %(message)s")) 20 | lg.addHandler(handler) 21 | 22 | 23 | class Pipeline: 24 | def __init__(self, config: Config): 25 | self.config = config 26 | self._obj_map = instantiate_class_from_config(config) 27 | self.pipeline_settings = self._obj_map["pipeline_settings"][0] 28 | self.semantic_graph_context_generator = [ 29 | self._obj_map["semantic_graph_context_generator"][i] 30 | for i in range(len(self._obj_map["semantic_graph_context_generator"])) 31 | ] 32 | self.gnn_heads = [self._obj_map["gnn_heads"][i] for i in range(len(self._obj_map["gnn_heads"]))] 33 | self.topic_model = self._obj_map["topic_model"][0] 34 | 35 | @classmethod 36 | def from_config(cls, config: Config): 37 | return cls(config) 38 | 39 | @classmethod 40 | def from_yaml(cls, yaml_path): 41 | config = Config.from_yaml(yaml_path) 42 | return cls(config) 43 | 44 | def find_files_with_substring(self, root_dir, substring): 45 | for dirpath, dirnames, filenames in os.walk(root_dir): 46 | for filename in filenames: 47 | if substring in filename: 48 | yield os.path.join(dirpath, filename) 49 | 50 | def load_jsonl(self, filepaths): 51 | res = [] 52 | for filepath in filepaths: 53 | with open(filepath, "r") as f: 54 | for line in f: 55 | res.append(json.loads(line)) 56 | sents = [] 57 | for r in res: 58 | messages = r["conversation_history"] 59 | reply = r["assistant_reply"] 60 | sents.append(reply) 61 | sents.append(messages[-2]["content"]) 62 | data = pd.DataFrame(sents, columns=["query"]) 63 | data["_id"] = data.index 64 | return data 65 | 66 | def run(self, git_repo: str, repo_name: str) -> Tuple[Tuple[np.ndarray, np.ndarray], dict]: 67 | """ 68 | Run the pipeline.""" 69 | # replace with lg.info 70 | lg.info("Running pipeline...") 71 | lg.info("Fetching repo...") 72 | repo_folder, context_folder = clone_and_create_context_folder(git_repo, repo_name) 73 | lg.info("Generating semantic graph context...") 74 | 75 | semantic_graph_context = [ 76 | context_generator.decompose_repo( 77 | repo_folder, 78 | repo_name, 79 | context_folder, 80 | skip_graph_generation=self.pipeline_settings.config["skip_graph_creation"], 81 | ) 82 | for context_generator in self.semantic_graph_context_generator 83 | ] 84 | context_files = [] 85 | for context in semantic_graph_context[0]: 86 | context_files.append(context) 87 | context_files = self.find_files_with_substring(context_folder, repo_name) 88 | lg.info(context_files) 89 | data = self.load_jsonl(context_files) 90 | 91 | lg.info("Running GNN heads...") 92 | gnn_head_outputs = [gnn_head.generate_graph(data) for gnn_head in self.gnn_heads] 93 | lg.info("Combining GNN heads...") 94 | lg.info(self.pipeline_settings) 95 | 96 | if self.pipeline_settings.config["combine_gnn_strategy"] == "sum": 97 | combined_gnn_head = linearly_sum_gnn_heads(gnn_head_outputs, self.pipeline_settings.config["norm_fn"]) 98 | elif self.pipeline_settings.config["combine_gnn_strategy"] == "gat": 99 | combined_gnn_head = combine_graphs_with_gat(gnn_head_outputs) 100 | elif self.pipeline_settings.config["combine_gnn_strategy"] == "none": 101 | combined_gnn_head = gnn_head_outputs[0] 102 | elif self.pipeline_settings.config["combine_gnn_strategy"] == "random": 103 | # Choose a random GNN head as the final output 104 | combined_gnn_head = random.choice(gnn_head_outputs) 105 | else: 106 | raise ValueError(f"Unknown combine_gnn_strategy setting: {self.pipeline_settings['combine_gnn_strategy']}") 107 | 108 | lg.info("Running topic model...") 109 | 110 | topic_model_outputs = self.topic_model.run(data, combined_gnn_head) 111 | 112 | return gnn_head_outputs, [topic_model_outputs] 113 | 114 | def get_repo_contents(self, git_repo_path): 115 | contents = self.semantic_graph_context_generator.get_repo_contents(git_repo_path) 116 | return contents 117 | 118 | def decompose_repo(self, contents, git_repo_path, name_id, topic_tree, out_path): 119 | self.semantic_graph_context_generator.decompose_repo(contents, git_repo_path, name_id, topic_tree, out_path) 120 | -------------------------------------------------------------------------------- /src/system_prompts/format_system_prompts.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | 4 | def format_system_prompts(git_repo_path, file_name): 5 | system_prompts = { 6 | "self-instruction": f""" 7 | GitHub repository: {git_repo_path} you're currently in the file {file_name}. 8 | Generate a list of irreducible questions related to a given piece of code that can help understand its functionality and behavior. Ensure that the questions are consistently formatted and easy to parse. The irreducible questions should be answerable with the provided piece of code and cannot be further simplified. Consider the following aspects when generating the questions: 9 | 10 | What is the purpose of the code? 11 | What input does the code take, and how does it process the input? 12 | What output does the code produce? 13 | Use these aspects to generate a set of irreducible questions that can help anyone understand the code and its behavior. 14 | """, 15 | } 16 | 17 | return system_prompts 18 | 19 | 20 | def extract_questions(prompt_output): 21 | # Define a regular expression to match the questions 22 | question_regex = r"\d+\.\s(.+\?)\n" 23 | 24 | # Find all matches of the question regex in the prompt output 25 | questions = re.findall(question_regex, prompt_output) 26 | return questions 27 | 28 | 29 | def format_system_prompts_with_tree(git_repo_path, file_name, topic_tree): 30 | system_prompts = { 31 | "summary2": f""" 32 | Provide a concise overview of the codebase located at {git_repo_path} and the file {file_name}, including its main features and capabilities. 33 | ChatGPT will use its natural language processing capabilities to analyze the codebase and generate a brief summary of its main functionalities, tools, and technologies used. Here is the repos heirarchical topic tree 34 | {topic_tree} You can use it to better understand the global structure of the repository 35 | """, 36 | "summary3": f""" 37 | Can you summarize the main functionality of the codebase located at {git_repo_path} and the file {file_name}, as well as any notable libraries or frameworks used? 38 | ChatGPT will use its natural language processing capabilities to analyze the codebase and provide a brief summary of its main functionalities, libraries, and frameworks used.Here is the repos heirarchical topic tree 39 | {topic_tree} You can use it to better understand the global structure of the repository 40 | """, 41 | "summary": f""" 42 | Summarize the code the GitHub repository: {git_repo_path} you're currently in the file {file_name} 43 | I want you to act as a code summarizer, you will use advanced natural language processing capabilities to analyze the code and generate a concise summary that captures 44 | the main functionality and purpose of the codebase. Additionally, ChatGPT can provide insights into the programming languages and libraries used in the repository, 45 | as well as any notable features or functionalities that are present. here is a topic tree of the repository 46 | {topic_tree} You can use it to better understand the global structure of the repository 47 | """, 48 | "qa_chain": f""" 49 | GitHub repository: {git_repo_path} you're currently in the file {file_name}. 50 | I want you to asks questions that a new developer may ask about the codebase used in the repository, 51 | as well as answer the question with step by step reasoning as a senior dev would. All responses should first ask a question and then answer with reasoning. 52 | Here is a topic tree of the repository 53 | {topic_tree} You can use it to better understand the global structure of the repository 54 | """, 55 | } 56 | return system_prompts 57 | -------------------------------------------------------------------------------- /test_config.yaml: -------------------------------------------------------------------------------- 1 | pipeline_settings: 2 | - __impl__: PipelineSettings 3 | combine_gnn_strategy: none 4 | norm_fn: none 5 | skip_graph_creation: False 6 | 7 | semantic_graph_context_generator: 8 | - __impl__: SemanticGraphContextGenerator 9 | generator_id: self-instruction 10 | model_id: gpt-3.5-turbo 11 | temperature: 0 12 | max_tokens: 500 13 | with_tree: False 14 | 15 | gnn_heads: 16 | - __impl__: GNNHead 17 | generator_id: message_passing 18 | k_hop: 2 19 | is_sparse: True 20 | embbedding_id: sbert 21 | embbedding_model: all-MiniLM-L6-v2 22 | kernel_id: cosine_similarity 23 | graph_id: adjacency 24 | connection_threshold: 0.7 25 | 26 | topic_model: 27 | - __impl__: TopicModel 28 | topic_model_id: topic_model 29 | embedding_model: all-MiniLM-L6-v2 30 | clustering_model: 31 | HDBSCAN(min_cluster_size=15, metric='euclidean', 32 | cluster_selection_method='eom', prediction_data=True) 33 | dimensionality_reduction: 34 | umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, 35 | metric='cosine') 36 | vectorizer_model: CountVectorizer(stop_words="english") 37 | representation_model: 38 | OpenAI(model="gpt-3.5-turbo", delay_in_seconds=3, chat=True) 39 | hierarchical_linkage_fn: sch.linkage(x, 'single', optimal_ordering=True) 40 | with_tree: True 41 | auto_cluster: True 42 | --------------------------------------------------------------------------------