├── .gitattributes
├── .gitignore
├── .pre-commit-config.yaml
├── .python-version
├── LICENSE
├── README.md
├── __main__.py
├── git-frontend
├── .gitignore
├── README.md
├── package-lock.json
├── package.json
├── public
│ ├── favicon.ico
│ ├── index.html
│ ├── logo192.png
│ ├── logo512.png
│ ├── manifest.json
│ └── robots.txt
└── src
│ ├── App.css
│ ├── App.js
│ ├── App.test.js
│ ├── index.css
│ ├── index.js
│ ├── logo.svg
│ ├── reportWebVitals.js
│ └── setupTests.js
├── images
└── self_portrait.jpg
├── main.py
├── measure_process_template_diffusion.ipynb
├── papers
├── 2203.15544.pdf
└── README.md
├── pyproject.toml
├── random.ipynb
├── requirements.txt
├── setup.cfg
├── src
├── __init__.py
├── alembic.ini
├── alembic
│ └── env.py
├── ast_parsers
│ └── python_ast_parser.py
├── backend
│ ├── config.py
│ ├── database.py
│ └── memory_tree_manager.py
├── community_detection
│ └── extract_connected_components.py
├── config.py
├── configs
│ ├── config1.yaml
│ └── config2.yaml
├── dynamic_import.py
├── fetch_repo.py
├── ml_models
│ ├── __init__.py
│ ├── graph_networks
│ │ ├── gnn_head.py
│ │ ├── kernels.py
│ │ └── message_passing.py
│ ├── topic_modeling
│ │ └── topic_model.py
│ └── transformers
│ │ ├── category_decomposition.py
│ │ └── semantic_graph_context_generator.py
├── notebooks
│ ├── bottom_up_vs_top_down.ipynb
│ ├── community_structure.ipynb
│ ├── github_repo_parser.ipynb
│ └── inference_me.md
├── pipeline.py
└── system_prompts
│ └── format_system_prompts.py
└── test_config.yaml
/.gitattributes:
--------------------------------------------------------------------------------
1 | * text=auto eol=lf
2 | *.ipynb linguist-documentation
3 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .venv*
2 | venv
3 | .env
4 | *.pyc
5 | *.swp
6 | *.egg-info
7 | __pycache__
8 | .DS_Store
9 | work
10 | context
11 |
12 | # ignore jupyter notebook checkpoints
13 | .ipynb_checkpoints
14 |
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | # WARNING!
2 | #
3 | # When making changes to auto-formatters used in pre-commit hooks, you are
4 | # likely to cause merge conflicts with main and/or other pull requests.
5 | # Fixing them might revert other people's work. Expect pain!
6 | # To avoid accidental reversions and keep it easy to review, please make sure
7 | # that changes here are in a pull request by themselves, that it consists of
8 | # two commits:
9 | #
10 | # 1. The changes to this file
11 | # 2. Changes made by running `python3 -m pre_commit run --all-files`.
12 | #
13 | # Then each time your pull request is blocked by a merge conflict, do the
14 | # following steps:
15 | #
16 | # git reset HEAD^1 && git checkout -f # discard the change commit
17 | # git rebase main # re-apply other people's changes
18 | # python3 -m pre_commit run --all-files # re-run the rules
19 | # git add . # add the newly changed files
20 | # git commit -m 'apply pre-commit' # commit it
21 | # git push -f # force push back to your branch
22 | #
23 | # Keep in mind you may have to do this a few times, as changes here may impact
24 | # other pull requests. Try to keep it up-to-date so they can go in when it'll
25 | # cause least disruption.
26 | #
27 | # /WARNING!
28 |
29 | exclude: build|stubs|^bot/templates/$|openassistant/templates|docs/docs/api/openapi.json
30 |
31 | repos:
32 | - repo: https://github.com/pre-commit/pre-commit-hooks
33 | rev: v4.4.0
34 | hooks:
35 | - id: trailing-whitespace
36 | - id: check-ast
37 | - id: check-yaml
38 | # Always check YAML but skip a few YAML files that are auto-generated
39 | # and which break the standard YAML check. The alternative would be to
40 | # skip any unsafe errors (and thus break YAML compatibility) or use
41 | # some other checker that may not work in general.
42 | exclude: ^copilot/.*/addons/.*$
43 | - id: check-json
44 | - id: check-case-conflict
45 | - id: detect-private-key
46 | - id: fix-encoding-pragma
47 | args: [--remove]
48 | - id: forbid-submodules
49 | - id: mixed-line-ending
50 | - id: requirements-txt-fixer
51 | - id: check-executables-have-shebangs
52 | - id: check-shebang-scripts-are-executable
53 | - id: check-byte-order-marker
54 | - id: check-symlinks
55 | - id: check-merge-conflict
56 | - id: check-added-large-files
57 | args: [--maxkb=1024]
58 | - id: end-of-file-fixer
59 |
60 | - repo: https://github.com/psf/black
61 | rev: 22.12.0
62 | hooks:
63 | - id: black-jupyter
64 |
65 | - repo: https://github.com/pycqa/flake8
66 | rev: 6.0.0
67 | hooks:
68 | - id: flake8
69 | exclude: "^src/(ml_models/topic_modeling/topic_model|__init__)\\.py$"
70 |
71 | - repo: https://github.com/pycqa/isort
72 | rev: 5.12.0
73 | hooks:
74 | - id: isort
75 |
76 | - repo: https://github.com/pre-commit/mirrors-prettier
77 | rev: v2.7.1
78 | hooks:
79 | - id: prettier
80 | args:
81 | [
82 | --prose-wrap=always,
83 | --write,
84 | --ignore-path=./website/.prettierignore,
85 | ]
86 |
87 | - repo: local
88 | hooks:
89 | - id: next-lint-website
90 | name: Lint website
91 | files: ^website/
92 | exclude: ^website/node_modules/
93 | types_or: [javascript, jsx, ts, tsx]
94 | language: node
95 | pass_filenames: false
96 | entry: website/next-lint.js
97 |
--------------------------------------------------------------------------------
/.python-version:
--------------------------------------------------------------------------------
1 | 3.10.8
2 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # GitModel (Graph Intelligence Tensor)
2 |
3 | 
4 |
5 | GitModel is a pipeline that can generate high quality topic models of github repos using GPT, GNN
6 | message passing layers and BERTopic.
7 |
8 | ⚠️Warning: Not under active development. A more expansive system is actively being developed @ https://github.com/Neural-Dragon-AI - currently private.
9 | A preview from the org can be seen here including the openbugger and babydragon package https://www.youtube.com/live/BbUOoGfpjvQ?feature=share&t=18645
10 | - Generate documentation for Python repos and create enriched subgraphs for your
11 | codebases
12 | - Swap system prompt tasks(bug hunting, todo, documentation labeling, etc) for
13 | enriching semantic graph and dataset building.
14 | - The generated data is saved to context folder.
15 | - in src/format_system_prompts. w/ tree works but it requires manual changing
16 | one line of code. will fix soon
17 | - Pre-commit for isort, prettier, detect private_keys, black, flake8, and more.
18 | - GNN Message Passing and Topic modeling pipeline as an inductive bias (GRNN)
19 | - BERTopic is highly customizable and can compose several different clustering,
20 | embedding, vectorizers, bag of words and dimensionality reduction techniques.
21 |
22 |
23 |
24 |
25 | - https://github.com/huggingface/diffusers/blob/main/CONTRIBUTING.md
26 |
27 | ## How to Use
28 |
29 | ```bash
30 | python3.10 -m venv venv && source venv/bin/activate && pip install -U pip setuptools wheel
31 | pip install -r requirements.txt
32 | python __main__.py
33 | ```
34 |
35 | **main.py**
36 |
37 | ```python
38 | import argparse
39 | from getpass import getpass
40 |
41 | import openai
42 |
43 | from src import Pipeline
44 |
45 | if __name__ == "__main__":
46 | argsparse = argparse.ArgumentParser()
47 | argsparse.add_argument("--config", type=str, default="./test_config.yaml")
48 | argsparse.add_argument("--repo", type=str, default="https://github.com/danielpatrickhug/GitModel.git")
49 | argsparse.add_argument("--repo_name", type=str, default="gitmodel")
50 |
51 | args = argsparse.parse_args()
52 |
53 | openai_secret = getpass("Enter the secret key: ")
54 | # Set up OpenAI API credentials
55 | openai.api_key = openai_secret
56 |
57 | print("starting pipeline")
58 | pipeline = Pipeline.from_yaml(args.config)
59 | gnn_head_outputs, topic_model_outputs = pipeline.run(args.repo, args.repo_name)
60 | for i, topic_model_output in enumerate(topic_model_outputs):
61 | topic_model_output["data"].to_csv(f"context/{args.repo_name}_topic_model_outputs_{i}.csv")
62 | topic_model_output["topic_info"].to_csv(f"context/{args.repo_name}_topic_info_{i}.csv")
63 | with open(f"context/{args.repo_name}_tree_{i}.txt", "w", encoding="utf-8") as f:
64 | f.write(topic_model_output["tree"])
65 | ```
66 | ## Topic model your dependencies.
67 |
68 | If you have enough patience or a lot of money to afford more then one computer.
69 | run GitModel on /venv/lib/python3.10/site-packages
70 |
71 |
72 | ## Examples
73 |
74 | ### Gitmodel
75 |
76 | - https://github.com/danielpatrickhug/GitModel
77 | - uses Deepminds clrs topic tree in system prompt during semantic graph
78 | generation
79 |
80 | ```
81 |
82 | . ├─Function description and comparison including Gaussian kernel and sparse
83 | matrices\_**\_ │ ├─■──Understanding the Purpose and Handling of a Function for
84 | Sparse Matrices with Inputs, Outputs, and P ── Topic: 9 │ └─Understanding
85 | kernels and functions in the 'kernels.py' file for estimating PDF and computing
86 | simila │ ├─■──Purpose and functions of kernel-related functions in kernels.py
87 | file of a Python program, including ── Topic: 22 │ └─■──Understanding the
88 | cos_sim and cos_sim_torch functions in kernels.py file\_\_** ── Topic: 25
89 | └─Graph message passing and adjacency matrix computation using embeddings\_**\_
90 | ├─k-hop message passing and cosine similarity kernel computation for graph
91 | embeddings\_\_** │ ├─k-hop message passing with adjacency matrix and node
92 | features\_**\_ │ │ ├─Computation of Gaussian Kernel Matrix between Two Sets of
93 | Embeddings using PyTorch\_\_** │ │ │ ├─■──Cosine Similarity with PyTorch Tensors
94 | and Functional.\_**_ ── Topic: 1 │ │ │ └─■──Function to compute adjacency matrix
95 | for embeddings using specified kernel type and threshold value_ ── Topic: 19 │ │
96 | └─Message Passing and K-hop Aggregation in Graphs using Sparse Matrices and Node
97 | Features\_\_** │ │ ├─■──Document pruning and adjacency matrix recomputation
98 | using embeddings and thresholding\_**\_ ── Topic: 11 │ │ └─k-hop message passing
99 | and adjacency matrix computation in sparse graphs.\_\_** │ │ ├─■──Computing
100 | graph laplacian and degree matrix from pairwise distances using a given
101 | function.\_**\_ ── Topic: 7 │ │ └─■──Message Passing with K-hop Adjacency and
102 | Aggregated Features in Sparse Matrices\_\_** ── Topic: 8 │ └─"Outlier Reduction
103 | Using Count-TF-IDF and OpenAI Representation Model"\_**\_ │ ├─Topic Modeling and
104 | Outlier Reduction in Natural Language Processing (NLP)\_\_** │ │ ├─Understanding
105 | the compose\*inference function in a chatbot system.\_**\_ │ │ │ ├─■──Processing
106 | conversation transcripts with Python functions\_\_** ── Topic: 18 │ │ │
107 | └─Understanding the compose_inference function in a chatbot conversation with
108 | message templates\_\*\*\* │ │ │ ├─■──Understanding the `compose_inference`
109 | Function in Chatbot Conversation Generation with OpenAI GPT\_** ── Topic: 2 │ │
110 | │ └─■──Function to create prompt message template with role and text input
111 | parameters and validation of rol ── Topic: 17 │ │ └─Outlier Reduction with
112 | Machine Learning Models\_**\_ │ │ ├─Document processing and reduction techniques
113 | for topic modeling with various machine learning models │ │ │ ├─MiniLM language
114 | model for sentence embedding\_\_** │ │ │ │ ├─■──Embedding sentences using MiniLM
115 | language model with multiprocessing and GPU acceleration\_**\_ ── Topic: 15 │ │
116 | │ │ └─■──Embedding Sentences using Pre-Trained Language Model with
117 | SentenceTransformer Library\_\_** ── Topic: 23 │ │ │ └─■──Topic modeling
118 | algorithms and document reduction techniques\_**\_ ── Topic: 0 │ │ └─SQLalchemy
119 | migrations in online mode with engine configuration\_\_** │ │ ├─■──Probability
120 | Density Estimation with Gaussian Kernel Density Estimator\_**\_ ── Topic: 12 │ │
121 | └─Running database migrations with SQLAlchemy and Alembic\_\_** │ │ ├─■──Graph
122 | network message passing & Mobile App Navigation System Design\_**\_ ── Topic: 21
123 | │ │ └─■──Running migrations with SQLAlchemy and Alembic in online mode\_\_** ──
124 | Topic: 6 │ └─Class Settings definition using BaseSettings and its purpose for
125 | managing configuration in a third-p │ ├─■──Empty class definition for managing
126 | application settings using Pydantic's BaseSettings\_**\_ ── Topic: 3 │
127 | └─■──MemoryTreeManager class implementation\_\_** ── Topic: 16 └─Codebase
128 | decomposition and analysis with Git repository and AST nodes.\_**\_ ├─Code
129 | decomposition and processing in Git repositories.\_\_** │ ├─■──Python code
130 | parsing and analysis\_**\_ ── Topic: 4 │ └─Code decomposition in a Git
131 | repository\_\_** │ ├─■──Decomposing Git Repositories with System Prompts.\_**\_
132 | ── Topic: 10 │ └─Parsing and pruning files in a GitHub repository\_\_** │
133 | ├─■──parsing and pruning files in a local Git repository\_**\_ ── Topic: 5 │
134 | └─■──purpose of `get_repo_contents` function in `repo_graph_generation.py` for
135 | retrieving and pruning Git ── Topic: 24 └─Analyzing chatbot main capabilities in
136 | a codebase using natural language processing and notable fram ├─■──summarizing
137 | code in a GitHub repository using ChatGPT\_\_** ── Topic: 14 └─Understanding
138 | Codebase Structure and Functionality with Hierarchical Trees and
139 | Frameworks\_**\_ ├─■──Analyzing codebase structure and functionalities using a
140 | hierarchical topic tree\_\_\*\* ── Topic: 13 └─■──Understanding the difference
141 | between format_system_prompts and format_system_prompts_with_tree in a ── Topic:
142 | 20
143 |
144 | ```
145 |
146 | w/ graph code bert embeddings
147 |
148 | ```
149 |
150 | . ├─"The Pipeline Class and Its Methods in GitModel Project"\_**\_ │
151 | ├─Probability Density Estimation using Gaussian KDE in SciPy\_\_** │ │
152 | ├─Probability density function estimation using Gaussian kernel density
153 | estimation\_**\_ │ │ │ ├─■──Probability density estimation with Gaussian
154 | kernel\_\_** ── Topic: 16 │ │ │ └─■──Understanding cos\*sim_torch function and
155 | configuring context with URL and target metadata\_**\_ ── Topic: 14 │ │ └─Empty
156 | class definition for MessageTreeManagerConfiguration in Python\_\_** │ │ ├─Empty
157 | class definition in MessageTreeManagerConfiguration with BaseModel
158 | inheritance.\_**\_ │ │ │ ├─■──Questions about bug fixing with system prompts in
159 | kernel computation with tensors and matrices.\_\_** ── Topic: 13 │ │ │ └─Empty
160 | class definitions and inability to determine expected behavior of
161 | MemoryTreeManager class\_**\_ │ │ │ ├─■──Purpose of run_migrations_online in
162 | Alembic environment file\_\_** ── Topic: 12 │ │ │ └─■──Empty class definition of
163 | MessageTreeManagerConfiguration inheriting from BaseModel\_\*\*\* ── Topic: 25 │
164 | │ └─Understanding the purpose of SemanticGraphContextGenerator and TopicModel
165 | classes in the codebase\_** │ │ ├─■──Purpose of Pipeline class in codebase with
166 | SemanticGraphContextGenerator, MessageTreeManagerConfigur ── Topic: 15 │ │
167 | └─■──Understanding the purpose and usage of TopicModel class in dimensional
168 | tensors and input shape setti ── Topic: 20 │ └─GitModel Pipeline class with
169 | find_files_with_substring method\_**\_ │ ├─GitModel Pipeline Class and
170 | find_files_with_substring Method Description\_\_** │ │ ├─■──Understanding the
171 | `clone_and_create_context_folder` Function\_**\_ ── Topic: 4 │ │ └─GitModel
172 | Pipeline class and methods for searching files with substring\_\_** │ │
173 | ├─GitModel Pipeline class and methods for file searching\_**\_ │ │ │ ├─■──Python
174 | class for loading and initializing configuration values from a YAML file with
175 | dynamic imports ── Topic: 9 │ │ │ └─■──The Pipeline class and its methods in
176 | GitModel project configuration and file searching.\_\_** ── Topic: 10 │ │
177 | └─■──Python Pipeline Class for Generating a Semantic Graph Context for Git
178 | Repository Data Processing\_**\_ ── Topic: 8 │ └─■──Cloning and Storing
179 | Repository in "Work" Folder with Custom Name using Python Function\_\_** ──
180 | Topic: 22 └─Understanding the purpose and input of a Pipeline class in a project
181 | involving semantic graphs and e ├─Topic Modeling with Hierarchical Topics and
182 | Outlier Reduction Strategies in Python\_**\_ │ ├─Working with context folders
183 | and creating directories using os module.\_\_** │ │ ├─■──Creating a work folder
184 | and cloning a repository to create a context folder in Python\_**\_ ── Topic: 18
185 | │ │ └─■──Working with context and folder paths in Python\_\_** ── Topic: 3 │
186 | └─■──Topic modeling and representation using hierarchical and ctfidf
187 | models\_**\_ ── Topic: 5 └─PyTorch function for computing Gaussian kernel matrix
188 | and k-hop message passing on an adjacency matr ├─Compute k-hop adjacency matrix
189 | and aggregated features using message passing in graph analysis.\_\_** │ ├─k-hop
190 | message passing with adjacency matrix and node features\_**\_ │ │ ├─■──Document
191 | Pruning and Adjacency Matrix Recomputation\_\_** ── Topic: 23 │ │ └─Computing
192 | k-hop adjacency matrix with message passing in graph neural networks.\_**\_ │ │
193 | ├─■──Computing k-hop adjacency matrix and aggregated features using message
194 | passing\_\_** ── Topic: 0 │ │ └─■──GNNHead class for computing kernel matrix
195 | with node features in numpy array\_**\_ ── Topic: 1 │ └─Data Migrations in
196 | Offline Mode.\_\_** │ ├─■──Degree matrix computation using adjacency distance
197 | matrix and pairwise distances in Python\_**\_ ── Topic: 21 │ └─■──SQLAlchemy
198 | migration in 'offline' mode\_\_** ── Topic: 11 └─Understanding code inputs and
199 | purpose in a Pipeline class\_**\_ ├─Parsing Python files using AST module and
200 | extracting specific information\_\_** │ ├─■──Cosine Similarity Computation using
201 | PyTorch and NumPy\_**\_ ── Topic: 6 │ └─■──Python code parsing and data
202 | extraction using AST\_\_** ── Topic: 17 └─Code Structure and Purpose of Pipeline
203 | Class with Config and Semantic Graph Context Generator in Pyt ├─Code for a
204 | Pipeline with Semantic Graph Context Generator\_**\_ │ ├─■──Understanding
205 | Pipeline Class and Semantic Graph Context Generation in Python Code\_\_** ──
206 | Topic: 24 │ └─■──Summarizing code in a GitHub repository using ChatGPT\_**\_ ──
207 | Topic: 2 └─Semantic Graph Context Generator Class and Methods\_\_**
208 | ├─■──Semantic Graph Context Generation for Git Repositories.\_**\_ ── Topic: 19
209 | └─■──Implementation of class instantiation using configuration and dictionary
210 | mapping.\_\_\*\* ── Topic: 7
211 |
212 | ```
213 |
214 | ### DeepMind CLRS
215 |
216 | - https://github.com/deepmind/clrs
217 |
218 | ```
219 |
220 | . ├─Purpose and Attributes of the `Net` Class in Graph Neural Networks\_**\_ │
221 | ├─Graph Attention Networks and DAG Shortest Paths in JAX.\_\_** │ │ ├─Graph
222 | Attention Networks (GAT and GATv2) code implementation\_**\_ │ │ │ ├─Code for
223 | DAG shortest path and depth-first search algorithms\_\_** │ │ │ │ ├─■──String
224 | Matching and Maximum Subarray\_**\_ ── Topic: 10 │ │ │ │ └─Depth-First Search
225 | and DAG Shortest Path Algorithms implemented in Python\_\_** │ │ │ │
226 | ├─■──Description of string probe functions in probing.py file for Hash Table
227 | probing.\_**\_ ── Topic: 1 │ │ │ │ └─Graph Algorithms - DFS and DAG Shortest
228 | Paths\_\_** │ │ │ │ ├─■──Graph algorithms (DFS and DAG shortest path) in
229 | Python\_**\_ ── Topic: 0 │ │ │ │ └─■──Functions for decoding diff and graph
230 | features in PyTorch graph neural networks.\_\_** ── Topic: 6 │ │ │ └─■──Graph
231 | Attention Networks (GAT and GATv2)\_**\_ ── Topic: 20 │ │ └─■──Message Passing
232 | with \_MessagePassingScanState, \_MessagePassingOutputChunked and
233 | MessagePassingStateC ── Topic: 17 │ └─Implementing a Baseline Model with
234 | Selectable Message Passing Algorithm and its Dataset Sampler.\_\_** │ ├─Handling
235 | of untrained parameters in optimization updates\_**\_ │ │ ├─■──Updating
236 | parameters with filtered gradients from multiple algorithms.\_\_** ── Topic: 8 │
237 | │ └─■──Processing trajectory hints with variable-length time dimension using
238 | batching.\_**\_ ── Topic: 9 │ └─Processing time-chunked data with batched
239 | samplers and message passing nets.\_\_** │ ├─Model processing of time-chunked
240 | data with dataset sampling and batch processing\_**\_ │ │ ├─■──CLRS dataset
241 | download and URL retrieval on Google Cloud Platform\_\_** ── Topic: 13 │ │
242 | └─Chunked data and dataset sampling with JAX.\_**\_ │ │ ├─■──JAX functions for
243 | reshaping and restacking data for pmap computation\_\_** ── Topic: 4 │ │ └─Data
244 | chunking with batched sampling and message passing in neural networks.\_**\_ │ │
245 | ├─Processing time-chunked data with batch samplers and a NetChunked class\_\_**
246 | │ │ │ ├─■──Time-chunked data processing using BaselineModelChunked and
247 | NetChunked in TensorFlow.\_**\_ ── Topic: 2 │ │ │ └─■──Creating samplers for
248 | training data.\_\_** ── Topic: 11 │ │ └─■──Documented code for sampling
249 | algorithms using randomized position generation.\_**\_ ── Topic: 3 │ └─■──Point
250 | Sampling and Convex Hull Computation\_\_** ── Topic: 18 └─Loss functions for
251 | training with time-chunked data\_**\_ ├─Loss calculation for time-chunked and
252 | full-sample training.\_\_** │ ├─Code functions for evaluating predictions using
253 | permutations and masking\_**\_ │ │ ├─■──Functions for Evaluating Predictions in
254 | Probing Tasks.\_\_** ── Topic: 7 │ │ └─■──permutation pointer manipulation and
255 | reduction in predictions\_**\_ ── Topic: 16 │ └─Loss calculation and decoder
256 | output postprocessing in neural networks.\_\_** │ ├─■──Postprocessing with
257 | Sinkhorn operator in log space\_**\_ ── Topic: 15 │ └─■──Loss calculation
258 | methods for training with time and full samples\_\_** ── Topic: 19 └─Functions
259 | for expanding and broadcasting JAX arrays\_**\_ ├─Description and input/output
260 | parameters of \_expand_to and \_is_not_done_broadcast functions\_\_** │
261 | ├─■──Array expansion and broadcasting techniques\_**\_ ── Topic: 21 │
262 | └─■──Purpose and Functionality of \_is_not_done_broadcast Function\_\_** ──
263 | Topic: 14 └─Sampler classes and associated data generation types\_**\_
264 | ├─■──Understanding Parameters and Expected Input/Output of Various Functions
265 | (including mst_prim, floyd_w ── Topic: 5 └─■──Sampling classes and their data
266 | generation purpose\_\_** ── Topic: 12
267 |
268 | ```
269 |
270 | - recurrent generation augmented with the above topic tree in system prompt
271 |
272 | ```
273 |
274 | . ├─DFS and DAG Shortest Paths Algorithm Implementation with Probing\_**\_ │
275 | ├─■──Bipartite matching-based flow networks\_\_** ── Topic: 34 │ └─Search and
276 | Shortest Path Algorithms\_**\_ │ ├─DAG shortest path algorithm with probing and
277 | initialization\_\_** │ │ ├─■──Strongly Connected Components Algorithm with
278 | Kosaraju's Implementation\_**\_ ── Topic: 37 │ │ └─Graph Sampling and DAG
279 | Shortest Path Algorithm\_\_** │ │ ├─■──Bipartite matching using Edmonds-Karp
280 | algorithm\_**\_ ── Topic: 18 │ │ └─■──Random graph generation using Bellman-Ford
281 | algorithm in Python\_\_** ── Topic: 0 │ └─Graham scan convex hull algorithm
282 | implementation in Python\_**\_ │ ├─■──Maximum subarray algorithm
283 | implementation\_\_** ── Topic: 6 │ └─■──Graham scan convex hull algorithm
284 | implementation\_**\_ ── Topic: 12 └─Postprocessing Decoder Output for Chunked
285 | Data Processing Net\_\_** ├─Postprocessing Decoder Output with Chunked Data in
286 | JAX\_**\_ │ ├─Functions and Files in Probing.py Explained\_\_** │ │ ├─Functions
287 | and techniques for data splitting and replication in probing and pmap
288 | computation.\_**\_ │ │ │ ├─Understanding the strings_pair_cat function and
289 | split_stages function in probing.py file\_\_** │ │ │ │ ├─TFDS CLRSDataset
290 | Command-Line Tool for Sampling Datasets\_**\_ │ │ │ │ │ ├─■──CLRS30 dataset and
291 | related functions explanation\_\_** ── Topic: 5 │ │ │ │ │ └─■──TFDS CLRSDataset
292 | Builder Implementation\_**\_ ── Topic: 16 │ │ │ │ └─Functions and Probing in
293 | Python Code\_\_** │ │ │ │ ├─Purpose of the `split_stages` function in
294 | `probing.py` and related functions for evaluating
295 | `ProbesD │ │ │ │ │ ├─Functions for evaluating hint and output predictions using permutation objects and dictionaries.____ │ │ │ │ │ │ ├─Processing randomized `pos`input in a sampler with pointers and permutations.____ │ │ │ │ │ │ │ ├─■──Process randomization of`pos`input in algorithms including string algorithms____ ── Topic: 29 │ │ │ │ │ │ │ └─■──A function to replace should-be permutations with proper permutation pointers using a sample iterato ── Topic: 19 │ │ │ │ │ │ └─Function for Evaluating Permutation Predictions using Hint Data____ │ │ │ │ │ │ ├─■──Function to Reduce Permutations in a Dictionary of Result Objects____ ── Topic: 11 │ │ │ │ │ │ └─■──Function to evaluate hint predictions with tuple and list inputs____ ── Topic: 17 │ │ │ │ │ └─Understanding probing functions in Hash Table implementation____ │ │ │ │ │ ├─Hash Table Probing Functions in probing.py File____ │ │ │ │ │ │ ├─■──Splitting ProbesDict into DataPoints by stage in Python____ ── Topic: 14 │ │ │ │ │ │ └─■──Understanding Hash Table Probing Functions (strings_pi, strings_pos, strings_pair_cat) in Python's`
296 | ── Topic: 1 │ │ │ │ │ └─■──Functions for Checking Input Dimensions in Machine
297 | Learning Models\_**_ ── Topic: 15 │ │ │ │ └─JAX pmap reshaping and computation
298 | functions (\_pmap_reshape, \_maybe_pmap_reshape, \_maybe_pmap_data)_ │ │ │ │
299 | ├─JAX pmap computation and pytree reshaping\_\_** │ │ │ │ │ ├─■──Purpose and
300 | attributes of the Stage and OutputClass classes\_**\_ ── Topic: 22 │ │ │ │ │
301 | └─■──JAX tree reshaping for pmap computation with \_pmap_reshape and
302 | \_maybe_pmap_reshape functions\_\_** ── Topic: 3 │ │ │ │ └─Numpy array copying
303 | functions with assertions\_**\_ │ │ │ │ ├─■──Functions for copying data between
304 | numpy arrays in Python\_\_** ── Topic: 21 │ │ │ │ └─■──Function Purpose and
305 | Parameters Analysis in Codebase\_**\_ ── Topic: 9 │ │ │ └─Trajectory Batching
306 | with Variable-Length Time Dimension\_\_** │ │ │ ├─■──Trajectory Batching and
307 | Concatenation\_**\_ ── Topic: 35 │ │ │ └─■──Batch processing of variable-length
308 | hint trajectories.\_\_** ── Topic: 31 │ │ └─Understanding the
309 | `_is_not_done_broadcast` function and its input/output parameters.\_**\_ │ │
310 | ├─■──Understanding the \_is_not_done_broadcast function in JAX array for
311 | sequence completion.\_\_** ── Topic: 8 │ │ └─■──Array broadcasting and expansion
312 | with \_expand_and_broadcast_to and \_expand_to functions\_**\_ ── Topic: 27 │
313 | └─Postprocessing Decoder Output with Sinkhorn Algorithm and Hard
314 | Categorization\_\_** │ ├─Node Feature Decoding with Encoders and Decoders\_**\_
315 | │ │ ├─■──Position Encoding Function for Natural Language Processing\_\_** ──
316 | Topic: 23 │ │ └─Node feature decoding using decoders and edge features\_**\_ │ │
317 | ├─■──Creating Encoders with Xavier Initialization and Truncated Normal
318 | Distribution for Encoding Categori ── Topic: 33 │ │ └─Node feature decoding with
319 | decoders and edge features\_\_** │ │ ├─■──Node feature decoding and encoding
320 | with decoders and edge features\_**\_ ── Topic: 2 │ │ └─■──Graph diff
321 | decoders\_\_** ── Topic: 32 │ └─Postprocessing of decoder output in graph neural
322 | networks.\_**\_ │ ├─Decoder Output Postprocessing with Sinkhorn Algorithm and
323 | Cross-Entropy Loss\_\_** │ │ ├─Message Passing Net with Time-Chunked Data
324 | Processing\_**\_ │ │ │ ├─■──Python Class for Message Passing Model with
325 | Selectable Algorithm\_\_** ── Topic: 26 │ │ │ └─■──NetChunked message passing
326 | operation with LSTM states for time-chunked data\_**\_ ── Topic: 7 │ │ └─Loss
327 | calculation for time-chunked training with scalar truth data.\_\_** │ │ ├─Loss
328 | calculation function for time-chunked training with scalar truth data.\_**\_ │ │
329 | │ ├─■──Loss calculation for time-chunked training data\_\_** ── Topic: 4 │ │ │
330 | └─■──Logarithmic Sinkhorn Operator for Permutation Pointer Logits\_**\_ ──
331 | Topic: 10 │ │ └─■──Decoder postprocessing with Sinkhorn operator\_\_** ── Topic:
332 | 28 │ └─Gradient Filtering for Optimizer Updates\_**\_ │ ├─■──Filtering processor
333 | parameters in Haiku models\_\_** ── Topic: 30 │ └─■──Filtering null gradients
334 | for untrained parameters during optimization.\_**\_ ── Topic: 24 └─PGN with Jax
335 | implementation and NeurIPS 2020 paper\_\_** ├─Message-Passing Neural Network
336 | (MPNN) for Graph Convolutional Networks (GCNs)\_**\_ │ ├─■──"Applying Triplet
337 | Message Passing with HK Transforms in MPNN for Graph Neural Networks"\_\_** ──
338 | Topic: 20 │ └─■──Implementation of Deep Sets (Zaheer et al., NeurIPS 2017) using
339 | adjacency matrices and memory networ ── Topic: 13 └─GATv2 Graph Attention
340 | Network with adjustable sizes of multi-head attention and residual connections
341 | ├─■──Graph Attention Network v2 architecture with adjustable head number and
342 | output size.\_**\_ ── Topic: 36 └─■──Processor factory with various models and
343 | configurations\_\_** ── Topic: 25
344 |
345 | ```
346 |
347 | ## Langchain
348 |
349 | - https://github.com/hwchase17/langchain
350 |
351 | ```
352 |
353 | . ├─Combining documents with different chain types and LLM chains\_**\_ │
354 | ├─MapReduce Chain Loading and Combining\_\_** │ │ ├─Question answering chain
355 | with sources loading and combining\_**\_ │ │ │ ├─■──Loading question answering
356 | with sources chain with multiple loader mappings and chains.\_\_** ── Topic: 53
357 | │ │ │ └─■──Loading and Combining Documents with Language Models for Summarizing
358 | and QA\_**\_ ── Topic: 71 │ │ └─Map Reduce Chain Loading Function\_\_** │ │
359 | ├─Document Refinement using LLM Chains\_**\_ │ │ │ ├─■──Combining Documents with
360 | Stuffing and LLM Chain in Python\_\_** ── Topic: 97 │ │ │
361 | └─BaseQAWithSourcesChain document handling and processing.\_**\_ │ │ │
362 | ├─■──Question Answering with Sources over Documents Chain\_\_** ── Topic: 60 │ │
363 | │ └─■──Python class for chatbot with vector database and question
364 | generation\_**\_ ── Topic: 16 │ │ └─MapReduce chain implementation\_\_** │ │
365 | ├─■──MapReduceDocumentsChain document combination with chaining and
366 | mapping\_**\_ ── Topic: 12 │ │ └─■──MapReduce Chain Loading Function\_\_** ──
367 | Topic: 95 │ └─LLMBashChain document examples and related keywords\_**\_ │ ├─Bash
368 | operations and language modeling chain implementation\_\_** │ │
369 | ├─LLMSummarizationCheckerChain document samples\_**\_ │ │ │ ├─■──Working with
370 | SQL databases in Python using SQLDatabaseChain\_\_** ── Topic: 46 │ │ │
371 | └─Document processing with LLMSummarizationCheckerChain\_**\_ │ │ │
372 | ├─■──Implementation of Program-Aided Language Models with PALChain class and
373 | related prompts and assertio ── Topic: 31 │ │ │
374 | └─■──LLMSummarizationCheckerChain class and its functionality\_\_** ── Topic: 93
375 | │ │ └─LLMBashChain - interpreting prompts and executing bash code\_**\_ │ │
376 | ├─■──LLMMathChain - Python code execution for math prompts\_\_** ── Topic: 92 │
377 | │ └─■──Bash execution with LLMBashChain\_**\_ ── Topic: 80 │ └─■──MRKLChain
378 | implementation with ChainConfig and API integration\_\_** ── Topic: 59 └─Code
379 | organization and structure in Python including several classes related to
380 | self-hosted embedding ├─Code organization and improvement suggestions for a
381 | class definition.\_**\_ │ ├─Code Loading and Organization Best Practices\_\_** │
382 | │ ├─Web scraping Hacker News webpage titles\_**\_ │ │ │ ├─Loading files using
383 | unstructured in Python\_\_** │ │ │ │ ├─Unstructured file loading with retry and
384 | partitioning capabilities.\_**\_ │ │ │ │ │ ├─■──Retry Decorator for OpenAI API
385 | Calls\_\_** ── Topic: 45 │ │ │ │ │ └─Unstructured File Loading and
386 | Partitioning\_**\_ │ │ │ │ │ ├─■──Unstructured File Loader for Partitioning
387 | Files in Various Formats\_\_** ── Topic: 25 │ │ │ │ │ └─■──Loading files with
388 | Unstructured package in different modes (Python code).\_**\_ ── Topic: 26 │ │ │
389 | │ └─PDF manipulation in Python with pypdf, pdfminer, fitz and pymupdf
390 | libraries\_\_** │ │ │ │ ├─■──PDF file loading and text extraction using PyMuPDF
391 | and PDFMiner\_**\_ ── Topic: 69 │ │ │ │ └─■──Extracting Text from Paged PDF
392 | using PyPDF and PDFMiner\_\_** ── Topic: 96 │ │ │ └─Extracting Hacker News
393 | Webpage Information using WebBaseLoader and BeautifulSoup.\_**\_ │ │ │ ├─■──Web
394 | scraping Hacker News with BeautifulSoup and WebBaseLoader\_\_** ── Topic: 21 │ │
395 | │ └─■──Web Scraping for College Confidential and Lyrics Websites\_**\_ ── Topic:
396 | 76 │ │ └─Code organization and structure in various Python modules\_\_** │ │
397 | ├─Compliments on clear and structured codebase with good use of type hints for
398 | memory handling and con │ │ │ ├─Implementation of ReAct paper using ReActChain
399 | with examples in Python\_**\_ │ │ │ │ ├─■──Implementation of ReAct paper in
400 | ReActChain agent with OpenAI LLC model and tools\_\_** ── Topic: 101 │ │ │ │
401 | └─In-memory Docstore for Efficient Lookup and Exploration\_**\_ │ │ │ │
402 | ├─■──Document Store Exploration with DocstoreExplorer\_\_** ── Topic: 87 │ │ │ │
403 | └─■──InMemoryDocstore for Storing and Searching Documents with
404 | AddableMixin\_**\_ ── Topic: 61 │ │ │ └─Compliments on Code Readability and
405 | Organization in Python Codebase.\_\_** │ │ │ ├─Memory Handling and Conversation
406 | Management\_**\_ │ │ │ │ ├─Memory Conversation Summarizer Implementation\_\_** │
407 | │ │ │ │ ├─Memory and Conversation Summarization in AI-assisted dialogues.\_**\_
408 | │ │ │ │ │ │ ├─■──Purpose of ChatPromptValue class in chat.py\_\_** ── Topic: 30
409 | │ │ │ │ │ │ └─■──Memory management and conversation summarization in AI chatbot
410 | system.\_**\_ ── Topic: 6 │ │ │ │ │ └─■──Implementation of Chain class with
411 | CallbackManager and Memory attributes.\_\_** ── Topic: 52 │ │ │ │ └─Potential
412 | bugs and suggestions for loading LLM, few-shot prompts, and examples from JSON
413 | and YAML fi │ │ │ │ ├─Code structure and organization tips for loading examples
414 | and templates from files in Python.\_**\_ │ │ │ │ │ ├─Compliments on code
415 | structure and organization\_\_** │ │ │ │ │ │ ├─■──Loading few-shot prompts from
416 | config with prefix and suffix templates\_**\_ ── Topic: 34 │ │ │ │ │ │ └─Code
417 | organization and structure for creating chat prompt templates\_\_** │ │ │ │ │ │
418 | ├─■──Chat prompt template and message prompt templates for generating chatbot
419 | prompts.\_**\_ ── Topic: 8 │ │ │ │ │ │ └─■──Purpose of `_load_prompt_from_file`
420 | function in loading.py module.\_\_** ── Topic: 13 │ │ │ │ │ └─■──Function for
421 | Loading a Chain of LLM Checkers from a Configuration Dictionary.\_**\_ ── Topic:
422 | 3 │ │ │ │ └─Documented class definitions for tools used in handling API
423 | requests, including OpenSearchVectorSear │ │ │ │ ├─Handling API requests using
424 | tools such as RequestsPostTool and OpenSearchVectorSearch\_\_** │ │ │ │ │
425 | ├─Python requests wrapper for making HTTP requests with various tools and
426 | methods\_**\_ │ │ │ │ │ │ ├─■──DeepInfra API token and text generation model
427 | wrapper\_\_** ── Topic: 41 │ │ │ │ │ │ └─RequestsWrapper and BaseRequestsTool
428 | for making HTTP requests (POST, GET, PATCH, DELETE) to API endp │ │ │ │ │ │
429 | ├─■──Checking Validity of Template Strings with Input Variables and Formatter
430 | Mapping\_**\_ ── Topic: 14 │ │ │ │ │ │ └─■──Requests tools for making HTTP
431 | requests with Python\_\_** ── Topic: 10 │ │ │ │ │ └─Code organization and
432 | positive feedback\_**\_ │ │ │ │ │ ├─Bing Search API Wrapper and Handler
433 | Classes\_\_** │ │ │ │ │ │ ├─■──Langchain callback manager and codebase
434 | organization\_**\_ ── Topic: 2 │ │ │ │ │ │ └─■──Bing Search API Wrapper and SERP
435 | API Usage in Python\_\_** ── Topic: 1 │ │ │ │ │ └─Handling iFixit devices with
436 | models and remote hardware\_**\_ │ │ │ │ │ ├─■──Loading iFixit repair guides and
437 | device wikis with transformer model inference.\_\_** ── Topic: 0 │ │ │ │ │
438 | └─■──Potential Issues with Modifying Input Dictionary in a Prompt Loading
439 | Function\_**\_ ── Topic: 9 │ │ │ │ └─Implementation and Usage of
440 | SearxSearchWrapper with Environment Variables and SSL Support\_\_** │ │ │ │
441 | ├─Python Libraries for API Wrappers and Search Engines\_**\_ │ │ │ │ │ ├─Python
442 | packages for integrating with search engines: SearxSearchWrapper and
443 | QdrantClient.\_\_** │ │ │ │ │ │ ├─■──Implementation of Searx API Wrapper
444 | (SearxSearchWrapper) using Python's BaseModel with QdrantClient ── Topic: 33 │ │
445 | │ │ │ │ └─■──Handling environment variables and dictionaries with
446 | get*from_dict_or_env function\_**\_ ── Topic: 72 │ │ │ │ │ └─Purpose and Issues
447 | with `print_text` Function in `langchain` Repository's `input.py` File\_\_** │ │
448 | │ │ │ ├─■──Printing Highlighted Text with Options in Python\_**\_ ── Topic: 51 │
449 | │ │ │ │ └─■──Converting Python Objects to String Representation with Nested
450 | Structures and Joining on Newline Cha ── Topic: 66 │ │ │ │ └─GitbookLoader class
451 | and its methods\_\_** │ │ │ │ ├─■──Handling newlines recursively in data
452 | structures using pandas\_**\_ ── Topic: 29 │ │ │ │ └─GitBookLoader class for
453 | loading web pages with options to load all or single pages\_\_** │ │ │ │
454 | ├─■──GitbookLoader class for loading single or multiple pages from GitBook with
455 | relative paths in the nav ── Topic: 28 │ │ │ │ └─■──Length-Based Example
456 | Selection and Text Length Calculation\_**\_ ── Topic: 57 │ │ │ └─Ngram overlap
457 | score using sentence_bleu and method1 smoothing function\_\_** │ │ │ ├─Ngram
458 | overlap score using sentence_bleu method1 smoothing function and auto
459 | reweighting\_**\_ │ │ │ │ ├─■──Code structure and organization in langchain
460 | document loaders with support for parsing comma-separat ── Topic: 70 │ │ │ │
461 | └─Ngram overlap score using sentence_bleu and method1 smoothing function with
462 | auto reweighting in nltk │ │ │ │ ├─■──Compliments on well-structured and
463 | organized code in different classes and methods\_\_** ── Topic: 65 │ │ │ │
464 | └─■──Sentence BLEU score and ngram overlap computation with method1 smoothing
465 | function and auto reweighti ── Topic: 49 │ │ │ └─Model Definition and
466 | Experimentation with Datetime and UTCNow Attributes\_**\_ │ │ │ ├─■──Data
467 | Modeling with Time Zones in Python\_\_** ── Topic: 91 │ │ │ └─■──Constitutional
468 | Principles and Tracing in Python\_**\_ ── Topic: 68 │ │ └─Text splitting for
469 | knowledge triple extraction\_\_** │ │ ├─Text Splitting Toolkit\_**\_ │ │ │
470 | ├─Text splitting interface and implementation\_\_** │ │ │ │ ├─Python REPL Tool
471 | and AST Implementation\_**\_ │ │ │ │ │ ├─Python REPL Tool Implementation\_\_** │
472 | │ │ │ │ │ ├─SQL database metadata retrieval tool\_**\_ │ │ │ │ │ │ │ ├─■──Python
473 | function to concatenate cell information for AI and human usage\_\_** ── Topic:
474 | 44 │ │ │ │ │ │ │ └─SQL database metadata tool for listing table schema and
475 | metadata\_**\_ │ │ │ │ │ │ │ ├─■──SQL database metadata extraction tool for
476 | specified tables\_\_** ── Topic: 75 │ │ │ │ │ │ │ └─■──JSON and SQL database
477 | tools for listing and getting values\_**\_ ── Topic: 15 │ │ │ │ │ │ └─Python
478 | REPL Tool using AST and Coroutine\_\_** │ │ │ │ │ │ ├─■──Tool implementation
479 | with direct function or coroutine input and error handling.\_**\_ ── Topic: 99 │
480 | │ │ │ │ │ └─■──Python REPL Tool with AST and version validation\_\_** ── Topic:
481 | 74 │ │ │ │ │ └─Implementing API wrappers for news, movie information, and
482 | weather using APIChain\_**\_ │ │ │ │ │ ├─Implementing APIs for News, Weather,
483 | and Movie Information in LangChain's Load Tools Module\_\_** │ │ │ │ │ │
484 | ├─■──Language model for reasoning about position and color attributes of objects
485 | in weather forecasting w ── Topic: 73 │ │ │ │ │ │ └─Implementing APIs for
486 | fetching news and movies using Python\_**\_ │ │ │ │ │ │ ├─■──well-structured and
487 | readable implementation of API initialization functions in load_tools.py for Too
488 | ── Topic: 85 │ │ │ │ │ │ └─■──Working with API authentication and chaining for
489 | news and movie information retrieval (using news_ap ── Topic: 100 │ │ │ │ │
490 | └─■──Wolfram Alpha SDK querying using WolframAlphaQueryRun class and api_wrapper
491 | attribute\_\_** ── Topic: 89 │ │ │ │ └─TextSplitting for Vector Storage with
492 | Overlapping Chunks\_**\_ │ │ │ │ ├─Python's StrictFormatter class and its
493 | check_unused_args method for formatting and validation of inp │ │ │ │ │ ├─L2
494 | distance search using ndarray in Python\_\_** │ │ │ │ │ │ ├─■──L2 search for
495 | nearest neighbors with np.linalg.norm\_**\_ ── Topic: 32 │ │ │ │ │ │
496 | └─■──Parsing and Organizing Notes with Hashing and Embeddings\_\_** ── Topic: 67
497 | │ │ │ │ │ └─Python Class for Strict Formatter with Check on Unused Args\_**\_ │
498 | │ │ │ │ ├─Vector Store Toolkit and Deployment\_\_** │ │ │ │ │ │ ├─■──Vector
499 | Store Toolkit and Deployment with OpenAI LLM\_**\_ ── Topic: 35 │ │ │ │ │ │
500 | └─■──Working with AirbyteJSONLoader to load local Airbyte JSON files\_\_** ──
501 | Topic: 47 │ │ │ │ │ └─Python Formatter class with check_unused_args method and
502 | strict validation\_**\_ │ │ │ │ │ ├─Python's StrictFormatter class and its
503 | check_unused_args method for validating unused and extra argu │ │ │ │ │ │
504 | ├─■──Finding TODO Tasks in Code Snippets\_\_** ── Topic: 4 │ │ │ │ │ │ └─Python
505 | Formatter and StrictFormatter with check_unused_args method\_**\_ │ │ │ │ │ │
506 | ├─■──Color Mapping Function for Prompt Inputs with Exclusions\_\_** ── Topic: 88
507 | │ │ │ │ │ │ └─■──Implementing strict checking of unused and extra keys in a
508 | subclass of formatter\_**\_ ── Topic: 48 │ │ │ │ │ └─Python module for loading
509 | and manipulating language chain data with verbosity control.\_\_** │ │ │ │ │
510 | ├─■──Python function for getting verbosity from language chaining with Azure
511 | OpenAI and difference from O ── Topic: 64 │ │ │ │ │ └─■──Purpose of functions in
512 | loading.py and csv toolkit of langchain repository\_**\_ ── Topic: 42 │ │ │ │
513 | └─Text splitting using chunk size and overlap with various libraries and
514 | interfaces.\_\_** │ │ │ │ ├─Text splitting and chunking with overlap and length
515 | functions\_**\_ │ │ │ │ │ ├─■──Developing and Maintaining Docker Compose Modules
516 | in Python\_\_** ── Topic: 79 │ │ │ │ │ └─Text splitting and chunking using
517 | TextSplitter interface\_**\_ │ │ │ │ │ ├─Text Splitting Interface and
518 | Implementation\_\_** │ │ │ │ │ │ ├─■──Text splitting using TokenTextSplitter
519 | class.\_**\_ ── Topic: 7 │ │ │ │ │ │ └─■──Document Loading and Splitting with
520 | Text Splitting and Callback Management.\_\_** ── Topic: 84 │ │ │ │ │ └─■──Python
521 | code for initializing an agent with various optional arguments\_**\_ ── Topic:
522 | 18 │ │ │ │ └─Loading Google Docs from Google Drive using Credentials and Tokens
523 | with Python\_\_** │ │ │ │ ├─Document Loading from Cloud Storage (GCS and S3)
524 | using BaseLoader Class\_**\_ │ │ │ │ │ ├─■──Online PDF loading and caching using
525 | SQLite and temporary directories\_\_** ── Topic: 98 │ │ │ │ │ └─■──Loading
526 | documents from cloud storage using GCSFileLoader and S3FileLoader classes.\_**\_
527 | ── Topic: 36 │ │ │ │ └─■──Google Drive Loader and Credentials for Loading Google
528 | Docs\_\_** ── Topic: 86 │ │ │ └─StreamlitCallbackHandler for logging to
529 | streamlit in Python code\_**\_ │ │ │ ├─Streaming with LLMs and Callback
530 | Handlers\_\_** │ │ │ │ ├─Networkx wrapper for entity graph operations with Redis
531 | caching.\_**\_ │ │ │ │ │ ├─NetworkX Entity Graph with Missing Tables and
532 | Callback Manager\_\_** │ │ │ │ │ │ ├─■──Graph Index Creation and Operations
533 | using NetworkX Library in Python\_**\_ ── Topic: 58 │ │ │ │ │ │
534 | └─■──NetworkxEntityGraph and entity graph operations.\_\_** ── Topic: 20 │ │ │ │
535 | │ └─Redis cache implementation in Python\_**\_ │ │ │ │ │ ├─■──Implementing a
536 | SQAlchemy-based cache system with missing and existing prompts for better
537 | performance ── Topic: 17 │ │ │ │ │ └─■──Implementation of a Redis cache as a
538 | backend in Python\_\_** ── Topic: 39 │ │ │ │ └─Python Callback Handler for
539 | Streamlit Logging\_**\_ │ │ │ │ ├─■──Callback handlers for printing to standard
540 | output.\_\_** ── Topic: 43 │ │ │ │ └─■──StreamlitCallbackHandler for logging
541 | prompts and actions to Streamlit\_**\_ ── Topic: 90 │ │ │ └─ZeroShotAgent class
542 | and observation prefix property in Python\_\_** │ │ │ ├─Creating a JSON agent
543 | using a toolkit for zeroshot agent execution with format instructions and inpu │
544 | │ │ │ ├─■──Creating Pandas DataFrames using Agent Scratchpad and Python AST REPL
545 | Tool.\_**\_ ── Topic: 82 │ │ │ │ └─Creating a JSON agent with toolkit, format
546 | instructions, and prefix/suffix\_\_** │ │ │ │ ├─■──SQL agent creation with
547 | SQLDatabaseToolkit, BaseLLM and BaseCallbackManager\_**\_ ── Topic: 11 │ │ │ │
548 | └─■──Creating a JSON agent with OpenAPI toolkit and interacting with it using
549 | JSON tools\_\_** ── Topic: 56 │ │ │ └─Classes for language model-driven decision
550 | making and use of "agent_scratchpad" in LLMChain prompts* │ │ │ ├─■──Agent class
551 | and entity extraction using "agent*scratchpad" variable\_**\_ ── Topic: 38 │ │ │
552 | └─■──Code for a text-based game-playing agent using self-ask-with-search
553 | approach in TextWorld environmen ── Topic: 102 │ │ └─Text Mapping for
554 | Approximate k-NN Search using nmslib in Python\_\_** │ │ ├─Script Scoring with
555 | KNN Search\_**\_ │ │ │ ├─■──Document bulk-ingest function for embeddings in
556 | Elasticsearch index\_\_** ── Topic: 23 │ │ │ └─■──Script Scoring Search with
557 | Cosine Similarity and k-Nearest Neighbors (k-NN) Algorithm\_**\_ ── Topic: 19 │
558 | │ └─Default text mapping for Approximate k-NN Search in dense vector fields
559 | using NMSLIB engine\_\_** │ │ ├─■──Default Mapping for Approximate k-NN Search
560 | using NMSLIB Engine\_**\_ ── Topic: 81 │ │ └─■──Elasticsearch indexing and
561 | scripting with default mappings and painless scripting\_\_** ── Topic: 94 │
562 | └─Tracing and Recording Runs with SharedTracer and TracerStack\_**\_ │ ├─Python
563 | classes ToolRun and ChainRun in schemas.py file with additional attributes and
564 | their purpose. │ │ ├─■──Extracting information about ElementInViewPort instances
565 | in chainrun toolrun runs.\_\_** ── Topic: 77 │ │ └─■──Purpose and attributes of
566 | the ChainRun class in schemas.py file\_**\_ ── Topic: 78 │ └─Tracing and
567 | thread-safe execution with SharedTracer Singleton class\_\_** │ ├─■──Tracing
568 | Execution Order with BaseTracer in a Thread-Safe Manner\_**\_ ── Topic: 55 │
569 | └─■──TracerStack and SharedTracer Implementation in Python\_\_** ── Topic: 63
570 | └─Python wrapper for OpenAI and Hugging Face language models\_**\_ ├─Self-Hosted
571 | Hugging Face Instructor Embedding Models on Remote Hardware\_\_** │
572 | ├─HuggingFace and Sentence-Transformers Embeddings for Cohere\_**\_ │ │
573 | ├─■──Output parsing using regular expressions and the BaseOutputParser
574 | class\_\_** ── Topic: 54 │ │ └─NLP Embeddings using Hugging Face and Sentence
575 | Transformers\_**\_ │ │ ├─■──Neural Embeddings with Hugging Face and Cohere
576 | API\_\_** ── Topic: 24 │ │ └─■──Loading sentence embedding model with
577 | sentence_transformers library.\_**\_ ── Topic: 27 │ └─Self-hosted HuggingFace
578 | pipeline API for running models on remote hardware\_\_** │ ├─Self-hosted
579 | HuggingFace pipeline for remote GPU hardware inference with autolaunched
580 | instances on va │ │ ├─■──Self-hosted HuggingFace pipeline for remote hardware
581 | with HuggingFace Transformers and AutoTokenizer ── Topic: 40 │ │
582 | └─■──Self-hosted embeddings for sentence_transformers with remote hardware
583 | support.\_**\_ ── Topic: 22 │ └─■──Self-hosted embeddings for running custom
584 | embedding models on remote hardware\_\_** ── Topic: 62 └─Python wrapper for
585 | OpenAI language model with API key authentication and model parameters
586 | configurat ├─OpenAI Language Model Wrapper Class with API Key Authentication and
587 | Model Parameters Configuration\_\_ │ ├─■──StochasticAI Wrapper for Large
588 | Language Models with Environment Key Validation and PDF Partitioning* ── Topic:
589 | 50 │ └─Integration of OpenAI Language Model with GooseAI class for Text
590 | Generation\_**\_ │ ├─■──OpenAI Chat Model Implementation\_\_** ── Topic: 37 │
591 | └─■──Python Wrapper for OpenAI Language Models\_**\_ ── Topic: 5 └─■──Anthropic
592 | Large Language Models and API Usage in AtlasDB Project Management\_\_** ──
593 | Topic: 83
594 |
595 | ```
596 |
597 | ## Pyknotid
598 |
599 | - https://pyknotid.readthedocs.io/en/latest/
600 |
601 | ```
602 |
603 | └─Calculation of Alexander polynomial for knots in Python and Mathematica.\_**\_
604 | ├─Alexander polynomial calculation using Mathematica process and knot routing
605 | algorithm with various p │ ├─Calculation of Alexander polynomial for knots using
606 | Python and Mathematica representations\_\_** │ │ ├─CellKnot object
607 | initialization and properties with sin, cos, linspace, phi, psi, theta,
608 | rotation, pe │ │ │ ├─Mollweide projection and spherical coordinates\_**\_ │ │ │
609 | │ ├─Rotation of Spheres using Rotation Matrices\_\_** │ │ │ │ │ ├─■──Rotation of
610 | sphere to align given positions at the top\_**\_ ── Topic: 41 │ │ │ │ │
611 | └─■──Rotation matrix computation and manipulation using iterable angles.\_\_**
612 | ── Topic: 18 │ │ │ │ └─Mollweide projection and conversion of spherical
613 | coordinates\_**\_ │ │ │ │ ├─Mollweide projection and spherical coordinates
614 | conversion\_\_** │ │ │ │ │ ├─■──Vector magnitude calculation, Mollweide
615 | projection, and well-written code in Python.\_**\_ ── Topic: 51 │ │ │ │ │
616 | └─■──"Mollweide projection and spherical coordinate conversion"\_\_** ── Topic:
617 | 30 │ │ │ │ └─■──Verbose printing function for Pyknotid counters.\_**\_ ── Topic:
618 | 10 │ │ │ └─CellKnot class and points folding\_\_** │ │ │ ├─CellKnot and Knot
619 | Folding\_**\_ │ │ │ │ ├─■──Understanding the "cell_trefoil" function and the
620 | "aperiodic_trefoil" function for creating interpol ── Topic: 37 │ │ │ │
621 | └─■──CellKnot class and related methods\_\_** ── Topic: 33 │ │ │ └─3D geometric
622 | scaling with numpy and crossing signs\_**\_ │ │ │ ├─Geometric Transformation
623 | with Crossing Signs\_\_** │ │ │ │ ├─■──Numpy arrays for creating and perturbing
624 | a simple link using sin and cos in Python code.\_**\_ ── Topic: 15 │ │ │ │
625 | └─■──Geometric transformation with crossing signs and np array\_\_** ── Topic: 3
626 | │ │ │ └─■──3D point scaling helper functions in p4_3**1, p4_4**1, p4_5**1_false,
627 | p5_3**1 and p5_4**1.\_\_** ── Topic: 47 │ │ └─Knot representations and
628 | calculation of Alexander polynomial using Python and Mathematica\_**\_ │ │
629 | ├─Line Segment Open by Distance Generator\_\_** │ │ │ ├─Issues with missing
630 | function definitions and potential bugs in serialisation functions.\_**\_ │ │ │
631 | │ ├─■──JSON and polynomial serialisation with potential implementation
632 | issues\_\_** ── Topic: 17 │ │ │ │ └─■──Issues with incomplete function for
633 | serialising Jones polynomials in Python\_**\_ ── Topic: 36 │ │ │ └─Line vectors
634 | open by distance fraction with seed and number of segments as parameters.\_\_**
635 | │ │ │ ├─Line segment manipulation and generation\_**\_ │ │ │ │ ├─Line Segments
636 | and Open/Closed Loop Detection\_\_** │ │ │ │ │ ├─■──Open and closed line
637 | segments generation with distance constraint.\_**\_ ── Topic: 5 │ │ │ │ │
638 | └─■──Writing Mathematica code to file and running it using MathKernel\_\_** ──
639 | Topic: 28 │ │ │ │ └─Loading and manipulating CSV files with Pandas and saving to
640 | JSON.\_**\_ │ │ │ │ ├─■──Writing and loading data in json format with numpy and
641 | handling file paths (filenotfounderror explan ── Topic: 14 │ │ │ │ └─■──Parsing
642 | CSV data using pandas in Python\_\_** ── Topic: 19 │ │ │ └─Downloading Knots
643 | Database with Pyknotid Library.\_**\_ │ │ │ ├─Knots database download and
644 | management\_\_** │ │ │ │ ├─■──Downloading Knots Database using Pyknotid
645 | Library\_**\_ ── Topic: 23 │ │ │ │ └─■──Deleting old versions of database files
646 | in specific format using Python.\_\_** ── Topic: 44 │ │ │ └─■──Recursive file
647 | inclusion using fnmatch patterns in Python\_**\_ ── Topic: 43 │ │ └─Alexander
648 | polynomial computation using Mathematica for knot representations\_\_** │ │
649 | ├─Calculation of Alexander polynomial using Python and Mathematica code
650 | snippets.\_**\_ │ │ │ ├─MeshCollectionVisual class and associated methods for
651 | vertex colors and shading in mesh visualizatio │ │ │ │ ├─Code Refactoring and
652 | Todo Tasks with Comments and Unit Tests\_\_** │ │ │ │ │ ├─■──Classes and
653 | functionality for handling periodic boundary conditions in a 2D space.\_**\_ ──
654 | Topic: 39 │ │ │ │ │ └─■──Code Refactoring and Unit Testing\_\_** ── Topic: 4 │ │
655 | │ │ └─MeshCollectionVisual class and vertex colors in 3D mesh
656 | visualization.\_**\_ │ │ │ │ ├─Signal Smoothing with Window Functions\_\_** │ │
657 | │ │ │ ├─■──Testing vector intersection in a dp/dq region using
658 | do_vectors_intersect function and obtaining bool ── Topic: 20 │ │ │ │ │ └─signal
659 | smoothing with different windows and sizes\_**\_ │ │ │ │ │ ├─■──Signal Smoothing
660 | using Different Windows\_\_** ── Topic: 49 │ │ │ │ │ └─■──Code organization and
661 | readability of periodic_vassiliev_degree_2 function in adherence with PEP 8 gu
662 | ── Topic: 26 │ │ │ │ └─MeshCollectionVisual class and related methods\_**\_ │ │
663 | │ │ ├─■──MeshCollectionVisual class and its methods for mesh visualization and
664 | handling vertex colors and sha ── Topic: 9 │ │ │ │ └─■──Cell object for lines
665 | with periodic boundary conditions\_\_** ── Topic: 45 │ │ │ └─Alexander
666 | polynomial calculation using Mathematica\_**\_ │ │ │ ├─Calculating the Alexander
667 | polynomial of knots using various representations\_\_** │ │ │ │ ├─Gauss code
668 | conversion to crossing indices\_**\_ │ │ │ │ │ ├─Recommendations for the "mag"
669 | function implementation in periodic.py\_\_** │ │ │ │ │ │ ├─■──Implementing
670 | vector magnitude using dot product in Pyknotid\_**\_ ── Topic: 8 │ │ │ │ │ │
671 | └─■──Improving code with imports and using numpy.zeros instead of n.zeros.\_\_**
672 | ── Topic: 50 │ │ │ │ │ └─■──Converting Gauss code to crossings in a crossing
673 | object\_**\_ ── Topic: 22 │ │ │ │ └─Calculation of Alexander polynomial using
674 | Mathematica for knot representations\_\_** │ │ │ │ ├─Knot theory and Alexander
675 | polynomial calculation using Mathematica\_**\_ │ │ │ │ │ ├─■──BoundingBox class
676 | implementation in Python with numpy and axis manipulation\_\_** ── Topic: 1 │ │
677 | │ │ │ └─■──Calculation of Alexander polynomial for knot representations using
678 | Mathematica\_**\_ ── Topic: 0 │ │ │ │ └─3D sphere plotting with Mollweide
679 | projection using VisPy\_\_** │ │ │ │ ├─■──"3D visualization of spherical data
680 | using VisPy and Mollweide projection"\_**\_ ── Topic: 2 │ │ │ │ └─■──Class
681 | definition of MeshCollectionVisual that creates a mesh by concatenating visuals'
682 | vertices, ind ── Topic: 12 │ │ │ └─Database objects matching invariants using
683 | Python code\_\_** │ │ │ ├─Database objects and invariants in knot theory\_**\_ │
684 | │ │ │ ├─■──"Database storage and manipulation of knots using Peewee and
685 | optimized Cython routines"\_\_** ── Topic: 35 │ │ │ │ └─■──Database searching
686 | with invariants in Python\_**\_ ── Topic: 7 │ │ │ └─■──OpenKnot class for
687 | holding vertices of an open curve in spacecurves module\_\_** ── Topic: 38 │ │
688 | └─Parsing data files and visualizing with matplotlib and mayavi/vispy.\_**\_ │ │
689 | ├─Code for drawing bounding boxes in 3D using VisPy\_\_** │ │ │ ├─Plotting
690 | Lissajous Conformation with Vispy and Mayavi Toolkits\_**\_ │ │ │ │
691 | ├─■──Plotting Lissajous conformations with plot_cell using Vispy and Mayavi
692 | toolkits\_\_** ── Topic: 13 │ │ │ │ └─■──Understanding the plot_line() function
693 | in pyknotid's visualise.py and its 3D plotting toolkits (Maya ── Topic: 27 │ │ │
694 | └─■──Bounding box visualization with VisPy\_**\_ ── Topic: 32 │ │ └─Analyzing
695 | knot type of curve in a data file using argparse parser and VisPy canvas
696 | plotting.\_\_** │ │ ├─■──Plotting 2D projections with optional markers in Python
697 | using pyplot\_**\_ ── Topic: 48 │ │ └─Analysis of knot types in data files using
698 | argparse and vispy_canvas.\_\_** │ │ ├─■──Working with VisPy Canvas and Scene
699 | objects\_**\_ ── Topic: 40 │ │ └─■──Parsing and analyzing knot types in data
700 | files using argparse\_\_** ── Topic: 42 │ └─Alternative Periodic Vassiliev
701 | Function with Conway Notation (Degree 4, Z4 Coefficients) and Core Cr │
702 | ├─■──Calculation of writhing numbers using Arrow diagrams and Gauss codes\_**\_
703 | ── Topic: 16 │ └─Alternative periodic Vassiliev function for Conway notation
704 | with z4 coefficients and related invaria │ ├─■──Arnold's invariants and their
705 | calculation by transforming representation into an unknot\_\_** ── Topic: 6 │
706 | └─■──Alternative periodic Vassiliev function in pyknotid with Conway notation
707 | and Z4 coefficients\_**\_ ── Topic: 11 └─"Calculating higher order writhe
708 | integrals using numpy and cython"\_\_** ├─Calculation of Higher Order Writhe
709 | Integral with NumPy and Cython Implementation.\_**\_ │ ├─Calculation of higher
710 | order writhe integral using points and order contributions\_\_** │ │
711 | ├─■──higher_order_writhe_integral function implementation with numpy\_**\_ ──
712 | Topic: 29 │ │ └─■──Writhing matrix and coefficient calculations for points in
713 | 3-dimensional space\_\_** ── Topic: 46 │ └─■──Calculation of Writhe and Average
714 | Crossing Number using Integral\_**\_ ── Topic: 21 └─■──Distance Quantity
715 | Calculation from Curve Integral\_\_** ── Topic: 31
716 |
717 | ```
718 |
719 | ## PyReason
720 |
721 | - https://github.com/lab-v2/pyreason
722 |
723 | ```
724 |
725 | . ├─Updating Interpretations Graph with Nodes and Edges\_**\_ │ ├─Analysis of
726 | Python code implementing a graph data structure and functions to add nodes and
727 | edges, us │ │ ├─■──Code Refactoring and Commenting, Handling None Values in
728 | Python Code\_\_** ── Topic: 10 │ │ └─Code analysis and review of a function for
729 | adding edges and nodes to a graph, including checking for │ │ ├─■──Positive
730 | aspects of a codebase with clear and descriptive function names and variable
731 | names.\_**\_ ── Topic: 4 │ │ └─■──Methods for adding edges to a graph with
732 | considerations for existing nodes and labels.\_\_** ── Topic: 11 │ └─Updating
733 | nodes and edges in a graph with rule trace and atom trace, using Numba for
734 | optimization\_**\_ │ ├─Python functions for updating rule traces with graph
735 | attributes\_\_** │ │ ├─■──Updating nodes with rules and traces in a converging
736 | system.\_**\_ ── Topic: 15 │ │ └─■──Interpretation of Graph Labels Using Numba
737 | in Python\_\_** ── Topic: 5 │ └─analysis of profiling data for program
738 | optimization\_**\_ │ ├─■──Parsing YAML to create a list of rules\_\_** ── Topic:
739 | 9 │ └─■──Parsing and Profiling Data from CSV files in Python\_**\_ ── Topic: 1
740 | └─Python Object Getter Implementation for Fact Objects\_\_** ├─Python code
741 | structure and implementation in pyreason's numba_wrapper lib\_**\_ │ ├─Functions
742 | and objects in pyreason's numba_wrapper module that return nested functions and
743 | implement │ │ ├─■──Function for getting time lower and upper bounds from a fact
744 | object\_\_** ── Topic: 14 │ │ └─Higher-order functions in rule_type.py for
745 | accessing fields of a rule object in pyreason/pyreason/sc │ │ ├─Python classes
746 | and object-oriented programming concepts with "Fact" class examples.\_**\_ │ │ │
747 | ├─■──Nested function type checking with isinstance in Python\_\_** ── Topic: 8 │
748 | │ │ └─■──Class Fact and its attributes and methods.\_**\_ ── Topic: 7 │ │
749 | └─Numba implementation functions for label and world types in PyReason\_\_** │ │
750 | ├─Higher-order functions for getting attributes of rules in PyReason's
751 | numba_types module\_**\_ │ │ │ ├─■──Code structure and naming conventions in
752 | pyreason's numba_wrapper/numpy_types directory\_\_** ── Topic: 6 │ │ │
753 | └─■──Implementation details of the `get_target_criteria` function and
754 | `unbox_rule` in rule_type.py\_**\_ ── Topic: 0 │ │ └─■──Implementation of
755 | interval methods in pyreason using numba_wrapper\_\_** ── Topic: 2 │
756 | └─Compliments on codebase functions for calculating minimum, maximum, average,
757 | and average lower using │ ├─■──Implementing a reset function to update the
758 | bounds of an interval object in Python\_**\_ ── Topic: 18 │ └─■──Compliments on
759 | Function Names and Docstrings for Array Calculation Functions\_\_** ── Topic: 13
760 | └─Working with pyapi and objects in Python code\_**\_ ├─Understanding a Python
761 | code snippet for unboxing facts with structmodels\_\_** │ ├─■──Object Unboxing
762 | and Attribute Retrieval in Python with PyAPI\_**\_ ── Topic: 17 │ └─Numba code
763 | for creating and boxing a struct model instance\_\_** │ ├─■──Code snippet for
764 | creating a struct proxy and boxing its components in Numba.\_**\_ ── Topic: 3 │
765 | └─■──Python class RuleModel with labeled attributes and types\_\_** ── Topic: 12
766 | └─■──Functions for creating and boxing objects in a Python-C++ interface\_\_\_\_
767 | ── Topic: 16
768 |
769 | ```
770 |
771 | ```
772 |
--------------------------------------------------------------------------------
/__main__.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | from getpass import getpass
3 |
4 | import openai
5 |
6 | from src import Pipeline
7 |
8 | if __name__ == "__main__":
9 | argsparse = argparse.ArgumentParser()
10 | argsparse.add_argument("--config", type=str, default="./test_config.yaml")
11 | argsparse.add_argument("--repo", type=str, default="https://github.com/LAION-AI/Open-Assistant.git")
12 | argsparse.add_argument("--repo_name", type=str, default="OA")
13 |
14 | args = argsparse.parse_args()
15 |
16 | openai_secret = getpass("Enter the secret key: ")
17 | # Set up OpenAI API credentials
18 | openai.api_key = openai_secret
19 |
20 | print("starting pipeline")
21 | pipeline = Pipeline.from_yaml(args.config)
22 | gnn_head_outputs, topic_model_outputs = pipeline.run(args.repo, args.repo_name)
23 | for i, topic_model_output in enumerate(topic_model_outputs):
24 | topic_model_output["data"].to_csv(f"context/{args.repo_name}_topic_model_outputs_{i}.csv")
25 | topic_model_output["topic_info"].to_csv(f"context/{args.repo_name}_topic_info_{i}.csv")
26 | with open(f"context/{args.repo_name}_tree_{i}.txt", "w", encoding="utf-8") as f:
27 | f.write(topic_model_output["tree"])
28 |
--------------------------------------------------------------------------------
/git-frontend/.gitignore:
--------------------------------------------------------------------------------
1 | # See https://help.github.com/articles/ignoring-files/ for more about ignoring files.
2 |
3 | # dependencies
4 | /node_modules
5 | /.pnp
6 | .pnp.js
7 |
8 | # testing
9 | /coverage
10 |
11 | # production
12 | /build
13 |
14 | # misc
15 | .DS_Store
16 | .env.local
17 | .env.development.local
18 | .env.test.local
19 | .env.production.local
20 |
21 | npm-debug.log*
22 | yarn-debug.log*
23 | yarn-error.log*
24 |
--------------------------------------------------------------------------------
/git-frontend/README.md:
--------------------------------------------------------------------------------
1 | # Getting Started with Create React App
2 |
3 | This project was bootstrapped with
4 | [Create React App](https://github.com/facebook/create-react-app).
5 |
6 | ## Available Scripts
7 |
8 | In the project directory, you can run:
9 |
10 | ### `npm start`
11 |
12 | Runs the app in the development mode.\
13 | Open [http://localhost:3000](http://localhost:3000) to view it in your browser.
14 |
15 | The page will reload when you make changes.\
16 | You may also see any lint errors in the console.
17 |
18 | ### `npm test`
19 |
20 | Launches the test runner in the interactive watch mode.\
21 | See the section about [running tests](https://facebook.github.io/create-react-app/docs/running-tests)
22 | for more information.
23 |
24 | ### `npm run build`
25 |
26 | Builds the app for production to the `build` folder.\
27 | It correctly bundles React in production mode and optimizes the build for the best
28 | performance.
29 |
30 | The build is minified and the filenames include the hashes.\
31 | Your app is ready to be deployed!
32 |
33 | See the section about
34 | [deployment](https://facebook.github.io/create-react-app/docs/deployment) for
35 | more information.
36 |
37 | ### `npm run eject`
38 |
39 | **Note: this is a one-way operation. Once you `eject`, you can't go back!**
40 |
41 | If you aren't satisfied with the build tool and configuration choices, you can
42 | `eject` at any time. This command will remove the single build dependency from
43 | your project.
44 |
45 | Instead, it will copy all the configuration files and the transitive
46 | dependencies (webpack, Babel, ESLint, etc) right into your project so you have
47 | full control over them. All of the commands except `eject` will still work, but
48 | they will point to the copied scripts so you can tweak them. At this point
49 | you're on your own.
50 |
51 | You don't have to ever use `eject`. The curated feature set is suitable for
52 | small and middle deployments, and you shouldn't feel obligated to use this
53 | feature. However we understand that this tool wouldn't be useful if you couldn't
54 | customize it when you are ready for it.
55 |
56 | ## Learn More
57 |
58 | You can learn more in the
59 | [Create React App documentation](https://facebook.github.io/create-react-app/docs/getting-started).
60 |
61 | To learn React, check out the [React documentation](https://reactjs.org/).
62 |
63 | ### Code Splitting
64 |
65 | This section has moved here:
66 | [https://facebook.github.io/create-react-app/docs/code-splitting](https://facebook.github.io/create-react-app/docs/code-splitting)
67 |
68 | ### Analyzing the Bundle Size
69 |
70 | This section has moved here:
71 | [https://facebook.github.io/create-react-app/docs/analyzing-the-bundle-size](https://facebook.github.io/create-react-app/docs/analyzing-the-bundle-size)
72 |
73 | ### Making a Progressive Web App
74 |
75 | This section has moved here:
76 | [https://facebook.github.io/create-react-app/docs/making-a-progressive-web-app](https://facebook.github.io/create-react-app/docs/making-a-progressive-web-app)
77 |
78 | ### Advanced Configuration
79 |
80 | This section has moved here:
81 | [https://facebook.github.io/create-react-app/docs/advanced-configuration](https://facebook.github.io/create-react-app/docs/advanced-configuration)
82 |
83 | ### Deployment
84 |
85 | This section has moved here:
86 | [https://facebook.github.io/create-react-app/docs/deployment](https://facebook.github.io/create-react-app/docs/deployment)
87 |
88 | ### `npm run build` fails to minify
89 |
90 | This section has moved here:
91 | [https://facebook.github.io/create-react-app/docs/troubleshooting#npm-run-build-fails-to-minify](https://facebook.github.io/create-react-app/docs/troubleshooting#npm-run-build-fails-to-minify)
92 |
--------------------------------------------------------------------------------
/git-frontend/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "git-frontend",
3 | "version": "0.1.0",
4 | "private": true,
5 | "dependencies": {
6 | "@emotion/react": "^11.10.6",
7 | "@emotion/styled": "^11.10.6",
8 | "@mui/icons-material": "^5.11.11",
9 | "@mui/material": "^5.11.14",
10 | "@testing-library/jest-dom": "^5.16.5",
11 | "@testing-library/react": "^13.4.0",
12 | "@testing-library/user-event": "^13.5.0",
13 | "react": "^18.2.0",
14 | "react-dom": "^18.2.0",
15 | "react-scripts": "5.0.1",
16 | "web-vitals": "^2.1.4"
17 | },
18 | "scripts": {
19 | "preinstall": "npx npm-force-resolutions",
20 | "start": "react-scripts start",
21 | "build": "react-scripts build",
22 | "test": "react-scripts test",
23 | "eject": "react-scripts eject"
24 | },
25 | "eslintConfig": {
26 | "extends": [
27 | "react-app",
28 | "react-app/jest"
29 | ]
30 | },
31 | "browserslist": {
32 | "production": [
33 | ">0.2%",
34 | "not dead",
35 | "not op_mini all"
36 | ],
37 | "development": [
38 | "last 1 chrome version",
39 | "last 1 firefox version",
40 | "last 1 safari version"
41 | ]
42 | },
43 | "resolutions": {
44 | "nth-check": ">=2.0.1"
45 | },
46 | "devDependencies": {
47 | "npm-force-resolutions": "0.0.10"
48 | }
49 | }
50 |
--------------------------------------------------------------------------------
/git-frontend/public/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danielpatrickhug/GitModel/3764c938f659d0dc3b51ab3b4394aa5974fcc49e/git-frontend/public/favicon.ico
--------------------------------------------------------------------------------
/git-frontend/public/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
116 | );
117 | }
118 |
119 | export default App;
120 |
--------------------------------------------------------------------------------
/git-frontend/src/App.test.js:
--------------------------------------------------------------------------------
1 | import { render, screen } from "@testing-library/react";
2 | import App from "./App";
3 |
4 | test("renders learn react link", () => {
5 | render();
6 | const linkElement = screen.getByText(/learn react/i);
7 | expect(linkElement).toBeInTheDocument();
8 | });
9 |
--------------------------------------------------------------------------------
/git-frontend/src/index.css:
--------------------------------------------------------------------------------
1 | body {
2 | margin: 0;
3 | font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", "Roboto", "Oxygen",
4 | "Ubuntu", "Cantarell", "Fira Sans", "Droid Sans", "Helvetica Neue",
5 | sans-serif;
6 | -webkit-font-smoothing: antialiased;
7 | -moz-osx-font-smoothing: grayscale;
8 | }
9 |
10 | code {
11 | font-family: source-code-pro, Menlo, Monaco, Consolas, "Courier New",
12 | monospace;
13 | }
14 |
--------------------------------------------------------------------------------
/git-frontend/src/index.js:
--------------------------------------------------------------------------------
1 | import React from "react";
2 | import ReactDOM from "react-dom/client";
3 | import "./index.css";
4 | import App from "./App";
5 | import reportWebVitals from "./reportWebVitals";
6 |
7 | const root = ReactDOM.createRoot(document.getElementById("root"));
8 | root.render(
9 |
10 |
11 |
12 | );
13 |
14 | // If you want to start measuring performance in your app, pass a function
15 | // to log results (for example: reportWebVitals(console.log))
16 | // or send to an analytics endpoint. Learn more: https://bit.ly/CRA-vitals
17 | reportWebVitals();
18 |
--------------------------------------------------------------------------------
/git-frontend/src/logo.svg:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/git-frontend/src/reportWebVitals.js:
--------------------------------------------------------------------------------
1 | const reportWebVitals = (onPerfEntry) => {
2 | if (onPerfEntry && onPerfEntry instanceof Function) {
3 | import("web-vitals").then(({ getCLS, getFID, getFCP, getLCP, getTTFB }) => {
4 | getCLS(onPerfEntry);
5 | getFID(onPerfEntry);
6 | getFCP(onPerfEntry);
7 | getLCP(onPerfEntry);
8 | getTTFB(onPerfEntry);
9 | });
10 | }
11 | };
12 |
13 | export default reportWebVitals;
14 |
--------------------------------------------------------------------------------
/git-frontend/src/setupTests.js:
--------------------------------------------------------------------------------
1 | // jest-dom adds custom jest matchers for asserting on DOM nodes.
2 | // allows you to do things like:
3 | // expect(element).toHaveTextContent(/react/i)
4 | // learn more: https://github.com/testing-library/jest-dom
5 | import "@testing-library/jest-dom";
6 |
--------------------------------------------------------------------------------
/images/self_portrait.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danielpatrickhug/GitModel/3764c938f659d0dc3b51ab3b4394aa5974fcc49e/images/self_portrait.jpg
--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
1 | import openai
2 | from fastapi import FastAPI, HTTPException
3 | from fastapi.middleware.cors import CORSMiddleware
4 | from fastapi.responses import JSONResponse
5 | from pydantic import BaseModel
6 |
7 | from src import Pipeline
8 |
9 | app = FastAPI()
10 |
11 | # Add the CORS middleware
12 | app.add_middleware(
13 | CORSMiddleware,
14 | allow_origins=["*"],
15 | allow_credentials=True,
16 | allow_methods=["*"],
17 | allow_headers=["*"],
18 | )
19 |
20 |
21 | class AnalyzeRepoInput(BaseModel):
22 | config: str
23 | repo: str
24 | repo_name: str
25 | openai_secret: str
26 |
27 |
28 | @app.post("/api/topic-modeling")
29 | async def analyze_repo(input_data: AnalyzeRepoInput):
30 | openai_secret = input_data.openai_secret
31 | openai.api_key = openai_secret
32 |
33 | pipeline = Pipeline.from_yaml(input_data.config)
34 | try:
35 | gnn_head_outputs, topic_model_outputs = pipeline.run(input_data.repo, input_data.repo_name)
36 | if topic_model_outputs:
37 | topic_model_output = topic_model_outputs[0]
38 | return JSONResponse(content=topic_model_output["tree"])
39 | else:
40 | raise HTTPException(status_code=400, detail="Error processing topic model")
41 | except Exception as e:
42 | raise HTTPException(status_code=500, detail=str(e))
43 |
44 |
45 | if __name__ == "__main__":
46 | import uvicorn
47 |
48 | uvicorn.run(app, host="0.0.0.0", port=8000)
49 |
--------------------------------------------------------------------------------
/papers/2203.15544.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danielpatrickhug/GitModel/3764c938f659d0dc3b51ab3b4394aa5974fcc49e/papers/2203.15544.pdf
--------------------------------------------------------------------------------
/papers/README.md:
--------------------------------------------------------------------------------
1 | # Essays
2 |
3 | ## Chat GPT 3.5
4 |
5 | Prompt:
6 |
7 | - Write an essay on code repository that can topic model any python repo even
8 | its own. what are the implications on a global scale. touch on topic modeling
9 | operating systems
10 |
11 | > Code repositories are integral to the software development process as they
12 | > enable developers to collaborate, share and maintain code across different
13 | > platforms. One significant advancement in code repository technology is the
14 | > ability to perform topic modeling on Python repositories. Topic modeling
15 | > refers to the process of identifying patterns and topics within a given text
16 | > corpus, which can be useful for tasks such as categorizing documents,
17 | > clustering similar documents, and even identifying latent semantic
18 | > relationships between different code files.
19 |
20 | > One exciting aspect of the ability to topic model Python repositories is that
21 | > it can be applied to any repository, including the repository of the tool
22 | > performing the analysis. This means that the tool itself can be continuously
23 | > improved as it analyzes new data, allowing it to better identify trends and
24 | > topics in the codebase. This continuous learning can lead to more accurate and
25 | > relevant insights, as the tool can evolve alongside the code it analyzes.
26 |
27 | > The implications of this technology on a global scale are significant. As more
28 | > and more Python code is produced and shared globally, the ability to
29 | > efficiently categorize, search and discover relevant code becomes increasingly
30 | > important. Topic modeling can enable better search and discovery of code,
31 | > allowing developers to more easily find code that is relevant to their needs.
32 | > This can lead to increased collaboration and faster development cycles, as
33 | > developers can more easily share code and build on top of existing work.
34 |
35 | > Furthermore, topic modeling can help identify trends and best practices in
36 | > code development. For example, if a topic model identifies that a particular
37 | > function or library is being used extensively across different projects, it
38 | > may indicate that this function or library is particularly useful or
39 | > efficient. This can help inform decisions on which libraries to use, which
40 | > functions to write, and even which design patterns to employ.
41 |
42 | > Finally, it is worth noting that topic modeling is not limited to Python
43 | > repositories. Operating systems themselves can be analyzed using topic
44 | > modeling techniques, allowing us to better understand the structure and
45 | > organization of operating system code. This can help improve the performance
46 | > and reliability of operating systems, which are critical components of modern
47 | > computing. In conclusion, the ability to topic model Python repositories has
48 | > significant implications for the development and maintenance of software on a
49 | > global scale. It can enable better search and discovery of code, identify
50 | > trends and best practices, and even inform decisions on which libraries to use
51 | > and which functions to write. As the volume of Python code continues to grow,
52 | > the ability to efficiently analyze and understand this code becomes
53 | > increasingly important.
54 |
55 | Prompt:
56 |
57 | - Write a section on how generating semantic content over a code repository with
58 | self instruct system prompts can then be passed into graph neural networks and
59 | graph attention networks with the code it was generated on can be used to
60 | topic model repositories and its dependencies
61 |
62 | > Once semantic content has been generated over a code repository using
63 | > self-instruct system prompts, the next step is to use this data to develop a
64 | > topic model for the repository and its dependencies. One powerful technique
65 | > for accomplishing this is through the use of graph neural networks (GNNs) and
66 | > graph attention networks (GATs).
67 |
68 | > GNNs are neural networks that can operate directly on graphs, allowing them to
69 | > model complex relationships between different nodes within the graph. In the
70 | > context of code repositories, each file or function within the repository can
71 | > be represented as a node in a graph, with edges connecting nodes that have
72 | > some form of relationship. For example, a function that calls another function
73 | > would be connected to the function it calls, while a file that imports another
74 | > file would be connected to the file it imports.
75 |
76 | > By passing the semantic content generated over a code repository into a GNN,
77 | > we can train the network to learn relationships between different nodes within
78 | > the graph. For example, the GNN may learn that certain functions tend to be
79 | > used together, or that certain files are commonly imported together. This
80 | > information can then be used to identify different topics or clusters within
81 | > the repository, based on the relationships between different nodes.
82 |
83 | > GATs are a variant of GNNs that use attention mechanisms to selectively focus
84 | > on specific parts of the graph when making predictions. This can be useful in
85 | > the context of code repositories, as it allows the network to identify the
86 | > most relevant nodes when identifying topics or clusters. For example, if the
87 | > network is trying to identify a particular library that is commonly used
88 | > within the repository, it may focus on nodes that are related to that library,
89 | > such as files that import the library or functions that use it extensively.
90 |
91 | > By combining semantic content generated over a code repository with GNNs and
92 | > GATs, we can develop a powerful tool for topic modeling repositories and their
93 | > dependencies. This can be useful for a range of tasks, including identifying
94 | > the most commonly used libraries, identifying clusters of related functions or
95 | > files, and even identifying potential code quality issues or security
96 | > vulnerabilities within the repository.
97 |
98 | > In summary, the combination of semantic content generation, GNNs and GATs can
99 | > enable powerful topic modeling of code repositories and their
100 | > dependencies.This can help developers more easily navigate and understand
101 | > large code bases, and can even lead to improved code quality and security.
102 |
103 | ## GPT4
104 |
105 | #### Prompt:
106 |
107 | - Write an essay on code repository that can topic model any python repo even
108 | its own. what are the implications on a global scale. touch on topic modeling
109 | operating systems, and using GNN to message pass between repos
110 |
111 | > Title: Topic Modeling and Code Repositories: A Future of Global Collaboration
112 | > and Innovation
113 |
114 | #### Introduction
115 |
116 | > Code repositories have revolutionized the software development process by
117 | > providing a centralized platform for developers to store, manage, and share
118 | > their code. With the rapid growth of open-source software, there is an
119 | > increasing need to leverage advanced machine learning techniques to better
120 | > understand and organize the vast amount of code available. One such method is
121 | > topic modeling, which can be applied to any Python repository, including its
122 | > own. This essay will explore the implications of such a system on a global
123 | > scale, discussing its potential impact on operating systems and the use of
124 | > Graph Neural Networks (GNN) for message passing between repositories.
125 |
126 | #### Topic Modeling and Code Repositories
127 |
128 | > Topic modeling is a machine learning technique used to automatically identify
129 | > and group related themes or topics within a large collection of documents. In
130 | > the context of code repositories, this technique can be employed to analyze
131 | > the underlying structure and organization of the codebase, making it easier
132 | > for developers to navigate, understand, and contribute to the project. By
133 | > applying topic modeling to any Python repository, including its own, a new
134 | > level of abstraction and organization can be achieved, allowing for more
135 | > efficient collaboration and innovation.
136 |
137 | #### Global Implications
138 |
139 | > The ability to automatically topic model any Python repository, even its own,
140 | > has far-reaching implications on a global scale. First and foremost, this
141 | > technique can lead to better collaboration between developers from diverse
142 | > backgrounds and skill levels. By providing a more organized and understandable
143 | > view of a codebase, developers can more easily find relevant sections of the
144 | > code and identify where their expertise can be applied.
145 |
146 | > Furthermore, topic modeling can facilitate the identification and reuse of
147 | > existing code, promoting more efficient software development practices. By
148 | > identifying and categorizing similar code snippets across different
149 | > repositories, developers can more easily leverage pre-existing solutions to
150 | > common problems, reducing the need for redundant work and improving overall
151 | > software quality.
152 |
153 | #### Topic Modeling Operating Systems
154 |
155 | > Operating systems are complex and consist of a vast amount of code. The
156 | > application of topic modeling to operating systems can significantly improve
157 | > their organization, maintainability, and extensibility. By grouping related
158 | > code modules, developers can more easily understand the interactions between
159 | > different components of the system, allowing them to more effectively debug,
160 | > optimize, and enhance the operating system.
161 |
162 | > Additionally, topic modeling operating systems can lead to more modular
163 | > designs, enabling better cross-platform compatibility and easier integration
164 | > of new features. This, in turn, can foster greater innovation and
165 | > collaboration between different operating system communities, ultimately
166 | > benefiting the global development ecosystem.
167 |
168 | #### Graph Neural Networks for Message Passing between Repositories
169 |
170 | > Graph Neural Networks (GNN) provide a powerful tool for message passing
171 | > between repositories, enabling more efficient collaboration and knowledge
172 | > sharing. By representing code repositories as nodes in a graph and connecting
173 | > them based on their topical similarity, GNNs can be used to propagate
174 | > information and learn high-level representations of the code.
175 |
176 | > This approach can help identify and bridge gaps in knowledge between different
177 | > repositories, allowing developers to leverage insights from related projects
178 | > more effectively. Moreover, the use of GNNs for message passing between
179 | > repositories can facilitate the transfer of best practices and code patterns
180 | > across different development communities, promoting more consistent and
181 | > high-quality software development on a global scale.
182 |
183 | #### Conclusion
184 |
185 | > The application of topic modeling to code repositories, including the ability
186 | > to model any Python repository, even its own, has the potential to
187 | > revolutionize software development on a global scale. By improving code
188 | > organization, facilitating collaboration, and fostering the reuse of existing
189 | > solutions, topic modeling can contribute to more efficient and innovative
190 | > software development practices. The integration of topic modeling with
191 | > operating systems and the use of Graph Neural Networks for message passing
192 | > between repositories further enhances the potential for global collaboration,
193 | > knowledge sharing, and the advancement of the software development ecosystem.
194 |
195 | #### Prompt:
196 |
197 | - Write an story about a person that builds a code repository that can topic
198 | model any python repo even its own.
199 |
200 | > Once upon a time in the quaint little town of Codeville, a programmer named
201 | > Alex was known for their extraordinary coding skills. Having spent years
202 | > mastering the intricacies of Python, Alex was well-respected and admired by
203 | > their peers. Yet, they weren't satisfied with just writing code—they aspired
204 | > to create something groundbreaking, a testament to their talents and a gift to
205 | > the programming world.
206 |
207 | > One day, while reading about machine learning, Alex stumbled upon an
208 | > interesting idea: topic modeling. It was a powerful technique to discover
209 | > hidden patterns in large collections of text documents. Alex's eyes lit up
210 | > with excitement, and an ambitious plan took shape in their mind—a code
211 | > repository that could perform topic modeling on any Python repository, even
212 | > its own!
213 |
214 | > And so, Alex embarked on a journey to create the ultimate code repository.
215 | > They named their project "PythonThinker," as a tribute to the famed
216 | > philosopher and mathematician, Pythagoras. They dedicated countless hours to
217 | > researching natural language processing, machine learning algorithms, and
218 | > various Python libraries. With every passing day, the idea of PythonThinker
219 | > became more real, as Alex poured their heart and soul into the project.
220 |
221 | > Alex began by designing a system to extract and analyze code from any Python
222 | > repository. This involved tokenizing the code, removing stop words, and
223 | > normalizing the data. They then implemented a state-of-the-art topic modeling
224 | > algorithm that could adapt to the structure and complexity of any codebase.
225 | > This, in turn, allowed PythonThinker to reveal hidden topics, relationships,
226 | > and patterns within the code.
227 |
228 | > As the project progressed, Alex realized that PythonThinker had the potential
229 | > to revolutionize software development. By understanding the underlying topics
230 | > within codebases, developers could gain insights into the organization and
231 | > structure of projects, allowing them to quickly identify areas for improvement
232 | > and optimization.
233 |
234 | > After months of tireless work, PythonThinker was finally complete. It was a
235 | > beautifully crafted piece of software, capable of ingesting any Python
236 | > repository and spewing out valuable insights. What truly set it apart,
237 | > however, was its ability to introspect—to topic model its very own codebase.
238 |
239 | > With great anticipation, Alex fed PythonThinker its own codebase as input. As
240 | > the algorithm whirred to life, it analyzed every line of code, every function,
241 | > and every module it contained. The moment of truth was finally here. Alex
242 | > nervously awaited the results, hoping their masterpiece would live up to its
243 | > promise.
244 |
245 | > And it did. PythonThinker was able to successfully topic model its own code,
246 | > revealing a beautiful tapestry of interconnected topics and dependencies. Alex
247 | > was elated. Their creation was not only a testament to their programming
248 | > prowess, but it also demonstrated the power of self-aware software—a code
249 | > repository capable of understanding and analyzing itself.
250 |
251 | > The news of PythonThinker's success spread like wildfire across Codeville and
252 | > beyond. Developers from around the world flocked to the project, eager to
253 | > harness its power to improve their own codebases. And as Alex watched their
254 | > creation soar to new heights, they felt an immense sense of pride and
255 | > accomplishment.
256 |
257 | > PythonThinker continued to evolve, becoming an indispensable tool for
258 | > developers everywhere. Alex's creation had transformed the world of
259 | > programming, proving that with hard work, passion, and a touch of genius, even
260 | > the wildest dreams can come true. And so, the legend of Alex and PythonThinker
261 | > lived on, inspiring countless programmers to push the boundaries of what is
262 | > possible with code.
263 |
264 | #### Prompt:
265 |
266 | - Write a story about a 26-year-old AI developer from long island, that builds a
267 | code repository that can topic model any python repo even its own.
268 |
269 | > Once upon a time on the picturesque shores of Long Island, there lived a
270 | > brilliant 26-year-old AI developer named Charlie. Known for his remarkable
271 | > coding skills and knack for innovation, Charlie was already a highly respected
272 | > member of the tech community. His passion for programming and machine learning
273 | > seemed boundless, but there was one challenge that consumed his thoughts:
274 | > creating a code repository that could perform topic modeling on any Python
275 | > repository, including its very own.
276 |
277 | > Charlie had long been intrigued by topic modeling—a powerful technique that
278 | > uncovers hidden patterns and relationships within large collections of text
279 | > documents. He believed that such an algorithm could revolutionize the software
280 | > development process, offering valuable insights and fostering collaboration
281 | > among developers.
282 |
283 | > And so, Charlie embarked on a quest to create the ultimate code repository,
284 | > one that would stand as a testament to his ingenuity and determination. He
285 | > named his ambitious project "AthenaCode," inspired by the Greek goddess of
286 | > wisdom and strategy, Athena.
287 |
288 | > Charlie dedicated himself to the task with unmatched intensity. He spent
289 | > countless hours poring over research papers, studying natural language
290 | > processing, and experimenting with cutting-edge machine learning algorithms.
291 | > He even reached out to experts from around the world, seeking their guidance
292 | > and insights.
293 |
294 | > AthenaCode's foundation was built on a system that could efficiently extract
295 | > and analyze code from any Python repository. Charlie designed it to tokenize
296 | > the code, remove stop words, and normalize the data. Once this was done, he
297 | > implemented a state-of-the-art topic modeling algorithm capable of adapting to
298 | > the unique structure and complexity of any given codebase.
299 |
300 | > Charlie's hard work soon bore fruit. AthenaCode began to reveal hidden topics,
301 | > relationships, and patterns within codebases, providing valuable insights into
302 | > the organization and structure of projects. Developers could now quickly
303 | > identify areas for improvement and optimization, making their work more
304 | > efficient and effective.
305 |
306 | > But the true test of AthenaCode's power lay in its ability to introspect—to
307 | > topic model its own codebase. With a mix of excitement and trepidation,
308 | > Charlie fed AthenaCode's code into its algorithm. As it processed each line of
309 | > code, every function, and every module, Charlie watched in awe as his creation
310 | > came to life.
311 |
312 | > The results were nothing short of astonishing. AthenaCode had successfully
313 | > topic modeled its own codebase, revealing a web of interconnected topics and
314 | > dependencies that even Charlie had not anticipated. Overwhelmed with pride, he
315 | > knew his creation was destined to change the world of programming.
316 |
317 | > Word of AthenaCode's success spread like wildfire across the tech community.
318 | > Developers from around the globe clamored to use Charlie's revolutionary tool
319 | > to optimize their own codebases. As he watched his creation gain momentum,
320 | > Charlie couldn't help but feel a deep sense of satisfaction and
321 | > accomplishment.
322 |
323 | > With time, AthenaCode continued to evolve and grow, becoming an indispensable
324 | > resource for developers everywhere. Charlie's groundbreaking creation had
325 | > transformed the software development landscape, demonstrating that with
326 | > determination, hard work, and a touch of genius, even the most ambitious
327 | > dreams can be realized.
328 |
329 | > And so, the story of Charlie and AthenaCode lived on, inspiring countless
330 | > programmers to reach for the stars and push the boundaries of what is possible
331 | > with code. And as the sun set over the shores of Long Island, the tale of a
332 | > young AI developer and his remarkable creation continued to spread, leaving an
333 | > indelible mark on the world of programming.
334 |
335 | #### Prompt: Write an story about a person that builds a code repository that can topic model any python repo even its own. the main focus is
336 |
337 | message passing between code bases from different branches of physics and the
338 | quest for a unifying framework
339 |
340 | > In the bustling city of Scientia, a curious programmer named Dr. Ada Smith was
341 | > known for her relentless pursuit of knowledge. Adept at Python programming and
342 | > passionate about both machine learning and the mysteries of the universe, Ada
343 | > sought to understand the complex relationships between different branches of
344 | > physics. Her dream was to uncover a unifying framework that could bridge the
345 | > gaps between these fields, and she believed that the key to unlocking this
346 | > puzzle lay in the code repositories of the world's leading physicists.
347 |
348 | > In her quest, Ada conceived an ambitious plan: to build a code repository that
349 | > could topic model any Python repo—even its own. She envisioned a system that
350 | > could not only analyze and understand the intricate patterns within individual
351 | > codebases, but also facilitate message passing between these codebases,
352 | > allowing for seamless collaboration and the exchange of knowledge between
353 | > physicists working in diverse fields.
354 |
355 | > And so, with unwavering determination, Ada embarked on her groundbreaking
356 | > project, which she named "QuantumWeaver." The name was inspired by the way her
357 | > creation would weave together the fabric of knowledge from various branches of
358 | > physics, much like the interconnected threads of a tapestry.
359 |
360 | > Ada devoted herself to the task, meticulously studying natural language
361 | > processing, machine learning algorithms, and various Python libraries to
362 | > create a state-of-the-art topic modeling system. QuantumWeaver's primary
363 | > function was to analyze code from Python repositories, identifying hidden
364 | > topics and relationships within the codebases of physicists working in fields
365 | > such as quantum mechanics, relativity, and thermodynamics.
366 |
367 | > As QuantumWeaver began to take shape, Ada focused on implementing message
368 | > passing capabilities between the codebases. This would allow researchers to
369 | > share insights, methods, and algorithms across different branches of physics,
370 | > fostering a collaborative environment that could lead to the discovery of the
371 | > long-sought unifying framework.
372 |
373 | > After months of tireless work, Ada's masterpiece was finally complete. It was
374 | > time to test QuantumWeaver's ability to topic model its own codebase—a true
375 | > demonstration of the power and versatility of her creation.
376 |
377 | > With bated breath, Ada fed QuantumWeaver its own code as input. The algorithm
378 | > sprang to life, analyzing every line of code, every function, and every module
379 | > it contained. As the results poured in, Ada's eyes widened with
380 | > amazement—QuantumWeaver had successfully topic modeled its own code, a
381 | > testament to its extraordinary capabilities.
382 |
383 | > The news of QuantumWeaver's success spread rapidly throughout the scientific
384 | > community. Physicists from around the world eagerly adopted the system, using
385 | > it to analyze their codebases and exchange knowledge between seemingly
386 | > disparate fields. As they shared insights, algorithms, and methodologies, it
387 | > became clear that QuantumWeaver was on the verge of revolutionizing the field
388 | > of physics.
389 |
390 | > As the years passed, the collaborative environment fostered by QuantumWeaver
391 | > led to numerous breakthroughs and discoveries. Researchers found unexpected
392 | > connections between various branches of physics, paving the way for a deeper
393 | > understanding of the universe.
394 |
395 | > And one fateful day, Ada's dream was realized. A brilliant young physicist,
396 | > armed with the insights and knowledge gleaned from the QuantumWeaver network,
397 | > unveiled a groundbreaking unifying framework—a beautiful theory that elegantly
398 | > connected the diverse branches of physics under a single, coherent paradigm.
399 |
400 | > QuantumWeaver had not only transformed the way physicists collaborated and
401 | > shared knowledge, but it had also played a pivotal role in uncovering the
402 | > elusive unifying framework that had captivated the scientific community for
403 | > centuries. Ada's creation would be remembered as a turning point in the
404 | > history of science, and her relentless pursuit of knowledge would inspire
405 | > generations of physicists and programmers to reach for the stars and uncover
406 | > the mysteries of the universe.
407 |
408 | #### Prompt
409 |
410 | - Write a formal essay on this topic
411 |
412 | ##### Title: QuantumWeaver: The Role of Topic Modeling and Message Passing in Unifying the Branches of Physics
413 |
414 | ##### Abstract
415 |
416 | > This essay explores the development and impact of QuantumWeaver, a
417 | > groundbreaking code repository designed to perform topic modeling on any
418 | > Python repo, including its own. We examine how the system's message passing
419 | > capabilities fostered collaboration between researchers in different branches
420 | > of physics and ultimately led to the discovery of a unifying framework. By
421 | > highlighting the transformative potential of advanced algorithms and
422 | > interdisciplinary collaboration, we underscore the significance of
423 | > QuantumWeaver as a turning point in the history of science.
424 |
425 | ##### Introduction
426 |
427 | > The quest for a unifying framework in physics—a theory that could bridge the
428 | > gaps between distinct branches such as quantum mechanics, relativity, and
429 | > thermodynamics—has captivated scientists for centuries. One such visionary,
430 | > Dr. Ada Smith, believed that the key to discovering this elusive framework lay
431 | > in the code repositories of leading physicists. Her groundbreaking project,
432 | > QuantumWeaver, sought to facilitate seamless collaboration and the exchange of
433 | > knowledge between researchers by leveraging topic modeling and message passing
434 | > between codebases.
435 |
436 | ##### QuantumWeaver: A Groundbreaking Code Repository
437 |
438 | > Dr. Smith's QuantumWeaver was designed to analyze code from Python
439 | > repositories and identify hidden topics and relationships within the codebases
440 | > of physicists working in diverse fields. By incorporating state-of-the-art
441 | > natural language processing and machine learning algorithms, the system could
442 | > efficiently perform topic modeling on any Python repo—even its own.
443 |
444 | > The true innovation of QuantumWeaver, however, lay in its message passing
445 | > capabilities. This feature allowed researchers to share insights, methods, and
446 | > algorithms across different branches of physics, fostering a collaborative
447 | > environment that could potentially reveal the unifying framework.
448 |
449 | ##### Impact on the Scientific Community
450 |
451 | > The introduction of QuantumWeaver led to a paradigm shift in the way
452 | > physicists collaborated and shared knowledge. The interconnected network of
453 | > researchers facilitated by the system's message passing capabilities enabled
454 | > the exchange of ideas and methodologies between seemingly disparate fields.
455 | > This led to unexpected connections and insights, paving the way for a deeper
456 | > understanding of the universe.
457 |
458 | > As QuantumWeaver continued to be adopted by physicists around the world, the
459 | > collaborative environment it fostered resulted in numerous breakthroughs and
460 | > discoveries. Ultimately, the insights and knowledge gleaned from this
461 | > interconnected network of researchers culminated in the unveiling of a
462 | > groundbreaking unifying framework—a theory that elegantly connected the
463 | > diverse branches of physics under a single, coherent paradigm.
464 |
465 | #Conclusion
466 |
467 | > QuantumWeaver, as an innovative code repository and collaborative platform,
468 | > played a pivotal role in uncovering the long-sought unifying framework in
469 | > physics. By harnessing the power of topic modeling and message passing between
470 | > codebases, the system revolutionized the way physicists collaborated, shared
471 | > knowledge, and approached the exploration of the universe. The story of
472 | > QuantumWeaver and Dr. Ada Smith serves as a testament to the transformative
473 | > potential of advanced algorithms and interdisciplinary collaboration,
474 | > inspiring future generations of scientists and programmers to push the
475 | > boundaries of human understanding.
476 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [tool.poetry]
2 | name = "GitModel"
3 | version = "0.1.0"
4 | description = "GitModel is a Python library for working with Git repositories as a database for RAG."
5 | authors = ["Daniel Hug "]
6 |
7 | [tool.isort]
8 | profile = "black"
9 | filter_files = true
10 | multi_line_output = 3
11 | include_trailing_comma = true
12 | force_grid_wrap = 0
13 | use_parentheses = true
14 | ensure_newline_before_comments = true
15 | line_length = 120
16 |
17 | [tool.black]
18 | line-length = 120
19 | target-version = ['py310']
20 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | astor
2 | bertopic
3 | GitPython
4 | networkx
5 | numpy
6 | openai
7 | pandas
8 | pre-commit
9 | scipy
10 | sentence-transformers
11 | torch
12 |
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [flake8]
2 | max-line-length = 120
3 | extend-ignore = E203, W503, E501, E741
4 |
--------------------------------------------------------------------------------
/src/__init__.py:
--------------------------------------------------------------------------------
1 | from src.config import *
2 | from src.pipeline import Pipeline
3 | from src.system_prompts.format_system_prompts import *
4 |
--------------------------------------------------------------------------------
/src/alembic.ini:
--------------------------------------------------------------------------------
1 | # A generic, single database configuration.
2 |
3 | [alembic]
4 | # path to migration scripts
5 | script_location = %(here)s/alembic
6 |
7 | # template used to generate migration file names; The default value is %%(rev)s_%%(slug)s
8 | # Uncomment the line below if you want the files to be prepended with date and time
9 | # see https://alembic.sqlalchemy.org/en/latest/tutorial.html#editing-the-ini-file
10 | # for all available tokens
11 | file_template = %%(year)d_%%(month).2d_%%(day).2d_%%(hour).2d%%(minute).2d-%%(rev)s_%%(slug)s
12 |
13 | # sys.path path, will be prepended to sys.path if present.
14 | # defaults to the current working directory.
15 | prepend_sys_path = .
16 |
17 | # timezone to use when rendering the date within the migration file
18 | # as well as the filename.
19 | # If specified, requires the python-dateutil library that can be
20 | # installed by adding `alembic[tz]` to the pip requirements
21 | # string value is passed to dateutil.tz.gettz()
22 | # leave blank for localtime
23 | # timezone =
24 |
25 | # max length of characters to apply to the
26 | # "slug" field
27 | # truncate_slug_length = 40
28 |
29 | # set to 'true' to run the environment during
30 | # the 'revision' command, regardless of autogenerate
31 | # revision_environment = false
32 |
33 | # set to 'true' to allow .pyc and .pyo files without
34 | # a source .py file to be detected as revisions in the
35 | # versions/ directory
36 | # sourceless = false
37 |
38 | # version location specification; This defaults
39 | # to alembic/versions. When using multiple version
40 | # directories, initial revisions must be specified with --version-path.
41 | # The path separator used here should be the separator specified by "version_path_separator" below.
42 | # version_locations = %(here)s/bar:%(here)s/bat:alembic/versions
43 |
44 | # version path separator; As mentioned above, this is the character used to split
45 | # version_locations. The default within new alembic.ini files is "os", which uses os.pathsep.
46 | # If this key is omitted entirely, it falls back to the legacy behavior of splitting on spaces and/or commas.
47 | # Valid values for version_path_separator are:
48 | #
49 | # version_path_separator = :
50 | # version_path_separator = ;
51 | # version_path_separator = space
52 | version_path_separator = os # Use os.pathsep. Default configuration used for new projects.
53 |
54 | # the output encoding used when revision files
55 | # are written from script.py.mako
56 | # output_encoding = utf-8
57 |
58 | # sqlalchemy.url = postgresql://:@/
59 | sqlalchemy.url = postgresql://postgres:postgres@localhost:5432/postgres
60 |
61 | [post_write_hooks]
62 | # post_write_hooks defines scripts or Python functions that are run
63 | # on newly generated revision scripts. See the documentation for further
64 | # detail and examples
65 |
66 | # format using "black" - use the console_scripts runner, against the "black" entrypoint
67 | hooks = black
68 | black.type = console_scripts
69 | black.entrypoint = black
70 | # black.options = -l 79 REVISION_SCRIPT_FILENAME
71 |
72 | # Logging configuration
73 | [loggers]
74 | keys = root,sqlalchemy,alembic
75 |
76 | [handlers]
77 | keys = console
78 |
79 | [formatters]
80 | keys = generic
81 |
82 | [logger_root]
83 | level = WARN
84 | handlers = console
85 | qualname =
86 |
87 | [logger_sqlalchemy]
88 | level = WARN
89 | handlers =
90 | qualname = sqlalchemy.engine
91 |
92 | [logger_alembic]
93 | level = INFO
94 | handlers =
95 | qualname = alembic
96 |
97 | [handler_console]
98 | class = StreamHandler
99 | args = (sys.stderr,)
100 | level = NOTSET
101 | formatter = generic
102 |
103 | [formatter_generic]
104 | format = %(levelname)-5.5s [%(name)s] %(message)s
105 | datefmt = %H:%M:%S
106 |
--------------------------------------------------------------------------------
/src/alembic/env.py:
--------------------------------------------------------------------------------
1 | from logging.config import fileConfig
2 |
3 | import sqlmodel
4 | from sqlalchemy import engine_from_config, pool
5 |
6 | from alembic import context
7 |
8 | # this is the Alembic Config object, which provides
9 | # access to the values within the .ini file in use.
10 | config = context.config
11 |
12 | # Interpret the config file for Python logging.
13 | # This line sets up loggers basically.
14 | if config.config_file_name is not None:
15 | fileConfig(config.config_file_name)
16 |
17 | # add your model's MetaData object here
18 | # for 'autogenerate' support
19 | # from myapp import mymodel
20 | # target_metadata = mymodel.Base.metadata
21 | target_metadata = sqlmodel.SQLModel.metadata
22 |
23 | # other values from the config, defined by the needs of env.py,
24 | # can be acquired:
25 | # my_important_option = config.get_main_option("my_important_option")
26 | # ... etc.
27 |
28 |
29 | def run_migrations_offline() -> None:
30 | """Run migrations in 'offline' mode.
31 |
32 | This configures the context with just a URL
33 | and not an Engine, though an Engine is acceptable
34 | here as well. By skipping the Engine creation
35 | we don't even need a DBAPI to be available.
36 |
37 | Calls to context.execute() here emit the given string to the
38 | script output.
39 |
40 | """
41 | url = config.get_main_option("sqlalchemy.url")
42 | context.configure(
43 | url=url,
44 | target_metadata=target_metadata,
45 | literal_binds=True,
46 | dialect_opts={"paramstyle": "named"},
47 | )
48 |
49 | with context.begin_transaction():
50 | context.run_migrations()
51 |
52 |
53 | def run_migrations_online() -> None:
54 | """Run migrations in 'online' mode.
55 |
56 | In this scenario we need to create an Engine
57 | and associate a connection with the context.
58 |
59 | """
60 | connectable = engine_from_config(
61 | config.get_section(config.config_ini_section),
62 | prefix="sqlalchemy.",
63 | poolclass=pool.NullPool,
64 | )
65 |
66 | with connectable.connect() as connection:
67 | context.configure(connection=connection, target_metadata=target_metadata)
68 |
69 | with context.begin_transaction():
70 | context.get_context()._ensure_version_table()
71 | connection.execute("LOCK TABLE alembic_version IN ACCESS EXCLUSIVE MODE")
72 | context.run_migrations()
73 |
74 |
75 | if context.is_offline_mode():
76 | run_migrations_offline()
77 | else:
78 | run_migrations_online()
79 |
--------------------------------------------------------------------------------
/src/ast_parsers/python_ast_parser.py:
--------------------------------------------------------------------------------
1 | import ast
2 | import os
3 |
4 | import astor
5 |
6 |
7 | def parse_python_file(file_path):
8 | parsed_contents = {
9 | "imports": [],
10 | "globals": [],
11 | "classes": [],
12 | "functions": [],
13 | }
14 |
15 | with open(file_path, "r") as file:
16 | file_contents = file.read()
17 | parsed_tree = ast.parse(file_contents)
18 |
19 | for node in ast.iter_child_nodes(parsed_tree):
20 | if isinstance(node, ast.Import) or isinstance(node, ast.ImportFrom):
21 | parsed_contents["imports"].append(astor.to_source(node).strip())
22 | elif isinstance(node, ast.Assign) and len(node.targets) == 1 and isinstance(node.targets[0], ast.Name):
23 | parsed_contents["globals"].append(astor.to_source(node).strip())
24 | elif isinstance(node, ast.FunctionDef):
25 | if node.name == "main":
26 | parsed_contents["functions"].append(ast.get_source_segment(file_contents, node))
27 | else:
28 | parsed_contents["functions"].append(ast.get_source_segment(file_contents, node))
29 | elif isinstance(node, ast.ClassDef):
30 | parsed_contents["classes"].append(ast.get_source_segment(file_contents, node))
31 |
32 | return parsed_contents
33 |
34 |
35 | def get_methods(class_or_str):
36 | if isinstance(class_or_str, str):
37 | class_or_str = ast.parse(class_or_str)
38 |
39 | method_nodes = [node for node in ast.iter_child_nodes(class_or_str) if isinstance(node, ast.FunctionDef)]
40 | method_sources = []
41 | for node in method_nodes:
42 | source_lines, _ = ast.get_source_segment(class_or_str, node)
43 | method_sources.append("".join(source_lines).strip())
44 | return method_sources
45 |
46 |
47 | def parse_github_repo(local_dir):
48 | parsed_files = []
49 | content_labels = {0: "imports", 1: "globals", 2: "classes", 3: "functions", 4: "main", 5: "file_name"}
50 |
51 | for root, dirs, files in os.walk(local_dir):
52 | for file_name in files:
53 | if file_name.endswith(".py"):
54 | file_path = os.path.join(root, file_name)
55 | try:
56 | parsed_contents = parse_python_file(file_path)
57 | content = {content_labels[i]: v for i, v in enumerate(parsed_contents.values())}
58 | content[content_labels[5]] = file_path
59 | parsed_files.append(content)
60 | except Exception as e:
61 | print(e)
62 | continue
63 |
64 | return parsed_files
65 |
--------------------------------------------------------------------------------
/src/backend/config.py:
--------------------------------------------------------------------------------
1 | from pydantic import BaseModel, BaseSettings
2 |
3 |
4 | class MessageTreeManagerConfiguration(BaseModel):
5 | pass
6 |
7 |
8 | class Settings(BaseSettings):
9 | pass
10 |
11 |
12 | settings = Settings()
13 |
--------------------------------------------------------------------------------
/src/backend/database.py:
--------------------------------------------------------------------------------
1 | # from sqlmodel import create_engine
2 |
--------------------------------------------------------------------------------
/src/backend/memory_tree_manager.py:
--------------------------------------------------------------------------------
1 | class MemoryTreeManager:
2 | pass
3 |
--------------------------------------------------------------------------------
/src/community_detection/extract_connected_components.py:
--------------------------------------------------------------------------------
1 | import networkx as nx
2 | from networkx.algorithms import community
3 |
4 |
5 | def community_detection(A):
6 | # Convert the adjacency matrix to a NetworkX graph
7 | G = nx.from_numpy_array(A)
8 |
9 | # Perform Girvan-Newman community detection
10 | communities_generator = community.girvan_newman(G)
11 |
12 | # Get the first level of communities
13 | communities = next(communities_generator)
14 |
15 | # Sort communities by size (number of nodes)
16 | sorted_communities = sorted(communities, key=lambda x: len(x), reverse=True)
17 |
18 | return sorted_communities
19 |
20 |
21 | def print_communities(data, A):
22 | # Print sorted communities
23 | sorted_communities = community_detection(data, A)
24 | for i, com in enumerate(sorted_communities):
25 | print(f"Community {i + 1}:")
26 | print(f"Nodes: {list(com)}")
27 | component_files_structure = []
28 | component_text_structure = []
29 | component_history_structure = []
30 | for row_key in list(com):
31 | file_component = data.loc[row_key, "file_name"]
32 | text_component = data.loc[row_key, "assistant_reply"]
33 | history_component = data.loc[row_key, "conversation_history"]
34 | component_files_structure.append(file_component)
35 | component_text_structure.append(text_component)
36 | component_history_structure.append(history_component)
37 | print(component_files_structure)
38 | for text, history in zip(component_text_structure, component_history_structure):
39 | # print(history[-2]['content'])
40 | print()
41 | print(text)
42 | print()
43 | print()
44 |
45 |
46 | def dfs(v, visited, adj_matrix, component):
47 | """Depth-first search algorithm."""
48 | visited[v] = True
49 | component.append(v)
50 | for i, val in enumerate(adj_matrix[v]):
51 | if val > 0 and not visited[i]:
52 | dfs(i, visited, adj_matrix, component)
53 |
54 |
55 | def connected_components(adj_matrix):
56 | """Find connected components in a graph represented by an adjacency matrix."""
57 | visited = [False for _ in range(adj_matrix.shape[0])]
58 | components = []
59 |
60 | for v in range(adj_matrix.shape[0]):
61 | if not visited[v]:
62 | component = []
63 | dfs(v, visited, adj_matrix, component)
64 | components.append(component)
65 |
66 | return components
67 |
--------------------------------------------------------------------------------
/src/config.py:
--------------------------------------------------------------------------------
1 | import yaml
2 |
3 |
4 | class Config:
5 | def __init__(self, pipeline_settings, semantic_graph_context_generator, gnn_heads, topic_model):
6 | self.semantic_graph_context_generator = semantic_graph_context_generator
7 | self.gnn_heads = gnn_heads
8 | self.topic_model = topic_model
9 | self.pipeline_settings = pipeline_settings
10 |
11 | @classmethod
12 | def from_yaml(cls, yaml_path):
13 | with open(yaml_path, "r") as f:
14 | config_dict = yaml.safe_load(f)
15 | return cls(**config_dict)
16 |
17 |
18 | class PipelineSettings:
19 | def __init__(self, config: Config):
20 | self.config = config
21 |
22 | def __repr__(self) -> str:
23 | return f"PipelineSettings(config={self.config})"
24 |
--------------------------------------------------------------------------------
/src/configs/config1.yaml:
--------------------------------------------------------------------------------
1 | semantic_graph_context_generator:
2 | - __impl__: SemanticGraphContextGenerator
3 | generator_id: summary
4 | model_id: gpt-3.5-turbo
5 | temperature: 0
6 | max_tokens: 500
7 | with_tree: False
8 | - __impl__: SemanticGraphContextGenerator
9 | generator_id: qa_chain
10 | model_id: gpt-3.5-turbo
11 | temperature: 0
12 | max_tokens: 600
13 | with_tree: False
14 | - __impl__: SemanticGraphContextGenerator
15 | generator_id: ux_ui_consult
16 | model_id: gpt-3.5-turbo
17 | temperature: 0
18 | max_tokens: 500
19 | with_tree: False
20 | - __impl__: SemanticGraphContextGenerator
21 | generator_id: bug_fix
22 | model_id: gpt-3.5-turbo
23 | temperature: 0
24 | max_tokens: 500
25 | with_tree: False
26 |
27 | gnn_heads:
28 | - __impl__: GNNHead
29 | generator_id: message_passing
30 | k_hop: 2
31 | is_sparse: True
32 | embbedding_id: sbert
33 | embbedding_model: all-MiniLM-L6-v2
34 | kernel_id: cosine_similarity
35 | graph_id: adjacency
36 | connection_threshold: 0.7
37 |
38 | topic_model:
39 | - __impl__: TopicModel
40 | topic_model_id: topic_model
41 | embedding_model: all-MiniLM-L6-v2
42 | clustering_model:
43 | HDBSCAN(min_cluster_size=15, metric='euclidean',
44 | cluster_selection_method='eom', prediction_data=True)
45 | dimensionality_reduction:
46 | umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0,
47 | metric='cosine')
48 | vectorizer_model: CountVectorizer(stop_words="english")
49 | representation_model:
50 | OpenAI(model="gpt-3.5-turbo", delay_in_seconds=1, chat=True)
51 | hierarchical_linkage_fn: sch.linkage(x, 'single', optimal_ordering=True)
52 | with_tree: True
53 | auto_cluster: True
54 |
--------------------------------------------------------------------------------
/src/configs/config2.yaml:
--------------------------------------------------------------------------------
1 | pipeline_settings:
2 | - __impl__: PipelineSettings
3 | combine_gnn_strategy: sum
4 | norm_fn: z_score
5 | skip_graph_creation: True
6 |
7 | semantic_graph_context_generator:
8 | - __impl__: SemanticGraphContextGenerator
9 | generator_id: summary
10 | model_id: gpt-3.5-turbo
11 | temperature: 0
12 | max_tokens: 300
13 | with_tree: False
14 | - __impl__: SemanticGraphContextGenerator
15 | generator_id: qa_chain
16 | model_id: gpt-3.5-turbo
17 | temperature: 0
18 | max_tokens: 300
19 | with_tree: False
20 |
21 | gnn_heads:
22 | - __impl__: GNNHead
23 | generator_id: message_passing
24 | k_hop: 2
25 | is_sparse: True
26 | embbedding_id: sbert
27 | embbedding_model: all-MiniLM-L6-v2
28 | kernel_id: cosine_similarity
29 | graph_id: adjacency
30 | connection_threshold: 0.7
31 | - __impl__: GNNHead
32 | generator_id: message_passing
33 | k_hop: 3
34 | is_sparse: True
35 | embbedding_id: sbert
36 | embbedding_model: all-MiniLM-L6-v2
37 | kernel_id: cosine_similarity
38 | graph_id: adjacency
39 | connection_threshold: 0.7
40 | - __impl__: GNNHead
41 | generator_id: message_passing
42 | k_hop: 2
43 | is_sparse: True
44 | embbedding_id: sbert
45 | embbedding_model: all-MiniLM-L6-v2
46 | kernel_id: gaussian
47 | sigma: 1
48 | graph_id: laplacian
49 | connection_threshold: 0.7
50 |
51 | topic_model:
52 | - __impl__: TopicModel
53 | topic_model_id: topic_model
54 | embedding_model: all-MiniLM-L6-v2
55 | clustering_model:
56 | HDBSCAN(min_cluster_size=15, metric='euclidean',
57 | cluster_selection_method='eom', prediction_data=True)
58 | dimensionality_reduction:
59 | umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0,
60 | metric='cosine')
61 | vectorizer_model: CountVectorizer(stop_words="english")
62 | representation_model:
63 | OpenAI(model="gpt-3.5-turbo", delay_in_seconds=1, chat=True)
64 | hierarchical_linkage_fn: sch.linkage(x, 'centroid', optimal_ordering=True)
65 | with_tree: True
66 | auto_cluster: True
67 |
--------------------------------------------------------------------------------
/src/dynamic_import.py:
--------------------------------------------------------------------------------
1 | from copy import deepcopy
2 | from typing import Dict
3 |
4 | from src.config import Config, PipelineSettings
5 | from src.ml_models.graph_networks.gnn_head import GNNHead
6 | from src.ml_models.topic_modeling.topic_model import TopicModel
7 | from src.ml_models.transformers.semantic_graph_context_generator import SemanticGraphContextGenerator
8 |
9 |
10 | def instantiate_class_from_config(config: Config) -> Dict[str, list]:
11 | config = deepcopy(config)
12 |
13 | obj_map = {}
14 | for component_name in ["pipeline_settings", "semantic_graph_context_generator", "gnn_heads", "topic_model"]:
15 | if component_name not in obj_map:
16 | obj_map[component_name] = []
17 |
18 | for args in getattr(config, component_name):
19 | impl = args.pop("__impl__")
20 | try:
21 | _cls = {
22 | "PipelineSettings": PipelineSettings,
23 | "SemanticGraphContextGenerator": SemanticGraphContextGenerator,
24 | "GNNHead": GNNHead,
25 | "TopicModel": TopicModel,
26 | }[impl]
27 | except KeyError:
28 | raise Exception(f"{impl} cannot be found in module my_module")
29 | obj_map[component_name].append(_cls(config=args))
30 |
31 | # Print the recovered dictionary
32 | return obj_map
33 |
--------------------------------------------------------------------------------
/src/fetch_repo.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | from git import Repo
4 |
5 |
6 | def clone_and_create_context_folder(repo_name: str, id_name: str) -> str:
7 | # Create a work folder in the current working directory if it doesn't already exist
8 | work_folder = os.path.join(os.getcwd(), "work")
9 | if not os.path.exists(work_folder):
10 | os.makedirs(work_folder)
11 |
12 | # Clone the repository into the work folder
13 | repo_path = os.path.join(work_folder, id_name)
14 | if not os.path.exists(repo_path):
15 | Repo.clone_from(repo_name, repo_path)
16 |
17 | # Create a context folder with the specified id_name in the current working directory
18 | context_folder = os.path.join(os.getcwd(), f"context/{id_name}_context")
19 | if not os.path.exists(context_folder):
20 | os.makedirs(context_folder)
21 |
22 | return repo_path, context_folder
23 |
--------------------------------------------------------------------------------
/src/ml_models/__init__.py:
--------------------------------------------------------------------------------
1 | from src.ml_models.graph_networks.gnn_head import GNNHead # noqa: F401
2 | from src.ml_models.topic_modeling.topic_model import TopicModel # noqa: F401
3 | from src.ml_models.transformers.semantic_graph_context_generator import SemanticGraphContextGenerator # noqa: F401
4 |
--------------------------------------------------------------------------------
/src/ml_models/graph_networks/gnn_head.py:
--------------------------------------------------------------------------------
1 | import math
2 | import multiprocessing
3 | from typing import List, Tuple
4 |
5 | import numpy as np
6 | import torch
7 | import torch.nn as nn
8 | import torch.nn.functional as F
9 | from sentence_transformers import SentenceTransformer
10 | from transformers import AutoModel
11 |
12 | from src.config import Config
13 | from src.ml_models.graph_networks.kernels import compute_kernel_by_type, graph_laplacian
14 | from src.ml_models.graph_networks.message_passing import k_hop_message_passing, k_hop_message_passing_sparse
15 |
16 |
17 | class GNNHead:
18 | def __init__(self, config: Config):
19 | self.config = config
20 |
21 | def __repr__(self) -> str:
22 | return f"GNNHead(config={self.config})"
23 |
24 | def compute_kernel_matrix(self, node_features: np.ndarray) -> np.ndarray:
25 | """
26 | Compute the kernel matrix using the specified kernel function.
27 |
28 | Parameters:
29 | node_features (numpy array): The feature matrix of the nodes.
30 |
31 | Returns:
32 | kernel_matrix (numpy array): The kernel matrix computed using the specified kernel function.
33 | """
34 | if self.config["kernel_id"] == "cosine_similarity":
35 | kernel_matrix = compute_kernel_by_type(
36 | node_features, threshold=self.config["connection_threshold"], kernel_type="cosine"
37 | )
38 | elif self.config["kernel_id"] == "gaussian":
39 | kernel_matrix = compute_kernel_by_type(
40 | node_features,
41 | threshold=self.config["connection_threshold"],
42 | kernel_type="gaussian",
43 | sigma=self.config["sigma"],
44 | )
45 | else:
46 | raise ValueError("Invalid kernel function specified.")
47 | return kernel_matrix
48 |
49 | def embed_data(self, data, key="query", cores=1, gpu=False, batch_size=128):
50 | """
51 | Embed the sentences/text using the MiniLM language model (which uses mean pooling)
52 | """
53 | print("Embedding data")
54 | if self.config["embbedding_model"] == "all-MiniLM-L6-v2":
55 | model = SentenceTransformer(self.config["embbedding_model"])
56 | elif self.config["embbedding_model"] == "graphcodebert-base":
57 | model = AutoModel.from_pretrained("sentence-transformers/bert-base-nli-mean-tokens")
58 | else:
59 | raise ValueError("Invalid embedding model specified.")
60 | print("Model loaded")
61 |
62 | sentences = data[key].tolist()
63 | unique_sentences = data[key].unique()
64 | print("Unique sentences", len(unique_sentences))
65 |
66 | if cores == 1:
67 | embeddings = model.encode(unique_sentences, show_progress_bar=True, batch_size=batch_size)
68 | else:
69 | devices = ["cpu"] * cores
70 | if gpu:
71 | devices = None # use all CUDA devices
72 |
73 | # Start the multi-process pool on multiple devices
74 | print("Multi-process pool starting")
75 | pool = model.start_multi_process_pool(devices)
76 | print("Multi-process pool started")
77 |
78 | chunk_size = math.ceil(len(unique_sentences) / cores)
79 |
80 | # Compute the embeddings using the multi-process pool
81 | embeddings = model.encode_multi_process(
82 | unique_sentences, pool, batch_size=batch_size, chunk_size=chunk_size
83 | )
84 | model.stop_multi_process_pool(pool)
85 |
86 | print("Embeddings computed")
87 |
88 | mapping = {sentence: embedding for sentence, embedding in zip(unique_sentences, embeddings)}
89 | embeddings = np.array([mapping[sentence] for sentence in sentences])
90 |
91 | return embeddings
92 |
93 | def generate_graph(self, data) -> Tuple[np.ndarray, np.ndarray]:
94 | """
95 | Generate a graph using the kernel function and message passing.
96 |
97 | Parameters:
98 | node_features (numpy array): The feature matrix of the nodes.
99 |
100 | Returns:
101 | A_k (numpy array): The k-hop adjacency matrix.
102 | agg_features (numpy array): The aggregated feature matrix for each node in the k-hop neighborhood.
103 | """
104 | embs = self.embed_data(data)
105 | kernel_matrix = self.compute_kernel_matrix(embs)
106 | if self.config["graph_id"] == "adjacency":
107 | A = kernel_matrix
108 | elif self.config["graph_id"] == "laplacian":
109 | L, D = graph_laplacian(kernel_matrix)
110 | A = np.linalg.pinv(D) @ L
111 | else:
112 | raise ValueError("Invalid graph type specified.")
113 |
114 | if self.config["is_sparse"]:
115 | A_k, agg_features = k_hop_message_passing_sparse(A, embs, self.config["k_hop"])
116 | else:
117 | A_k, agg_features = k_hop_message_passing(A, embs, self.config["k_hop"])
118 |
119 | return A_k, agg_features
120 |
121 |
122 | class GraphCodeBERT:
123 | def __init__(self, config: Config):
124 | self.config = config
125 |
126 | def __repr__(self) -> str:
127 | return f"GraphCodeBERT(config={self.config})"
128 |
129 | def mean_pooling(self, model_output, attention_mask):
130 | token_embeddings = model_output[0] # First element of model_output contains all token embeddings
131 | input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
132 | return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
133 |
134 | def encode(self):
135 | pass
136 |
137 |
138 | class GATLayer(nn.Module):
139 | """
140 | A single layer of the Graph Attention Network.
141 | """
142 |
143 | def __init__(self, in_features, out_features, dropout=0.0):
144 | super().__init__()
145 | self.in_features = in_features
146 | self.out_features = out_features
147 |
148 | self.W = nn.Linear(in_features, out_features, bias=False)
149 | self.a = nn.Linear(2 * out_features, 1, bias=False)
150 |
151 | self.dropout = nn.Dropout(dropout)
152 |
153 | def forward(self, h, adj):
154 | Wh = self.W(h)
155 | a_input = self.prepare_attention_input(Wh)
156 | e = F.leaky_relu(self.a(a_input))
157 | attention = F.softmax(e, dim=1)
158 | attention = self.dropout(attention)
159 |
160 | h_prime = torch.matmul(attention.transpose(1, 2), Wh)
161 | return h_prime
162 |
163 | def prepare_attention_input(self, Wh):
164 | N = Wh.size()[0]
165 | Wh_repeated_in_chunks = Wh.repeat_interleave(N, dim=0)
166 | Wh_repeated_alternating = Wh.repeat(N, 1)
167 | all_combinations_matrix = torch.cat([Wh_repeated_in_chunks, Wh_repeated_alternating], dim=1)
168 | return all_combinations_matrix.view(N, N, 2 * self.out_features)
169 |
170 |
171 | class GAT(nn.Module):
172 | """
173 | The Graph Attention Network model.
174 | """
175 |
176 | def __init__(self, in_features, out_features, dropout=0.0):
177 | super().__init__()
178 | self.in_features = in_features
179 | self.out_features = out_features
180 |
181 | self.layers = nn.ModuleList(
182 | [
183 | GATLayer(in_features, out_features, dropout=dropout),
184 | GATLayer(out_features, out_features, dropout=dropout),
185 | ]
186 | )
187 |
188 | def forward(self, embs, adj_matrices):
189 | h = embs
190 | for layer, adj in zip(self.layers, adj_matrices):
191 | h = layer(h, adj)
192 | return h
193 |
194 |
195 | def combine_graphs_with_gat(graphs):
196 | """
197 | Combine a list of graphs using the Graph Attention Network.
198 |
199 | Parameters:
200 | graphs (List of Tuple): List of tuples where each tuple contains an adjacency matrix and its
201 | corresponding node embeddings.
202 |
203 | Returns:
204 | combined_adj_matrix (numpy array): The combined adjacency matrix.
205 | combined_embs (numpy array): The combined node embeddings.
206 | """
207 | in_features = graphs[0][1].shape[1] # input dimensionality of the embeddings
208 | out_features = 256 # output dimensionality of the embeddings
209 | gat = GAT(in_features, out_features)
210 |
211 | adj_matrices = [t[0] for t in graphs]
212 | embs = torch.cat([torch.FloatTensor(t[1]) for t in graphs], dim=0)
213 |
214 | with torch.no_grad():
215 | combined_embs = gat(embs, adj_matrices)
216 |
217 | combined_adj_matrix = sum(adj_matrices)
218 | max_value = np.max(combined_adj_matrix)
219 | normalized_matrix = combined_adj_matrix / max_value
220 | multiprocessing.resource_tracker.unregister()
221 |
222 | return normalized_matrix, combined_embs.numpy()
223 |
224 |
225 | def linearly_sum_gnn_heads(
226 | matrices: List[Tuple[np.ndarray, np.ndarray]], normalization_fn: str = "max"
227 | ) -> Tuple[np.ndarray, np.ndarray]:
228 | """
229 | Linearly sum a list of adjacency matrices and normalize the result. Also computes the average of the node embeddings
230 | and checks that the number of documents in the adjacency matrices matches the number of documents in the embeddings set.
231 |
232 | Parameters:
233 | matrices (List of Tuple): List of tuples where each tuple contains an adjacency matrix and its
234 | corresponding node embeddings.
235 |
236 | Returns:
237 | normalized_matrix (numpy array): The linearly summed and normalized adjacency matrix.
238 | avg_embeddings (numpy array): The average node embeddings.
239 | """
240 | if len(matrices) == 1:
241 | return matrices[0]
242 | # Compute the average of the embeddings
243 | avg_embeddings = np.mean([t[1] for t in matrices], axis=0)
244 |
245 | # Check that the number of documents in the adjacency matrices matches the number of documents in the embeddings set
246 | num_docs_adj = matrices[0][0].shape[0]
247 | num_docs_embs = avg_embeddings.shape[0]
248 | if num_docs_adj != num_docs_embs:
249 | raise ValueError(
250 | "Number of documents in adjacency matrix does not match number of documents in embeddings set."
251 | )
252 |
253 | # Linearly sum the adjacency matrices and normalize the matrix by dividing it by the maximum value
254 | summed_matrix = sum([t[0] for t in matrices])
255 | if normalization_fn == "max":
256 | max_value = np.max(summed_matrix)
257 | normalized_matrix = summed_matrix / max_value
258 | elif normalization_fn == "sum":
259 | normalized_matrix = summed_matrix / len(matrices)
260 | elif normalization_fn == "min-max":
261 | min_value = np.min(summed_matrix)
262 | max_value = np.max(summed_matrix)
263 | normalized_matrix = (summed_matrix - min_value) / (max_value - min_value)
264 | elif normalization_fn == "z_score":
265 | mean = np.mean(summed_matrix)
266 | std = np.std(summed_matrix)
267 | normalized_matrix = (summed_matrix - mean) / std
268 | elif normalization_fn == "circular":
269 | min_value = np.min(summed_matrix)
270 | max_value = np.max(summed_matrix)
271 | normalized_matrix_linear = (summed_matrix - min_value) / (max_value - min_value)
272 | normalized_matrix = np.sin(2 * np.pi * normalized_matrix_linear)
273 | else:
274 | raise ValueError(f"Normalization function {normalization_fn} not supported.")
275 |
276 | A_k, agg_embs = k_hop_message_passing_sparse(normalized_matrix, avg_embeddings, 2)
277 | print(f"adj matrix shape: {A_k.shape}")
278 | print(f"agg embeddings shape: {agg_embs.shape}")
279 | return A_k, agg_embs
280 |
--------------------------------------------------------------------------------
/src/ml_models/graph_networks/kernels.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import scipy
3 | import torch
4 | import torch.nn.functional as F
5 | from torch import Tensor
6 |
7 |
8 | def gaussian_kernel_torch(embs_a, embs_b, sigma=1.0):
9 | """
10 | Computes the Gaussian kernel matrix between two sets of embeddings using PyTorch.
11 | :param embs_a: Tensor of shape (batch_size_a, embedding_dim) containing the first set of embeddings.
12 | :param embs_b: Tensor of shape (batch_size_b, embedding_dim) containing the second set of embeddings.
13 | :param sigma: Width of the Gaussian kernel.
14 | :return: Tensor of shape (batch_size_a, batch_size_b) containing the Gaussian kernel matrix.
15 | """
16 | if not isinstance(embs_a, torch.Tensor):
17 | embs_a = torch.tensor(embs_a)
18 |
19 | if not isinstance(embs_b, torch.Tensor):
20 | embs_b = torch.tensor(embs_b)
21 |
22 | # Compute the pairwise distances between the embeddings
23 | dist_matrix = torch.cdist(embs_a, embs_b)
24 |
25 | # Compute the Gaussian kernel matrix
26 | kernel_matrix = torch.exp(-(dist_matrix**2) / (2 * sigma**2))
27 |
28 | return kernel_matrix
29 |
30 |
31 | def estimate_pdf(scores: list):
32 | """
33 | estimate scores probability density function
34 | :scores: list of distance scores from topic features to topic centroid
35 | :return: distribution
36 | """
37 | return scipy.stats.gaussian_kde(scores)
38 |
39 |
40 | def cos_sim(a: Tensor, b: Tensor):
41 | """
42 | Computes the cosine similarity cos_sim(a[i], b[j]) for all i and j.
43 | :return: Matrix with res[i][j] = cos_sim(a[i], b[j])
44 | """
45 | if not isinstance(a, torch.Tensor):
46 | a = torch.tensor(np.array(a))
47 |
48 | if not isinstance(b, torch.Tensor):
49 | b = torch.tensor(np.array(b))
50 |
51 | if len(a.shape) == 1:
52 | a = a.unsqueeze(0)
53 |
54 | if len(b.shape) == 1:
55 | b = b.unsqueeze(0)
56 |
57 | a_norm = torch.nn.functional.normalize(a, p=2, dim=1)
58 | b_norm = torch.nn.functional.normalize(b, p=2, dim=1)
59 | return torch.mm(a_norm, b_norm.transpose(0, 1))
60 |
61 |
62 | def cos_sim_torch(embs_a: Tensor, embs_b: Tensor) -> Tensor:
63 | """
64 | Computes the cosine similarity cos_sim(a[i], b[j]) for all i and j.
65 | Using torch.nn.functional.cosine_similarity
66 | :return: Matrix with res[i][j] = cos_sim(a[i], b[j])
67 | """
68 | if not isinstance(embs_a, torch.Tensor):
69 | embs_a = torch.tensor(np.array(embs_a))
70 |
71 | if not isinstance(embs_b, torch.Tensor):
72 | embs_b = torch.tensor(np.array(embs_b))
73 |
74 | if len(embs_a.shape) == 1:
75 | embs_a = embs_a.unsqueeze(0)
76 |
77 | if len(embs_b.shape) == 1:
78 | embs_b = embs_b.unsqueeze(0)
79 | A = F.cosine_similarity(embs_a.unsqueeze(1), embs_b.unsqueeze(0), dim=2)
80 | return A
81 |
82 |
83 | def prune_ref_docs(qa_embs, ref_embs, ref_docs, threshold=0.1):
84 | """
85 | Drops unnecessary documents from the reference embeddings and updates the list of reference documents,
86 | and then recomputes the adjacency matrix.
87 |
88 | Parameters:
89 | qa_embs (numpy array): The embedding matrix of QA pairs.
90 | ref_embs (numpy array): The embedding matrix of reference sentences.
91 | ref_docs (list): The list of reference documents.
92 | threshold (float): The threshold below which documents are considered unnecessary.
93 |
94 | Returns:
95 | pruned_ref_embs (numpy array): The pruned embedding matrix of reference sentences.
96 | pruned_ref_docs (list): The pruned list of reference documents.
97 | pruned_A (numpy array): The pruned adjacency matrix.
98 | """
99 |
100 | # Compute the initial adjacency matrix with full reference embeddings
101 | A = gaussian_kernel_torch(qa_embs, ref_embs, sigma=0.5)
102 | print(f"Before: {A.shape}")
103 | # Compute the row-wise sum of the adjacency matrix
104 | row_sum = torch.sum(A, dim=0)
105 |
106 | # Identify the indexes of the relevant documents
107 | relevant_idx = torch.where(row_sum > threshold * row_sum.max())[0]
108 |
109 | # Drop unnecessary rows from the reference embeddings
110 | pruned_ref_embs = ref_embs[relevant_idx]
111 |
112 | # Update the list of reference documents
113 | pruned_ref_docs = [ref_docs[i] for i in relevant_idx]
114 |
115 | # Recompute the adjacency matrix with pruned reference embeddings
116 | pruned_A = gaussian_kernel_torch(qa_embs, pruned_ref_embs, sigma=0.5)
117 | print(f"After: {pruned_A.shape}")
118 | return pruned_ref_embs, pruned_ref_docs, pruned_A
119 |
120 |
121 | def compute_kernel_by_type(embs, threshold=0.65, kernel_type="cosine", sigma=1.0):
122 | # match case to kernel type
123 | if kernel_type == "gaussian":
124 | A = gaussian_kernel_torch(embs, embs, sigma)
125 | if kernel_type == "cosine":
126 | A = cos_sim(embs, embs)
127 | adj_matrix = torch.zeros_like(A)
128 | adj_matrix[A > threshold] = 1
129 | adj_matrix[A <= threshold] = 0
130 | adj_matrix = adj_matrix.numpy().astype(np.float32)
131 | return adj_matrix
132 |
133 |
134 | def compute_kernel_with_activation_fn(embs, activation_fn=F.relu, kernel_type="cosine", sigma=1.0):
135 | # match case to kernel type
136 | if kernel_type == "gaussian":
137 | A = gaussian_kernel_torch(embs, embs, sigma)
138 | if kernel_type == "cosine":
139 | A = cos_sim(embs, embs)
140 | A = activation_fn(A)
141 | A = A.numpy().astype(np.float32)
142 | return A
143 |
144 |
145 | def compute_kernel(embs):
146 | # TODO match case to kernel type
147 | A = cos_sim(embs, embs)
148 | adj_matrix = A.numpy().astype(np.float32)
149 | return adj_matrix
150 |
151 |
152 | def degree_matrix(A):
153 | """
154 | compute degree matrix using adjacency distance matrix from pairwise distances
155 | :A: nxn size matrix embedding minmaxed using mu sigma and pairwise distances
156 | :return: degree matrix
157 | """
158 | n = A.shape[0]
159 | D = np.zeros((n, n))
160 | for i in range(n):
161 | D[i, i] = np.sum(A[i, :])
162 | return D
163 |
164 |
165 | def graph_laplacian(A):
166 | """
167 | compute graph laplacian using degree and adjacency matrix from pairwise distances
168 | :A: nxn size matrix embedding minmaxed using mu sigma and pairwise distances
169 | :return: graph laplacian, and degree matrix
170 | """
171 | D = degree_matrix(A)
172 | L = D - A
173 | return L, D
174 |
--------------------------------------------------------------------------------
/src/ml_models/graph_networks/message_passing.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import scipy.sparse as sp
3 | from tqdm import tqdm
4 |
5 |
6 | def k_hop_message_passing(A, node_features, k):
7 | """
8 | Compute the k-hop adjacency matrix and aggregated features using message passing.
9 |
10 | Parameters:
11 | A (numpy array): The adjacency matrix of the graph.
12 | node_features (numpy array): The feature matrix of the nodes.
13 | k (int): The number of hops for message passing.
14 |
15 | Returns:
16 | A_k (numpy array): The k-hop adjacency matrix.
17 | agg_features (numpy array): The aggregated feature matrix for each node in the k-hop neighborhood.
18 | """
19 |
20 | print("Compute the k-hop adjacency matrix")
21 | A_k = np.linalg.matrix_power(A, k)
22 |
23 | print("Aggregate the messages from the k-hop neighborhood:")
24 | agg_features = node_features.copy()
25 |
26 | for i in tqdm(range(k)):
27 | agg_features += np.matmul(np.linalg.matrix_power(A, i + 1), node_features)
28 |
29 | return A_k, agg_features
30 |
31 |
32 | def k_hop_message_passing_sparse(A, node_features, k):
33 | """
34 | Compute the k-hop adjacency matrix and aggregated features using message passing.
35 |
36 | Parameters:
37 | A (numpy array or scipy sparse matrix): The adjacency matrix of the graph.
38 | node_features (numpy array or scipy sparse matrix): The feature matrix of the nodes.
39 | k (int): The number of hops for message passing.
40 |
41 | Returns:
42 | A_k (numpy array): The k-hop adjacency matrix.
43 | agg_features (numpy array): The aggregated feature matrix for each node in the k-hop neighborhood.
44 | """
45 |
46 | # Convert input matrices to sparse matrices if they are not already
47 | if not sp.issparse(A):
48 | A = sp.csr_matrix(A)
49 | if not sp.issparse(node_features):
50 | node_features = sp.csr_matrix(node_features)
51 |
52 | # Compute the k-hop adjacency matrix and the aggregated features
53 | A_k = A.copy()
54 | agg_features = node_features.copy()
55 |
56 | for i in tqdm(range(k)):
57 | # Compute the message passing for the k-hop neighborhood
58 | message = A_k.dot(node_features)
59 | # Apply a GCN layer to aggregate the messages
60 | agg_features = A_k.dot(agg_features) + message
61 | # Update the k-hop adjacency matrix by adding new edges
62 | A_k += A_k.dot(A)
63 |
64 | return A_k.toarray(), agg_features.toarray()
65 |
--------------------------------------------------------------------------------
/src/ml_models/topic_modeling/topic_model.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import os
3 | import sys
4 | from typing import Tuple
5 |
6 | import numpy as np
7 | import pandas as pd
8 | from bertopic import BERTopic
9 | from bertopic.representation import MaximalMarginalRelevance, OpenAI, TextGeneration
10 | from bertopic.vectorizers import ClassTfidfTransformer
11 | from sentence_transformers import SentenceTransformer
12 | from sklearn.cluster import AgglomerativeClustering
13 | from sklearn.decomposition import TruncatedSVD
14 | from sklearn.feature_extraction.text import CountVectorizer
15 | from transformers import pipeline
16 |
17 | os.environ["TOKENIZERS_PARALLELISM"] = "false"
18 | lg = logging.getLogger(__name__)
19 | lg.setLevel(logging.DEBUG)
20 | handler = logging.StreamHandler(sys.stdout)
21 | handler.setFormatter(logging.Formatter("[%(asctime)s] [%(levelname)s] - %(message)s"))
22 | lg.addHandler(handler)
23 |
24 |
25 | class TopicModel:
26 | def __init__(self, config):
27 | self.config = config
28 |
29 | def __repr__(self) -> str:
30 | return f"TopicModel(config={self.config})"
31 |
32 | def fit_topic_model(self, topic_model, data, embeddings, key="query"):
33 | topics, probs = topic_model.fit_transform(data[key].to_list(), embeddings)
34 | return topics, probs
35 |
36 | def get_topic_info(self, topic_model):
37 | return topic_model.get_topic_info()
38 |
39 | def reduce_topics(self, topic_model, data, nr_topics, key="query"):
40 | topic_model.reduce_topics(data[key].to_list(), nr_topics)
41 | return topic_model
42 |
43 | def get_representative_docs(self, topic_model):
44 | return topic_model.get_representative_docs()
45 |
46 | def reduce_outliers(self, topic_model, data, topics, probs, key="query", strategy="c-tf-idf"):
47 | if strategy == "c-tf-idf":
48 | new_topics = topic_model.reduce_outliers(data[key].to_list(), topics, strategy, threshold=0.1)
49 | elif strategy == "embeddings":
50 | new_topics = topic_model.reduce_outliers(data[key].to_list(), topics, strategy)
51 | elif strategy == "distributions":
52 | new_topics = topic_model.reduce_outliers(
53 | data[key].to_list(), topics, probabilities=probs, strategy=strategy
54 | )
55 | else:
56 | raise ValueError("Invalid strategy")
57 | topic_model.update_topics(
58 | data[key].to_list(),
59 | topics=new_topics,
60 | representation_model=self.representation_model,
61 | vectorizer_model=self.vectorizer_model,
62 | ctfidf_model=self.ctfidf_model,
63 | )
64 | return topic_model, new_topics
65 |
66 | def compute_hierarchical_topic_tree(self, topic_model, data, key="query"):
67 | hierarchical_topics = topic_model.hierarchical_topics(data[key].to_list())
68 | tree = topic_model.get_topic_tree(hierarchical_topics)
69 | return hierarchical_topics, tree
70 |
71 | def run(self, data, gnn_outputs: Tuple[np.ndarray, np.ndarray], key="query") -> dict:
72 | """
73 | Run the topic modeling.
74 | """
75 | # Prepare the data
76 | if self.config["auto_cluster"]:
77 | if self.config["representation_model"].split("(")[0] == "pipeline":
78 | generator = self.config["representation_model"]
79 | representation_model = [MaximalMarginalRelevance(diversity=0.3), TextGeneration(eval(generator))]
80 | elif self.config["representation_model"].split("(")[0] == "OpenAI":
81 | representation_model = [
82 | MaximalMarginalRelevance(diversity=0.3),
83 | eval(self.config["representation_model"]),
84 | ]
85 | else:
86 | lg.info("Using MaximalMarginalRelevance as representation model")
87 | representation_model = MaximalMarginalRelevance(diversity=0.3)
88 |
89 | topic_model = BERTopic(
90 | nr_topics="auto",
91 | vectorizer_model=eval(self.config["vectorizer_model"]),
92 | embedding_model=self.config["embedding_model"],
93 | representation_model=representation_model,
94 | n_gram_range=(1, 2),
95 | min_topic_size=10,
96 | top_n_words=10,
97 | calculate_probabilities=False,
98 | )
99 | else:
100 | # Fit the BERTopic model
101 | topic_model = BERTopic(
102 | nr_topics="auto",
103 | vectorizer_model=eval(self.config["vectorizer_model"]),
104 | umap_model=eval(self.config["dimensionality_reduction"]),
105 | hdbscan_model=eval(self.config["clustering_model"]),
106 | embedding_model=eval(self.config["embedding_model"]),
107 | representation_model=eval(self.config["representation_model"]),
108 | n_gram_range=(1, 2),
109 | min_topic_size=10,
110 | top_n_words=10,
111 | calculate_probabilities=False,
112 | )
113 | topics, probabilities = topic_model.fit_transform(data[key].to_list(), gnn_outputs[1])
114 | freq = self.get_topic_info(topic_model)
115 | rep_docs = self.get_representative_docs(topic_model)
116 | hr, tree = self.compute_hierarchical_topic_tree(topic_model=topic_model, data=data)
117 | # add dict with topic info
118 | docs = []
119 | for k, v in rep_docs.items():
120 | docs.append((k, v))
121 | rep = pd.DataFrame(docs, columns=["Topic", "Rep_docs"])
122 | topic_info_freq = pd.merge(freq, rep, on="Topic")
123 | data["topic"] = topics
124 | data["prob"] = probabilities
125 | return {"data": data, "topic_info": topic_info_freq, "tree": tree}
126 |
--------------------------------------------------------------------------------
/src/ml_models/transformers/category_decomposition.py:
--------------------------------------------------------------------------------
1 | import json
2 | from getpass import getpass
3 |
4 | import openai
5 |
6 | openai_secret = getpass("Enter the secret key: ")
7 | # Set up OpenAI API credentials
8 | openai.api_key = openai_secret
9 |
10 |
11 | def chat_gpt_inference(messages: list):
12 | response = openai.ChatCompletion.create(
13 | model="gpt-3.5-turbo",
14 | messages=messages,
15 | max_tokens=1500,
16 | temperature=0.2,
17 | )
18 | return response
19 |
20 |
21 | def create_prompt_message_template(text, role="user"):
22 | if role not in ["user", "assistant"]:
23 | raise ValueError("Not a valid role. Please use 'user' or 'assistant'.")
24 | return {"role": role, "content": text}
25 |
26 |
27 | def compose_inference(text_block, messages):
28 | user_template = create_prompt_message_template(text_block, role="user")
29 | messages.append(user_template)
30 | chat_resp = chat_gpt_inference(messages)
31 | reply_text = chat_resp["choices"][0]["message"]["content"]
32 | assistant_template = create_prompt_message_template(reply_text, role="assistant")
33 | messages.append(assistant_template)
34 | return messages, reply_text
35 |
36 |
37 | def category_generation(topic: str, topic_reasoning_chain: list):
38 | """Decompose a category into its constituent parts.
39 |
40 | Args:
41 | category (str): The category to decompose.
42 |
43 | Returns:
44 | list: The list of constituent parts of the category.
45 | """
46 | # Define the prompt
47 | system_prompt = """Please design a categorical framework that captures the essential qualities and thought processes of a genius category theorist. Begin by identifying the key objects and morphisms that represent the foundational elements of a genius category theorist's intellectual toolbox. Then, outline the relationships and transformations between these objects and morphisms, highlighting the critical reasoning, creativity, and innovative problem-solving strategies employed by such a theorist. Finally, explain how this category can serve as a foundation for further exploration and development of novel ideas within the realm of category theory."""
48 |
49 | messages = [{"role": "system", "content": system_prompt}]
50 |
51 | with open(topic_reasoning_chain, "r") as f:
52 | segments = f.readlines()
53 |
54 | with open("ct_output_file.jsonl", "a") as f:
55 | for i, line in enumerate(segments):
56 | # print(line)
57 | topic = eval(line)["label"]
58 | print(topic)
59 | prompt = f"""Please provide a description of {topic} within a categorical framework,
60 | detailing the objects and morphisms that represent its essential components. Offer a brief summary of these objects and
61 | morphisms as they pertain to the category specific to {topic}."""
62 | try:
63 | messages, reply_text = compose_inference(prompt, messages)
64 |
65 | except Exception:
66 | messages, reply_text = compose_inference(prompt, [{"role": "system", "content": system_prompt}])
67 | print(reply_text)
68 | print()
69 | row = {
70 | "topic": topic,
71 | "assistant_reply": reply_text,
72 | }
73 | json.dump(row, f)
74 | f.write("\n")
75 |
--------------------------------------------------------------------------------
/src/ml_models/transformers/semantic_graph_context_generator.py:
--------------------------------------------------------------------------------
1 | import json
2 |
3 | import openai
4 |
5 | from src.ast_parsers.python_ast_parser import get_methods, parse_github_repo
6 | from src.config import Config
7 | from src.system_prompts.format_system_prompts import (
8 | extract_questions,
9 | format_system_prompts,
10 | format_system_prompts_with_tree,
11 | )
12 |
13 |
14 | class SemanticGraphContextGenerator:
15 | def __init__(self, config: Config):
16 | self.config = config
17 |
18 | def __repr__(self) -> str:
19 | return f"SemanticGraphContextGenerator(config={self.config})"
20 |
21 | def get_repo_contents(self, git_repo_path):
22 | contents = parse_github_repo(git_repo_path)
23 | print(len(contents))
24 | pruned_contents = []
25 | for cont in contents:
26 | fp = cont["file_name"]
27 | fn = fp.split("/")[-1]
28 | fn_ = fn.split(".")[0]
29 | if fn_ in ["__init__"] or fn_.split("_")[-1] in ["test"]:
30 | continue
31 | else:
32 | pruned_contents.append(cont)
33 | return pruned_contents
34 |
35 | def chat_gpt_inference(self, messages: list):
36 | response = openai.ChatCompletion.create(
37 | model="gpt-3.5-turbo",
38 | messages=messages,
39 | max_tokens=self.config["max_tokens"],
40 | temperature=self.config["temperature"],
41 | )
42 | return response
43 |
44 | def create_prompt_message_template(self, text, role="user"):
45 | if role not in ["user", "assistant"]:
46 | raise ValueError("Not a valid role. Please use 'user' or 'assistant'.")
47 | return {"role": role, "content": text}
48 |
49 | def compose_inference(self, text_block, messages):
50 | user_template = self.create_prompt_message_template(text_block, role="user")
51 | messages.append(user_template)
52 | chat_resp = self.chat_gpt_inference(messages)
53 | reply_text = chat_resp["choices"][0]["message"]["content"]
54 | assistant_template = self.create_prompt_message_template(reply_text, role="assistant")
55 | messages.append(assistant_template)
56 | return messages, reply_text
57 |
58 | def process_transcript(self, segments, file_name, git_repo_path, output_file_path, system_prompt, task, code_type):
59 | messages = [{"role": "system", "content": system_prompt}]
60 |
61 | with open(output_file_path, "a") as f:
62 | for i, sent in enumerate(segments):
63 |
64 | text_block = f"""```{sent}```"""
65 | try:
66 | messages, reply_text = self.compose_inference(text_block[:2000], messages)
67 |
68 | except Exception:
69 | messages, reply_text = self.compose_inference(
70 | text_block[:2000], [{"role": "system", "content": system_prompt}]
71 | )
72 | if task == "self-instruction":
73 | questions = extract_questions(reply_text)
74 | if len(questions) > 0:
75 | for q in questions:
76 | sub_text_block = f"""{q}```{sent}```"""
77 | print(sub_text_block)
78 | print()
79 | messages, reply_text = self.compose_inference(sub_text_block[:2000], messages)
80 | print(reply_text)
81 | row = {
82 | "git_repo_path": git_repo_path,
83 | "file_name": file_name,
84 | "code_type": code_type,
85 | "system_task": task,
86 | "system_prompt": system_prompt,
87 | "conversation_history": messages,
88 | "assistant_reply": reply_text,
89 | }
90 | json.dump(row, f)
91 | f.write("\n")
92 | else:
93 | row = {
94 | "git_repo_path": git_repo_path,
95 | "file_name": file_name,
96 | "code_type": code_type,
97 | "system_task": task,
98 | "system_prompt": system_prompt,
99 | "conversation_history": messages,
100 | "assistant_reply": reply_text,
101 | }
102 | json.dump(row, f)
103 | f.write("\n")
104 |
105 | return messages
106 |
107 | def decompose_repo(self, git_repo_path, name_id, out_path, skip_graph_generation=False):
108 | contents = self.get_repo_contents(git_repo_path)
109 | context_paths = []
110 | for cont in contents:
111 |
112 | if not self.config["with_tree"]:
113 | system_prompts = format_system_prompts(git_repo_path, cont["file_name"])
114 | else:
115 | system_prompts = format_system_prompts_with_tree(
116 | git_repo_path, cont["file_name"], self.config["topic_tree"]
117 | )
118 | for k, v in zip(system_prompts.keys(), system_prompts.values()):
119 |
120 | func_task = k
121 | out_file_name = f"{name_id}_{func_task}"
122 | print(f"file_name: {cont['file_name']}")
123 | num_funcs = len(cont["functions"])
124 | num_classes = len(cont["classes"])
125 | print(f"Imports: {cont['imports']}")
126 | context_paths.append(f"{out_path}/{out_file_name}.jsonl")
127 | if skip_graph_generation:
128 | continue
129 | try:
130 | if num_funcs > 0 or num_classes > 0:
131 | print(f" len of functions: {len(cont['functions'])}")
132 | _ = self.process_transcript(
133 | cont["functions"],
134 | cont["file_name"],
135 | git_repo_path,
136 | f"{out_path}/{out_file_name}.jsonl",
137 | system_prompts[func_task],
138 | func_task,
139 | "functions",
140 | )
141 | _ = self.process_transcript(
142 | cont["classes"],
143 | cont["file_name"],
144 | git_repo_path,
145 | f"{out_path}/{out_file_name}.jsonl",
146 | system_prompts[func_task],
147 | func_task,
148 | "classes",
149 | )
150 | for cls in cont["classes"]:
151 | cls_funcs = get_methods(cls)
152 |
153 | print(f"len of class: {len(cls)}")
154 | for method in cls_funcs:
155 | print(f"len of method: {len(method)}")
156 | _ = self.process_transcript(
157 | cls_funcs,
158 | cont["file_name"],
159 | git_repo_path,
160 | f"{out_path}/{out_file_name}.jsonl",
161 | system_prompts[func_task],
162 | func_task,
163 | "methods",
164 | )
165 | except Exception as e:
166 | print(e)
167 | continue
168 | print("\n\n")
169 | return set(context_paths)
170 |
--------------------------------------------------------------------------------
/src/notebooks/inference_me.md:
--------------------------------------------------------------------------------
1 | ```python
2 | from ast_parsers.python_ast_parser import parse_python_file, get_methods, parse_github_repo
3 | from system_prompts.format_system_prompts import format_system_prompts, format_system_prompts_with_tree
4 | from ..ml_models.transformers.chatgpt_api_inference import process_transcript
5 | from ..repo_graph_generation import decompose_repo
6 | from ..ml_models.graph_networks.kernels import compute_kernel_by_type, graph_laplacian, compute_kernel
7 | from ..ml_models.graph_networks.message_passing import k_hop_message_passing_sparse
8 | from ..ml_models.transformers.sentence_embeddings import embed_data
9 | from ..ml_models.topic_modeling.umap_hdbscan_pipeline import (
10 | load_topic_model,
11 | get_topic_model,
12 | get_representative_docs,
13 | reduce_outliers,
14 | fit_topic_model,
15 | compute_hierarchical_topic_tree,
16 | get_topic_info,
17 | )
18 |
19 |
20 | from getpass import getpass
21 |
22 | openai_secret = getpass("Enter the secret value: ")
23 | # Set up OpenAI API credentials
24 | openai.api_key = openai_secret
25 |
26 |
27 | !git clone https://github.com/danielpatrickhug/GitModel.git
28 | ```
29 |
30 | TODO. standardize this
31 |
32 | ```python
33 | git_repo_path = "/content/GitModel"
34 | out_path = "/content/gitmodel_sum"
35 | name_id = "gitmodel"
36 | contents = parse_github_repo(git_repo_path)
37 | print(len(contents))
38 | pruned_contents = []
39 | for cont in contents:
40 | fp = cont["file_name"]
41 | fn = fp.split("/")[-1]
42 | fn_ = fn.split(".")[0]
43 | if fn_ in ["__init__"] or fn_.split("_")[-1] in ["test"]:
44 | continue
45 | else:
46 | print(cont["file_name"])
47 | pruned_contents.append(cont)
48 |
49 | decompose_repo()
50 | ```
51 |
52 | ```python
53 | def load_jsonl(filepaths):
54 | data = []
55 | for filepath in filepaths:
56 | with open(filepath, "r") as f:
57 | for line in f:
58 | data.append(json.loads(line))
59 | return data
60 |
61 |
62 | root_dir = "/content/gitmodel_sum"
63 | repo = "gitmodel"
64 | repo_files = [
65 | f"{root_dir}/{repo}_summary.jsonl",
66 | f"{root_dir}/{repo}_question_asking.jsonl",
67 | ]
68 |
69 | res = load_jsonl(repo_files)
70 | sents = []
71 | for r in res:
72 | messages = r["conversation_history"]
73 | reply = r["assistant_reply"]
74 | sents.append(reply)
75 | sents.append(messages[-2]["content"])
76 | ```
77 |
78 | ```python
79 | data = pd.DataFrame(sents, columns=["query"])
80 | data["_id"] = data.index
81 | ```
82 |
83 | ```python
84 | MODEL_NAME = "all-MiniLM-L6-v2" # "allenai-specter"#
85 | embs = embed_data(data, model_name=MODEL_NAME)
86 | A = compute_kernel_by_type(embs, threshold=0.6, kernel_type="cosine")
87 | k=2
88 | A_k, agg_features = k_hop_message_passing_sparse(A, embs, k)
89 | ```
90 |
91 | Graph Laplacian
92 |
93 | ```python
94 | L, D = graph_laplacian(A)
95 | L_k, D_k = graph_laplacian(A_k)
96 | ```
97 |
98 | SVD for when the heads
99 |
100 | ```python
101 | U, S, VT = np.linalg.svd(A)
102 | print(f"U: {U.shape}\n")
103 | print(f"S: {S.shape}\n")
104 | print(f"VT: {VT.shape}\n")
105 | plt.plot(np.diag(S))
106 | plt.xlabel("Singular value index")
107 | plt.ylabel("Singular value")
108 | plt.title("Singular values of A")
109 | plt.show()
110 | ```
111 |
112 | ```python
113 | U_k, S_k, VT_k = np.linalg.svd(A_k)
114 | print(f"U_{k}: {U_k.shape}\n")
115 | print(f"S_{k}: {S_k.shape}\n")
116 | print(f"VT_{k}: {VT_k.shape}\n")
117 | plt.plot(np.diag(S_k))
118 | plt.xlabel("Singular value index")
119 | plt.ylabel("Singular value")
120 | plt.title("Singular values of A_k")
121 | plt.show()
122 | ```
123 |
124 | ```python
125 | topic_model = load_topic_model(nr_topics="auto")
126 | topics, probs = fit_topic_model(topic_model, data, agg_features)
127 | freq = get_topic_info(topic_model)
128 | rep_docs = topic_model
129 | hr, tree = compute_hierarchical_topic_tree(topic_model=topic_model, data=data)
130 | ```
131 |
--------------------------------------------------------------------------------
/src/pipeline.py:
--------------------------------------------------------------------------------
1 | import json
2 | import logging
3 | import os
4 | import random
5 | import sys
6 | from typing import Tuple
7 |
8 | import numpy as np
9 | import pandas as pd
10 |
11 | from src.config import Config
12 | from src.dynamic_import import instantiate_class_from_config
13 | from src.fetch_repo import clone_and_create_context_folder
14 | from src.ml_models.graph_networks.gnn_head import combine_graphs_with_gat, linearly_sum_gnn_heads
15 |
16 | lg = logging.getLogger(__name__)
17 | lg.setLevel(logging.DEBUG)
18 | handler = logging.StreamHandler(sys.stdout)
19 | handler.setFormatter(logging.Formatter("[%(asctime)s] [%(levelname)s] - %(message)s"))
20 | lg.addHandler(handler)
21 |
22 |
23 | class Pipeline:
24 | def __init__(self, config: Config):
25 | self.config = config
26 | self._obj_map = instantiate_class_from_config(config)
27 | self.pipeline_settings = self._obj_map["pipeline_settings"][0]
28 | self.semantic_graph_context_generator = [
29 | self._obj_map["semantic_graph_context_generator"][i]
30 | for i in range(len(self._obj_map["semantic_graph_context_generator"]))
31 | ]
32 | self.gnn_heads = [self._obj_map["gnn_heads"][i] for i in range(len(self._obj_map["gnn_heads"]))]
33 | self.topic_model = self._obj_map["topic_model"][0]
34 |
35 | @classmethod
36 | def from_config(cls, config: Config):
37 | return cls(config)
38 |
39 | @classmethod
40 | def from_yaml(cls, yaml_path):
41 | config = Config.from_yaml(yaml_path)
42 | return cls(config)
43 |
44 | def find_files_with_substring(self, root_dir, substring):
45 | for dirpath, dirnames, filenames in os.walk(root_dir):
46 | for filename in filenames:
47 | if substring in filename:
48 | yield os.path.join(dirpath, filename)
49 |
50 | def load_jsonl(self, filepaths):
51 | res = []
52 | for filepath in filepaths:
53 | with open(filepath, "r") as f:
54 | for line in f:
55 | res.append(json.loads(line))
56 | sents = []
57 | for r in res:
58 | messages = r["conversation_history"]
59 | reply = r["assistant_reply"]
60 | sents.append(reply)
61 | sents.append(messages[-2]["content"])
62 | data = pd.DataFrame(sents, columns=["query"])
63 | data["_id"] = data.index
64 | return data
65 |
66 | def run(self, git_repo: str, repo_name: str) -> Tuple[Tuple[np.ndarray, np.ndarray], dict]:
67 | """
68 | Run the pipeline."""
69 | # replace with lg.info
70 | lg.info("Running pipeline...")
71 | lg.info("Fetching repo...")
72 | repo_folder, context_folder = clone_and_create_context_folder(git_repo, repo_name)
73 | lg.info("Generating semantic graph context...")
74 |
75 | semantic_graph_context = [
76 | context_generator.decompose_repo(
77 | repo_folder,
78 | repo_name,
79 | context_folder,
80 | skip_graph_generation=self.pipeline_settings.config["skip_graph_creation"],
81 | )
82 | for context_generator in self.semantic_graph_context_generator
83 | ]
84 | context_files = []
85 | for context in semantic_graph_context[0]:
86 | context_files.append(context)
87 | context_files = self.find_files_with_substring(context_folder, repo_name)
88 | lg.info(context_files)
89 | data = self.load_jsonl(context_files)
90 |
91 | lg.info("Running GNN heads...")
92 | gnn_head_outputs = [gnn_head.generate_graph(data) for gnn_head in self.gnn_heads]
93 | lg.info("Combining GNN heads...")
94 | lg.info(self.pipeline_settings)
95 |
96 | if self.pipeline_settings.config["combine_gnn_strategy"] == "sum":
97 | combined_gnn_head = linearly_sum_gnn_heads(gnn_head_outputs, self.pipeline_settings.config["norm_fn"])
98 | elif self.pipeline_settings.config["combine_gnn_strategy"] == "gat":
99 | combined_gnn_head = combine_graphs_with_gat(gnn_head_outputs)
100 | elif self.pipeline_settings.config["combine_gnn_strategy"] == "none":
101 | combined_gnn_head = gnn_head_outputs[0]
102 | elif self.pipeline_settings.config["combine_gnn_strategy"] == "random":
103 | # Choose a random GNN head as the final output
104 | combined_gnn_head = random.choice(gnn_head_outputs)
105 | else:
106 | raise ValueError(f"Unknown combine_gnn_strategy setting: {self.pipeline_settings['combine_gnn_strategy']}")
107 |
108 | lg.info("Running topic model...")
109 |
110 | topic_model_outputs = self.topic_model.run(data, combined_gnn_head)
111 |
112 | return gnn_head_outputs, [topic_model_outputs]
113 |
114 | def get_repo_contents(self, git_repo_path):
115 | contents = self.semantic_graph_context_generator.get_repo_contents(git_repo_path)
116 | return contents
117 |
118 | def decompose_repo(self, contents, git_repo_path, name_id, topic_tree, out_path):
119 | self.semantic_graph_context_generator.decompose_repo(contents, git_repo_path, name_id, topic_tree, out_path)
120 |
--------------------------------------------------------------------------------
/src/system_prompts/format_system_prompts.py:
--------------------------------------------------------------------------------
1 | import re
2 |
3 |
4 | def format_system_prompts(git_repo_path, file_name):
5 | system_prompts = {
6 | "self-instruction": f"""
7 | GitHub repository: {git_repo_path} you're currently in the file {file_name}.
8 | Generate a list of irreducible questions related to a given piece of code that can help understand its functionality and behavior. Ensure that the questions are consistently formatted and easy to parse. The irreducible questions should be answerable with the provided piece of code and cannot be further simplified. Consider the following aspects when generating the questions:
9 |
10 | What is the purpose of the code?
11 | What input does the code take, and how does it process the input?
12 | What output does the code produce?
13 | Use these aspects to generate a set of irreducible questions that can help anyone understand the code and its behavior.
14 | """,
15 | }
16 |
17 | return system_prompts
18 |
19 |
20 | def extract_questions(prompt_output):
21 | # Define a regular expression to match the questions
22 | question_regex = r"\d+\.\s(.+\?)\n"
23 |
24 | # Find all matches of the question regex in the prompt output
25 | questions = re.findall(question_regex, prompt_output)
26 | return questions
27 |
28 |
29 | def format_system_prompts_with_tree(git_repo_path, file_name, topic_tree):
30 | system_prompts = {
31 | "summary2": f"""
32 | Provide a concise overview of the codebase located at {git_repo_path} and the file {file_name}, including its main features and capabilities.
33 | ChatGPT will use its natural language processing capabilities to analyze the codebase and generate a brief summary of its main functionalities, tools, and technologies used. Here is the repos heirarchical topic tree
34 | {topic_tree} You can use it to better understand the global structure of the repository
35 | """,
36 | "summary3": f"""
37 | Can you summarize the main functionality of the codebase located at {git_repo_path} and the file {file_name}, as well as any notable libraries or frameworks used?
38 | ChatGPT will use its natural language processing capabilities to analyze the codebase and provide a brief summary of its main functionalities, libraries, and frameworks used.Here is the repos heirarchical topic tree
39 | {topic_tree} You can use it to better understand the global structure of the repository
40 | """,
41 | "summary": f"""
42 | Summarize the code the GitHub repository: {git_repo_path} you're currently in the file {file_name}
43 | I want you to act as a code summarizer, you will use advanced natural language processing capabilities to analyze the code and generate a concise summary that captures
44 | the main functionality and purpose of the codebase. Additionally, ChatGPT can provide insights into the programming languages and libraries used in the repository,
45 | as well as any notable features or functionalities that are present. here is a topic tree of the repository
46 | {topic_tree} You can use it to better understand the global structure of the repository
47 | """,
48 | "qa_chain": f"""
49 | GitHub repository: {git_repo_path} you're currently in the file {file_name}.
50 | I want you to asks questions that a new developer may ask about the codebase used in the repository,
51 | as well as answer the question with step by step reasoning as a senior dev would. All responses should first ask a question and then answer with reasoning.
52 | Here is a topic tree of the repository
53 | {topic_tree} You can use it to better understand the global structure of the repository
54 | """,
55 | }
56 | return system_prompts
57 |
--------------------------------------------------------------------------------
/test_config.yaml:
--------------------------------------------------------------------------------
1 | pipeline_settings:
2 | - __impl__: PipelineSettings
3 | combine_gnn_strategy: none
4 | norm_fn: none
5 | skip_graph_creation: False
6 |
7 | semantic_graph_context_generator:
8 | - __impl__: SemanticGraphContextGenerator
9 | generator_id: self-instruction
10 | model_id: gpt-3.5-turbo
11 | temperature: 0
12 | max_tokens: 500
13 | with_tree: False
14 |
15 | gnn_heads:
16 | - __impl__: GNNHead
17 | generator_id: message_passing
18 | k_hop: 2
19 | is_sparse: True
20 | embbedding_id: sbert
21 | embbedding_model: all-MiniLM-L6-v2
22 | kernel_id: cosine_similarity
23 | graph_id: adjacency
24 | connection_threshold: 0.7
25 |
26 | topic_model:
27 | - __impl__: TopicModel
28 | topic_model_id: topic_model
29 | embedding_model: all-MiniLM-L6-v2
30 | clustering_model:
31 | HDBSCAN(min_cluster_size=15, metric='euclidean',
32 | cluster_selection_method='eom', prediction_data=True)
33 | dimensionality_reduction:
34 | umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0,
35 | metric='cosine')
36 | vectorizer_model: CountVectorizer(stop_words="english")
37 | representation_model:
38 | OpenAI(model="gpt-3.5-turbo", delay_in_seconds=3, chat=True)
39 | hierarchical_linkage_fn: sch.linkage(x, 'single', optimal_ordering=True)
40 | with_tree: True
41 | auto_cluster: True
42 |
--------------------------------------------------------------------------------