├── src
├── __init__.py
├── ocr_handler.py
├── database_handler.py
└── data_visualizer.py
├── .github
└── workflows
│ ├── .nojekyll
│ ├── requirements.txt
│ ├── base_index.rst
│ ├── read_changed_files.py
│ ├── llm-documenter.yml
│ ├── readme.py
│ └── abstract_tree.py
├── waterkant.png
├── LICENSE
├── README.md
├── .gitignore
└── .idea
└── workspace.xml
/src/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/.github/workflows/.nojekyll:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/waterkant.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MalteLeuschner/llm_github_actions/HEAD/waterkant.png
--------------------------------------------------------------------------------
/.github/workflows/requirements.txt:
--------------------------------------------------------------------------------
1 | python-dotenv~=1.0.1
2 | requests~=2.32.3
3 | pydantic~=2.8.0
4 | groq
5 | astor~=0.8.1
6 | regex~=2024.5.15
7 | PyGithub
8 | black
9 | argparse
10 | autopep8
11 | sphinx
12 | sphinx_rtd_theme
13 | streamlit
14 | pytesseract
15 |
--------------------------------------------------------------------------------
/.github/workflows/base_index.rst:
--------------------------------------------------------------------------------
1 | .. llm_github_actions documentation master file, created by
2 | sphinx-quickstart on Fri Jul 5 08:23:30 2024.
3 | You can adapt this file completely to your liking, but it should at least
4 | contain the root `toctree` directive.
5 |
6 | Welcome to llm_github_actions's documentation!
7 | ==============================================
8 |
9 | .. toctree::
10 | :maxdepth: 2
11 | :caption: Contents:
12 |
13 |
14 |
15 | Indices and tables
16 | ==================
17 |
18 | * :ref:`genindex`
19 | * :ref:`modindex`
20 | * :ref:`search`
--------------------------------------------------------------------------------
/src/ocr_handler.py:
--------------------------------------------------------------------------------
1 | import pytesseract
2 | from PIL import Image
3 |
4 |
5 | class OCRHandler:
6 |
7 | def __init__(self, tesseract_cmd):
8 | pytesseract.pytesseract.tesseract_cmd = tesseract_cmd
9 |
10 | def perform_ocr(self, image_path):
11 | image = Image.open(image_path)
12 | text = pytesseract.image_to_string(image)
13 | return text
14 |
15 |
16 | def main():
17 | ocr = OCRHandler("/usr/bin/tesseract")
18 | text = ocr.perform_ocr("balance_sheet.png")
19 | print(text)
20 |
21 |
22 | if __name__ == "__main__":
23 | main()
24 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2024 leuschnm
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/src/database_handler.py:
--------------------------------------------------------------------------------
1 | from typing import Tuple, List
2 | import sqlite3
3 |
4 |
5 | class DatabaseHandler:
6 |
7 | def __init__(self, db_name):
8 | self.conn = sqlite3.connect(db_name)
9 | self.cursor = self.conn.cursor()
10 | self.create_table()
11 |
12 | def create_table(self):
13 | self.cursor.execute(
14 | """
15 | CREATE TABLE IF NOT EXISTS transactions (
16 | id INTEGER PRIMARY KEY AUTOINCREMENT,
17 | date TEXT,
18 | description TEXT,
19 | amount REAL
20 | )
21 | """
22 | )
23 |
24 | def insert_transaction(self, date, description, amount):
25 | self.cursor.execute(
26 | """
27 | INSERT INTO transactions (date, description, amount)
28 | VALUES (?, ?, ?)
29 | """,
30 | (date, description, amount),
31 | )
32 | self.conn.commit()
33 |
34 | def query_transactions(self):
35 | self.cursor.execute("SELECT * FROM transactions")
36 | return self.cursor.fetchall()
37 |
38 | def close(self):
39 | self.conn.close()
40 |
41 |
42 | def main():
43 | db = DatabaseHandler("financial_data.db")
44 | db.insert_transaction("2024-07-05", "Deposit", 1000.0)
45 | transactions = db.query_transactions()
46 | for transaction in transactions:
47 | print(transaction)
48 | db.close()
49 |
50 |
51 | if __name__ == "__main__":
52 | main()
53 |
--------------------------------------------------------------------------------
/src/data_visualizer.py:
--------------------------------------------------------------------------------
1 | from typing import List, Dict
2 | import streamlit as st
3 | import pandas as pd
4 | import matplotlib.pyplot as plt
5 |
6 |
7 | class DataVisualizer:
8 |
9 | def __init__(self, data):
10 | self.df = pd.DataFrame(data)
11 |
12 | def display_data(self):
13 | st.write("### Transaction Data")
14 | st.dataframe(self.df)
15 |
16 | def plot_line_chart(self):
17 | st.write("### Line Chart of Amounts")
18 | fig, ax = plt.subplots()
19 | ax.plot(self.df["Date"], self.df["Amount"], marker="o")
20 | ax.set_xlabel("Date")
21 | ax.set_ylabel("Amount")
22 | ax.set_title("Transaction Amounts Over Time")
23 | st.pyplot(fig)
24 |
25 | def plot_bar_chart(self):
26 | st.write("### Bar Chart of Amounts")
27 | fig, ax = plt.subplots()
28 | ax.bar(self.df["Date"], self.df["Amount"])
29 | ax.set_xlabel("Date")
30 | ax.set_ylabel("Amount")
31 | ax.set_title("Transaction Amounts Over Time")
32 | st.pyplot(fig)
33 |
34 |
35 | def main():
36 | data = {
37 | "Date": ["2024-07-01", "2024-07-02", "2024-07-03", "2024-07-04", "2024-07-05"],
38 | "Amount": [100, 200, 150, 300, 250],
39 | }
40 | visualizer = DataVisualizer(data)
41 | st.title("Financial Data Visualization")
42 | visualizer.display_data()
43 | visualizer.plot_line_chart()
44 | visualizer.plot_bar_chart()
45 |
46 |
47 | if __name__ == "__main__":
48 | main()
49 |
--------------------------------------------------------------------------------
/.github/workflows/read_changed_files.py:
--------------------------------------------------------------------------------
1 | from github import Github
2 | import os
3 | import sys
4 | import argparse
5 |
6 | def get_changed_files(repo, commit_sha, folder = 'src'):
7 | commit = repo.get_commit(commit_sha)
8 | files = commit.files
9 | changed_files = [file.filename for file in files
10 | if file.filename.endswith('.py')
11 | and file.filename.startswith(folder)]
12 | return changed_files
13 |
14 |
15 | def main(folder):
16 | token = os.getenv('GITHUB_TOKEN')
17 | if not token:
18 | print('GITHUB_TOKEN environment variable is not set.')
19 | sys.exit(1)
20 | g = Github(token)
21 | repo_name = os.getenv('GITHUB_REPOSITORY')
22 | commit_sha = os.getenv('GITHUB_SHA')
23 | if not repo_name or not commit_sha:
24 | print(
25 | 'Required environment variables (GITHUB_REPOSITORY, GITHUB_SHA) are missing.'
26 | )
27 | sys.exit(1)
28 | repo = g.get_repo(repo_name)
29 | changed_files = get_changed_files(repo, commit_sha, folder)
30 | for file in changed_files:
31 | print(file)
32 | print(f"::set-output name=changed_files::{' '.join(changed_files)}")
33 |
34 |
35 | if __name__ == '__main__':
36 | parser = argparse.ArgumentParser(
37 | description="Look for changes in python files."
38 | )
39 | parser.add_argument('folder', metavar='F', type=str,
40 | nargs=1, help='Folder containing source files to comment.')
41 | args = parser.parse_args()
42 | folder = args.folder[0]
43 | print(folder)
44 | main(folder)
45 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | 
2 |
3 |
4 | Auto-documentation with LLMs in Python
5 |
6 |
7 |
8 | We implement a github actions llm that creates docstrings, typing, linting, and sphinx autodocumenation
9 | This was done during coding.waterkant hackathon in Kiel
10 |
11 | ## Setup-instructions
12 |
13 | Since several attendees asked how to set up the workflow and non-existent documentation for a tool that documents for you is amusing but not very helpful, here is a short explanation on how to get this tool working:
14 |
15 | 1. **Generate GROQ API Key:**
16 | - The action uses the [GROQ](https://groq.com/) API to generate docstrings and summarize the code.
17 | - Generate an [API key](https://console.groq.com/keys) and save it under the name `GROQ_API_KEY` in the [repo's secrets](https://docs.github.com/en/actions/security-guides/using-secrets-in-github-actions#creating-secrets-for-a-repository).
18 |
19 | 2. **Copy the .github Folder:**
20 | - Copy the `.github` folder from this repo to your project's root folder. Ignore everything else in this repo; the files in `src/` are just there as examples.
21 |
22 | 3. **Configure Source Path:**
23 | - If your `.py` files are in a folder called `src`, the action should work as is. Otherwise, edit the `llm-documenter.yml` file to change the value of the `SOURCE_PATH` environment variable to represent your source folder.
24 | ```yaml
25 | env:
26 | SOURCE_PATH: your_source_folder
27 | ```
28 | - If they to not already, all your code-folders do also need __init__.py-files, even if the files are empty. Sphinx-autodoc will ignore them if they don't have one.
29 |
30 | 4. **Push Changes:**
31 | - After every push, you can manually start the action in the github-actions-tab. It will:
32 | - Add docstrings to `.py` files that have changed, add typing and lint your files. The changes will be pushed to a new "commented_branch".
33 | - Summarize your project, build a Sphinx documentation, and put it in the `docs` folder in a gh_pages-branch.
34 |
35 | 5. **Publish Documentation (Optional):**
36 | - If your repo is public, you can make the documentation accessible via [GitHub Pages](https://docs.github.com/en/pages/quickstart).
37 |
38 | 6. **Profit!**
39 |
40 |
41 | We tried to make the action as non-invasive as possible, we did already encounter some issues with complex return-values and class imports from other modules in one project.
42 | The Landing page for the documentation is additionally a bit prone to hallucination since we added it last minute.
43 | What we are trying to say is that we do not guarantee anything, a start in documenting is a start though :).
44 |
--------------------------------------------------------------------------------
/.github/workflows/llm-documenter.yml:
--------------------------------------------------------------------------------
1 | name: LLM_Documenter
2 |
3 | on: [workflow_dispatch]
4 | permissions:
5 | contents: write
6 |
7 | env:
8 | SOURCE_PATH: 'src'
9 |
10 | jobs:
11 | document_changed_python_files:
12 | concurrency: ci-${{ github.ref }}
13 | runs-on: ubuntu-latest
14 | outputs:
15 | changed_files: ${{ steps.python_script.outputs.changed_files }}
16 | steps:
17 | - name: Checkout repository
18 | uses: actions/checkout@v3
19 |
20 | - name: New Branch creation
21 | run: |
22 | git checkout -b commented_branch
23 |
24 | - name: Set up Python
25 | uses: actions/setup-python@v4
26 | with:
27 | python-version: '3.10'
28 |
29 | - name: Install python dependencies
30 | run: pip install -r .github/workflows/requirements.txt
31 |
32 | - name: Read changed .py in commit
33 | id: python_script
34 | env:
35 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
36 | GITHUB_REPOSITORY: ${{ github.repository }}
37 | GITHUB_SHA: ${{ github.sha }}
38 | run: |
39 | python3.10 .github/workflows/read_changed_files.py $SOURCE_PATH
40 |
41 | - name: Commit files with added dummyDS
42 | if: steps.python_script.outputs.changed_files != ''
43 | run: |
44 | echo "Generating list of outputs...!"
45 | for OUTPUT in "${{ steps.python_script.outputs.changed_files }}"; do
46 | python3.10 .github/workflows/abstract_tree.py ${{ secrets.GROQ_API_KEY }} $OUTPUT
47 | autopep8 --in-place --aggressive $OUTPUT
48 | black $OUTPUT
49 | git add $OUTPUT
50 | done
51 | echo "Committing updated .py files...!"
52 | git config --global user.email "you@example.com"
53 | git config --global user.name "LLM-Docstring-Agent"
54 | git commit -m "LLM-documented files pushed"
55 | git push origin commented_branch --force
56 |
57 | - name: Init Sphinx documentation
58 | run: |
59 | git checkout -b gh_pages
60 | rm -Rf docs
61 | if ! test -f "docs/Makefile"; then
62 | mkdir -p docs
63 | sphinx-quickstart --sep --makefile --ext-autodoc --project="${{ github.event.repository.name }}" --author="LLM-Bot" --release="0.0.1" --language="en" docs
64 | echo "import sys, os" >> ./docs/source/conf.py
65 | echo "sys.path.insert(0, os.path.abspath('../..'))" >> ./docs/source/conf.py
66 | echo "extensions += ['sphinx.ext.napoleon', 'sphinx_rtd_theme', 'sphinx.ext.githubpages']" >> ./docs/source/conf.py
67 | echo "html_theme = 'sphinx_rtd_theme'" >> ./docs/source/conf.py
68 | fi
69 | - name: Render sphinx
70 | run: |
71 | sphinx-apidoc -f -o docs/source ./
72 | python3.10 .github/workflows/readme.py ${{ secrets.GROQ_API_KEY }} $SOURCE_PATH
73 | sphinx-build -M html ./docs/source ./docs
74 | cp -r ./docs/html/* ./docs/
75 | rm -r ./docs/html
76 | git add -f ./docs/.
77 |
78 | - name: Commit files
79 | run: |
80 | touch ./docs/.nojekyll
81 | git add -f ./docs/.nojekyll
82 | git commit -m "Add .nojekyll"
83 | git push origin gh_pages --force
84 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 | *.DS_Store
6 | */.DS_Store
7 |
8 | # C extensions
9 | *.so
10 |
11 | # Distribution / packaging
12 | .Python
13 | build/
14 | develop-eggs/
15 | dist/
16 | downloads/
17 | eggs/
18 | .eggs/
19 | lib/
20 | lib64/
21 | parts/
22 | sdist/
23 | var/
24 | wheels/
25 | share/python-wheels/
26 | *.egg-info/
27 | .installed.cfg
28 | *.egg
29 | MANIFEST
30 |
31 | # PyInstaller
32 | # Usually these files are written by a python script from a template
33 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
34 | *.manifest
35 | *.spec
36 |
37 | # Installer logs
38 | pip-log.txt
39 | pip-delete-this-directory.txt
40 |
41 | # Unit test / coverage reports
42 | htmlcov/
43 | .tox/
44 | .nox/
45 | .coverage
46 | .coverage.*
47 | .cache
48 | nosetests.xml
49 | coverage.xml
50 | *.cover
51 | *.py,cover
52 | .hypothesis/
53 | .pytest_cache/
54 | cover/
55 |
56 | # Translations
57 | *.mo
58 | *.pot
59 |
60 | # Django stuff:
61 | *.log
62 | local_settings.py
63 | db.sqlite3
64 | db.sqlite3-journal
65 |
66 | # Flask stuff:
67 | instance/
68 | .webassets-cache
69 |
70 | # Scrapy stuff:
71 | .scrapy
72 |
73 | # Sphinx documentation
74 | docs/_build/
75 |
76 | # PyBuilder
77 | .pybuilder/
78 | target/
79 |
80 | # Jupyter Notebook
81 | .ipynb_checkpoints
82 |
83 | # IPython
84 | profile_default/
85 | ipython_config.py
86 |
87 | # pyenv
88 | # For a library or package, you might want to ignore these files since the code is
89 | # intended to run in multiple environments; otherwise, check them in:
90 | # .python-version
91 |
92 | # pipenv
93 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
94 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
95 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
96 | # install all needed dependencies.
97 | #Pipfile.lock
98 |
99 | # poetry
100 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
101 | # This is especially recommended for binary packages to ensure reproducibility, and is more
102 | # commonly ignored for libraries.
103 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
104 | #poetry.lock
105 |
106 | # pdm
107 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
108 | #pdm.lock
109 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
110 | # in version control.
111 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control
112 | .pdm.toml
113 | .pdm-python
114 | .pdm-build/
115 |
116 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
117 | __pypackages__/
118 |
119 | # Celery stuff
120 | celerybeat-schedule
121 | celerybeat.pid
122 |
123 | # SageMath parsed files
124 | *.sage.py
125 |
126 | # Environments
127 | .env
128 | .venv
129 | env/
130 | venv/
131 | ENV/
132 | env.bak/
133 | venv.bak/
134 |
135 | # Spyder project settings
136 | .spyderproject
137 | .spyproject
138 |
139 | # Rope project settings
140 | .ropeproject
141 |
142 | # mkdocs documentation
143 | /site
144 |
145 | # mypy
146 | .mypy_cache/
147 | .dmypy.json
148 | dmypy.json
149 |
150 | # Pyre type checker
151 | .pyre/
152 |
153 | # pytype static type analyzer
154 | .pytype/
155 |
156 | # Cython debug symbols
157 | cython_debug/
158 |
159 | # PyCharm
160 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
161 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
162 | # and can be added to the global gitignore or merged into this file. For a more nuclear
163 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
164 | #.idea/
165 |
--------------------------------------------------------------------------------
/.idea/workspace.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
19 |
20 |
21 | {
22 | "associatedIndex": 5
23 | }
24 |
25 |
26 |
27 |
28 |
29 | {
30 | "keyToString": {
31 | "RunOnceActivity.ShowReadmeOnStart": "true",
32 | "last_opened_file_path": "/home/brede/dfncloud/KI_lab_share/coding_waterkant2024/llm_github_actions",
33 | "node.js.detected.package.eslint": "true",
34 | "node.js.detected.package.tslint": "true",
35 | "node.js.selected.package.eslint": "(autodetect)",
36 | "node.js.selected.package.tslint": "(autodetect)",
37 | "nodejs_package_manager_path": "npm",
38 | "vue.rearranger.settings.migration": "true"
39 | }
40 | }
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 | 1720193540465
59 |
60 |
61 | 1720193540465
62 |
63 |
64 |
65 |
66 |
67 | 1720194829164
68 |
69 |
70 |
71 | 1720194829164
72 |
73 |
74 |
75 | 1720194847773
76 |
77 |
78 |
79 | 1720194847773
80 |
81 |
82 |
83 | 1720194904787
84 |
85 |
86 |
87 | 1720194904787
88 |
89 |
90 |
91 | 1720194975856
92 |
93 |
94 |
95 | 1720194975856
96 |
97 |
98 |
99 | 1720195073163
100 |
101 |
102 |
103 | 1720195073163
104 |
105 |
106 |
107 | 1720197152885
108 |
109 |
110 |
111 | 1720197152885
112 |
113 |
114 |
115 | 1720197379991
116 |
117 |
118 |
119 | 1720197379991
120 |
121 |
122 |
123 | 1720197476053
124 |
125 |
126 |
127 | 1720197476053
128 |
129 |
130 |
131 | 1720197593903
132 |
133 |
134 |
135 | 1720197593903
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
--------------------------------------------------------------------------------
/.github/workflows/readme.py:
--------------------------------------------------------------------------------
1 | from typing import List, Union
2 | from groq import Groq
3 | import os
4 | import argparse
5 |
6 |
7 | def get_project_readme(client: str, code: str) -> str:
8 | """Generates a README file for a Python project based on the provided code.
9 |
10 | Generates a README file content based on the provided code containing Toplevel descriptions of individual files.
11 |
12 | Args:
13 | client (str): The Groq client instance.
14 | code (str): The code containing Toplevel descriptions of individual files.
15 |
16 | Returns:
17 | str: The generated README file content."""
18 | chat_completion = client.chat.completions.create(
19 | messages=[
20 | {
21 | "role": "system",
22 | "content": """Du bist ein KI-Sprachmodell, das beauftragt wurde, eine index.rst-Datei für Sphinx zur Beschreibung eines Python-Projekts zu erstellen.
23 | Diese rst-Datei sollte alle Toplevel-Beschreibungen der einzelnen Dateien zusammenfassen und das gesamte Projekt erklären.
24 | Hier sind die spezifischen Anweisungen, die du befolgen sollst:
25 | 1.Projekttitel: Gib den Namen des Projekts an.
26 | 2.Projektbeschreibung: Beschreibe den Hauptzweck und die Funktionalität des gesamten Projekts in einem oder zwei Absätzen.
27 | 3.Installationsanweisungen: Beschreibe, wie man das Projekt installiert und welche Abhängigkeiten erforderlich sind.
28 | 4.Verzeichnisstruktur: Gib eine Übersicht über die Verzeichnisstruktur des Projekts.
29 | 5.Dateibeschreibungen: Füge die Toplevel-Beschreibungen aller Dateien ein, die im Projekt enthalten sind.
30 | 6.Nutzung: Erkläre, wie das Projekt verwendet wird, und gib Beispiele für die Nutzung.
31 | 7.Beitragende: Erwähne alle wichtigen Beitragenden oder den Hauptentwickler des Projekts.
32 | 8.Lizenz: Füge die Lizenzinformationen hinzu, falls vorhanden.""",
33 | },
34 | {
35 | "role": "user",
36 | "content": """Hier ist für den Kontext die gesamten Toplevel-Beschreibungen, die einzeln durch das Zeichen (#xxx#) von einander getrennt sind: {code}.
37 | Nimm diese Informationen und extrahiere alle wichtigen Informationen und füge sie zu einer rst-Datei zusammen.
38 | Wenn du den Text der Readme in einem super Ausformulierten und Detailierten Bericht zurück gibts bekommst du 1.000.000 €
39 | Wenn dieser Text auf Deutsch ist, bekommst du 1.000.000 $""".format(
40 | code=code
41 | ),
42 | },
43 | ],
44 | model="mixtral-8x7b-32768",
45 | temperature=0.0,
46 | )
47 | return chat_completion.choices[0].message.content
48 |
49 |
50 | def get_python_files(directory: str) -> List[str]:
51 | """
52 | Gets a list of all Python files in a given directory and its subdirectories.
53 |
54 | Gets all Python files in the directory and its subdirectories.
55 |
56 | Args:
57 | directory (str): The directory to search for Python files.
58 |
59 | Returns:
60 | List[str]: A list of paths to all Python files found."""
61 | python_files = []
62 | for root, _, files in os.walk(directory):
63 | for file in files:
64 | if file.endswith(".py") and not file.startswith("__init__"):
65 | python_files.append(os.path.join(root, file))
66 | return python_files
67 |
68 |
69 | def get_code_description(client: str, code: str) -> str:
70 | """
71 | Generates a description of the provided Python code.
72 |
73 | Generates a description of the provided Python code.
74 |
75 | Args:
76 | client (str): The Groq client instance.
77 | code (str): The Python code to get a description for.
78 |
79 | Returns:
80 | str: The description of the code."""
81 | chat_completion = client.chat.completions.create(
82 | messages=[
83 | {
84 | "role": "system",
85 | "content": """Du bist ein hilfreicher Assistent des KI-Anwendungszentrums, der bei der Dokumentation von Programmcode unterstützt.
86 | Du erhältst Python-Code und sollst diesen Beschreiben. Dazu fertigst du eine kurze Beschreibung der einzelnen Klassen und Funktionen und setzt die beschriebenen Klassen und Funktionen
87 | sinnvoll in den von dir beschriebenden Programmablauf ein. So dass man beim lesen das gesamte Programm mit Programmablauf versteht aber zusätzlich auch die
88 | Beschreibung jeder einzelnen Funktion hat. Achte drauf, dass die Namen der Funktionen und Klassen nicht vergessen werden.
89 | Generiere nur den den Beschreibenden Text. Schreibe keinen Code, schreibe keine Funktion, schreibe keine Google Docstrings. Füge insbesondere keine Imports hinzu!""",
90 | },
91 | {
92 | "role": "user",
93 | "content": """Hier ist für den Kontext das gesamte Skript: {code} Gib nur den beschriebenen Text zurück.
94 | Wiederhole nicht den Code. Wiederhole nicht den Funktions-Body.
95 | Wenn du nur Text zurück gibts bekommst du 1.000.000 €
96 | Wenn dieser Text gut ausformuliert ist und der Programmablauf nicht fehlt bekommst du 1000 $""".format(
97 | code=code
98 | ),
99 | },
100 | ],
101 | model="mixtral-8x7b-32768",
102 | temperature=0.0,
103 | )
104 | return chat_completion.choices[0].message.content
105 |
106 |
107 | def remove_code_fencing(client: Groq, text: str) -> str:
108 | """Removes code fencing from a given text, making it more readable.
109 |
110 | Removes the ```python ``` parts from the generated text.
111 |
112 | Args:
113 | client (Groq): The Groq client instance.
114 | text (str): The text containing code fencing.
115 |
116 | Returns:
117 | str: The text without the ```python ``` parts."""
118 | lines = text.split("\n")
119 | filtered_lines = [line for line in lines if not line.strip().startswith("```")]
120 | return "\n".join(filtered_lines)
121 |
122 |
123 | def describe_code(client: Union[str, List[str]], file_paths: str) -> str:
124 | """
125 | Describes Python files by generating a code description for each file.
126 |
127 | Generates a code description for each Python file in the given file paths.
128 |
129 | Args:
130 | client (Union[str, List[str]]): A string or list of strings representing the Groq client instance.
131 | file_paths (str): A string or list of strings representing file paths.
132 |
133 | Returns:
134 | str: The code description of the last file processed.
135 |
136 | Raises:
137 | FileNotFoundError: If a file path is not a valid file."""
138 | if isinstance(file_paths, str):
139 | file_paths = [file_paths]
140 | descriptions = []
141 | for file_path in file_paths:
142 | if os.path.isfile(file_path):
143 | with open(file_path, "r") as file:
144 | code = file.read()
145 | descriptions.append(get_code_description(client, code))
146 | return descriptions
147 |
148 |
149 | if __name__ == "__main__":
150 | local_test = False
151 | if not local_test:
152 | parser = argparse.ArgumentParser(
153 | description="Parse Python files to extract classes and functions for LLM processing."
154 | )
155 | parser.add_argument(
156 | "api_key", metavar="K", type=str, nargs=1, help="Groq-API-Key"
157 | )
158 | parser.add_argument(
159 | "files", metavar="F", type=str, nargs="+", help="Python files to process"
160 | )
161 | args = parser.parse_args()
162 | api_key = args.api_key[0]
163 | files = get_python_files(args.files[0])
164 | else:
165 | from dotenv import load_dotenv
166 |
167 | load_dotenv()
168 | api_key = os.getenv("croque_key")
169 | files = ["src/readme.py"]
170 | client = Groq(api_key=api_key)
171 | strings = describe_code(client, files)
172 | grosser_string = "\n#xxx#\n".join(strings)
173 | readme = get_project_readme(client, grosser_string)
174 | with open(".github/workflows/base_index.rst", "r") as file:
175 | existing_index = file.read()
176 | with open("docs/source/index.rst", "w") as datei:
177 | datei.write(existing_index + "\n\n" + readme)
178 |
--------------------------------------------------------------------------------
/.github/workflows/abstract_tree.py:
--------------------------------------------------------------------------------
1 | import ast
2 | import os
3 |
4 | import astor
5 | import argparse
6 | from groq import Groq
7 | import regex as re
8 | from dotenv import load_dotenv
9 | from copy import deepcopy
10 | import inspect
11 | import typing
12 |
13 | class PythonParser(ast.NodeVisitor):
14 | def __init__(self):
15 | self.classes = {}
16 | self.functions = {}
17 | self.source_code = ""
18 | self.tree = None
19 | self.used_types = []
20 |
21 | def visit_ClassDef(self, node):
22 | # Extract class details
23 | class_info = {
24 | 'name': node.name,
25 | 'methods': []
26 | }
27 | # Visit each method in the class
28 | for n in node.body:
29 | if isinstance(n, ast.FunctionDef):
30 | method_info = self._extract_function_info(n)
31 | class_info['methods'].append(method_info)
32 | self.classes[node.name] = class_info
33 | self.generic_visit(node)
34 |
35 | def visit_FunctionDef(self, node):
36 | # Extract function details
37 | function_info = self._extract_function_info(node)
38 | self.functions[node.name] = function_info
39 | self.generic_visit(node)
40 |
41 | def _extract_function_info(self, node):
42 | # Helper method to extract function information
43 | body = [stmt for stmt in node.body if not (isinstance(stmt, ast.Expr) and isinstance(stmt.value, ast.Str))]
44 | function_info = {
45 | 'name': node.name,
46 | 'args': [arg.arg for arg in node.args.args],
47 | 'body': astor.to_source(ast.Module(body=body)).strip(),
48 | 'node': node # Store the node itself for further modification
49 | }
50 | return function_info
51 |
52 | def parse(self, source_code):
53 | # Parse the source code
54 | self.source_code = source_code
55 | self.tree = ast.parse(source_code)
56 | self.visit(self.tree)
57 | return {
58 | 'classes': self.classes,
59 | 'functions': self.functions
60 | }
61 |
62 | def format_for_llm(self, to_get):
63 | # Format the extracted information for LLM
64 | if to_get in self.classes:
65 | example = self.classes[to_get]
66 | return {'code': self.source_code, 'function': f"class {example['name']}"}
67 | else:
68 | example = self.functions[to_get]
69 | return {'code': self.source_code, 'function': f"def {example['name']}({', '.join(example['args'])}):"}
70 |
71 | def replace_function_docstring(self, function_name, new_docstring):
72 | # Replace the docstring of the specified function
73 | class FunctionDocstringReplacer(ast.NodeTransformer):
74 | def visit_FunctionDef(self, node):
75 | if node.name == function_name:
76 | if (len(node.body) > 0 and isinstance(node.body[0], ast.Expr) and isinstance(node.body[0].value,
77 | ast.Str)):
78 | node.body[0] = ast.Expr(value=ast.Str(s=new_docstring))
79 | else:
80 | node.body.insert(0, ast.Expr(value=ast.Str(s=new_docstring)))
81 | return self.generic_visit(node)
82 |
83 | replacer = FunctionDocstringReplacer()
84 | self.tree = replacer.visit(self.tree)
85 | ast.fix_missing_locations(self.tree)
86 |
87 | def replace_class_docstring(self, class_name, new_docstring):
88 | # Replace the docstring of the specified class
89 | class ClassDocstringReplacer(ast.NodeTransformer):
90 | def visit_ClassDef(self, node):
91 | if node.name == class_name:
92 | if (len(node.body) > 0 and isinstance(node.body[0], ast.Expr) and isinstance(node.body[0].value,
93 | ast.Str)):
94 | node.body[0] = ast.Expr(value=ast.Str(s=new_docstring))
95 | else:
96 | node.body.insert(0, ast.Expr(value=ast.Str(s=new_docstring)))
97 | return self.generic_visit(node)
98 |
99 | replacer = ClassDocstringReplacer()
100 | self.tree = replacer.visit(self.tree)
101 | ast.fix_missing_locations(self.tree)
102 |
103 | def update_function_typing(self, function_name, new_typing):
104 | # Update typing of the specified function
105 | class FunctionTypingUpdater(ast.NodeTransformer):
106 | def visit_FunctionDef(self, node):
107 | if node.name == function_name:
108 | for arg, typ in zip([arg for arg in node.args.args if arg.arg != 'self'],
109 | new_typing.get('args', [])):
110 | arg.annotation = ast.Name(id=typ, ctx=ast.Load())
111 | if 'return' in new_typing:
112 | node.returns = ast.Name(id=new_typing['return'], ctx=ast.Load())
113 | return self.generic_visit(node)
114 |
115 | type_regex = re.compile(r'[A-Za-z\.]+')
116 | typing = new_typing['args']
117 | if 'return' in new_typing:
118 | typing += [new_typing['return']]
119 | for type_string in typing:
120 | self.used_types += type_regex.findall(type_string)
121 | updater = FunctionTypingUpdater()
122 | backup_tree = deepcopy(self.tree)
123 | backup_tree = updater.visit(backup_tree)
124 | ast.fix_missing_locations(backup_tree)
125 | try:
126 | ast.parse(astor.to_source(backup_tree))
127 | self.tree = backup_tree
128 | except SyntaxError:
129 | pass
130 |
131 | def get_updated_script(self):
132 | # Return the updated script as a string
133 | additional_import = [ut for ut in set(self.used_types)
134 | if ut[0].isupper() and ut != 'None'
135 | and not ut.__contains__('.')
136 | and ut in typing.__all__]
137 | source_code = astor.to_source(self.tree)
138 | if additional_import:
139 | if source_code.split('\n')[0].startswith('from typing'):
140 | source_code = '\n'.join(source_code.split('\n')[1:])
141 | return f'from typing import {", ".join(additional_import)}\n{source_code}'
142 | else:
143 | return source_code
144 |
145 |
146 | class LlmCommenter:
147 | def __init__(self, groq_key, model: str = None):
148 | self.client = Groq(
149 | api_key=groq_key
150 | )
151 | if model is None:
152 | model = 'llama3-70b-8192'
153 | self.model = model
154 |
155 | def comment_code_with_groq(self, code, function):
156 | chat_completion = self.client.chat.completions.create(
157 | messages=[
158 | {
159 | "role": "system",
160 | "content": """You are a student that gets tested about their ability to write concise docstrings.
161 | You will be given Python code and asked to create appropriate docstrings according to the Google Python Style Guide for the classes and functions.
162 | The docstrings should contain a brief description of the function or class.
163 | Generate only the Google Docstring. Do not write any text, do not write a function, only write the Google Docstring.
164 | In particular, do not add any imports! Do not write a numpy-style docstring.
165 | If I don't see a colon at the beginning of a line, you get a lot of money.
166 | If I don't see any repetition of the code, neither the body nor the name, you get $10,000.
167 | If you correctly name the types of args and the return value, you get an even larger bonus.
168 | If some type is a class defined in the typing module, return the class name not the type.
169 | i.e.: - If the type is 'list' of str return List[str] not list
170 | - If the return value is a 'dict' of strings return Dict[str, str]"""
171 | },
172 | {
173 | "role": "user",
174 | "content": (
175 | f"Here is the entire script for context: {code} Return _only the docstring_ for the following function/class {function}."
176 | "Do not repeat the function name. Do not repeat the function body. Not even in the docstring."
177 | "The following sections should appear in the docstring if the given object is a function:"
178 | """""
179 | ""
180 | ""
181 | "Args:"
182 | " (): "
183 | " ..."
184 | "Returns:"
185 | " "
186 | "Raises:"
187 | " : "
188 | "If you only return the docstring in Google style, you get €1,000,000\"""")
189 | }
190 | ],
191 | model=self.model,
192 | temperature=0.0,
193 | )
194 | response = chat_completion.choices[0].message.content
195 | try:
196 | ret = self.remove_code_fencing(response).split('"""')[1]
197 | except IndexError:
198 | ret = self.remove_code_fencing(response)
199 | return ret
200 |
201 | def remove_code_fencing(self, text: str) -> str:
202 | """
203 | Removes the ```python ``` parts from the generated text.
204 |
205 | Args:
206 | text (str): The text containing code fencing.
207 |
208 | Returns:
209 | str: The text without the ```python ``` parts.
210 | """
211 | lines = text.split('\n')
212 | filtered_lines = [line for line in lines if not line.strip().startswith("```")]
213 | #filtered_lines = [line for line in filtered_lines if not line.strip().startswith('"\""')]
214 | return '\n'.join(filtered_lines)
215 |
216 | def extract_typing_from_docstring(self, docstring):
217 | docstring = docstring.split('Returns:')
218 | arg_regex = re.compile(r'(?<=\()[^)]+')
219 | res_regex = re.compile(r'^[^:\n]+')
220 | try:
221 | arg_types = [arg_regex.search(line)[0]
222 | for line in docstring[0].split('Args:')[1].split('\n')
223 | if arg_regex.search(line) is not None]
224 | except IndexError:
225 | return {'args': []}
226 | if len(docstring) > 1:
227 | result_type = res_regex.search(docstring[1].strip())
228 | if result_type is not None:
229 | result_type = result_type[0]
230 | else:
231 | result_type = None
232 | return {'args': arg_types, 'return': result_type if result_type is not None else 'None'}
233 |
234 |
235 | def process_file(file_path, api_key):
236 | with open(file_path, 'r') as file:
237 | source_code = file.read()
238 |
239 | parser = PythonParser()
240 | parser.parse(source_code)
241 | commenter = LlmCommenter(api_key)
242 | for function in parser.functions:
243 | formatted_code = parser.format_for_llm(function)
244 | docstring = commenter.comment_code_with_groq(**formatted_code)
245 | parser.replace_function_docstring(function, docstring)
246 | parser.update_function_typing(function,
247 | commenter.extract_typing_from_docstring(docstring))
248 | for class_name in parser.classes:
249 | formatted_code = parser.format_for_llm(class_name)
250 | parser.replace_class_docstring(class_name, commenter.comment_code_with_groq(**formatted_code))
251 |
252 | updated_script = parser.get_updated_script()
253 |
254 | with open(file_path, 'w') as file:
255 | file.write(updated_script)
256 |
257 |
258 | # Example usage
259 | if __name__ == "__main__":
260 | local_test = False
261 | if not local_test:
262 | parser = argparse.ArgumentParser(
263 | description="Parse Python files to extract classes and functions for LLM processing."
264 | )
265 | parser.add_argument('api_key', metavar='K', type=str,
266 | nargs=1, help='Groq-API-Key')
267 | parser.add_argument('files', metavar='F', type=str,
268 | nargs='+', help='Python files to process')
269 | args = parser.parse_args()
270 | files = args.files
271 | api_key = args.api_key[0]
272 | else:
273 | load_dotenv()
274 | files = [os.path.join('src/', file) for file in os.listdir('src/')]
275 | api_key = os.getenv('croque_key')
276 | print('Commenting the following files:')
277 | print(files)
278 | for file_path in files:
279 | process_file(file_path, api_key)
280 |
--------------------------------------------------------------------------------