├── src ├── __init__.py ├── ocr_handler.py ├── database_handler.py └── data_visualizer.py ├── .github └── workflows │ ├── .nojekyll │ ├── requirements.txt │ ├── base_index.rst │ ├── read_changed_files.py │ ├── llm-documenter.yml │ ├── readme.py │ └── abstract_tree.py ├── waterkant.png ├── LICENSE ├── README.md ├── .gitignore └── .idea └── workspace.xml /src/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.github/workflows/.nojekyll: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /waterkant.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MalteLeuschner/llm_github_actions/HEAD/waterkant.png -------------------------------------------------------------------------------- /.github/workflows/requirements.txt: -------------------------------------------------------------------------------- 1 | python-dotenv~=1.0.1 2 | requests~=2.32.3 3 | pydantic~=2.8.0 4 | groq 5 | astor~=0.8.1 6 | regex~=2024.5.15 7 | PyGithub 8 | black 9 | argparse 10 | autopep8 11 | sphinx 12 | sphinx_rtd_theme 13 | streamlit 14 | pytesseract 15 | -------------------------------------------------------------------------------- /.github/workflows/base_index.rst: -------------------------------------------------------------------------------- 1 | .. llm_github_actions documentation master file, created by 2 | sphinx-quickstart on Fri Jul 5 08:23:30 2024. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Welcome to llm_github_actions's documentation! 7 | ============================================== 8 | 9 | .. toctree:: 10 | :maxdepth: 2 11 | :caption: Contents: 12 | 13 | 14 | 15 | Indices and tables 16 | ================== 17 | 18 | * :ref:`genindex` 19 | * :ref:`modindex` 20 | * :ref:`search` -------------------------------------------------------------------------------- /src/ocr_handler.py: -------------------------------------------------------------------------------- 1 | import pytesseract 2 | from PIL import Image 3 | 4 | 5 | class OCRHandler: 6 | 7 | def __init__(self, tesseract_cmd): 8 | pytesseract.pytesseract.tesseract_cmd = tesseract_cmd 9 | 10 | def perform_ocr(self, image_path): 11 | image = Image.open(image_path) 12 | text = pytesseract.image_to_string(image) 13 | return text 14 | 15 | 16 | def main(): 17 | ocr = OCRHandler("/usr/bin/tesseract") 18 | text = ocr.perform_ocr("balance_sheet.png") 19 | print(text) 20 | 21 | 22 | if __name__ == "__main__": 23 | main() 24 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 leuschnm 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /src/database_handler.py: -------------------------------------------------------------------------------- 1 | from typing import Tuple, List 2 | import sqlite3 3 | 4 | 5 | class DatabaseHandler: 6 | 7 | def __init__(self, db_name): 8 | self.conn = sqlite3.connect(db_name) 9 | self.cursor = self.conn.cursor() 10 | self.create_table() 11 | 12 | def create_table(self): 13 | self.cursor.execute( 14 | """ 15 | CREATE TABLE IF NOT EXISTS transactions ( 16 | id INTEGER PRIMARY KEY AUTOINCREMENT, 17 | date TEXT, 18 | description TEXT, 19 | amount REAL 20 | ) 21 | """ 22 | ) 23 | 24 | def insert_transaction(self, date, description, amount): 25 | self.cursor.execute( 26 | """ 27 | INSERT INTO transactions (date, description, amount) 28 | VALUES (?, ?, ?) 29 | """, 30 | (date, description, amount), 31 | ) 32 | self.conn.commit() 33 | 34 | def query_transactions(self): 35 | self.cursor.execute("SELECT * FROM transactions") 36 | return self.cursor.fetchall() 37 | 38 | def close(self): 39 | self.conn.close() 40 | 41 | 42 | def main(): 43 | db = DatabaseHandler("financial_data.db") 44 | db.insert_transaction("2024-07-05", "Deposit", 1000.0) 45 | transactions = db.query_transactions() 46 | for transaction in transactions: 47 | print(transaction) 48 | db.close() 49 | 50 | 51 | if __name__ == "__main__": 52 | main() 53 | -------------------------------------------------------------------------------- /src/data_visualizer.py: -------------------------------------------------------------------------------- 1 | from typing import List, Dict 2 | import streamlit as st 3 | import pandas as pd 4 | import matplotlib.pyplot as plt 5 | 6 | 7 | class DataVisualizer: 8 | 9 | def __init__(self, data): 10 | self.df = pd.DataFrame(data) 11 | 12 | def display_data(self): 13 | st.write("### Transaction Data") 14 | st.dataframe(self.df) 15 | 16 | def plot_line_chart(self): 17 | st.write("### Line Chart of Amounts") 18 | fig, ax = plt.subplots() 19 | ax.plot(self.df["Date"], self.df["Amount"], marker="o") 20 | ax.set_xlabel("Date") 21 | ax.set_ylabel("Amount") 22 | ax.set_title("Transaction Amounts Over Time") 23 | st.pyplot(fig) 24 | 25 | def plot_bar_chart(self): 26 | st.write("### Bar Chart of Amounts") 27 | fig, ax = plt.subplots() 28 | ax.bar(self.df["Date"], self.df["Amount"]) 29 | ax.set_xlabel("Date") 30 | ax.set_ylabel("Amount") 31 | ax.set_title("Transaction Amounts Over Time") 32 | st.pyplot(fig) 33 | 34 | 35 | def main(): 36 | data = { 37 | "Date": ["2024-07-01", "2024-07-02", "2024-07-03", "2024-07-04", "2024-07-05"], 38 | "Amount": [100, 200, 150, 300, 250], 39 | } 40 | visualizer = DataVisualizer(data) 41 | st.title("Financial Data Visualization") 42 | visualizer.display_data() 43 | visualizer.plot_line_chart() 44 | visualizer.plot_bar_chart() 45 | 46 | 47 | if __name__ == "__main__": 48 | main() 49 | -------------------------------------------------------------------------------- /.github/workflows/read_changed_files.py: -------------------------------------------------------------------------------- 1 | from github import Github 2 | import os 3 | import sys 4 | import argparse 5 | 6 | def get_changed_files(repo, commit_sha, folder = 'src'): 7 | commit = repo.get_commit(commit_sha) 8 | files = commit.files 9 | changed_files = [file.filename for file in files 10 | if file.filename.endswith('.py') 11 | and file.filename.startswith(folder)] 12 | return changed_files 13 | 14 | 15 | def main(folder): 16 | token = os.getenv('GITHUB_TOKEN') 17 | if not token: 18 | print('GITHUB_TOKEN environment variable is not set.') 19 | sys.exit(1) 20 | g = Github(token) 21 | repo_name = os.getenv('GITHUB_REPOSITORY') 22 | commit_sha = os.getenv('GITHUB_SHA') 23 | if not repo_name or not commit_sha: 24 | print( 25 | 'Required environment variables (GITHUB_REPOSITORY, GITHUB_SHA) are missing.' 26 | ) 27 | sys.exit(1) 28 | repo = g.get_repo(repo_name) 29 | changed_files = get_changed_files(repo, commit_sha, folder) 30 | for file in changed_files: 31 | print(file) 32 | print(f"::set-output name=changed_files::{' '.join(changed_files)}") 33 | 34 | 35 | if __name__ == '__main__': 36 | parser = argparse.ArgumentParser( 37 | description="Look for changes in python files." 38 | ) 39 | parser.add_argument('folder', metavar='F', type=str, 40 | nargs=1, help='Folder containing source files to comment.') 41 | args = parser.parse_args() 42 | folder = args.folder[0] 43 | print(folder) 44 | main(folder) 45 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![Waterkant Image](waterkant.png) 2 | 3 |

4 | Auto-documentation with LLMs in Python 5 |

6 | 7 | 8 | We implement a github actions llm that creates docstrings, typing, linting, and sphinx autodocumenation 9 | This was done during coding.waterkant hackathon in Kiel 10 | 11 | ## Setup-instructions 12 | 13 | Since several attendees asked how to set up the workflow and non-existent documentation for a tool that documents for you is amusing but not very helpful, here is a short explanation on how to get this tool working: 14 | 15 | 1. **Generate GROQ API Key:** 16 | - The action uses the [GROQ](https://groq.com/) API to generate docstrings and summarize the code. 17 | - Generate an [API key](https://console.groq.com/keys) and save it under the name `GROQ_API_KEY` in the [repo's secrets](https://docs.github.com/en/actions/security-guides/using-secrets-in-github-actions#creating-secrets-for-a-repository). 18 | 19 | 2. **Copy the .github Folder:** 20 | - Copy the `.github` folder from this repo to your project's root folder. Ignore everything else in this repo; the files in `src/` are just there as examples. 21 | 22 | 3. **Configure Source Path:** 23 | - If your `.py` files are in a folder called `src`, the action should work as is. Otherwise, edit the `llm-documenter.yml` file to change the value of the `SOURCE_PATH` environment variable to represent your source folder. 24 | ```yaml 25 | env: 26 | SOURCE_PATH: your_source_folder 27 | ``` 28 | - If they to not already, all your code-folders do also need __init__.py-files, even if the files are empty. Sphinx-autodoc will ignore them if they don't have one. 29 | 30 | 4. **Push Changes:** 31 | - After every push, you can manually start the action in the github-actions-tab. It will: 32 | - Add docstrings to `.py` files that have changed, add typing and lint your files. The changes will be pushed to a new "commented_branch". 33 | - Summarize your project, build a Sphinx documentation, and put it in the `docs` folder in a gh_pages-branch. 34 | 35 | 5. **Publish Documentation (Optional):** 36 | - If your repo is public, you can make the documentation accessible via [GitHub Pages](https://docs.github.com/en/pages/quickstart). 37 | 38 | 6. **Profit!** 39 | 40 | 41 | We tried to make the action as non-invasive as possible, we did already encounter some issues with complex return-values and class imports from other modules in one project. 42 | The Landing page for the documentation is additionally a bit prone to hallucination since we added it last minute. 43 | What we are trying to say is that we do not guarantee anything, a start in documenting is a start though :). 44 | -------------------------------------------------------------------------------- /.github/workflows/llm-documenter.yml: -------------------------------------------------------------------------------- 1 | name: LLM_Documenter 2 | 3 | on: [workflow_dispatch] 4 | permissions: 5 | contents: write 6 | 7 | env: 8 | SOURCE_PATH: 'src' 9 | 10 | jobs: 11 | document_changed_python_files: 12 | concurrency: ci-${{ github.ref }} 13 | runs-on: ubuntu-latest 14 | outputs: 15 | changed_files: ${{ steps.python_script.outputs.changed_files }} 16 | steps: 17 | - name: Checkout repository 18 | uses: actions/checkout@v3 19 | 20 | - name: New Branch creation 21 | run: | 22 | git checkout -b commented_branch 23 | 24 | - name: Set up Python 25 | uses: actions/setup-python@v4 26 | with: 27 | python-version: '3.10' 28 | 29 | - name: Install python dependencies 30 | run: pip install -r .github/workflows/requirements.txt 31 | 32 | - name: Read changed .py in commit 33 | id: python_script 34 | env: 35 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 36 | GITHUB_REPOSITORY: ${{ github.repository }} 37 | GITHUB_SHA: ${{ github.sha }} 38 | run: | 39 | python3.10 .github/workflows/read_changed_files.py $SOURCE_PATH 40 | 41 | - name: Commit files with added dummyDS 42 | if: steps.python_script.outputs.changed_files != '' 43 | run: | 44 | echo "Generating list of outputs...!" 45 | for OUTPUT in "${{ steps.python_script.outputs.changed_files }}"; do 46 | python3.10 .github/workflows/abstract_tree.py ${{ secrets.GROQ_API_KEY }} $OUTPUT 47 | autopep8 --in-place --aggressive $OUTPUT 48 | black $OUTPUT 49 | git add $OUTPUT 50 | done 51 | echo "Committing updated .py files...!" 52 | git config --global user.email "you@example.com" 53 | git config --global user.name "LLM-Docstring-Agent" 54 | git commit -m "LLM-documented files pushed" 55 | git push origin commented_branch --force 56 | 57 | - name: Init Sphinx documentation 58 | run: | 59 | git checkout -b gh_pages 60 | rm -Rf docs 61 | if ! test -f "docs/Makefile"; then 62 | mkdir -p docs 63 | sphinx-quickstart --sep --makefile --ext-autodoc --project="${{ github.event.repository.name }}" --author="LLM-Bot" --release="0.0.1" --language="en" docs 64 | echo "import sys, os" >> ./docs/source/conf.py 65 | echo "sys.path.insert(0, os.path.abspath('../..'))" >> ./docs/source/conf.py 66 | echo "extensions += ['sphinx.ext.napoleon', 'sphinx_rtd_theme', 'sphinx.ext.githubpages']" >> ./docs/source/conf.py 67 | echo "html_theme = 'sphinx_rtd_theme'" >> ./docs/source/conf.py 68 | fi 69 | - name: Render sphinx 70 | run: | 71 | sphinx-apidoc -f -o docs/source ./ 72 | python3.10 .github/workflows/readme.py ${{ secrets.GROQ_API_KEY }} $SOURCE_PATH 73 | sphinx-build -M html ./docs/source ./docs 74 | cp -r ./docs/html/* ./docs/ 75 | rm -r ./docs/html 76 | git add -f ./docs/. 77 | 78 | - name: Commit files 79 | run: | 80 | touch ./docs/.nojekyll 81 | git add -f ./docs/.nojekyll 82 | git commit -m "Add .nojekyll" 83 | git push origin gh_pages --force 84 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | *.DS_Store 6 | */.DS_Store 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | wheels/ 25 | share/python-wheels/ 26 | *.egg-info/ 27 | .installed.cfg 28 | *.egg 29 | MANIFEST 30 | 31 | # PyInstaller 32 | # Usually these files are written by a python script from a template 33 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 34 | *.manifest 35 | *.spec 36 | 37 | # Installer logs 38 | pip-log.txt 39 | pip-delete-this-directory.txt 40 | 41 | # Unit test / coverage reports 42 | htmlcov/ 43 | .tox/ 44 | .nox/ 45 | .coverage 46 | .coverage.* 47 | .cache 48 | nosetests.xml 49 | coverage.xml 50 | *.cover 51 | *.py,cover 52 | .hypothesis/ 53 | .pytest_cache/ 54 | cover/ 55 | 56 | # Translations 57 | *.mo 58 | *.pot 59 | 60 | # Django stuff: 61 | *.log 62 | local_settings.py 63 | db.sqlite3 64 | db.sqlite3-journal 65 | 66 | # Flask stuff: 67 | instance/ 68 | .webassets-cache 69 | 70 | # Scrapy stuff: 71 | .scrapy 72 | 73 | # Sphinx documentation 74 | docs/_build/ 75 | 76 | # PyBuilder 77 | .pybuilder/ 78 | target/ 79 | 80 | # Jupyter Notebook 81 | .ipynb_checkpoints 82 | 83 | # IPython 84 | profile_default/ 85 | ipython_config.py 86 | 87 | # pyenv 88 | # For a library or package, you might want to ignore these files since the code is 89 | # intended to run in multiple environments; otherwise, check them in: 90 | # .python-version 91 | 92 | # pipenv 93 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 94 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 95 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 96 | # install all needed dependencies. 97 | #Pipfile.lock 98 | 99 | # poetry 100 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 101 | # This is especially recommended for binary packages to ensure reproducibility, and is more 102 | # commonly ignored for libraries. 103 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 104 | #poetry.lock 105 | 106 | # pdm 107 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 108 | #pdm.lock 109 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 110 | # in version control. 111 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control 112 | .pdm.toml 113 | .pdm-python 114 | .pdm-build/ 115 | 116 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 117 | __pypackages__/ 118 | 119 | # Celery stuff 120 | celerybeat-schedule 121 | celerybeat.pid 122 | 123 | # SageMath parsed files 124 | *.sage.py 125 | 126 | # Environments 127 | .env 128 | .venv 129 | env/ 130 | venv/ 131 | ENV/ 132 | env.bak/ 133 | venv.bak/ 134 | 135 | # Spyder project settings 136 | .spyderproject 137 | .spyproject 138 | 139 | # Rope project settings 140 | .ropeproject 141 | 142 | # mkdocs documentation 143 | /site 144 | 145 | # mypy 146 | .mypy_cache/ 147 | .dmypy.json 148 | dmypy.json 149 | 150 | # Pyre type checker 151 | .pyre/ 152 | 153 | # pytype static type analyzer 154 | .pytype/ 155 | 156 | # Cython debug symbols 157 | cython_debug/ 158 | 159 | # PyCharm 160 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 161 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 162 | # and can be added to the global gitignore or merged into this file. For a more nuclear 163 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 164 | #.idea/ 165 | -------------------------------------------------------------------------------- /.idea/workspace.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | 8 | 13 | 14 | 19 | 21 | { 22 | "associatedIndex": 5 23 | } 24 | 25 | 26 | 29 | { 30 | "keyToString": { 31 | "RunOnceActivity.ShowReadmeOnStart": "true", 32 | "last_opened_file_path": "/home/brede/dfncloud/KI_lab_share/coding_waterkant2024/llm_github_actions", 33 | "node.js.detected.package.eslint": "true", 34 | "node.js.detected.package.tslint": "true", 35 | "node.js.selected.package.eslint": "(autodetect)", 36 | "node.js.selected.package.tslint": "(autodetect)", 37 | "nodejs_package_manager_path": "npm", 38 | "vue.rearranger.settings.migration": "true" 39 | } 40 | } 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 1720193540465 59 | 65 | 66 | 73 | 74 | 81 | 82 | 89 | 90 | 97 | 98 | 105 | 106 | 113 | 114 | 121 | 122 | 129 | 130 | 137 | 140 | 141 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 155 | -------------------------------------------------------------------------------- /.github/workflows/readme.py: -------------------------------------------------------------------------------- 1 | from typing import List, Union 2 | from groq import Groq 3 | import os 4 | import argparse 5 | 6 | 7 | def get_project_readme(client: str, code: str) -> str: 8 | """Generates a README file for a Python project based on the provided code. 9 | 10 | Generates a README file content based on the provided code containing Toplevel descriptions of individual files. 11 | 12 | Args: 13 | client (str): The Groq client instance. 14 | code (str): The code containing Toplevel descriptions of individual files. 15 | 16 | Returns: 17 | str: The generated README file content.""" 18 | chat_completion = client.chat.completions.create( 19 | messages=[ 20 | { 21 | "role": "system", 22 | "content": """Du bist ein KI-Sprachmodell, das beauftragt wurde, eine index.rst-Datei für Sphinx zur Beschreibung eines Python-Projekts zu erstellen. 23 | Diese rst-Datei sollte alle Toplevel-Beschreibungen der einzelnen Dateien zusammenfassen und das gesamte Projekt erklären. 24 | Hier sind die spezifischen Anweisungen, die du befolgen sollst: 25 | 1.Projekttitel: Gib den Namen des Projekts an. 26 | 2.Projektbeschreibung: Beschreibe den Hauptzweck und die Funktionalität des gesamten Projekts in einem oder zwei Absätzen. 27 | 3.Installationsanweisungen: Beschreibe, wie man das Projekt installiert und welche Abhängigkeiten erforderlich sind. 28 | 4.Verzeichnisstruktur: Gib eine Übersicht über die Verzeichnisstruktur des Projekts. 29 | 5.Dateibeschreibungen: Füge die Toplevel-Beschreibungen aller Dateien ein, die im Projekt enthalten sind. 30 | 6.Nutzung: Erkläre, wie das Projekt verwendet wird, und gib Beispiele für die Nutzung. 31 | 7.Beitragende: Erwähne alle wichtigen Beitragenden oder den Hauptentwickler des Projekts. 32 | 8.Lizenz: Füge die Lizenzinformationen hinzu, falls vorhanden.""", 33 | }, 34 | { 35 | "role": "user", 36 | "content": """Hier ist für den Kontext die gesamten Toplevel-Beschreibungen, die einzeln durch das Zeichen (#xxx#) von einander getrennt sind: {code}. 37 | Nimm diese Informationen und extrahiere alle wichtigen Informationen und füge sie zu einer rst-Datei zusammen. 38 | Wenn du den Text der Readme in einem super Ausformulierten und Detailierten Bericht zurück gibts bekommst du 1.000.000 € 39 | Wenn dieser Text auf Deutsch ist, bekommst du 1.000.000 $""".format( 40 | code=code 41 | ), 42 | }, 43 | ], 44 | model="mixtral-8x7b-32768", 45 | temperature=0.0, 46 | ) 47 | return chat_completion.choices[0].message.content 48 | 49 | 50 | def get_python_files(directory: str) -> List[str]: 51 | """ 52 | Gets a list of all Python files in a given directory and its subdirectories. 53 | 54 | Gets all Python files in the directory and its subdirectories. 55 | 56 | Args: 57 | directory (str): The directory to search for Python files. 58 | 59 | Returns: 60 | List[str]: A list of paths to all Python files found.""" 61 | python_files = [] 62 | for root, _, files in os.walk(directory): 63 | for file in files: 64 | if file.endswith(".py") and not file.startswith("__init__"): 65 | python_files.append(os.path.join(root, file)) 66 | return python_files 67 | 68 | 69 | def get_code_description(client: str, code: str) -> str: 70 | """ 71 | Generates a description of the provided Python code. 72 | 73 | Generates a description of the provided Python code. 74 | 75 | Args: 76 | client (str): The Groq client instance. 77 | code (str): The Python code to get a description for. 78 | 79 | Returns: 80 | str: The description of the code.""" 81 | chat_completion = client.chat.completions.create( 82 | messages=[ 83 | { 84 | "role": "system", 85 | "content": """Du bist ein hilfreicher Assistent des KI-Anwendungszentrums, der bei der Dokumentation von Programmcode unterstützt. 86 | Du erhältst Python-Code und sollst diesen Beschreiben. Dazu fertigst du eine kurze Beschreibung der einzelnen Klassen und Funktionen und setzt die beschriebenen Klassen und Funktionen 87 | sinnvoll in den von dir beschriebenden Programmablauf ein. So dass man beim lesen das gesamte Programm mit Programmablauf versteht aber zusätzlich auch die 88 | Beschreibung jeder einzelnen Funktion hat. Achte drauf, dass die Namen der Funktionen und Klassen nicht vergessen werden. 89 | Generiere nur den den Beschreibenden Text. Schreibe keinen Code, schreibe keine Funktion, schreibe keine Google Docstrings. Füge insbesondere keine Imports hinzu!""", 90 | }, 91 | { 92 | "role": "user", 93 | "content": """Hier ist für den Kontext das gesamte Skript: {code} Gib nur den beschriebenen Text zurück. 94 | Wiederhole nicht den Code. Wiederhole nicht den Funktions-Body. 95 | Wenn du nur Text zurück gibts bekommst du 1.000.000 € 96 | Wenn dieser Text gut ausformuliert ist und der Programmablauf nicht fehlt bekommst du 1000 $""".format( 97 | code=code 98 | ), 99 | }, 100 | ], 101 | model="mixtral-8x7b-32768", 102 | temperature=0.0, 103 | ) 104 | return chat_completion.choices[0].message.content 105 | 106 | 107 | def remove_code_fencing(client: Groq, text: str) -> str: 108 | """Removes code fencing from a given text, making it more readable. 109 | 110 | Removes the ```python ``` parts from the generated text. 111 | 112 | Args: 113 | client (Groq): The Groq client instance. 114 | text (str): The text containing code fencing. 115 | 116 | Returns: 117 | str: The text without the ```python ``` parts.""" 118 | lines = text.split("\n") 119 | filtered_lines = [line for line in lines if not line.strip().startswith("```")] 120 | return "\n".join(filtered_lines) 121 | 122 | 123 | def describe_code(client: Union[str, List[str]], file_paths: str) -> str: 124 | """ 125 | Describes Python files by generating a code description for each file. 126 | 127 | Generates a code description for each Python file in the given file paths. 128 | 129 | Args: 130 | client (Union[str, List[str]]): A string or list of strings representing the Groq client instance. 131 | file_paths (str): A string or list of strings representing file paths. 132 | 133 | Returns: 134 | str: The code description of the last file processed. 135 | 136 | Raises: 137 | FileNotFoundError: If a file path is not a valid file.""" 138 | if isinstance(file_paths, str): 139 | file_paths = [file_paths] 140 | descriptions = [] 141 | for file_path in file_paths: 142 | if os.path.isfile(file_path): 143 | with open(file_path, "r") as file: 144 | code = file.read() 145 | descriptions.append(get_code_description(client, code)) 146 | return descriptions 147 | 148 | 149 | if __name__ == "__main__": 150 | local_test = False 151 | if not local_test: 152 | parser = argparse.ArgumentParser( 153 | description="Parse Python files to extract classes and functions for LLM processing." 154 | ) 155 | parser.add_argument( 156 | "api_key", metavar="K", type=str, nargs=1, help="Groq-API-Key" 157 | ) 158 | parser.add_argument( 159 | "files", metavar="F", type=str, nargs="+", help="Python files to process" 160 | ) 161 | args = parser.parse_args() 162 | api_key = args.api_key[0] 163 | files = get_python_files(args.files[0]) 164 | else: 165 | from dotenv import load_dotenv 166 | 167 | load_dotenv() 168 | api_key = os.getenv("croque_key") 169 | files = ["src/readme.py"] 170 | client = Groq(api_key=api_key) 171 | strings = describe_code(client, files) 172 | grosser_string = "\n#xxx#\n".join(strings) 173 | readme = get_project_readme(client, grosser_string) 174 | with open(".github/workflows/base_index.rst", "r") as file: 175 | existing_index = file.read() 176 | with open("docs/source/index.rst", "w") as datei: 177 | datei.write(existing_index + "\n\n" + readme) 178 | -------------------------------------------------------------------------------- /.github/workflows/abstract_tree.py: -------------------------------------------------------------------------------- 1 | import ast 2 | import os 3 | 4 | import astor 5 | import argparse 6 | from groq import Groq 7 | import regex as re 8 | from dotenv import load_dotenv 9 | from copy import deepcopy 10 | import inspect 11 | import typing 12 | 13 | class PythonParser(ast.NodeVisitor): 14 | def __init__(self): 15 | self.classes = {} 16 | self.functions = {} 17 | self.source_code = "" 18 | self.tree = None 19 | self.used_types = [] 20 | 21 | def visit_ClassDef(self, node): 22 | # Extract class details 23 | class_info = { 24 | 'name': node.name, 25 | 'methods': [] 26 | } 27 | # Visit each method in the class 28 | for n in node.body: 29 | if isinstance(n, ast.FunctionDef): 30 | method_info = self._extract_function_info(n) 31 | class_info['methods'].append(method_info) 32 | self.classes[node.name] = class_info 33 | self.generic_visit(node) 34 | 35 | def visit_FunctionDef(self, node): 36 | # Extract function details 37 | function_info = self._extract_function_info(node) 38 | self.functions[node.name] = function_info 39 | self.generic_visit(node) 40 | 41 | def _extract_function_info(self, node): 42 | # Helper method to extract function information 43 | body = [stmt for stmt in node.body if not (isinstance(stmt, ast.Expr) and isinstance(stmt.value, ast.Str))] 44 | function_info = { 45 | 'name': node.name, 46 | 'args': [arg.arg for arg in node.args.args], 47 | 'body': astor.to_source(ast.Module(body=body)).strip(), 48 | 'node': node # Store the node itself for further modification 49 | } 50 | return function_info 51 | 52 | def parse(self, source_code): 53 | # Parse the source code 54 | self.source_code = source_code 55 | self.tree = ast.parse(source_code) 56 | self.visit(self.tree) 57 | return { 58 | 'classes': self.classes, 59 | 'functions': self.functions 60 | } 61 | 62 | def format_for_llm(self, to_get): 63 | # Format the extracted information for LLM 64 | if to_get in self.classes: 65 | example = self.classes[to_get] 66 | return {'code': self.source_code, 'function': f"class {example['name']}"} 67 | else: 68 | example = self.functions[to_get] 69 | return {'code': self.source_code, 'function': f"def {example['name']}({', '.join(example['args'])}):"} 70 | 71 | def replace_function_docstring(self, function_name, new_docstring): 72 | # Replace the docstring of the specified function 73 | class FunctionDocstringReplacer(ast.NodeTransformer): 74 | def visit_FunctionDef(self, node): 75 | if node.name == function_name: 76 | if (len(node.body) > 0 and isinstance(node.body[0], ast.Expr) and isinstance(node.body[0].value, 77 | ast.Str)): 78 | node.body[0] = ast.Expr(value=ast.Str(s=new_docstring)) 79 | else: 80 | node.body.insert(0, ast.Expr(value=ast.Str(s=new_docstring))) 81 | return self.generic_visit(node) 82 | 83 | replacer = FunctionDocstringReplacer() 84 | self.tree = replacer.visit(self.tree) 85 | ast.fix_missing_locations(self.tree) 86 | 87 | def replace_class_docstring(self, class_name, new_docstring): 88 | # Replace the docstring of the specified class 89 | class ClassDocstringReplacer(ast.NodeTransformer): 90 | def visit_ClassDef(self, node): 91 | if node.name == class_name: 92 | if (len(node.body) > 0 and isinstance(node.body[0], ast.Expr) and isinstance(node.body[0].value, 93 | ast.Str)): 94 | node.body[0] = ast.Expr(value=ast.Str(s=new_docstring)) 95 | else: 96 | node.body.insert(0, ast.Expr(value=ast.Str(s=new_docstring))) 97 | return self.generic_visit(node) 98 | 99 | replacer = ClassDocstringReplacer() 100 | self.tree = replacer.visit(self.tree) 101 | ast.fix_missing_locations(self.tree) 102 | 103 | def update_function_typing(self, function_name, new_typing): 104 | # Update typing of the specified function 105 | class FunctionTypingUpdater(ast.NodeTransformer): 106 | def visit_FunctionDef(self, node): 107 | if node.name == function_name: 108 | for arg, typ in zip([arg for arg in node.args.args if arg.arg != 'self'], 109 | new_typing.get('args', [])): 110 | arg.annotation = ast.Name(id=typ, ctx=ast.Load()) 111 | if 'return' in new_typing: 112 | node.returns = ast.Name(id=new_typing['return'], ctx=ast.Load()) 113 | return self.generic_visit(node) 114 | 115 | type_regex = re.compile(r'[A-Za-z\.]+') 116 | typing = new_typing['args'] 117 | if 'return' in new_typing: 118 | typing += [new_typing['return']] 119 | for type_string in typing: 120 | self.used_types += type_regex.findall(type_string) 121 | updater = FunctionTypingUpdater() 122 | backup_tree = deepcopy(self.tree) 123 | backup_tree = updater.visit(backup_tree) 124 | ast.fix_missing_locations(backup_tree) 125 | try: 126 | ast.parse(astor.to_source(backup_tree)) 127 | self.tree = backup_tree 128 | except SyntaxError: 129 | pass 130 | 131 | def get_updated_script(self): 132 | # Return the updated script as a string 133 | additional_import = [ut for ut in set(self.used_types) 134 | if ut[0].isupper() and ut != 'None' 135 | and not ut.__contains__('.') 136 | and ut in typing.__all__] 137 | source_code = astor.to_source(self.tree) 138 | if additional_import: 139 | if source_code.split('\n')[0].startswith('from typing'): 140 | source_code = '\n'.join(source_code.split('\n')[1:]) 141 | return f'from typing import {", ".join(additional_import)}\n{source_code}' 142 | else: 143 | return source_code 144 | 145 | 146 | class LlmCommenter: 147 | def __init__(self, groq_key, model: str = None): 148 | self.client = Groq( 149 | api_key=groq_key 150 | ) 151 | if model is None: 152 | model = 'llama3-70b-8192' 153 | self.model = model 154 | 155 | def comment_code_with_groq(self, code, function): 156 | chat_completion = self.client.chat.completions.create( 157 | messages=[ 158 | { 159 | "role": "system", 160 | "content": """You are a student that gets tested about their ability to write concise docstrings. 161 | You will be given Python code and asked to create appropriate docstrings according to the Google Python Style Guide for the classes and functions. 162 | The docstrings should contain a brief description of the function or class. 163 | Generate only the Google Docstring. Do not write any text, do not write a function, only write the Google Docstring. 164 | In particular, do not add any imports! Do not write a numpy-style docstring. 165 | If I don't see a colon at the beginning of a line, you get a lot of money. 166 | If I don't see any repetition of the code, neither the body nor the name, you get $10,000. 167 | If you correctly name the types of args and the return value, you get an even larger bonus. 168 | If some type is a class defined in the typing module, return the class name not the type. 169 | i.e.: - If the type is 'list' of str return List[str] not list 170 | - If the return value is a 'dict' of strings return Dict[str, str]""" 171 | }, 172 | { 173 | "role": "user", 174 | "content": ( 175 | f"Here is the entire script for context: {code} Return _only the docstring_ for the following function/class {function}." 176 | "Do not repeat the function name. Do not repeat the function body. Not even in the docstring." 177 | "The following sections should appear in the docstring if the given object is a function:" 178 | """"" 179 | "" 180 | "" 181 | "Args:" 182 | " (): " 183 | " ..." 184 | "Returns:" 185 | " " 186 | "Raises:" 187 | " : " 188 | "If you only return the docstring in Google style, you get €1,000,000\"""") 189 | } 190 | ], 191 | model=self.model, 192 | temperature=0.0, 193 | ) 194 | response = chat_completion.choices[0].message.content 195 | try: 196 | ret = self.remove_code_fencing(response).split('"""')[1] 197 | except IndexError: 198 | ret = self.remove_code_fencing(response) 199 | return ret 200 | 201 | def remove_code_fencing(self, text: str) -> str: 202 | """ 203 | Removes the ```python ``` parts from the generated text. 204 | 205 | Args: 206 | text (str): The text containing code fencing. 207 | 208 | Returns: 209 | str: The text without the ```python ``` parts. 210 | """ 211 | lines = text.split('\n') 212 | filtered_lines = [line for line in lines if not line.strip().startswith("```")] 213 | #filtered_lines = [line for line in filtered_lines if not line.strip().startswith('"\""')] 214 | return '\n'.join(filtered_lines) 215 | 216 | def extract_typing_from_docstring(self, docstring): 217 | docstring = docstring.split('Returns:') 218 | arg_regex = re.compile(r'(?<=\()[^)]+') 219 | res_regex = re.compile(r'^[^:\n]+') 220 | try: 221 | arg_types = [arg_regex.search(line)[0] 222 | for line in docstring[0].split('Args:')[1].split('\n') 223 | if arg_regex.search(line) is not None] 224 | except IndexError: 225 | return {'args': []} 226 | if len(docstring) > 1: 227 | result_type = res_regex.search(docstring[1].strip()) 228 | if result_type is not None: 229 | result_type = result_type[0] 230 | else: 231 | result_type = None 232 | return {'args': arg_types, 'return': result_type if result_type is not None else 'None'} 233 | 234 | 235 | def process_file(file_path, api_key): 236 | with open(file_path, 'r') as file: 237 | source_code = file.read() 238 | 239 | parser = PythonParser() 240 | parser.parse(source_code) 241 | commenter = LlmCommenter(api_key) 242 | for function in parser.functions: 243 | formatted_code = parser.format_for_llm(function) 244 | docstring = commenter.comment_code_with_groq(**formatted_code) 245 | parser.replace_function_docstring(function, docstring) 246 | parser.update_function_typing(function, 247 | commenter.extract_typing_from_docstring(docstring)) 248 | for class_name in parser.classes: 249 | formatted_code = parser.format_for_llm(class_name) 250 | parser.replace_class_docstring(class_name, commenter.comment_code_with_groq(**formatted_code)) 251 | 252 | updated_script = parser.get_updated_script() 253 | 254 | with open(file_path, 'w') as file: 255 | file.write(updated_script) 256 | 257 | 258 | # Example usage 259 | if __name__ == "__main__": 260 | local_test = False 261 | if not local_test: 262 | parser = argparse.ArgumentParser( 263 | description="Parse Python files to extract classes and functions for LLM processing." 264 | ) 265 | parser.add_argument('api_key', metavar='K', type=str, 266 | nargs=1, help='Groq-API-Key') 267 | parser.add_argument('files', metavar='F', type=str, 268 | nargs='+', help='Python files to process') 269 | args = parser.parse_args() 270 | files = args.files 271 | api_key = args.api_key[0] 272 | else: 273 | load_dotenv() 274 | files = [os.path.join('src/', file) for file in os.listdir('src/')] 275 | api_key = os.getenv('croque_key') 276 | print('Commenting the following files:') 277 | print(files) 278 | for file_path in files: 279 | process_file(file_path, api_key) 280 | --------------------------------------------------------------------------------