├── ARS3.png ├── ARS4.png ├── ARS6.png ├── requirements.txt ├── 8dateien.json ├── text4.json ├── pcfg-grafik.py ├── Text6.txt ├── Text3.txt ├── Text7.txt ├── 8dateienAppARS4.json ├── Text5.txt ├── Text2.txt ├── Text1.txt ├── setup.py ├── Text8.txt ├── text4.txt ├── ars_core.py ├── ars_gui_app.py ├── app.py ├── README.md ├── ars4_gui_app.py └── ars6_gui_app.py /ARS3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pkoopongithub/algorithmisch-rekursive-sequenzanalyse-3.0/main/ARS3.png -------------------------------------------------------------------------------- /ARS4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pkoopongithub/algorithmisch-rekursive-sequenzanalyse-3.0/main/ARS4.png -------------------------------------------------------------------------------- /ARS6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pkoopongithub/algorithmisch-rekursive-sequenzanalyse-3.0/main/ARS6.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | sentence-transformers>=2.2.2 2 | hdbscan>=0.8.33 3 | scikit-learn>=1.1.3 4 | pandas>=1.5.3 5 | pyyaml>=6.0 6 | streamlit>=1.22.0 7 | networkx>=3.2.1 8 | matplotlib>=3.7.1 9 | -------------------------------------------------------------------------------- /8dateien.json: -------------------------------------------------------------------------------- 1 | { 2 | "0": { 3 | "2": 0.5333333333333333, 4 | "0": 0.4666666666666667 5 | }, 6 | "2": { 7 | "1": 1.0 8 | }, 9 | "1": { 10 | "1": 0.9230769230769231, 11 | "0": 0.07692307692307693 12 | } 13 | } -------------------------------------------------------------------------------- /text4.json: -------------------------------------------------------------------------------- 1 | { 2 | "-1": { 3 | "-1": 0.2727272727272727, 4 | "0": 0.45454545454545453, 5 | "1": 0.2727272727272727 6 | }, 7 | "0": { 8 | "1": 0.5, 9 | "-1": 0.5 10 | }, 11 | "1": { 12 | "0": 0.16666666666666666, 13 | "-1": 0.8333333333333334 14 | } 15 | } -------------------------------------------------------------------------------- /pcfg-grafik.py: -------------------------------------------------------------------------------- 1 | import graphviz 2 | 3 | def export_pcfg_to_dot(pcfg, filepath): 4 | dot = graphviz.Digraph() 5 | for rule in pcfg: 6 | lhs = rule['lhs'] 7 | rhs = ' '.join(rule['rhs']) 8 | prob = rule['probability'] 9 | dot.edge(lhs, rhs, label=f'{prob:.2f}') 10 | dot.render(filepath, format='png', cleanup=True) 11 | 12 | -------------------------------------------------------------------------------- /Text6.txt: -------------------------------------------------------------------------------- 1 | Text 6 2 | Datum: 28. Juni 1994, Ort: Käseverkaufsstand, Aachen, Marktplatz 3 | 4 | (Marktatmosphäre, Begrüßungen) 5 | 6 | Kunde 1: Guten Morgen! 7 | 8 | Verkäufer: Guten Morgen! 9 | 10 | Kunde 1: Ich hätte gerne fünfhundert Gramm holländischen Gouda. 11 | 12 | Verkäufer: Am Stück? 13 | 14 | Kunde 1: Ja, am Stück, bitte. 15 | 16 | Ende Text 6 -------------------------------------------------------------------------------- /Text3.txt: -------------------------------------------------------------------------------- 1 | Text 3 2 | Datum: 28. Juni 1994, Ort: Fischstand, Marktplatz, Aachen 3 | 4 | (Marktatmosphäre, Gespräch im Hintergrund, teilweise unverständlich) 5 | 6 | Kunde: Ein Pfund Seelachs, bitte. 7 | 8 | Verkäufer: Seelachs, alles klar. 9 | 10 | (Geräusche von Verpackung und Verkaufsvorbereitungen) 11 | 12 | Verkäufer: Vier Mark neunzehn, bitte. 13 | 14 | (Geräusche von Verpackung, Münzen klimpern) 15 | 16 | Verkäufer: Schönen Dank! 17 | 18 | Kunde: Ja, danke schön! 19 | 20 | Ende Text 3 -------------------------------------------------------------------------------- /Text7.txt: -------------------------------------------------------------------------------- 1 | Text 7 2 | Datum: 28. Juni 1994, Ort: Bonbonstand, Aachen, Marktplatz, 11:30 Uhr 3 | 4 | (Geräusche von Stimmen und Marktatmosphäre, teilweise unverständlich) 5 | 6 | Kunde: Von den gemischten hätte ich gerne hundert Gramm. 7 | 8 | (Unverständliche Fragen und Antworten) 9 | 10 | Verkäufer: Für zu Hause oder zum Mitnehmen? 11 | 12 | Kunde: Zum Mitnehmen, bitte. 13 | 14 | Verkäufer: Fünfzig Pfennig, bitte. 15 | 16 | (Klimpern von Münzen, Geräusche von Verpackung) 17 | 18 | Kunde: Danke! 19 | 20 | Ende Text 7 -------------------------------------------------------------------------------- /8dateienAppARS4.json: -------------------------------------------------------------------------------- 1 | { 2 | "T_1": { 3 | "T_4": 1.0 4 | }, 5 | "T_4": { 6 | "T_3": 1.0 7 | }, 8 | "T_3": { 9 | "T_3": 0.9230769230769231, 10 | "T_2": 0.07692307692307693 11 | }, 12 | "T_2": { 13 | "T_1": 1.0 14 | }, 15 | "NT_T_1_T_4_T_3": { 16 | "NT_T_1_T_4_T_3": 1.0 17 | }, 18 | "NT_T_4_T_3_T_3": { 19 | "NT_T_4_T_3_T_3": 1.0 20 | }, 21 | "NT_T_3_T_3_T_3": { 22 | "NT_T_3_T_3_T_3": 1.0 23 | }, 24 | "NT_T_3_T_3_T_2": { 25 | "NT_T_3_T_3_T_2": 1.0 26 | }, 27 | "NT_T_3_T_2_T_1": { 28 | "NT_T_3_T_2_T_1": 1.0 29 | }, 30 | "NT_T_2_T_1_T_4": { 31 | "NT_T_2_T_1_T_4": 1.0 32 | } 33 | } -------------------------------------------------------------------------------- /Text5.txt: -------------------------------------------------------------------------------- 1 | Text 5 2 | Datum: 26. Juni 1994, Ort: Gemüsestand, Aachen, Marktplatz, 11:00 Uhr 3 | 4 | (Marktatmosphäre, teilweise unverständlich) 5 | 6 | Verkäufer: So, bitte schön. 7 | 8 | Kunde 1: Auf Wiedersehen! 9 | 10 | Kunde 2: Ich hätte gern ein Kilo von den Granny Smith Äpfeln hier. 11 | 12 | (Unverständliches Gespräch im Hintergrund) 13 | 14 | Verkäufer: Sonst noch etwas? 15 | 16 | Kunde 2: Ja, noch ein Kilo Zwiebeln. 17 | 18 | Verkäufer: Alles klar. 19 | 20 | (Unverständliches Gespräch, Hintergrundgeräusche) 21 | 22 | Kunde 2: Das war's. 23 | 24 | Verkäufer: Sechs Mark fünfundzwanzig, bitte. 25 | 26 | (Unverständliches Gespräch, Geräusche von Münzen und Verpackung) 27 | 28 | Verkäufer: Wiedersehen! 29 | 30 | Kunde 2: Wiedersehen! 31 | 32 | Ende Text 5 -------------------------------------------------------------------------------- /Text2.txt: -------------------------------------------------------------------------------- 1 | Text 2 2 | Datum: 28. Juni 1994, Ort: Marktplatz, Aachen 3 | 4 | (Ständige Hintergrundgeräusche von Stimmen und Marktatmosphäre) 5 | 6 | Verkäufer: Kirschen kann jeder probieren hier, Kirschen kann jeder probieren hier! 7 | 8 | Kunde 1: Ein halbes Kilo Kirschen, bitte. 9 | 10 | Verkäufer: Ein halbes Kilo? Oder ein Kilo? 11 | 12 | (Unverständliches Gespräch, Münzen klimpern) 13 | 14 | Verkäufer: Danke schön! 15 | 16 | Verkäufer: Kirschen kann jeder probieren hier! Drei Mark, bitte. 17 | 18 | Kunde 1: Danke schön! 19 | 20 | Verkäufer: Kirschen kann jeder probieren hier, Kirschen kann jeder probieren hier! 21 | 22 | (Weitere Stimmen im Hintergrund, unverständliches Gespräch, Münzen klimpern) 23 | 24 | Kunde 2: Ein halbes Kilo, bitte. 25 | 26 | (Unverständliches Gespräch) 27 | 28 | Ende Text 2 -------------------------------------------------------------------------------- /Text1.txt: -------------------------------------------------------------------------------- 1 | Text 1 2 | Datum: 28. Juni 1994, Ort: Metzgerei, Aachen, 11:00 Uhr 3 | 4 | (Die Geräusche eines geschäftigen Marktplatzes im Hintergrund, Stimmen und Gemurmel) 5 | 6 | Verkäuferin: Guten Tag, was darf es sein? 7 | 8 | Kunde: Einmal von der groben Leberwurst, bitte. 9 | 10 | Verkäuferin: Wie viel darf’s denn sein? 11 | 12 | Kunde: Zwei hundert Gramm. 13 | 14 | Verkäuferin: Zwei hundert Gramm. Sonst noch etwas? 15 | 16 | Kunde: Ja, dann noch ein Stück von dem Schwarzwälder Schinken. 17 | 18 | Verkäuferin: Wie groß soll das Stück sein? 19 | 20 | Kunde: So um die dreihundert Gramm. 21 | 22 | Verkäuferin: Alles klar. Kommt sofort. (Geräusche von Papier und Verpackung) 23 | 24 | Kunde: Danke schön. 25 | 26 | Verkäuferin: Das macht dann acht Mark zwanzig. 27 | 28 | Kunde: Bitte. (Klimpern von Münzen, Geräusche der Kasse) 29 | 30 | Verkäuferin: Danke und einen schönen Tag noch! 31 | 32 | Kunde: Danke, ebenfalls! 33 | 34 | Ende Text 1 -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup( 4 | name="ars3", 5 | version="1.0.0", 6 | description="Algorithmisch Rekursive Sequenzanalyse 3.0 – Analyse und Simulation von Transkripten mit PCFG", 7 | author="Dein Name", 8 | author_email="dein.email@example.com", 9 | packages=find_packages(), 10 | include_package_data=True, 11 | install_requires=[ 12 | "sentence-transformers", 13 | "hdbscan", 14 | "scikit-learn", 15 | "pandas", 16 | "pyyaml", 17 | "streamlit", 18 | "networkx", 19 | "matplotlib" 20 | ], 21 | entry_points={ 22 | 'console_scripts': [ 23 | 'ars-gui = app:main', # Voraussetzung: app.py enthält eine main()-Funktion 24 | ], 25 | }, 26 | classifiers=[ 27 | "Programming Language :: Python :: 3", 28 | "License :: OSI Approved :: MIT License", 29 | "Operating System :: OS Independent", 30 | ], 31 | python_requires='>=3.9', 32 | ) 33 | -------------------------------------------------------------------------------- /Text8.txt: -------------------------------------------------------------------------------- 1 | Text 8 2 | Datum: 9. Juli 1994, Ort: Bäckerei, Aachen, 12:00 Uhr 3 | 4 | (Schritte hörbar, Hintergrundgeräusche, teilweise unverständlich) 5 | 6 | Kunde: Guten Tag! 7 | 8 | (Unverständliche Begrüßung im Hintergrund) 9 | 10 | Verkäuferin: Einmal unser bester Kaffee, frisch gemahlen, bitte. 11 | 12 | (Geräusche der Kaffeemühle, Verpackungsgeräusche) 13 | 14 | Verkäuferin: Sonst noch etwas? 15 | 16 | Kunde: Ja, noch zwei Stück Obstsalat und ein Schälchen Sahne. 17 | 18 | Verkäuferin: In Ordnung! 19 | 20 | (Geräusche der Kaffeemühle, Papiergeräusche) 21 | 22 | Verkäuferin: Ein kleines Schälchen Sahne, ja? 23 | 24 | Kunde: Ja, danke. 25 | 26 | (Türgeräusch, Lachen, Papiergeräusche) 27 | 28 | Verkäuferin: Keiner kümmert sich darum, die Türen zu ölen. 29 | 30 | Kunde: Ja, das ist immer so. 31 | 32 | (Lachen, Geräusche von Münzen und Verpackung) 33 | 34 | Verkäuferin: Das macht vierzehn Mark und neunzehn Pfennig, bitte. 35 | 36 | Kunde: Ich zahle in Kleingeld. 37 | 38 | (Lachen und Geräusche von Münzen) 39 | 40 | Verkäuferin: Vielen Dank, schönen Sonntag noch! 41 | 42 | Kunde: Danke, Ihnen auch! 43 | 44 | Ende Text 8 -------------------------------------------------------------------------------- /text4.txt: -------------------------------------------------------------------------------- 1 | Text 4 2 | Datum: 28. Juni 1994, Ort: Gemüsestand, Aachen, Marktplatz, 11:00 Uhr 3 | 4 | (Marktatmosphäre, teilweise unverständlich) 5 | 6 | Kunde: Hören Sie, ich nehme ein paar Champignons mit. 7 | 8 | Verkäufer: Braune oder helle? 9 | 10 | Kunde: Nehmen wir die hellen. 11 | 12 | Verkäufer: Alles klar, die hellen. 13 | 14 | (Unverständliche Unterhaltung im Hintergrund) 15 | 16 | Verkäufer: Die sind beide frisch, keine Sorge. 17 | 18 | Kunde: Wie ist es mit Pfifferlingen? 19 | 20 | Verkäufer: Ah, die sind super! 21 | 22 | (Unverständliches Gespräch) 23 | 24 | Kunde: Kann ich die in Reissalat tun? 25 | 26 | Verkäufer: Eher kurz anbraten in der Pfanne. 27 | 28 | Kunde: Okay, mache ich. 29 | 30 | Verkäufer: Die können Sie roh verwenden, aber ein bisschen anbraten ist besser. 31 | 32 | Kunde: Verstanden. 33 | 34 | (Weitere Unterhaltung, unverständliche Kommentare) 35 | 36 | Verkäufer: Noch etwas anderes? 37 | 38 | Kunde: Ja, dann nehme ich noch Erdbeeren. 39 | 40 | (Pause, Hintergrundgeräusche von Verpackung und Stimmen) 41 | 42 | Verkäufer: Schönen Tag noch! 43 | 44 | Kunde: Gleichfalls! 45 | 46 | Ende Text 4 -------------------------------------------------------------------------------- /ars_core.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import csv 4 | import yaml 5 | import numpy as np 6 | 7 | from sklearn.feature_extraction.text import TfidfVectorizer 8 | from sklearn.decomposition import PCA 9 | from sentence_transformers import SentenceTransformer 10 | import hdbscan 11 | import random 12 | 13 | # Modell laden 14 | model = SentenceTransformer("all-MiniLM-L6-v2") 15 | 16 | def read_transcripts(file_paths): 17 | utterances = [] 18 | for file in file_paths: 19 | with open(file, 'r', encoding='utf-8') as f: 20 | for line in f: 21 | line = line.strip() 22 | if line: 23 | utterances.append(line) 24 | return utterances 25 | 26 | def embed_utterances(utterances): 27 | return model.encode(utterances) 28 | 29 | def cluster_embeddings(embeddings): 30 | clusterer = hdbscan.HDBSCAN(min_cluster_size=3) 31 | labels = clusterer.fit_predict(embeddings) 32 | return labels 33 | 34 | def build_pcfg(labels, utterances): 35 | pcfg = {} 36 | terminal_chain = [] 37 | for i in range(len(labels) - 1): 38 | src = str(labels[i]) 39 | dst = str(labels[i + 1]) 40 | terminal_chain.append(src) 41 | if src not in pcfg: 42 | pcfg[src] = {} 43 | pcfg[src][dst] = pcfg[src].get(dst, 0) + 1 44 | 45 | # Wahrscheinlichkeiten normalisieren 46 | for src in pcfg: 47 | total = sum(pcfg[src].values()) 48 | for dst in pcfg[src]: 49 | pcfg[src][dst] /= total 50 | 51 | return pcfg, terminal_chain 52 | 53 | def process_multiple_dialogs(file_paths): 54 | utterances = read_transcripts(file_paths) 55 | embeddings = embed_utterances(utterances) 56 | labels = cluster_embeddings(embeddings) 57 | pcfg, terminal_chain = build_pcfg(labels, utterances) 58 | return { 59 | "utterances": utterances, 60 | "embeddings": embeddings, 61 | "labels": labels, 62 | "pcfg": pcfg, 63 | "terminal_chain": terminal_chain 64 | } 65 | 66 | def simulate_dialog(pcfg, length=6): 67 | if not pcfg: 68 | return [] 69 | current = random.choice(list(pcfg.keys())) 70 | sequence = [current] 71 | for _ in range(length - 1): 72 | if current not in pcfg: 73 | break 74 | next_states = list(pcfg[current].keys()) 75 | probs = list(pcfg[current].values()) 76 | current = np.random.choice(next_states, p=probs) 77 | sequence.append(current) 78 | return sequence 79 | 80 | def export_pcfg_to_json(pcfg, filepath): 81 | with open(filepath, 'w', encoding='utf-8') as f: 82 | json.dump(pcfg, f, indent=2) 83 | 84 | def export_pcfg_to_csv(pcfg, filepath): 85 | with open(filepath, 'w', encoding='utf-8', newline='') as f: 86 | writer = csv.writer(f) 87 | writer.writerow(["Source", "Target", "Probability"]) 88 | for src in pcfg: 89 | for dst in pcfg[src]: 90 | writer.writerow([src, dst, pcfg[src][dst]]) 91 | 92 | def export_pcfg_to_yaml(pcfg, filepath): 93 | with open(filepath, 'w', encoding='utf-8') as f: 94 | yaml.dump(pcfg, f, sort_keys=False, allow_unicode=True) 95 | -------------------------------------------------------------------------------- /ars_gui_app.py: -------------------------------------------------------------------------------- 1 | import tkinter as tk 2 | from tkinter import filedialog, messagebox, ttk 3 | import os 4 | import json 5 | import csv 6 | import yaml 7 | import numpy as np 8 | 9 | from ars_core import ( 10 | process_multiple_dialogs, 11 | simulate_dialog, 12 | export_pcfg_to_json, 13 | export_pcfg_to_csv, 14 | export_pcfg_to_yaml 15 | ) 16 | 17 | class ARSGUIApp: 18 | def __init__(self, root): 19 | self.root = root 20 | self.root.title("ARS 3.0 – Algorithmisch Rekursive Sequenzanalyse") 21 | 22 | self.dialog_files = [] 23 | self.processed_data = None 24 | 25 | self.build_gui() 26 | 27 | def build_gui(self): 28 | frm = ttk.Frame(self.root, padding=10) 29 | frm.grid(row=0, column=0, sticky="nsew") 30 | 31 | ttk.Label(frm, text="ARS 3.0 Dialogverarbeitung", font=("Arial", 16)).grid(row=0, column=0, columnspan=3, pady=10) 32 | 33 | ttk.Button(frm, text="Transkripte laden", command=self.load_dialogs).grid(row=1, column=0, sticky="ew", pady=5) 34 | ttk.Button(frm, text="Verarbeiten", command=self.run_processing).grid(row=1, column=1, sticky="ew", pady=5) 35 | ttk.Button(frm, text="Dialog simulieren", command=self.run_simulation).grid(row=1, column=2, sticky="ew", pady=5) 36 | 37 | ttk.Separator(frm).grid(row=2, column=0, columnspan=3, pady=10, sticky="ew") 38 | 39 | ttk.Button(frm, text="PCFG → JSON", command=lambda: self.export_pcfg("json")).grid(row=3, column=0, pady=5) 40 | ttk.Button(frm, text="PCFG → CSV", command=lambda: self.export_pcfg("csv")).grid(row=3, column=1, pady=5) 41 | ttk.Button(frm, text="PCFG → YAML", command=lambda: self.export_pcfg("yaml")).grid(row=3, column=2, pady=5) 42 | 43 | self.text_output = tk.Text(frm, wrap="word", height=20) 44 | self.text_output.grid(row=4, column=0, columnspan=3, sticky="nsew", pady=10) 45 | 46 | def log(self, msg): 47 | self.text_output.insert(tk.END, msg + "\n") 48 | self.text_output.see(tk.END) 49 | 50 | def load_dialogs(self): 51 | files = filedialog.askopenfilenames(title="Transkriptdateien auswählen", filetypes=[("Textdateien", "*.txt")]) 52 | if files: 53 | self.dialog_files = list(files) 54 | self.log(f"{len(self.dialog_files)} Dateien geladen.") 55 | 56 | def run_processing(self): 57 | if not self.dialog_files: 58 | messagebox.showwarning("Warnung", "Keine Dateien ausgewählt.") 59 | return 60 | self.log("Starte Verarbeitung...") 61 | self.processed_data = process_multiple_dialogs(self.dialog_files) 62 | self.log("Verarbeitung abgeschlossen.") 63 | self.log(f"Kategorien: {set(self.processed_data['terminal_chain'])}") 64 | 65 | def run_simulation(self): 66 | if not self.processed_data: 67 | messagebox.showwarning("Warnung", "Bitte zuerst Transkripte verarbeiten.") 68 | return 69 | chain = simulate_dialog(self.processed_data["pcfg"], length=6) 70 | self.log("Simulierter Dialog:") 71 | self.log(" → ".join(chain)) 72 | 73 | def export_pcfg(self, fmt): 74 | if not self.processed_data: 75 | messagebox.showwarning("Warnung", "Bitte zuerst Transkripte verarbeiten.") 76 | return 77 | filepath = filedialog.asksaveasfilename(defaultextension=f".{fmt}", filetypes=[(fmt.upper(), f"*.{fmt}")]) 78 | if filepath: 79 | if fmt == "json": 80 | export_pcfg_to_json(self.processed_data["pcfg"], filepath) 81 | elif fmt == "csv": 82 | export_pcfg_to_csv(self.processed_data["pcfg"], filepath) 83 | elif fmt == "yaml": 84 | export_pcfg_to_yaml(self.processed_data["pcfg"], filepath) 85 | self.log(f"PCFG exportiert als {filepath}") 86 | 87 | 88 | if __name__ == "__main__": 89 | root = tk.Tk() 90 | app = ARSGUIApp(root) 91 | root.mainloop() 92 | 93 | -------------------------------------------------------------------------------- /app.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | import os 3 | import json 4 | import yaml 5 | import numpy as np 6 | import hdbscan 7 | import umap 8 | import matplotlib.pyplot as plt 9 | from collections import defaultdict 10 | from sentence_transformers import SentenceTransformer 11 | import openai 12 | import random 13 | 14 | # === Konfiguration === 15 | USE_GPT = st.sidebar.checkbox("GPT zur Clusterbenennung verwenden?", value=False) 16 | openai.api_key = st.sidebar.text_input("OpenAI API-Key", type="password") 17 | 18 | @st.cache_data(show_spinner=False) 19 | def embed_utterances(utterances, model_name="all-MiniLM-L6-v2"): 20 | model = SentenceTransformer(model_name) 21 | return model.encode(utterances, show_progress_bar=False) 22 | 23 | def cluster_utterances(embeddings): 24 | clusterer = hdbscan.HDBSCAN(min_cluster_size=5, prediction_data=True) 25 | return clusterer.fit_predict(embeddings) 26 | 27 | def gpt_category(samples): 28 | prompt = "Gib eine knappe Kategorienbezeichnung (1–2 Wörter) für folgende Aussagen: 29 | " + "\n".join(f"- {s}" for s in samples[:5]) 30 | try: 31 | response = openai.ChatCompletion.create( 32 | model="gpt-4", 33 | messages=[{"role": "user", "content": prompt}], 34 | temperature=0.3 35 | ) 36 | return response["choices"][0]["message"]["content"].strip() 37 | except Exception: 38 | return local_category(samples) 39 | 40 | def local_category(samples): 41 | fallback = ["Frage", "Antwort", "Befehl", "Hinweis", "Ironie", "Zweifel"] 42 | return random.choice(fallback) 43 | 44 | def assign_categories(utterances, labels): 45 | clusters = defaultdict(list) 46 | for u, l in zip(utterances, labels): 47 | clusters[l].append(u) 48 | label_to_name = {} 49 | for l, samples in clusters.items(): 50 | label_to_name[l] = gpt_category(samples) if USE_GPT else local_category(samples) 51 | return [label_to_name[l] for l in labels], label_to_name 52 | 53 | def induce_pcfg(sequence): 54 | transitions = defaultdict(lambda: defaultdict(int)) 55 | for i in range(len(sequence) - 1): 56 | transitions[sequence[i]][sequence[i + 1]] += 1 57 | return {k: {kk: vv / sum(v.values()) for kk, vv in v.items()} for k, v in transitions.items()} 58 | 59 | def simulate_dialog(pcfg, start=None, maxlen=15): 60 | if not pcfg: return [] 61 | if not start: 62 | start = random.choice(list(pcfg.keys())) 63 | result = [start] 64 | for _ in range(maxlen - 1): 65 | if start not in pcfg: 66 | break 67 | next_items = list(pcfg[start].items()) 68 | next_tokens, probs = zip(*next_items) 69 | start = random.choices(next_tokens, probs)[0] 70 | result.append(start) 71 | return result 72 | 73 | def render_umap(embeddings, labels): 74 | reducer = umap.UMAP(random_state=42) 75 | reduced = reducer.fit_transform(embeddings) 76 | fig, ax = plt.subplots() 77 | unique_labels = set(labels) 78 | for label in unique_labels: 79 | mask = labels == label 80 | ax.scatter(reduced[mask, 0], reduced[mask, 1], label=f"Cluster {label}", alpha=0.6) 81 | ax.set_title("UMAP + HDBSCAN-Cluster") 82 | ax.legend() 83 | return fig 84 | 85 | def pcfg_to_dot(pcfg): 86 | lines = ["digraph PCFG {"] 87 | for src, dsts in pcfg.items(): 88 | for dst, prob in dsts.items(): 89 | lines.append(f'"{src}" -> "{dst}" [label="{prob:.2f}"];') 90 | lines.append("}") 91 | return "\n".join(lines) 92 | 93 | st.title("🗣️ Algorithmisch-Rekursive Sequenzanalyse 3.0") 94 | 95 | uploaded_files = st.file_uploader("Lade Transkripte hoch (.txt)", type="txt", accept_multiple_files=True) 96 | 97 | if uploaded_files: 98 | for file in uploaded_files: 99 | st.subheader(f"📄 Datei: {file.name}") 100 | raw_text = file.read().decode("utf-8") 101 | utterances = [line.split(":", 1)[1].strip() for line in raw_text.splitlines() if ":" in line] 102 | 103 | if not utterances: 104 | st.warning("Keine dialogischen Äußerungen gefunden.") 105 | continue 106 | 107 | embeddings = embed_utterances(utterances) 108 | labels = cluster_utterances(embeddings) 109 | categories, label_map = assign_categories(utterances, labels) 110 | pcfg = induce_pcfg(categories) 111 | 112 | st.markdown("### 🔖 Kategorien") 113 | for cluster, name in label_map.items(): 114 | st.write(f"**Cluster {cluster}** → {name}") 115 | 116 | st.markdown("### 🧠 PCFG-Simulation") 117 | if st.button(f"🎲 Simuliere Dialog ({file.name})"): 118 | dialog = simulate_dialog(pcfg) 119 | st.write(" → ".join(dialog)) 120 | 121 | st.markdown("### 📊 Cluster-Visualisierung") 122 | st.pyplot(render_umap(embeddings, labels)) 123 | 124 | st.markdown("### 📥 Export") 125 | col1, col2 = st.columns(2) 126 | 127 | with col1: 128 | st.download_button("📎 PCFG als YAML", yaml.dump(pcfg, allow_unicode=True), file_name=f"{file.name}_pcfg.yaml") 129 | with col2: 130 | st.download_button("📎 PCFG als DOT", pcfg_to_dot(pcfg), file_name=f"{file.name}_pcfg.dot") 131 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Algorithmic Recursive Sequence Analysis 3.0 2 | ![ARS 3.0 GUI](ARS3.png) 3 | ![ARS 4.0 GUI](ARS4.png) 4 | ![ARS 6.0 GUI](ARS6.png) 5 | 6 | This project provides a web-based application for the automatic analysis of dialog transcripts using Sentence-BERT, HDBSCAN clustering, and probabilistic context-free grammars (PCFG). 7 | 8 | ## Features 9 | 10 | * 📂 Upload multiple transcripts 11 | * 🧠 Sentence-BERT for embeddings 12 | * 📊 HDBSCAN for clustering 13 | * 🧾 Categorization via GPT or local methods 14 | * 📈 Visualization via UMAP 15 | * 🔁 PCFG induction from sequences 16 | * 🎲 Dialog simulation 17 | * 📎 Export PCFG as `.yaml`, `.json`, `.csv`, or `.dot` 18 | 19 | ## Files 20 | 21 | * `app.py` – Main file containing the Streamlit (or tkinter) GUI 22 | * `requirements.txt` – Python dependencies 23 | * `README.md` – This documentation 24 | 25 | ## Usage 26 | 27 | 1. Open the web app in your browser 28 | 2. Upload `.txt` transcripts 29 | 3. The app clusters utterances and generates semantic categories 30 | 4. You can export the resulting PCFG or simulate new dialogues 31 | 32 | --- 33 | 34 | # Algorithmic Recursive Sequence Analysis 3.0 (ARS 3.0) 35 | 36 | **Algorithmic Recursive Sequence Analysis (ARS 3.0)** is a modular system for the processing, analysis, and simulation of dialogical transcripts. It enables automatic clustering of semantically similar utterances, the generation of probabilistic context-free grammars (PCFG), and the creation of synthetic dialogues based on these structures. 37 | 38 | --- 39 | 40 | ## 🔧 Project Structure 41 | 42 | ``` 43 | ars3/ 44 | ├── ars_core.py # Core logic: processing, PCFG export, simulation 45 | ├── app.py # GUI (Streamlit or tkinter) 46 | ├── categories.json # Persistent storage of recognized categories 47 | ├── data/ 48 | │ └── test_transcript.txt # Example transcript data for analysis 49 | ├── output/ 50 | │ ├── pcfg.json # PCFG exported in JSON format 51 | │ ├── pcfg.csv # PCFG exported in CSV format 52 | │ ├── pcfg.yaml # PCFG exported in YAML format 53 | │ └── cluster_plot.png # Visualization of cluster structure 54 | ├── requirements.txt # Dependencies 55 | ├── setup.py # Installation script 56 | └── README.md # This file 57 | ``` 58 | 59 | --- 60 | 61 | ## 🧠 Core Functions 62 | 63 | ### `ars_core.py` 64 | 65 | * **`process_multiple_dialogs(transcript_paths)`** 66 | Loads and processes multiple transcripts, clusters semantically similar statements with HDBSCAN, and builds a PCFG. 67 | 68 | * **`simulate_dialog(pcfg, max_turns=10)`** 69 | Simulates a new dialog based on a given PCFG. 70 | 71 | * **`export_pcfg_to_json(pcfg, filepath)`** 72 | Exports the PCFG to a JSON file. 73 | 74 | * **`export_pcfg_to_csv(pcfg, filepath)`** 75 | Exports the PCFG to a CSV file for tabular analysis. 76 | 77 | * **`export_pcfg_to_yaml(pcfg, filepath)`** 78 | Exports the PCFG in YAML format for further processing or editing. 79 | 80 | --- 81 | 82 | ## 🖥️ GUI (`app.py`) 83 | 84 | * Select transcripts for processing 85 | * Start analysis and clustering 86 | * Visualize the cluster structure 87 | * Export PCFG in various formats 88 | * Simulate new dialogues at the press of a button 89 | 90 | The GUI is modular and can be run via Streamlit (web) or tkinter (local desktop interface). 91 | 92 | --- 93 | 94 | ## 📦 Installation 95 | 96 | 1. Clone or unzip the repository 97 | 98 | 2. Install the required dependencies: 99 | 100 | ```bash 101 | pip install -r requirements.txt 102 | ``` 103 | 104 | 3. Start the GUI (if `app.py` contains a `main()` function): 105 | 106 | ```bash 107 | python app.py 108 | ``` 109 | 110 | Or via a command-line shortcut (if installed as CLI): 111 | 112 | ```bash 113 | ars-gui 114 | ``` 115 | 116 | --- 117 | 118 | ## 📈 Export Formats 119 | 120 | * **JSON** – Structured, machine-readable format 121 | * **CSV** – For easy tabular analysis (e.g., in Excel or Pandas) 122 | * **YAML** – For readable configuration and external tool integration 123 | * **DOT** – For graph-based visualization via Graphviz or other tools 124 | 125 | 126 | 127 | --- 128 | 129 | 130 | 131 | 132 | # Algorithmisch-Rekursive Sequenzanalyse 3.0 133 | 134 | Dieses Projekt bietet eine Webanwendung zur automatischen Analyse von dialogischen Transkripten mithilfe von Sentence-BERT, HDBSCAN-Clustering und probabilistischen kontextfreien Grammatiken (PCFG). 135 | 136 | ## Funktionen 137 | 138 | - 📂 Mehrfacher Transkript-Upload 139 | - 🧠 Sentence-BERT für Embedding 140 | - 📊 HDBSCAN für Clusterbildung 141 | - 🧾 Kategorien mit GPT oder lokal 142 | - 📈 Visualisierung via UMAP 143 | - 🔁 PCFG-Induktion aus Sequenzen 144 | - 🎲 Simulation von Dialogen 145 | - 📎 Export der PCFG als `.yaml` oder `.dot` 146 | 147 | ## Dateien 148 | 149 | - `app.py` – Die Hauptdatei mit der Streamlit-App 150 | - `requirements.txt` – Python-Abhängigkeiten 151 | - `README.md` – Dieses Dokument 152 | 153 | 154 | 155 | ## Nutzung 156 | 157 | 1. Öffne die Web-App im Browser 158 | 2. Lade `.txt`-Transkripte hoch 159 | 3. Die App clustert Äußerungen und generiert Kategorien 160 | 4. Du kannst die resultierende PCFG exportieren oder Dialoge simulieren 161 | 162 | 163 | # Algorithmisch Rekursive Sequenzanalyse 3.0 164 | ![ARS 3.0 GUI](ARS3.png) 165 | ![ARS 4.0 GUI](ARS4.png) 166 | ![ARS 6.0 GUI](ARS6.png) 167 | 168 | **Algorithmisch Rekursive Sequenzanalyse (ARS 3.0)** ist ein modulares System zur Verarbeitung, Analyse und Simulation von dialogischen Transkripten. Es ermöglicht die automatische Clusterung semantisch ähnlicher Aussagen, den Aufbau probabilistischer kontextfreier Grammatiken (PCFG), sowie die Generierung synthetischer Dialoge auf Basis dieser Strukturen. 169 | 170 | --- 171 | 172 | ## 🔧 Projektstruktur 173 | 174 | 175 | 176 | 177 | ars3/
178 | ├── ars\_core.py # Zentrale Logik: Verarbeitung, PCFG-Export, Simulation
179 | ├── app.py # GUI (Streamlit oder tkinter)
180 | ├── categories.json # Persistente Speicherung erkannter Kategorien
181 | ├── data/
182 | │ └── test\_transcript.txt # Beispielhafte Dialogdaten zur Analyse
183 | ├── output/
184 | │ ├── pcfg.json # Exportierte PCFG im JSON-Format
185 | │ ├── pcfg.csv # Exportierte PCFG im CSV-Format
186 | │ ├── pcfg.yaml # Exportierte PCFG im YAML-Format
187 | │ └── cluster\_plot.png # Visualisierung der Clusterstruktur
188 | ├── requirements.txt # Abhängigkeiten
189 | ├── setup.py # Installationsskript
190 | └── README.md # Diese Datei
191 | 192 | ``` 193 | 194 | --- 195 | 196 | ## 🧠 Hauptfunktionen 197 | 198 | ### `ars_core.py` 199 | 200 | - **`process_multiple_dialogs(transcript_paths)`** 201 | Lädt mehrere Transkripte, analysiert semantisch ähnliche Aussagen, clustert mit HDBSCAN und erstellt eine PCFG. 202 | 203 | - **`simulate_dialog(pcfg, max_turns=10)`** 204 | Simuliert einen plausiblen neuen Dialog basierend auf einer gegebenen PCFG. 205 | 206 | - **`export_pcfg_to_json(pcfg, filepath)`** 207 | Exportiert die PCFG in eine JSON-Datei. 208 | 209 | - **`export_pcfg_to_csv(pcfg, filepath)`** 210 | Exportiert die PCFG in eine CSV-Datei zur besseren tabellarischen Auswertung. 211 | 212 | - **`export_pcfg_to_yaml(pcfg, filepath)`** 213 | Exportiert die PCFG in das YAML-Format (z. B. für andere Tools oder manuelle Bearbeitung). 214 | 215 | --- 216 | 217 | ## 🖥️ GUI (`app.py`) 218 | 219 | - Wähle Transkripte zur Verarbeitung 220 | - Starte Analyse & Clustering 221 | - Visualisiere Clusterstruktur 222 | - Exportiere PCFG in verschiedenen Formaten 223 | - Simuliere neue Dialoge auf Knopfdruck 224 | 225 | Die GUI ist modular aufgebaut und kann wahlweise in Streamlit (Web) oder tkinter (lokal) betrieben werden. 226 | 227 | 228 | ## 📦 Installation 229 | 230 | 1. Klone oder entpacke das Repository: 231 | 232 | 2. Installiere alle Abhängigkeiten: 233 | 234 | ```bash 235 | pip install -r requirements.txt 236 | ``` 237 | 238 | 3. Starte die GUI (wenn `app.py` eine `main()`-Funktion enthält): 239 | 240 | ```bash 241 | python app.py 242 | ``` 243 | 244 | Oder über den Konsolenbefehl: 245 | 246 | ```bash 247 | ars-gui 248 | ``` 249 | 250 | 251 | 252 | --- 253 | 254 | ## 📈 Exportformate 255 | 256 | * **JSON**: Für strukturierten maschinenlesbaren Export 257 | * **CSV**: Zur einfachen tabellarischen Analyse (z. B. in Excel oder Pandas) 258 | * **YAML**: Für lesbare Konfigurationen und Weiterverarbeitung in externen Tools 259 | 260 | 261 | 262 | 263 | 264 | -------------------------------------------------------------------------------- /ars4_gui_app.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import numpy as np 4 | from sklearn.cluster import HDBSCAN 5 | from sentence_transformers import SentenceTransformer 6 | from collections import defaultdict 7 | from scipy.stats import pearsonr 8 | import tkinter as tk 9 | from tkinter import filedialog, ttk, messagebox 10 | 11 | # Modell für Embeddings 12 | model = SentenceTransformer("all-MiniLM-L6-v2") 13 | 14 | class ARSGUI: 15 | def __init__(self, root): 16 | self.root = root 17 | self.root.title("Auto-PCFG Generator") 18 | self.transcripts = [] 19 | self.terminal_symbols = [] 20 | self.pcfg = {} 21 | 22 | self.setup_ui() 23 | 24 | def setup_ui(self): 25 | main_frame = ttk.Frame(self.root, padding="10") 26 | main_frame.grid(row=0, column=0, sticky="nsew") 27 | 28 | ttk.Button(main_frame, text="Load Transcripts", command=self.load_transcripts).grid(row=0, column=0, pady=5) 29 | ttk.Button(main_frame, text="Generate Grammar", command=self.generate_grammar).grid(row=0, column=1, pady=5) 30 | ttk.Button(main_frame, text="Optimize Grammar", command=self.optimize_grammar).grid(row=0, column=2, pady=5) 31 | ttk.Button(main_frame, text="Simulate Dialog", command=self.simulate_dialog).grid(row=0, column=3, pady=5) 32 | 33 | self.output_text = tk.Text(main_frame, height=20, width=80) 34 | self.output_text.grid(row=1, column=0, columnspan=4, pady=10) 35 | 36 | ttk.Button(main_frame, text="Export JSON", command=lambda: self.export_grammar("json")).grid(row=2, column=0) 37 | ttk.Button(main_frame, text="Export YAML", command=lambda: self.export_grammar("yaml")).grid(row=2, column=1) 38 | 39 | def log(self, message): 40 | self.output_text.insert(tk.END, message + "\n") 41 | self.output_text.see(tk.END) 42 | 43 | def load_transcripts(self): 44 | files = filedialog.askopenfilenames(filetypes=[("Text files", "*.txt")]) 45 | if not files: 46 | return 47 | 48 | self.transcripts = [] 49 | for file in files: 50 | with open(file, 'r', encoding='utf-8') as f: 51 | self.transcripts.extend([line.strip() for line in f if line.strip()]) 52 | 53 | self.log(f"Loaded {len(self.transcripts)} utterances from {len(files)} files.") 54 | 55 | def generate_grammar(self): 56 | if not self.transcripts: 57 | messagebox.showwarning("Warning", "No transcripts loaded!") 58 | return 59 | 60 | # Schritt 1: Terminalzeichen generieren 61 | embeddings = model.encode(self.transcripts) 62 | 63 | # KORREKTUR: Parameter gen_min_span_tree entfernt 64 | clusterer = HDBSCAN(min_cluster_size=3) 65 | clusters = clusterer.fit_predict(embeddings) 66 | 67 | # Unique Terminalzeichen erstellen 68 | self.terminal_symbols = [f"T_{c+1}" for c in clusters] 69 | unique_terminals = list(set(self.terminal_symbols)) 70 | 71 | self.log(f"Generated {len(unique_terminals)} terminal symbols: {unique_terminals}") 72 | 73 | # Schritt 2: Nonterminale und Regeln ableiten 74 | self.pcfg = self.induce_grammar_rules(self.terminal_symbols) 75 | self.log("\nGenerated PCFG rules:") 76 | for nt, rules in self.pcfg.items(): 77 | self.log(f"{nt} → {rules}") 78 | 79 | def induce_grammar_rules(self, terminals, n=3): 80 | rules = defaultdict(dict) 81 | 82 | # Einfache Übergänge zwischen Terminalzeichen 83 | for i in range(len(terminals)-1): 84 | src = terminals[i] 85 | dst = terminals[i+1] 86 | rules[src][dst] = rules[src].get(dst, 0) + 1 87 | 88 | # Nonterminale für häufige N-Gramme 89 | ngram_counts = defaultdict(int) 90 | for i in range(len(terminals)-n+1): 91 | ngram = " ".join(terminals[i:i+n]) 92 | ngram_counts[ngram] += 1 93 | 94 | # Füge Nonterminale für häufige N-Gramme hinzu 95 | for ngram, count in ngram_counts.items(): 96 | if count > 1: 97 | nt = f"NT_{ngram.replace(' ', '_')}" 98 | rules[nt] = {ngram: 1.0} 99 | 100 | # Ersetze Vorkommen im Haupt-PCFG 101 | for src in list(rules.keys()): 102 | if ngram in rules[src]: 103 | rules[src][nt] = rules[src].pop(ngram) 104 | 105 | # Normalisiere Wahrscheinlichkeiten 106 | for src in rules: 107 | total = sum(rules[src].values()) 108 | rules[src] = {dst: cnt/total for dst, cnt in rules[src].items()} 109 | 110 | return dict(rules) 111 | 112 | def optimize_grammar(self, iterations=10): 113 | if not self.pcfg: 114 | messagebox.showwarning("Warning", "Generate grammar first!") 115 | return 116 | 117 | empirical_freq = self.calculate_frequencies([self.terminal_symbols]) 118 | 119 | for i in range(iterations): 120 | generated_chains = [] 121 | for _ in range(5): 122 | chain = self.simulate_chain(max_length=len(self.terminal_symbols)) 123 | generated_chains.append(chain) 124 | 125 | gen_freq = self.calculate_frequencies(generated_chains) 126 | corr, p_value = pearsonr(empirical_freq, gen_freq) 127 | 128 | self.log(f"Iteration {i+1}: Correlation = {corr:.3f}, p = {p_value:.3f}") 129 | 130 | if corr > 0.9: 131 | break 132 | 133 | self.adjust_probabilities(empirical_freq, gen_freq) 134 | 135 | def calculate_frequencies(self, chains): 136 | all_terminals = sorted(set(self.terminal_symbols)) 137 | freq = np.zeros(len(all_terminals)) 138 | term_to_idx = {t: i for i, t in enumerate(all_terminals)} 139 | 140 | for chain in chains: 141 | for term in chain: 142 | if term in term_to_idx: 143 | freq[term_to_idx[term]] += 1 144 | 145 | return freq / freq.sum() if freq.sum() > 0 else freq 146 | 147 | def adjust_probabilities(self, empirical_freq, gen_freq): 148 | all_terminals = sorted(set(self.terminal_symbols)) 149 | adjustment = empirical_freq - gen_freq 150 | 151 | for src in self.pcfg: 152 | new_rules = {} 153 | for dst in self.pcfg[src]: 154 | if dst in all_terminals: 155 | idx = all_terminals.index(dst) 156 | new_prob = max(0.01, min(0.99, self.pcfg[src][dst] + 0.1 * adjustment[idx])) 157 | new_rules[dst] = new_prob 158 | else: 159 | new_rules[dst] = self.pcfg[src][dst] 160 | 161 | total = sum(new_rules.values()) 162 | self.pcfg[src] = {k: v/total for k, v in new_rules.items()} 163 | 164 | def simulate_chain(self, max_length=10): 165 | chain = [] 166 | current = np.random.choice(list(self.pcfg.keys())) 167 | 168 | for _ in range(max_length): 169 | if current not in self.pcfg: 170 | break 171 | 172 | next_options = list(self.pcfg[current].keys()) 173 | probs = list(self.pcfg[current].values()) 174 | next_item = np.random.choice(next_options, p=probs) 175 | 176 | if next_item.startswith("NT_"): 177 | expanded = next_item[3:].replace("_", " ").split() 178 | chain.extend(expanded) 179 | current = expanded[-1] if expanded else None 180 | else: 181 | chain.append(next_item) 182 | current = next_item 183 | 184 | return chain 185 | 186 | def simulate_dialog(self): 187 | if not self.pcfg: 188 | messagebox.showwarning("Warning", "Generate grammar first!") 189 | return 190 | 191 | chain = self.simulate_chain() 192 | self.log("\nSimulated dialog sequence:") 193 | self.log(" → ".join(chain)) 194 | 195 | def export_grammar(self, format): 196 | if not self.pcfg: 197 | messagebox.showwarning("Warning", "No grammar to export!") 198 | return 199 | 200 | file = filedialog.asksaveasfilename( 201 | defaultextension=f".{format}", 202 | filetypes=[(f"{format.upper()} files", f"*.{format}")] 203 | ) 204 | 205 | if not file: 206 | return 207 | 208 | if format == "json": 209 | with open(file, 'w') as f: 210 | json.dump(self.pcfg, f, indent=2) 211 | elif format == "yaml": 212 | import yaml 213 | with open(file, 'w') as f: 214 | yaml.dump(self.pcfg, f) 215 | 216 | self.log(f"Grammar exported to {file}") 217 | 218 | if __name__ == "__main__": 219 | root = tk.Tk() 220 | app = ARSGUI(root) 221 | root.mainloop() 222 | -------------------------------------------------------------------------------- /ars6_gui_app.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import numpy as np 4 | from sklearn.cluster import HDBSCAN 5 | from sklearn.feature_extraction.text import TfidfVectorizer 6 | from sentence_transformers import SentenceTransformer 7 | from collections import defaultdict 8 | from scipy.stats import pearsonr 9 | import tkinter as tk 10 | from tkinter import filedialog, ttk, messagebox 11 | import networkx as nx 12 | import matplotlib.pyplot as plt 13 | from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg 14 | from nltk import ngrams 15 | from langdetect import detect 16 | from transformers import pipeline 17 | import threading 18 | from copy import deepcopy 19 | 20 | class EnhancedDialogAnalyzer: 21 | def __init__(self, root): 22 | self.root = root 23 | self.root.title("LLM-enhanced Dialog Analyzer") 24 | 25 | # Modelle initialisieren 26 | self.embedding_model = SentenceTransformer("paraphrase-multilingual-mpnet-base-v2") 27 | self.llm = pipeline("text2text-generation", model="google/flan-t5-base") 28 | 29 | # Datenstrukturen 30 | self.transcripts = [] 31 | self.interacts = [] 32 | self.pcfg = {} 33 | self.empirical_chain = [] 34 | 35 | # GUI 36 | self.setup_ui() 37 | self.setup_visualization() 38 | 39 | def setup_ui(self): 40 | main_frame = ttk.Frame(self.root, padding="10") 41 | main_frame.grid(row=0, column=0, sticky="nsew") 42 | 43 | ttk.Button(main_frame, text="1. Load & Preprocess", 44 | command=self.load_and_preprocess).grid(row=0, column=0, pady=5) 45 | ttk.Button(main_frame, text="2. Analyze Meanings", 46 | command=self.analyze_meanings).grid(row=1, column=0, pady=5) 47 | ttk.Button(main_frame, text="3. Build Semantic PCFG", 48 | command=self.build_semantic_pcfg).grid(row=2, column=0, pady=5) 49 | ttk.Button(main_frame, text="4. Optimize", 50 | command=self.optimize_grammar).grid(row=3, column=0, pady=5) 51 | ttk.Button(main_frame, text="5. Visualize", 52 | command=self.visualize_grammar).grid(row=4, column=0, pady=5) 53 | 54 | self.output_text = tk.Text(main_frame, height=20, width=80) 55 | self.output_text.grid(row=0, column=1, rowspan=5, padx=10) 56 | 57 | def setup_visualization(self): 58 | self.figure = plt.Figure(figsize=(8, 6), dpi=100) 59 | self.canvas = FigureCanvasTkAgg(self.figure, master=self.root) 60 | self.canvas.get_tk_widget().grid(row=0, column=2, rowspan=5, padx=10) 61 | 62 | def load_and_preprocess(self): 63 | files = filedialog.askopenfilenames(filetypes=[("Text files", "*.txt")]) 64 | if not files: 65 | return 66 | 67 | self.transcripts = [] 68 | for file in files: 69 | with open(file, 'r', encoding='utf-8') as f: 70 | self.transcripts.extend([line.strip() for line in f if line.strip()]) 71 | 72 | self.log(f"Loaded {len(self.transcripts)} utterances") 73 | threading.Thread(target=self._preprocess).start() 74 | 75 | def _preprocess(self): 76 | self.root.after(0, lambda: self.log("Detecting languages...")) 77 | languages = set() 78 | for utterance in self.transcripts: 79 | try: 80 | lang = detect(utterance) 81 | languages.add(lang) 82 | except: 83 | pass 84 | self.root.after(0, lambda: self.log(f"Detected languages: {', '.join(languages)}")) 85 | 86 | self.root.after(0, lambda: self.log("Creating embeddings...")) 87 | self.embeddings = self.embedding_model.encode(self.transcripts) 88 | 89 | def analyze_meanings(self): 90 | if not self.transcripts: 91 | messagebox.showwarning("Warning", "Load transcripts first!") 92 | return 93 | 94 | self.interacts = [] 95 | for i, utterance in enumerate(self.transcripts): 96 | # Zuerst manuelle Klassifikation versuchen 97 | manual_meaning = self._preprocess_utterance(utterance) 98 | if manual_meaning: 99 | self.interacts.append({ 100 | "utterance": utterance, 101 | "meanings": [manual_meaning], 102 | "selected_meaning": manual_meaning 103 | }) 104 | continue 105 | 106 | context = " | ".join([u["utterance"] for u in self.interacts[-3:]]) if i > 0 else "" 107 | meanings = self._generate_meanings(utterance, context) 108 | filtered_meanings = self._filter_meanings(meanings, i) 109 | 110 | self.interacts.append({ 111 | "utterance": utterance, 112 | "meanings": meanings, 113 | "selected_meaning": filtered_meanings[0] if filtered_meanings else "UNK" 114 | }) 115 | 116 | self.log(f"Utterance {i+1}: {utterance[:50]}...") 117 | self.log(f" Selected meaning: {filtered_meanings[0] if filtered_meanings else 'UNK'}") 118 | 119 | def _preprocess_utterance(self, utterance): 120 | # Kürzen und Normalisieren 121 | utterance = utterance[:100].lower().replace('\n', ' ') 122 | 123 | # Erweiterte Mustererkennung 124 | patterns = { 125 | 'order': ['bitte', 'nehme', 'hätte', 'kaufen', 'gramm', 'kilo'], 126 | 'question': ['?', 'wie', 'was', 'wo '], 127 | 'confirmation': ['ja', 'okay', 'genau', 'richtig'], 128 | 'greeting': ['guten tag', 'hallo', 'guten morgen'], 129 | 'thanks': ['danke', 'vielen dank', 'dankeschön'] 130 | } 131 | 132 | for category, keywords in patterns.items(): 133 | if any(kw in utterance for kw in keywords): 134 | return f"{category.upper()}" 135 | 136 | return None 137 | 138 | def _generate_meanings(self, utterance, context): 139 | manual_meaning = self._preprocess_utterance(utterance) 140 | if manual_meaning: 141 | return [manual_meaning] 142 | 143 | prompt = f"""Generate a SINGLE, concise interpretation for this dialog utterance in German: 144 | Context: '{context}' 145 | Utterance: '{utterance}' 146 | Interpretation: The speaker""" 147 | try: 148 | output = self.llm(prompt, max_length=50, num_return_sequences=1) 149 | return [output[0]["generated_text"].strip()] 150 | except Exception as e: 151 | return ["UNK"] 152 | 153 | def _filter_meanings(self, meanings, index): 154 | if index == 0 or not meanings: 155 | return meanings 156 | return [meanings[0]] # Immer erste Bedeutung akzeptieren 157 | 158 | def build_semantic_pcfg(self): 159 | if not self.interacts: 160 | messagebox.showwarning("Warning", "Analyze meanings first!") 161 | return 162 | 163 | meaning_embeddings = self.embedding_model.encode( 164 | [i["selected_meaning"] for i in self.interacts] 165 | ) 166 | 167 | clusterer = HDBSCAN(min_cluster_size=3, metric='cosine') 168 | clusters = clusterer.fit_predict(meaning_embeddings) 169 | 170 | terminal_symbols = [] 171 | for i, cluster_id in enumerate(clusters): 172 | if cluster_id == -1: 173 | terminal_symbols.append(f"T_{i}") 174 | else: 175 | terminal_symbols.append(f"C_{cluster_id}") 176 | 177 | self.pcfg = defaultdict(dict) 178 | self.empirical_chain = terminal_symbols 179 | 180 | # Stärkere Gewichtung häufiger Übergänge mit exponentieller Gewichtung 181 | for i in range(len(terminal_symbols)-1): 182 | src = terminal_symbols[i] 183 | dst = terminal_symbols[i+1] 184 | src_nt = f"NT_{src}" 185 | 186 | weight = np.exp(-0.1 * i) # Exponentielle Gewichtung für nahe Übergänge 187 | self.pcfg[src][src_nt] = 1.0 188 | self.pcfg[src_nt][dst] = self.pcfg[src_nt].get(dst, 0) + weight 189 | 190 | # Normalisierung der Wahrscheinlichkeiten 191 | for src in self.pcfg: 192 | total = sum(self.pcfg[src].values()) 193 | self.pcfg[src] = {dst: count/total for dst, count in self.pcfg[src].items()} 194 | 195 | self.log("\nGenerated Semantic PCFG:") 196 | for src in list(self.pcfg.keys())[:5]: 197 | for dst, prob in list(self.pcfg[src].items())[:3]: 198 | self.log(f" {src.ljust(10)} → {dst.ljust(15)} [{prob:.2f}]") 199 | 200 | def _calculate_frequencies(self, chains): 201 | freq_dict = defaultdict(lambda: defaultdict(int)) 202 | 203 | for chain in chains: 204 | for i in range(len(chain)-1): 205 | current = chain[i] 206 | next_symbol = chain[i+1] 207 | freq_dict[current][next_symbol] += 1 208 | 209 | all_transitions = [] 210 | for src in freq_dict: 211 | for dst in freq_dict[src]: 212 | all_transitions.append(freq_dict[src][dst]) 213 | 214 | return np.array(all_transitions) 215 | 216 | def _simulate_chain(self, max_length): 217 | if not self.pcfg: 218 | return [] 219 | 220 | chain = [] 221 | current = np.random.choice(list(self.pcfg.keys())) 222 | chain.append(current) 223 | 224 | while len(chain) < max_length: 225 | if current not in self.pcfg: 226 | break 227 | 228 | next_symbols = list(self.pcfg[current].keys()) 229 | probs = list(self.pcfg[current].values()) 230 | next_symbol = np.random.choice(next_symbols, p=probs) 231 | 232 | chain.append(next_symbol) 233 | current = next_symbol 234 | 235 | return chain 236 | 237 | def _adjust_probabilities(self, empirical, generated): 238 | # Dynamische Lernrate basierend auf Datensatzgröße 239 | adjustment_factor = max(0.01, 0.2 * (1 - np.exp(-len(self.empirical_chain)/100))) 240 | 241 | temp_freq = defaultdict(lambda: defaultdict(float)) 242 | 243 | for i in range(len(self.empirical_chain)-1): 244 | src = self.empirical_chain[i] 245 | dst = self.empirical_chain[i+1] 246 | temp_freq[src][dst] += 1 247 | 248 | for src in self.pcfg: 249 | for dst in self.pcfg[src]: 250 | base_prob = temp_freq[src].get(dst, 0) 251 | smoothed_prob = (base_prob + 0.1) / (sum(temp_freq[src].values()) + 0.1 * len(self.pcfg[src])) 252 | self.pcfg[src][dst] = (1 - adjustment_factor) * self.pcfg[src][dst] + \ 253 | adjustment_factor * smoothed_prob 254 | 255 | for src in self.pcfg: 256 | total = sum(self.pcfg[src].values()) 257 | if total > 0: 258 | for dst in self.pcfg[src]: 259 | self.pcfg[src][dst] /= total 260 | 261 | def optimize_grammar(self, iterations=20): 262 | if not self.pcfg: 263 | messagebox.showwarning("Warning", "Build PCFG first!") 264 | return 265 | 266 | empirical_freq = self._calculate_frequencies([self.empirical_chain]) 267 | best_corr = -1 268 | best_pcfg = deepcopy(self.pcfg) 269 | 270 | for i in range(iterations): 271 | generated_chains = [ 272 | self._simulate_chain(max_length=len(self.empirical_chain)) 273 | for _ in range(5) 274 | ] 275 | 276 | gen_freq = self._calculate_frequencies(generated_chains) 277 | 278 | min_length = min(len(empirical_freq), len(gen_freq)) 279 | empirical = empirical_freq[:min_length] 280 | generated = gen_freq[:min_length] 281 | 282 | try: 283 | if len(empirical) > 1 and len(generated) > 1: 284 | corr, p_value = pearsonr(empirical, generated) 285 | self.log(f"Iteration {i+1}: r = {corr:.3f}, p = {p_value:.3f}") 286 | 287 | if abs(corr) > 0.3 and p_value < 0.1: # Nur signifikante Anpassungen 288 | if corr > best_corr: 289 | best_corr = corr 290 | best_pcfg = deepcopy(self.pcfg) 291 | 292 | self._adjust_probabilities(empirical_freq, gen_freq) 293 | 294 | if corr > 0.9: 295 | break 296 | else: 297 | self.log(f"Iteration {i+1}: Not enough data for correlation") 298 | break 299 | 300 | except Exception as e: 301 | self.log(f"Error in iteration {i+1}: {str(e)}") 302 | break 303 | 304 | self.pcfg = best_pcfg # Restore best version 305 | self.log(f"Optimization finished. Best r = {best_corr:.3f}") 306 | self.evaluate_grammar() 307 | 308 | def evaluate_grammar(self): 309 | # Berechne Konsistenz der generierten Dialoge 310 | test_chains = [self._simulate_chain(10) for _ in range(10)] 311 | coherence_scores = [self._calculate_coherence(chain) for chain in test_chains] 312 | self.log(f"Average coherence: {np.mean(coherence_scores):.2f}") 313 | 314 | def _calculate_coherence(self, chain): 315 | # Einfache Kohärenzmetrik: Anteil gültiger Übergänge 316 | valid_transitions = 0 317 | for i in range(len(chain)-1): 318 | if chain[i] in self.pcfg and chain[i+1] in self.pcfg[chain[i]]: 319 | valid_transitions += 1 320 | return valid_transitions / max(1, len(chain)-1) 321 | 322 | def visualize_grammar(self): 323 | if not self.pcfg: 324 | messagebox.showwarning("Warning", "Build PCFG first!") 325 | return 326 | 327 | self.figure.clf() 328 | G = nx.DiGraph() 329 | 330 | for src, transitions in self.pcfg.items(): 331 | for dst, prob in transitions.items(): 332 | G.add_edge(src, dst, weight=prob) 333 | 334 | pos = nx.spring_layout(G, k=0.8, iterations=50, seed=42) 335 | 336 | node_size = 1200 337 | font_size = 8 338 | edge_width_scale = 2.5 339 | 340 | ax = self.figure.add_subplot(111) 341 | 342 | nx.draw_networkx_nodes( 343 | G, pos, 344 | node_size=node_size, 345 | node_color='skyblue', 346 | alpha=0.9, 347 | ax=ax 348 | ) 349 | 350 | edges = nx.draw_networkx_edges( 351 | G, pos, 352 | width=[d['weight']*edge_width_scale for (_, _, d) in G.edges(data=True)], 353 | edge_color='gray', 354 | alpha=0.7, 355 | arrowstyle='->', 356 | arrowsize=15, 357 | ax=ax 358 | ) 359 | 360 | nx.draw_networkx_labels( 361 | G, pos, 362 | font_size=font_size, 363 | font_family='sans-serif', 364 | ax=ax 365 | ) 366 | 367 | edge_labels = { 368 | (u, v): f"{d['weight']:.2f}" 369 | for u, v, d in G.edges(data=True) 370 | } 371 | nx.draw_networkx_edge_labels( 372 | G, pos, 373 | edge_labels=edge_labels, 374 | font_size=font_size-1, 375 | label_pos=0.5, 376 | ax=ax 377 | ) 378 | 379 | ax.set_title("Probabilistic Context-Free Grammar (PCFG)") 380 | ax.axis('off') 381 | plt.tight_layout() 382 | self.canvas.draw() 383 | 384 | def log(self, message): 385 | self.output_text.insert(tk.END, message + "\n") 386 | self.output_text.see(tk.END) 387 | 388 | if __name__ == "__main__": 389 | root = tk.Tk() 390 | app = EnhancedDialogAnalyzer(root) 391 | root.mainloop() 392 | --------------------------------------------------------------------------------