├── ARS3.png
├── ARS4.png
├── ARS6.png
├── requirements.txt
├── 8dateien.json
├── text4.json
├── pcfg-grafik.py
├── Text6.txt
├── Text3.txt
├── Text7.txt
├── 8dateienAppARS4.json
├── Text5.txt
├── Text2.txt
├── Text1.txt
├── setup.py
├── Text8.txt
├── text4.txt
├── ars_core.py
├── ars_gui_app.py
├── app.py
├── README.md
├── ars4_gui_app.py
└── ars6_gui_app.py
/ARS3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pkoopongithub/algorithmisch-rekursive-sequenzanalyse-3.0/main/ARS3.png
--------------------------------------------------------------------------------
/ARS4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pkoopongithub/algorithmisch-rekursive-sequenzanalyse-3.0/main/ARS4.png
--------------------------------------------------------------------------------
/ARS6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pkoopongithub/algorithmisch-rekursive-sequenzanalyse-3.0/main/ARS6.png
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | sentence-transformers>=2.2.2
2 | hdbscan>=0.8.33
3 | scikit-learn>=1.1.3
4 | pandas>=1.5.3
5 | pyyaml>=6.0
6 | streamlit>=1.22.0
7 | networkx>=3.2.1
8 | matplotlib>=3.7.1
9 |
--------------------------------------------------------------------------------
/8dateien.json:
--------------------------------------------------------------------------------
1 | {
2 | "0": {
3 | "2": 0.5333333333333333,
4 | "0": 0.4666666666666667
5 | },
6 | "2": {
7 | "1": 1.0
8 | },
9 | "1": {
10 | "1": 0.9230769230769231,
11 | "0": 0.07692307692307693
12 | }
13 | }
--------------------------------------------------------------------------------
/text4.json:
--------------------------------------------------------------------------------
1 | {
2 | "-1": {
3 | "-1": 0.2727272727272727,
4 | "0": 0.45454545454545453,
5 | "1": 0.2727272727272727
6 | },
7 | "0": {
8 | "1": 0.5,
9 | "-1": 0.5
10 | },
11 | "1": {
12 | "0": 0.16666666666666666,
13 | "-1": 0.8333333333333334
14 | }
15 | }
--------------------------------------------------------------------------------
/pcfg-grafik.py:
--------------------------------------------------------------------------------
1 | import graphviz
2 |
3 | def export_pcfg_to_dot(pcfg, filepath):
4 | dot = graphviz.Digraph()
5 | for rule in pcfg:
6 | lhs = rule['lhs']
7 | rhs = ' '.join(rule['rhs'])
8 | prob = rule['probability']
9 | dot.edge(lhs, rhs, label=f'{prob:.2f}')
10 | dot.render(filepath, format='png', cleanup=True)
11 |
12 |
--------------------------------------------------------------------------------
/Text6.txt:
--------------------------------------------------------------------------------
1 | Text 6
2 | Datum: 28. Juni 1994, Ort: Käseverkaufsstand, Aachen, Marktplatz
3 |
4 | (Marktatmosphäre, Begrüßungen)
5 |
6 | Kunde 1: Guten Morgen!
7 |
8 | Verkäufer: Guten Morgen!
9 |
10 | Kunde 1: Ich hätte gerne fünfhundert Gramm holländischen Gouda.
11 |
12 | Verkäufer: Am Stück?
13 |
14 | Kunde 1: Ja, am Stück, bitte.
15 |
16 | Ende Text 6
--------------------------------------------------------------------------------
/Text3.txt:
--------------------------------------------------------------------------------
1 | Text 3
2 | Datum: 28. Juni 1994, Ort: Fischstand, Marktplatz, Aachen
3 |
4 | (Marktatmosphäre, Gespräch im Hintergrund, teilweise unverständlich)
5 |
6 | Kunde: Ein Pfund Seelachs, bitte.
7 |
8 | Verkäufer: Seelachs, alles klar.
9 |
10 | (Geräusche von Verpackung und Verkaufsvorbereitungen)
11 |
12 | Verkäufer: Vier Mark neunzehn, bitte.
13 |
14 | (Geräusche von Verpackung, Münzen klimpern)
15 |
16 | Verkäufer: Schönen Dank!
17 |
18 | Kunde: Ja, danke schön!
19 |
20 | Ende Text 3
--------------------------------------------------------------------------------
/Text7.txt:
--------------------------------------------------------------------------------
1 | Text 7
2 | Datum: 28. Juni 1994, Ort: Bonbonstand, Aachen, Marktplatz, 11:30 Uhr
3 |
4 | (Geräusche von Stimmen und Marktatmosphäre, teilweise unverständlich)
5 |
6 | Kunde: Von den gemischten hätte ich gerne hundert Gramm.
7 |
8 | (Unverständliche Fragen und Antworten)
9 |
10 | Verkäufer: Für zu Hause oder zum Mitnehmen?
11 |
12 | Kunde: Zum Mitnehmen, bitte.
13 |
14 | Verkäufer: Fünfzig Pfennig, bitte.
15 |
16 | (Klimpern von Münzen, Geräusche von Verpackung)
17 |
18 | Kunde: Danke!
19 |
20 | Ende Text 7
--------------------------------------------------------------------------------
/8dateienAppARS4.json:
--------------------------------------------------------------------------------
1 | {
2 | "T_1": {
3 | "T_4": 1.0
4 | },
5 | "T_4": {
6 | "T_3": 1.0
7 | },
8 | "T_3": {
9 | "T_3": 0.9230769230769231,
10 | "T_2": 0.07692307692307693
11 | },
12 | "T_2": {
13 | "T_1": 1.0
14 | },
15 | "NT_T_1_T_4_T_3": {
16 | "NT_T_1_T_4_T_3": 1.0
17 | },
18 | "NT_T_4_T_3_T_3": {
19 | "NT_T_4_T_3_T_3": 1.0
20 | },
21 | "NT_T_3_T_3_T_3": {
22 | "NT_T_3_T_3_T_3": 1.0
23 | },
24 | "NT_T_3_T_3_T_2": {
25 | "NT_T_3_T_3_T_2": 1.0
26 | },
27 | "NT_T_3_T_2_T_1": {
28 | "NT_T_3_T_2_T_1": 1.0
29 | },
30 | "NT_T_2_T_1_T_4": {
31 | "NT_T_2_T_1_T_4": 1.0
32 | }
33 | }
--------------------------------------------------------------------------------
/Text5.txt:
--------------------------------------------------------------------------------
1 | Text 5
2 | Datum: 26. Juni 1994, Ort: Gemüsestand, Aachen, Marktplatz, 11:00 Uhr
3 |
4 | (Marktatmosphäre, teilweise unverständlich)
5 |
6 | Verkäufer: So, bitte schön.
7 |
8 | Kunde 1: Auf Wiedersehen!
9 |
10 | Kunde 2: Ich hätte gern ein Kilo von den Granny Smith Äpfeln hier.
11 |
12 | (Unverständliches Gespräch im Hintergrund)
13 |
14 | Verkäufer: Sonst noch etwas?
15 |
16 | Kunde 2: Ja, noch ein Kilo Zwiebeln.
17 |
18 | Verkäufer: Alles klar.
19 |
20 | (Unverständliches Gespräch, Hintergrundgeräusche)
21 |
22 | Kunde 2: Das war's.
23 |
24 | Verkäufer: Sechs Mark fünfundzwanzig, bitte.
25 |
26 | (Unverständliches Gespräch, Geräusche von Münzen und Verpackung)
27 |
28 | Verkäufer: Wiedersehen!
29 |
30 | Kunde 2: Wiedersehen!
31 |
32 | Ende Text 5
--------------------------------------------------------------------------------
/Text2.txt:
--------------------------------------------------------------------------------
1 | Text 2
2 | Datum: 28. Juni 1994, Ort: Marktplatz, Aachen
3 |
4 | (Ständige Hintergrundgeräusche von Stimmen und Marktatmosphäre)
5 |
6 | Verkäufer: Kirschen kann jeder probieren hier, Kirschen kann jeder probieren hier!
7 |
8 | Kunde 1: Ein halbes Kilo Kirschen, bitte.
9 |
10 | Verkäufer: Ein halbes Kilo? Oder ein Kilo?
11 |
12 | (Unverständliches Gespräch, Münzen klimpern)
13 |
14 | Verkäufer: Danke schön!
15 |
16 | Verkäufer: Kirschen kann jeder probieren hier! Drei Mark, bitte.
17 |
18 | Kunde 1: Danke schön!
19 |
20 | Verkäufer: Kirschen kann jeder probieren hier, Kirschen kann jeder probieren hier!
21 |
22 | (Weitere Stimmen im Hintergrund, unverständliches Gespräch, Münzen klimpern)
23 |
24 | Kunde 2: Ein halbes Kilo, bitte.
25 |
26 | (Unverständliches Gespräch)
27 |
28 | Ende Text 2
--------------------------------------------------------------------------------
/Text1.txt:
--------------------------------------------------------------------------------
1 | Text 1
2 | Datum: 28. Juni 1994, Ort: Metzgerei, Aachen, 11:00 Uhr
3 |
4 | (Die Geräusche eines geschäftigen Marktplatzes im Hintergrund, Stimmen und Gemurmel)
5 |
6 | Verkäuferin: Guten Tag, was darf es sein?
7 |
8 | Kunde: Einmal von der groben Leberwurst, bitte.
9 |
10 | Verkäuferin: Wie viel darf’s denn sein?
11 |
12 | Kunde: Zwei hundert Gramm.
13 |
14 | Verkäuferin: Zwei hundert Gramm. Sonst noch etwas?
15 |
16 | Kunde: Ja, dann noch ein Stück von dem Schwarzwälder Schinken.
17 |
18 | Verkäuferin: Wie groß soll das Stück sein?
19 |
20 | Kunde: So um die dreihundert Gramm.
21 |
22 | Verkäuferin: Alles klar. Kommt sofort. (Geräusche von Papier und Verpackung)
23 |
24 | Kunde: Danke schön.
25 |
26 | Verkäuferin: Das macht dann acht Mark zwanzig.
27 |
28 | Kunde: Bitte. (Klimpern von Münzen, Geräusche der Kasse)
29 |
30 | Verkäuferin: Danke und einen schönen Tag noch!
31 |
32 | Kunde: Danke, ebenfalls!
33 |
34 | Ende Text 1
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup, find_packages
2 |
3 | setup(
4 | name="ars3",
5 | version="1.0.0",
6 | description="Algorithmisch Rekursive Sequenzanalyse 3.0 – Analyse und Simulation von Transkripten mit PCFG",
7 | author="Dein Name",
8 | author_email="dein.email@example.com",
9 | packages=find_packages(),
10 | include_package_data=True,
11 | install_requires=[
12 | "sentence-transformers",
13 | "hdbscan",
14 | "scikit-learn",
15 | "pandas",
16 | "pyyaml",
17 | "streamlit",
18 | "networkx",
19 | "matplotlib"
20 | ],
21 | entry_points={
22 | 'console_scripts': [
23 | 'ars-gui = app:main', # Voraussetzung: app.py enthält eine main()-Funktion
24 | ],
25 | },
26 | classifiers=[
27 | "Programming Language :: Python :: 3",
28 | "License :: OSI Approved :: MIT License",
29 | "Operating System :: OS Independent",
30 | ],
31 | python_requires='>=3.9',
32 | )
33 |
--------------------------------------------------------------------------------
/Text8.txt:
--------------------------------------------------------------------------------
1 | Text 8
2 | Datum: 9. Juli 1994, Ort: Bäckerei, Aachen, 12:00 Uhr
3 |
4 | (Schritte hörbar, Hintergrundgeräusche, teilweise unverständlich)
5 |
6 | Kunde: Guten Tag!
7 |
8 | (Unverständliche Begrüßung im Hintergrund)
9 |
10 | Verkäuferin: Einmal unser bester Kaffee, frisch gemahlen, bitte.
11 |
12 | (Geräusche der Kaffeemühle, Verpackungsgeräusche)
13 |
14 | Verkäuferin: Sonst noch etwas?
15 |
16 | Kunde: Ja, noch zwei Stück Obstsalat und ein Schälchen Sahne.
17 |
18 | Verkäuferin: In Ordnung!
19 |
20 | (Geräusche der Kaffeemühle, Papiergeräusche)
21 |
22 | Verkäuferin: Ein kleines Schälchen Sahne, ja?
23 |
24 | Kunde: Ja, danke.
25 |
26 | (Türgeräusch, Lachen, Papiergeräusche)
27 |
28 | Verkäuferin: Keiner kümmert sich darum, die Türen zu ölen.
29 |
30 | Kunde: Ja, das ist immer so.
31 |
32 | (Lachen, Geräusche von Münzen und Verpackung)
33 |
34 | Verkäuferin: Das macht vierzehn Mark und neunzehn Pfennig, bitte.
35 |
36 | Kunde: Ich zahle in Kleingeld.
37 |
38 | (Lachen und Geräusche von Münzen)
39 |
40 | Verkäuferin: Vielen Dank, schönen Sonntag noch!
41 |
42 | Kunde: Danke, Ihnen auch!
43 |
44 | Ende Text 8
--------------------------------------------------------------------------------
/text4.txt:
--------------------------------------------------------------------------------
1 | Text 4
2 | Datum: 28. Juni 1994, Ort: Gemüsestand, Aachen, Marktplatz, 11:00 Uhr
3 |
4 | (Marktatmosphäre, teilweise unverständlich)
5 |
6 | Kunde: Hören Sie, ich nehme ein paar Champignons mit.
7 |
8 | Verkäufer: Braune oder helle?
9 |
10 | Kunde: Nehmen wir die hellen.
11 |
12 | Verkäufer: Alles klar, die hellen.
13 |
14 | (Unverständliche Unterhaltung im Hintergrund)
15 |
16 | Verkäufer: Die sind beide frisch, keine Sorge.
17 |
18 | Kunde: Wie ist es mit Pfifferlingen?
19 |
20 | Verkäufer: Ah, die sind super!
21 |
22 | (Unverständliches Gespräch)
23 |
24 | Kunde: Kann ich die in Reissalat tun?
25 |
26 | Verkäufer: Eher kurz anbraten in der Pfanne.
27 |
28 | Kunde: Okay, mache ich.
29 |
30 | Verkäufer: Die können Sie roh verwenden, aber ein bisschen anbraten ist besser.
31 |
32 | Kunde: Verstanden.
33 |
34 | (Weitere Unterhaltung, unverständliche Kommentare)
35 |
36 | Verkäufer: Noch etwas anderes?
37 |
38 | Kunde: Ja, dann nehme ich noch Erdbeeren.
39 |
40 | (Pause, Hintergrundgeräusche von Verpackung und Stimmen)
41 |
42 | Verkäufer: Schönen Tag noch!
43 |
44 | Kunde: Gleichfalls!
45 |
46 | Ende Text 4
--------------------------------------------------------------------------------
/ars_core.py:
--------------------------------------------------------------------------------
1 | import os
2 | import json
3 | import csv
4 | import yaml
5 | import numpy as np
6 |
7 | from sklearn.feature_extraction.text import TfidfVectorizer
8 | from sklearn.decomposition import PCA
9 | from sentence_transformers import SentenceTransformer
10 | import hdbscan
11 | import random
12 |
13 | # Modell laden
14 | model = SentenceTransformer("all-MiniLM-L6-v2")
15 |
16 | def read_transcripts(file_paths):
17 | utterances = []
18 | for file in file_paths:
19 | with open(file, 'r', encoding='utf-8') as f:
20 | for line in f:
21 | line = line.strip()
22 | if line:
23 | utterances.append(line)
24 | return utterances
25 |
26 | def embed_utterances(utterances):
27 | return model.encode(utterances)
28 |
29 | def cluster_embeddings(embeddings):
30 | clusterer = hdbscan.HDBSCAN(min_cluster_size=3)
31 | labels = clusterer.fit_predict(embeddings)
32 | return labels
33 |
34 | def build_pcfg(labels, utterances):
35 | pcfg = {}
36 | terminal_chain = []
37 | for i in range(len(labels) - 1):
38 | src = str(labels[i])
39 | dst = str(labels[i + 1])
40 | terminal_chain.append(src)
41 | if src not in pcfg:
42 | pcfg[src] = {}
43 | pcfg[src][dst] = pcfg[src].get(dst, 0) + 1
44 |
45 | # Wahrscheinlichkeiten normalisieren
46 | for src in pcfg:
47 | total = sum(pcfg[src].values())
48 | for dst in pcfg[src]:
49 | pcfg[src][dst] /= total
50 |
51 | return pcfg, terminal_chain
52 |
53 | def process_multiple_dialogs(file_paths):
54 | utterances = read_transcripts(file_paths)
55 | embeddings = embed_utterances(utterances)
56 | labels = cluster_embeddings(embeddings)
57 | pcfg, terminal_chain = build_pcfg(labels, utterances)
58 | return {
59 | "utterances": utterances,
60 | "embeddings": embeddings,
61 | "labels": labels,
62 | "pcfg": pcfg,
63 | "terminal_chain": terminal_chain
64 | }
65 |
66 | def simulate_dialog(pcfg, length=6):
67 | if not pcfg:
68 | return []
69 | current = random.choice(list(pcfg.keys()))
70 | sequence = [current]
71 | for _ in range(length - 1):
72 | if current not in pcfg:
73 | break
74 | next_states = list(pcfg[current].keys())
75 | probs = list(pcfg[current].values())
76 | current = np.random.choice(next_states, p=probs)
77 | sequence.append(current)
78 | return sequence
79 |
80 | def export_pcfg_to_json(pcfg, filepath):
81 | with open(filepath, 'w', encoding='utf-8') as f:
82 | json.dump(pcfg, f, indent=2)
83 |
84 | def export_pcfg_to_csv(pcfg, filepath):
85 | with open(filepath, 'w', encoding='utf-8', newline='') as f:
86 | writer = csv.writer(f)
87 | writer.writerow(["Source", "Target", "Probability"])
88 | for src in pcfg:
89 | for dst in pcfg[src]:
90 | writer.writerow([src, dst, pcfg[src][dst]])
91 |
92 | def export_pcfg_to_yaml(pcfg, filepath):
93 | with open(filepath, 'w', encoding='utf-8') as f:
94 | yaml.dump(pcfg, f, sort_keys=False, allow_unicode=True)
95 |
--------------------------------------------------------------------------------
/ars_gui_app.py:
--------------------------------------------------------------------------------
1 | import tkinter as tk
2 | from tkinter import filedialog, messagebox, ttk
3 | import os
4 | import json
5 | import csv
6 | import yaml
7 | import numpy as np
8 |
9 | from ars_core import (
10 | process_multiple_dialogs,
11 | simulate_dialog,
12 | export_pcfg_to_json,
13 | export_pcfg_to_csv,
14 | export_pcfg_to_yaml
15 | )
16 |
17 | class ARSGUIApp:
18 | def __init__(self, root):
19 | self.root = root
20 | self.root.title("ARS 3.0 – Algorithmisch Rekursive Sequenzanalyse")
21 |
22 | self.dialog_files = []
23 | self.processed_data = None
24 |
25 | self.build_gui()
26 |
27 | def build_gui(self):
28 | frm = ttk.Frame(self.root, padding=10)
29 | frm.grid(row=0, column=0, sticky="nsew")
30 |
31 | ttk.Label(frm, text="ARS 3.0 Dialogverarbeitung", font=("Arial", 16)).grid(row=0, column=0, columnspan=3, pady=10)
32 |
33 | ttk.Button(frm, text="Transkripte laden", command=self.load_dialogs).grid(row=1, column=0, sticky="ew", pady=5)
34 | ttk.Button(frm, text="Verarbeiten", command=self.run_processing).grid(row=1, column=1, sticky="ew", pady=5)
35 | ttk.Button(frm, text="Dialog simulieren", command=self.run_simulation).grid(row=1, column=2, sticky="ew", pady=5)
36 |
37 | ttk.Separator(frm).grid(row=2, column=0, columnspan=3, pady=10, sticky="ew")
38 |
39 | ttk.Button(frm, text="PCFG → JSON", command=lambda: self.export_pcfg("json")).grid(row=3, column=0, pady=5)
40 | ttk.Button(frm, text="PCFG → CSV", command=lambda: self.export_pcfg("csv")).grid(row=3, column=1, pady=5)
41 | ttk.Button(frm, text="PCFG → YAML", command=lambda: self.export_pcfg("yaml")).grid(row=3, column=2, pady=5)
42 |
43 | self.text_output = tk.Text(frm, wrap="word", height=20)
44 | self.text_output.grid(row=4, column=0, columnspan=3, sticky="nsew", pady=10)
45 |
46 | def log(self, msg):
47 | self.text_output.insert(tk.END, msg + "\n")
48 | self.text_output.see(tk.END)
49 |
50 | def load_dialogs(self):
51 | files = filedialog.askopenfilenames(title="Transkriptdateien auswählen", filetypes=[("Textdateien", "*.txt")])
52 | if files:
53 | self.dialog_files = list(files)
54 | self.log(f"{len(self.dialog_files)} Dateien geladen.")
55 |
56 | def run_processing(self):
57 | if not self.dialog_files:
58 | messagebox.showwarning("Warnung", "Keine Dateien ausgewählt.")
59 | return
60 | self.log("Starte Verarbeitung...")
61 | self.processed_data = process_multiple_dialogs(self.dialog_files)
62 | self.log("Verarbeitung abgeschlossen.")
63 | self.log(f"Kategorien: {set(self.processed_data['terminal_chain'])}")
64 |
65 | def run_simulation(self):
66 | if not self.processed_data:
67 | messagebox.showwarning("Warnung", "Bitte zuerst Transkripte verarbeiten.")
68 | return
69 | chain = simulate_dialog(self.processed_data["pcfg"], length=6)
70 | self.log("Simulierter Dialog:")
71 | self.log(" → ".join(chain))
72 |
73 | def export_pcfg(self, fmt):
74 | if not self.processed_data:
75 | messagebox.showwarning("Warnung", "Bitte zuerst Transkripte verarbeiten.")
76 | return
77 | filepath = filedialog.asksaveasfilename(defaultextension=f".{fmt}", filetypes=[(fmt.upper(), f"*.{fmt}")])
78 | if filepath:
79 | if fmt == "json":
80 | export_pcfg_to_json(self.processed_data["pcfg"], filepath)
81 | elif fmt == "csv":
82 | export_pcfg_to_csv(self.processed_data["pcfg"], filepath)
83 | elif fmt == "yaml":
84 | export_pcfg_to_yaml(self.processed_data["pcfg"], filepath)
85 | self.log(f"PCFG exportiert als {filepath}")
86 |
87 |
88 | if __name__ == "__main__":
89 | root = tk.Tk()
90 | app = ARSGUIApp(root)
91 | root.mainloop()
92 |
93 |
--------------------------------------------------------------------------------
/app.py:
--------------------------------------------------------------------------------
1 | import streamlit as st
2 | import os
3 | import json
4 | import yaml
5 | import numpy as np
6 | import hdbscan
7 | import umap
8 | import matplotlib.pyplot as plt
9 | from collections import defaultdict
10 | from sentence_transformers import SentenceTransformer
11 | import openai
12 | import random
13 |
14 | # === Konfiguration ===
15 | USE_GPT = st.sidebar.checkbox("GPT zur Clusterbenennung verwenden?", value=False)
16 | openai.api_key = st.sidebar.text_input("OpenAI API-Key", type="password")
17 |
18 | @st.cache_data(show_spinner=False)
19 | def embed_utterances(utterances, model_name="all-MiniLM-L6-v2"):
20 | model = SentenceTransformer(model_name)
21 | return model.encode(utterances, show_progress_bar=False)
22 |
23 | def cluster_utterances(embeddings):
24 | clusterer = hdbscan.HDBSCAN(min_cluster_size=5, prediction_data=True)
25 | return clusterer.fit_predict(embeddings)
26 |
27 | def gpt_category(samples):
28 | prompt = "Gib eine knappe Kategorienbezeichnung (1–2 Wörter) für folgende Aussagen:
29 | " + "\n".join(f"- {s}" for s in samples[:5])
30 | try:
31 | response = openai.ChatCompletion.create(
32 | model="gpt-4",
33 | messages=[{"role": "user", "content": prompt}],
34 | temperature=0.3
35 | )
36 | return response["choices"][0]["message"]["content"].strip()
37 | except Exception:
38 | return local_category(samples)
39 |
40 | def local_category(samples):
41 | fallback = ["Frage", "Antwort", "Befehl", "Hinweis", "Ironie", "Zweifel"]
42 | return random.choice(fallback)
43 |
44 | def assign_categories(utterances, labels):
45 | clusters = defaultdict(list)
46 | for u, l in zip(utterances, labels):
47 | clusters[l].append(u)
48 | label_to_name = {}
49 | for l, samples in clusters.items():
50 | label_to_name[l] = gpt_category(samples) if USE_GPT else local_category(samples)
51 | return [label_to_name[l] for l in labels], label_to_name
52 |
53 | def induce_pcfg(sequence):
54 | transitions = defaultdict(lambda: defaultdict(int))
55 | for i in range(len(sequence) - 1):
56 | transitions[sequence[i]][sequence[i + 1]] += 1
57 | return {k: {kk: vv / sum(v.values()) for kk, vv in v.items()} for k, v in transitions.items()}
58 |
59 | def simulate_dialog(pcfg, start=None, maxlen=15):
60 | if not pcfg: return []
61 | if not start:
62 | start = random.choice(list(pcfg.keys()))
63 | result = [start]
64 | for _ in range(maxlen - 1):
65 | if start not in pcfg:
66 | break
67 | next_items = list(pcfg[start].items())
68 | next_tokens, probs = zip(*next_items)
69 | start = random.choices(next_tokens, probs)[0]
70 | result.append(start)
71 | return result
72 |
73 | def render_umap(embeddings, labels):
74 | reducer = umap.UMAP(random_state=42)
75 | reduced = reducer.fit_transform(embeddings)
76 | fig, ax = plt.subplots()
77 | unique_labels = set(labels)
78 | for label in unique_labels:
79 | mask = labels == label
80 | ax.scatter(reduced[mask, 0], reduced[mask, 1], label=f"Cluster {label}", alpha=0.6)
81 | ax.set_title("UMAP + HDBSCAN-Cluster")
82 | ax.legend()
83 | return fig
84 |
85 | def pcfg_to_dot(pcfg):
86 | lines = ["digraph PCFG {"]
87 | for src, dsts in pcfg.items():
88 | for dst, prob in dsts.items():
89 | lines.append(f'"{src}" -> "{dst}" [label="{prob:.2f}"];')
90 | lines.append("}")
91 | return "\n".join(lines)
92 |
93 | st.title("🗣️ Algorithmisch-Rekursive Sequenzanalyse 3.0")
94 |
95 | uploaded_files = st.file_uploader("Lade Transkripte hoch (.txt)", type="txt", accept_multiple_files=True)
96 |
97 | if uploaded_files:
98 | for file in uploaded_files:
99 | st.subheader(f"📄 Datei: {file.name}")
100 | raw_text = file.read().decode("utf-8")
101 | utterances = [line.split(":", 1)[1].strip() for line in raw_text.splitlines() if ":" in line]
102 |
103 | if not utterances:
104 | st.warning("Keine dialogischen Äußerungen gefunden.")
105 | continue
106 |
107 | embeddings = embed_utterances(utterances)
108 | labels = cluster_utterances(embeddings)
109 | categories, label_map = assign_categories(utterances, labels)
110 | pcfg = induce_pcfg(categories)
111 |
112 | st.markdown("### 🔖 Kategorien")
113 | for cluster, name in label_map.items():
114 | st.write(f"**Cluster {cluster}** → {name}")
115 |
116 | st.markdown("### 🧠 PCFG-Simulation")
117 | if st.button(f"🎲 Simuliere Dialog ({file.name})"):
118 | dialog = simulate_dialog(pcfg)
119 | st.write(" → ".join(dialog))
120 |
121 | st.markdown("### 📊 Cluster-Visualisierung")
122 | st.pyplot(render_umap(embeddings, labels))
123 |
124 | st.markdown("### 📥 Export")
125 | col1, col2 = st.columns(2)
126 |
127 | with col1:
128 | st.download_button("📎 PCFG als YAML", yaml.dump(pcfg, allow_unicode=True), file_name=f"{file.name}_pcfg.yaml")
129 | with col2:
130 | st.download_button("📎 PCFG als DOT", pcfg_to_dot(pcfg), file_name=f"{file.name}_pcfg.dot")
131 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Algorithmic Recursive Sequence Analysis 3.0
2 | 
3 | 
4 | 
5 |
6 | This project provides a web-based application for the automatic analysis of dialog transcripts using Sentence-BERT, HDBSCAN clustering, and probabilistic context-free grammars (PCFG).
7 |
8 | ## Features
9 |
10 | * 📂 Upload multiple transcripts
11 | * 🧠 Sentence-BERT for embeddings
12 | * 📊 HDBSCAN for clustering
13 | * 🧾 Categorization via GPT or local methods
14 | * 📈 Visualization via UMAP
15 | * 🔁 PCFG induction from sequences
16 | * 🎲 Dialog simulation
17 | * 📎 Export PCFG as `.yaml`, `.json`, `.csv`, or `.dot`
18 |
19 | ## Files
20 |
21 | * `app.py` – Main file containing the Streamlit (or tkinter) GUI
22 | * `requirements.txt` – Python dependencies
23 | * `README.md` – This documentation
24 |
25 | ## Usage
26 |
27 | 1. Open the web app in your browser
28 | 2. Upload `.txt` transcripts
29 | 3. The app clusters utterances and generates semantic categories
30 | 4. You can export the resulting PCFG or simulate new dialogues
31 |
32 | ---
33 |
34 | # Algorithmic Recursive Sequence Analysis 3.0 (ARS 3.0)
35 |
36 | **Algorithmic Recursive Sequence Analysis (ARS 3.0)** is a modular system for the processing, analysis, and simulation of dialogical transcripts. It enables automatic clustering of semantically similar utterances, the generation of probabilistic context-free grammars (PCFG), and the creation of synthetic dialogues based on these structures.
37 |
38 | ---
39 |
40 | ## 🔧 Project Structure
41 |
42 | ```
43 | ars3/
44 | ├── ars_core.py # Core logic: processing, PCFG export, simulation
45 | ├── app.py # GUI (Streamlit or tkinter)
46 | ├── categories.json # Persistent storage of recognized categories
47 | ├── data/
48 | │ └── test_transcript.txt # Example transcript data for analysis
49 | ├── output/
50 | │ ├── pcfg.json # PCFG exported in JSON format
51 | │ ├── pcfg.csv # PCFG exported in CSV format
52 | │ ├── pcfg.yaml # PCFG exported in YAML format
53 | │ └── cluster_plot.png # Visualization of cluster structure
54 | ├── requirements.txt # Dependencies
55 | ├── setup.py # Installation script
56 | └── README.md # This file
57 | ```
58 |
59 | ---
60 |
61 | ## 🧠 Core Functions
62 |
63 | ### `ars_core.py`
64 |
65 | * **`process_multiple_dialogs(transcript_paths)`**
66 | Loads and processes multiple transcripts, clusters semantically similar statements with HDBSCAN, and builds a PCFG.
67 |
68 | * **`simulate_dialog(pcfg, max_turns=10)`**
69 | Simulates a new dialog based on a given PCFG.
70 |
71 | * **`export_pcfg_to_json(pcfg, filepath)`**
72 | Exports the PCFG to a JSON file.
73 |
74 | * **`export_pcfg_to_csv(pcfg, filepath)`**
75 | Exports the PCFG to a CSV file for tabular analysis.
76 |
77 | * **`export_pcfg_to_yaml(pcfg, filepath)`**
78 | Exports the PCFG in YAML format for further processing or editing.
79 |
80 | ---
81 |
82 | ## 🖥️ GUI (`app.py`)
83 |
84 | * Select transcripts for processing
85 | * Start analysis and clustering
86 | * Visualize the cluster structure
87 | * Export PCFG in various formats
88 | * Simulate new dialogues at the press of a button
89 |
90 | The GUI is modular and can be run via Streamlit (web) or tkinter (local desktop interface).
91 |
92 | ---
93 |
94 | ## 📦 Installation
95 |
96 | 1. Clone or unzip the repository
97 |
98 | 2. Install the required dependencies:
99 |
100 | ```bash
101 | pip install -r requirements.txt
102 | ```
103 |
104 | 3. Start the GUI (if `app.py` contains a `main()` function):
105 |
106 | ```bash
107 | python app.py
108 | ```
109 |
110 | Or via a command-line shortcut (if installed as CLI):
111 |
112 | ```bash
113 | ars-gui
114 | ```
115 |
116 | ---
117 |
118 | ## 📈 Export Formats
119 |
120 | * **JSON** – Structured, machine-readable format
121 | * **CSV** – For easy tabular analysis (e.g., in Excel or Pandas)
122 | * **YAML** – For readable configuration and external tool integration
123 | * **DOT** – For graph-based visualization via Graphviz or other tools
124 |
125 |
126 |
127 | ---
128 |
129 |
130 |
131 |
132 | # Algorithmisch-Rekursive Sequenzanalyse 3.0
133 |
134 | Dieses Projekt bietet eine Webanwendung zur automatischen Analyse von dialogischen Transkripten mithilfe von Sentence-BERT, HDBSCAN-Clustering und probabilistischen kontextfreien Grammatiken (PCFG).
135 |
136 | ## Funktionen
137 |
138 | - 📂 Mehrfacher Transkript-Upload
139 | - 🧠 Sentence-BERT für Embedding
140 | - 📊 HDBSCAN für Clusterbildung
141 | - 🧾 Kategorien mit GPT oder lokal
142 | - 📈 Visualisierung via UMAP
143 | - 🔁 PCFG-Induktion aus Sequenzen
144 | - 🎲 Simulation von Dialogen
145 | - 📎 Export der PCFG als `.yaml` oder `.dot`
146 |
147 | ## Dateien
148 |
149 | - `app.py` – Die Hauptdatei mit der Streamlit-App
150 | - `requirements.txt` – Python-Abhängigkeiten
151 | - `README.md` – Dieses Dokument
152 |
153 |
154 |
155 | ## Nutzung
156 |
157 | 1. Öffne die Web-App im Browser
158 | 2. Lade `.txt`-Transkripte hoch
159 | 3. Die App clustert Äußerungen und generiert Kategorien
160 | 4. Du kannst die resultierende PCFG exportieren oder Dialoge simulieren
161 |
162 |
163 | # Algorithmisch Rekursive Sequenzanalyse 3.0
164 | 
165 | 
166 | 
167 |
168 | **Algorithmisch Rekursive Sequenzanalyse (ARS 3.0)** ist ein modulares System zur Verarbeitung, Analyse und Simulation von dialogischen Transkripten. Es ermöglicht die automatische Clusterung semantisch ähnlicher Aussagen, den Aufbau probabilistischer kontextfreier Grammatiken (PCFG), sowie die Generierung synthetischer Dialoge auf Basis dieser Strukturen.
169 |
170 | ---
171 |
172 | ## 🔧 Projektstruktur
173 |
174 |
175 |
176 |
177 | ars3/
178 | ├── ars\_core.py # Zentrale Logik: Verarbeitung, PCFG-Export, Simulation
179 | ├── app.py # GUI (Streamlit oder tkinter)
180 | ├── categories.json # Persistente Speicherung erkannter Kategorien
181 | ├── data/
182 | │ └── test\_transcript.txt # Beispielhafte Dialogdaten zur Analyse
183 | ├── output/
184 | │ ├── pcfg.json # Exportierte PCFG im JSON-Format
185 | │ ├── pcfg.csv # Exportierte PCFG im CSV-Format
186 | │ ├── pcfg.yaml # Exportierte PCFG im YAML-Format
187 | │ └── cluster\_plot.png # Visualisierung der Clusterstruktur
188 | ├── requirements.txt # Abhängigkeiten
189 | ├── setup.py # Installationsskript
190 | └── README.md # Diese Datei
191 |
192 | ```
193 |
194 | ---
195 |
196 | ## 🧠 Hauptfunktionen
197 |
198 | ### `ars_core.py`
199 |
200 | - **`process_multiple_dialogs(transcript_paths)`**
201 | Lädt mehrere Transkripte, analysiert semantisch ähnliche Aussagen, clustert mit HDBSCAN und erstellt eine PCFG.
202 |
203 | - **`simulate_dialog(pcfg, max_turns=10)`**
204 | Simuliert einen plausiblen neuen Dialog basierend auf einer gegebenen PCFG.
205 |
206 | - **`export_pcfg_to_json(pcfg, filepath)`**
207 | Exportiert die PCFG in eine JSON-Datei.
208 |
209 | - **`export_pcfg_to_csv(pcfg, filepath)`**
210 | Exportiert die PCFG in eine CSV-Datei zur besseren tabellarischen Auswertung.
211 |
212 | - **`export_pcfg_to_yaml(pcfg, filepath)`**
213 | Exportiert die PCFG in das YAML-Format (z. B. für andere Tools oder manuelle Bearbeitung).
214 |
215 | ---
216 |
217 | ## 🖥️ GUI (`app.py`)
218 |
219 | - Wähle Transkripte zur Verarbeitung
220 | - Starte Analyse & Clustering
221 | - Visualisiere Clusterstruktur
222 | - Exportiere PCFG in verschiedenen Formaten
223 | - Simuliere neue Dialoge auf Knopfdruck
224 |
225 | Die GUI ist modular aufgebaut und kann wahlweise in Streamlit (Web) oder tkinter (lokal) betrieben werden.
226 |
227 |
228 | ## 📦 Installation
229 |
230 | 1. Klone oder entpacke das Repository:
231 |
232 | 2. Installiere alle Abhängigkeiten:
233 |
234 | ```bash
235 | pip install -r requirements.txt
236 | ```
237 |
238 | 3. Starte die GUI (wenn `app.py` eine `main()`-Funktion enthält):
239 |
240 | ```bash
241 | python app.py
242 | ```
243 |
244 | Oder über den Konsolenbefehl:
245 |
246 | ```bash
247 | ars-gui
248 | ```
249 |
250 |
251 |
252 | ---
253 |
254 | ## 📈 Exportformate
255 |
256 | * **JSON**: Für strukturierten maschinenlesbaren Export
257 | * **CSV**: Zur einfachen tabellarischen Analyse (z. B. in Excel oder Pandas)
258 | * **YAML**: Für lesbare Konfigurationen und Weiterverarbeitung in externen Tools
259 |
260 |
261 |
262 |
263 |
264 |
--------------------------------------------------------------------------------
/ars4_gui_app.py:
--------------------------------------------------------------------------------
1 | import os
2 | import json
3 | import numpy as np
4 | from sklearn.cluster import HDBSCAN
5 | from sentence_transformers import SentenceTransformer
6 | from collections import defaultdict
7 | from scipy.stats import pearsonr
8 | import tkinter as tk
9 | from tkinter import filedialog, ttk, messagebox
10 |
11 | # Modell für Embeddings
12 | model = SentenceTransformer("all-MiniLM-L6-v2")
13 |
14 | class ARSGUI:
15 | def __init__(self, root):
16 | self.root = root
17 | self.root.title("Auto-PCFG Generator")
18 | self.transcripts = []
19 | self.terminal_symbols = []
20 | self.pcfg = {}
21 |
22 | self.setup_ui()
23 |
24 | def setup_ui(self):
25 | main_frame = ttk.Frame(self.root, padding="10")
26 | main_frame.grid(row=0, column=0, sticky="nsew")
27 |
28 | ttk.Button(main_frame, text="Load Transcripts", command=self.load_transcripts).grid(row=0, column=0, pady=5)
29 | ttk.Button(main_frame, text="Generate Grammar", command=self.generate_grammar).grid(row=0, column=1, pady=5)
30 | ttk.Button(main_frame, text="Optimize Grammar", command=self.optimize_grammar).grid(row=0, column=2, pady=5)
31 | ttk.Button(main_frame, text="Simulate Dialog", command=self.simulate_dialog).grid(row=0, column=3, pady=5)
32 |
33 | self.output_text = tk.Text(main_frame, height=20, width=80)
34 | self.output_text.grid(row=1, column=0, columnspan=4, pady=10)
35 |
36 | ttk.Button(main_frame, text="Export JSON", command=lambda: self.export_grammar("json")).grid(row=2, column=0)
37 | ttk.Button(main_frame, text="Export YAML", command=lambda: self.export_grammar("yaml")).grid(row=2, column=1)
38 |
39 | def log(self, message):
40 | self.output_text.insert(tk.END, message + "\n")
41 | self.output_text.see(tk.END)
42 |
43 | def load_transcripts(self):
44 | files = filedialog.askopenfilenames(filetypes=[("Text files", "*.txt")])
45 | if not files:
46 | return
47 |
48 | self.transcripts = []
49 | for file in files:
50 | with open(file, 'r', encoding='utf-8') as f:
51 | self.transcripts.extend([line.strip() for line in f if line.strip()])
52 |
53 | self.log(f"Loaded {len(self.transcripts)} utterances from {len(files)} files.")
54 |
55 | def generate_grammar(self):
56 | if not self.transcripts:
57 | messagebox.showwarning("Warning", "No transcripts loaded!")
58 | return
59 |
60 | # Schritt 1: Terminalzeichen generieren
61 | embeddings = model.encode(self.transcripts)
62 |
63 | # KORREKTUR: Parameter gen_min_span_tree entfernt
64 | clusterer = HDBSCAN(min_cluster_size=3)
65 | clusters = clusterer.fit_predict(embeddings)
66 |
67 | # Unique Terminalzeichen erstellen
68 | self.terminal_symbols = [f"T_{c+1}" for c in clusters]
69 | unique_terminals = list(set(self.terminal_symbols))
70 |
71 | self.log(f"Generated {len(unique_terminals)} terminal symbols: {unique_terminals}")
72 |
73 | # Schritt 2: Nonterminale und Regeln ableiten
74 | self.pcfg = self.induce_grammar_rules(self.terminal_symbols)
75 | self.log("\nGenerated PCFG rules:")
76 | for nt, rules in self.pcfg.items():
77 | self.log(f"{nt} → {rules}")
78 |
79 | def induce_grammar_rules(self, terminals, n=3):
80 | rules = defaultdict(dict)
81 |
82 | # Einfache Übergänge zwischen Terminalzeichen
83 | for i in range(len(terminals)-1):
84 | src = terminals[i]
85 | dst = terminals[i+1]
86 | rules[src][dst] = rules[src].get(dst, 0) + 1
87 |
88 | # Nonterminale für häufige N-Gramme
89 | ngram_counts = defaultdict(int)
90 | for i in range(len(terminals)-n+1):
91 | ngram = " ".join(terminals[i:i+n])
92 | ngram_counts[ngram] += 1
93 |
94 | # Füge Nonterminale für häufige N-Gramme hinzu
95 | for ngram, count in ngram_counts.items():
96 | if count > 1:
97 | nt = f"NT_{ngram.replace(' ', '_')}"
98 | rules[nt] = {ngram: 1.0}
99 |
100 | # Ersetze Vorkommen im Haupt-PCFG
101 | for src in list(rules.keys()):
102 | if ngram in rules[src]:
103 | rules[src][nt] = rules[src].pop(ngram)
104 |
105 | # Normalisiere Wahrscheinlichkeiten
106 | for src in rules:
107 | total = sum(rules[src].values())
108 | rules[src] = {dst: cnt/total for dst, cnt in rules[src].items()}
109 |
110 | return dict(rules)
111 |
112 | def optimize_grammar(self, iterations=10):
113 | if not self.pcfg:
114 | messagebox.showwarning("Warning", "Generate grammar first!")
115 | return
116 |
117 | empirical_freq = self.calculate_frequencies([self.terminal_symbols])
118 |
119 | for i in range(iterations):
120 | generated_chains = []
121 | for _ in range(5):
122 | chain = self.simulate_chain(max_length=len(self.terminal_symbols))
123 | generated_chains.append(chain)
124 |
125 | gen_freq = self.calculate_frequencies(generated_chains)
126 | corr, p_value = pearsonr(empirical_freq, gen_freq)
127 |
128 | self.log(f"Iteration {i+1}: Correlation = {corr:.3f}, p = {p_value:.3f}")
129 |
130 | if corr > 0.9:
131 | break
132 |
133 | self.adjust_probabilities(empirical_freq, gen_freq)
134 |
135 | def calculate_frequencies(self, chains):
136 | all_terminals = sorted(set(self.terminal_symbols))
137 | freq = np.zeros(len(all_terminals))
138 | term_to_idx = {t: i for i, t in enumerate(all_terminals)}
139 |
140 | for chain in chains:
141 | for term in chain:
142 | if term in term_to_idx:
143 | freq[term_to_idx[term]] += 1
144 |
145 | return freq / freq.sum() if freq.sum() > 0 else freq
146 |
147 | def adjust_probabilities(self, empirical_freq, gen_freq):
148 | all_terminals = sorted(set(self.terminal_symbols))
149 | adjustment = empirical_freq - gen_freq
150 |
151 | for src in self.pcfg:
152 | new_rules = {}
153 | for dst in self.pcfg[src]:
154 | if dst in all_terminals:
155 | idx = all_terminals.index(dst)
156 | new_prob = max(0.01, min(0.99, self.pcfg[src][dst] + 0.1 * adjustment[idx]))
157 | new_rules[dst] = new_prob
158 | else:
159 | new_rules[dst] = self.pcfg[src][dst]
160 |
161 | total = sum(new_rules.values())
162 | self.pcfg[src] = {k: v/total for k, v in new_rules.items()}
163 |
164 | def simulate_chain(self, max_length=10):
165 | chain = []
166 | current = np.random.choice(list(self.pcfg.keys()))
167 |
168 | for _ in range(max_length):
169 | if current not in self.pcfg:
170 | break
171 |
172 | next_options = list(self.pcfg[current].keys())
173 | probs = list(self.pcfg[current].values())
174 | next_item = np.random.choice(next_options, p=probs)
175 |
176 | if next_item.startswith("NT_"):
177 | expanded = next_item[3:].replace("_", " ").split()
178 | chain.extend(expanded)
179 | current = expanded[-1] if expanded else None
180 | else:
181 | chain.append(next_item)
182 | current = next_item
183 |
184 | return chain
185 |
186 | def simulate_dialog(self):
187 | if not self.pcfg:
188 | messagebox.showwarning("Warning", "Generate grammar first!")
189 | return
190 |
191 | chain = self.simulate_chain()
192 | self.log("\nSimulated dialog sequence:")
193 | self.log(" → ".join(chain))
194 |
195 | def export_grammar(self, format):
196 | if not self.pcfg:
197 | messagebox.showwarning("Warning", "No grammar to export!")
198 | return
199 |
200 | file = filedialog.asksaveasfilename(
201 | defaultextension=f".{format}",
202 | filetypes=[(f"{format.upper()} files", f"*.{format}")]
203 | )
204 |
205 | if not file:
206 | return
207 |
208 | if format == "json":
209 | with open(file, 'w') as f:
210 | json.dump(self.pcfg, f, indent=2)
211 | elif format == "yaml":
212 | import yaml
213 | with open(file, 'w') as f:
214 | yaml.dump(self.pcfg, f)
215 |
216 | self.log(f"Grammar exported to {file}")
217 |
218 | if __name__ == "__main__":
219 | root = tk.Tk()
220 | app = ARSGUI(root)
221 | root.mainloop()
222 |
--------------------------------------------------------------------------------
/ars6_gui_app.py:
--------------------------------------------------------------------------------
1 | import os
2 | import json
3 | import numpy as np
4 | from sklearn.cluster import HDBSCAN
5 | from sklearn.feature_extraction.text import TfidfVectorizer
6 | from sentence_transformers import SentenceTransformer
7 | from collections import defaultdict
8 | from scipy.stats import pearsonr
9 | import tkinter as tk
10 | from tkinter import filedialog, ttk, messagebox
11 | import networkx as nx
12 | import matplotlib.pyplot as plt
13 | from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg
14 | from nltk import ngrams
15 | from langdetect import detect
16 | from transformers import pipeline
17 | import threading
18 | from copy import deepcopy
19 |
20 | class EnhancedDialogAnalyzer:
21 | def __init__(self, root):
22 | self.root = root
23 | self.root.title("LLM-enhanced Dialog Analyzer")
24 |
25 | # Modelle initialisieren
26 | self.embedding_model = SentenceTransformer("paraphrase-multilingual-mpnet-base-v2")
27 | self.llm = pipeline("text2text-generation", model="google/flan-t5-base")
28 |
29 | # Datenstrukturen
30 | self.transcripts = []
31 | self.interacts = []
32 | self.pcfg = {}
33 | self.empirical_chain = []
34 |
35 | # GUI
36 | self.setup_ui()
37 | self.setup_visualization()
38 |
39 | def setup_ui(self):
40 | main_frame = ttk.Frame(self.root, padding="10")
41 | main_frame.grid(row=0, column=0, sticky="nsew")
42 |
43 | ttk.Button(main_frame, text="1. Load & Preprocess",
44 | command=self.load_and_preprocess).grid(row=0, column=0, pady=5)
45 | ttk.Button(main_frame, text="2. Analyze Meanings",
46 | command=self.analyze_meanings).grid(row=1, column=0, pady=5)
47 | ttk.Button(main_frame, text="3. Build Semantic PCFG",
48 | command=self.build_semantic_pcfg).grid(row=2, column=0, pady=5)
49 | ttk.Button(main_frame, text="4. Optimize",
50 | command=self.optimize_grammar).grid(row=3, column=0, pady=5)
51 | ttk.Button(main_frame, text="5. Visualize",
52 | command=self.visualize_grammar).grid(row=4, column=0, pady=5)
53 |
54 | self.output_text = tk.Text(main_frame, height=20, width=80)
55 | self.output_text.grid(row=0, column=1, rowspan=5, padx=10)
56 |
57 | def setup_visualization(self):
58 | self.figure = plt.Figure(figsize=(8, 6), dpi=100)
59 | self.canvas = FigureCanvasTkAgg(self.figure, master=self.root)
60 | self.canvas.get_tk_widget().grid(row=0, column=2, rowspan=5, padx=10)
61 |
62 | def load_and_preprocess(self):
63 | files = filedialog.askopenfilenames(filetypes=[("Text files", "*.txt")])
64 | if not files:
65 | return
66 |
67 | self.transcripts = []
68 | for file in files:
69 | with open(file, 'r', encoding='utf-8') as f:
70 | self.transcripts.extend([line.strip() for line in f if line.strip()])
71 |
72 | self.log(f"Loaded {len(self.transcripts)} utterances")
73 | threading.Thread(target=self._preprocess).start()
74 |
75 | def _preprocess(self):
76 | self.root.after(0, lambda: self.log("Detecting languages..."))
77 | languages = set()
78 | for utterance in self.transcripts:
79 | try:
80 | lang = detect(utterance)
81 | languages.add(lang)
82 | except:
83 | pass
84 | self.root.after(0, lambda: self.log(f"Detected languages: {', '.join(languages)}"))
85 |
86 | self.root.after(0, lambda: self.log("Creating embeddings..."))
87 | self.embeddings = self.embedding_model.encode(self.transcripts)
88 |
89 | def analyze_meanings(self):
90 | if not self.transcripts:
91 | messagebox.showwarning("Warning", "Load transcripts first!")
92 | return
93 |
94 | self.interacts = []
95 | for i, utterance in enumerate(self.transcripts):
96 | # Zuerst manuelle Klassifikation versuchen
97 | manual_meaning = self._preprocess_utterance(utterance)
98 | if manual_meaning:
99 | self.interacts.append({
100 | "utterance": utterance,
101 | "meanings": [manual_meaning],
102 | "selected_meaning": manual_meaning
103 | })
104 | continue
105 |
106 | context = " | ".join([u["utterance"] for u in self.interacts[-3:]]) if i > 0 else ""
107 | meanings = self._generate_meanings(utterance, context)
108 | filtered_meanings = self._filter_meanings(meanings, i)
109 |
110 | self.interacts.append({
111 | "utterance": utterance,
112 | "meanings": meanings,
113 | "selected_meaning": filtered_meanings[0] if filtered_meanings else "UNK"
114 | })
115 |
116 | self.log(f"Utterance {i+1}: {utterance[:50]}...")
117 | self.log(f" Selected meaning: {filtered_meanings[0] if filtered_meanings else 'UNK'}")
118 |
119 | def _preprocess_utterance(self, utterance):
120 | # Kürzen und Normalisieren
121 | utterance = utterance[:100].lower().replace('\n', ' ')
122 |
123 | # Erweiterte Mustererkennung
124 | patterns = {
125 | 'order': ['bitte', 'nehme', 'hätte', 'kaufen', 'gramm', 'kilo'],
126 | 'question': ['?', 'wie', 'was', 'wo '],
127 | 'confirmation': ['ja', 'okay', 'genau', 'richtig'],
128 | 'greeting': ['guten tag', 'hallo', 'guten morgen'],
129 | 'thanks': ['danke', 'vielen dank', 'dankeschön']
130 | }
131 |
132 | for category, keywords in patterns.items():
133 | if any(kw in utterance for kw in keywords):
134 | return f"{category.upper()}"
135 |
136 | return None
137 |
138 | def _generate_meanings(self, utterance, context):
139 | manual_meaning = self._preprocess_utterance(utterance)
140 | if manual_meaning:
141 | return [manual_meaning]
142 |
143 | prompt = f"""Generate a SINGLE, concise interpretation for this dialog utterance in German:
144 | Context: '{context}'
145 | Utterance: '{utterance}'
146 | Interpretation: The speaker"""
147 | try:
148 | output = self.llm(prompt, max_length=50, num_return_sequences=1)
149 | return [output[0]["generated_text"].strip()]
150 | except Exception as e:
151 | return ["UNK"]
152 |
153 | def _filter_meanings(self, meanings, index):
154 | if index == 0 or not meanings:
155 | return meanings
156 | return [meanings[0]] # Immer erste Bedeutung akzeptieren
157 |
158 | def build_semantic_pcfg(self):
159 | if not self.interacts:
160 | messagebox.showwarning("Warning", "Analyze meanings first!")
161 | return
162 |
163 | meaning_embeddings = self.embedding_model.encode(
164 | [i["selected_meaning"] for i in self.interacts]
165 | )
166 |
167 | clusterer = HDBSCAN(min_cluster_size=3, metric='cosine')
168 | clusters = clusterer.fit_predict(meaning_embeddings)
169 |
170 | terminal_symbols = []
171 | for i, cluster_id in enumerate(clusters):
172 | if cluster_id == -1:
173 | terminal_symbols.append(f"T_{i}")
174 | else:
175 | terminal_symbols.append(f"C_{cluster_id}")
176 |
177 | self.pcfg = defaultdict(dict)
178 | self.empirical_chain = terminal_symbols
179 |
180 | # Stärkere Gewichtung häufiger Übergänge mit exponentieller Gewichtung
181 | for i in range(len(terminal_symbols)-1):
182 | src = terminal_symbols[i]
183 | dst = terminal_symbols[i+1]
184 | src_nt = f"NT_{src}"
185 |
186 | weight = np.exp(-0.1 * i) # Exponentielle Gewichtung für nahe Übergänge
187 | self.pcfg[src][src_nt] = 1.0
188 | self.pcfg[src_nt][dst] = self.pcfg[src_nt].get(dst, 0) + weight
189 |
190 | # Normalisierung der Wahrscheinlichkeiten
191 | for src in self.pcfg:
192 | total = sum(self.pcfg[src].values())
193 | self.pcfg[src] = {dst: count/total for dst, count in self.pcfg[src].items()}
194 |
195 | self.log("\nGenerated Semantic PCFG:")
196 | for src in list(self.pcfg.keys())[:5]:
197 | for dst, prob in list(self.pcfg[src].items())[:3]:
198 | self.log(f" {src.ljust(10)} → {dst.ljust(15)} [{prob:.2f}]")
199 |
200 | def _calculate_frequencies(self, chains):
201 | freq_dict = defaultdict(lambda: defaultdict(int))
202 |
203 | for chain in chains:
204 | for i in range(len(chain)-1):
205 | current = chain[i]
206 | next_symbol = chain[i+1]
207 | freq_dict[current][next_symbol] += 1
208 |
209 | all_transitions = []
210 | for src in freq_dict:
211 | for dst in freq_dict[src]:
212 | all_transitions.append(freq_dict[src][dst])
213 |
214 | return np.array(all_transitions)
215 |
216 | def _simulate_chain(self, max_length):
217 | if not self.pcfg:
218 | return []
219 |
220 | chain = []
221 | current = np.random.choice(list(self.pcfg.keys()))
222 | chain.append(current)
223 |
224 | while len(chain) < max_length:
225 | if current not in self.pcfg:
226 | break
227 |
228 | next_symbols = list(self.pcfg[current].keys())
229 | probs = list(self.pcfg[current].values())
230 | next_symbol = np.random.choice(next_symbols, p=probs)
231 |
232 | chain.append(next_symbol)
233 | current = next_symbol
234 |
235 | return chain
236 |
237 | def _adjust_probabilities(self, empirical, generated):
238 | # Dynamische Lernrate basierend auf Datensatzgröße
239 | adjustment_factor = max(0.01, 0.2 * (1 - np.exp(-len(self.empirical_chain)/100)))
240 |
241 | temp_freq = defaultdict(lambda: defaultdict(float))
242 |
243 | for i in range(len(self.empirical_chain)-1):
244 | src = self.empirical_chain[i]
245 | dst = self.empirical_chain[i+1]
246 | temp_freq[src][dst] += 1
247 |
248 | for src in self.pcfg:
249 | for dst in self.pcfg[src]:
250 | base_prob = temp_freq[src].get(dst, 0)
251 | smoothed_prob = (base_prob + 0.1) / (sum(temp_freq[src].values()) + 0.1 * len(self.pcfg[src]))
252 | self.pcfg[src][dst] = (1 - adjustment_factor) * self.pcfg[src][dst] + \
253 | adjustment_factor * smoothed_prob
254 |
255 | for src in self.pcfg:
256 | total = sum(self.pcfg[src].values())
257 | if total > 0:
258 | for dst in self.pcfg[src]:
259 | self.pcfg[src][dst] /= total
260 |
261 | def optimize_grammar(self, iterations=20):
262 | if not self.pcfg:
263 | messagebox.showwarning("Warning", "Build PCFG first!")
264 | return
265 |
266 | empirical_freq = self._calculate_frequencies([self.empirical_chain])
267 | best_corr = -1
268 | best_pcfg = deepcopy(self.pcfg)
269 |
270 | for i in range(iterations):
271 | generated_chains = [
272 | self._simulate_chain(max_length=len(self.empirical_chain))
273 | for _ in range(5)
274 | ]
275 |
276 | gen_freq = self._calculate_frequencies(generated_chains)
277 |
278 | min_length = min(len(empirical_freq), len(gen_freq))
279 | empirical = empirical_freq[:min_length]
280 | generated = gen_freq[:min_length]
281 |
282 | try:
283 | if len(empirical) > 1 and len(generated) > 1:
284 | corr, p_value = pearsonr(empirical, generated)
285 | self.log(f"Iteration {i+1}: r = {corr:.3f}, p = {p_value:.3f}")
286 |
287 | if abs(corr) > 0.3 and p_value < 0.1: # Nur signifikante Anpassungen
288 | if corr > best_corr:
289 | best_corr = corr
290 | best_pcfg = deepcopy(self.pcfg)
291 |
292 | self._adjust_probabilities(empirical_freq, gen_freq)
293 |
294 | if corr > 0.9:
295 | break
296 | else:
297 | self.log(f"Iteration {i+1}: Not enough data for correlation")
298 | break
299 |
300 | except Exception as e:
301 | self.log(f"Error in iteration {i+1}: {str(e)}")
302 | break
303 |
304 | self.pcfg = best_pcfg # Restore best version
305 | self.log(f"Optimization finished. Best r = {best_corr:.3f}")
306 | self.evaluate_grammar()
307 |
308 | def evaluate_grammar(self):
309 | # Berechne Konsistenz der generierten Dialoge
310 | test_chains = [self._simulate_chain(10) for _ in range(10)]
311 | coherence_scores = [self._calculate_coherence(chain) for chain in test_chains]
312 | self.log(f"Average coherence: {np.mean(coherence_scores):.2f}")
313 |
314 | def _calculate_coherence(self, chain):
315 | # Einfache Kohärenzmetrik: Anteil gültiger Übergänge
316 | valid_transitions = 0
317 | for i in range(len(chain)-1):
318 | if chain[i] in self.pcfg and chain[i+1] in self.pcfg[chain[i]]:
319 | valid_transitions += 1
320 | return valid_transitions / max(1, len(chain)-1)
321 |
322 | def visualize_grammar(self):
323 | if not self.pcfg:
324 | messagebox.showwarning("Warning", "Build PCFG first!")
325 | return
326 |
327 | self.figure.clf()
328 | G = nx.DiGraph()
329 |
330 | for src, transitions in self.pcfg.items():
331 | for dst, prob in transitions.items():
332 | G.add_edge(src, dst, weight=prob)
333 |
334 | pos = nx.spring_layout(G, k=0.8, iterations=50, seed=42)
335 |
336 | node_size = 1200
337 | font_size = 8
338 | edge_width_scale = 2.5
339 |
340 | ax = self.figure.add_subplot(111)
341 |
342 | nx.draw_networkx_nodes(
343 | G, pos,
344 | node_size=node_size,
345 | node_color='skyblue',
346 | alpha=0.9,
347 | ax=ax
348 | )
349 |
350 | edges = nx.draw_networkx_edges(
351 | G, pos,
352 | width=[d['weight']*edge_width_scale for (_, _, d) in G.edges(data=True)],
353 | edge_color='gray',
354 | alpha=0.7,
355 | arrowstyle='->',
356 | arrowsize=15,
357 | ax=ax
358 | )
359 |
360 | nx.draw_networkx_labels(
361 | G, pos,
362 | font_size=font_size,
363 | font_family='sans-serif',
364 | ax=ax
365 | )
366 |
367 | edge_labels = {
368 | (u, v): f"{d['weight']:.2f}"
369 | for u, v, d in G.edges(data=True)
370 | }
371 | nx.draw_networkx_edge_labels(
372 | G, pos,
373 | edge_labels=edge_labels,
374 | font_size=font_size-1,
375 | label_pos=0.5,
376 | ax=ax
377 | )
378 |
379 | ax.set_title("Probabilistic Context-Free Grammar (PCFG)")
380 | ax.axis('off')
381 | plt.tight_layout()
382 | self.canvas.draw()
383 |
384 | def log(self, message):
385 | self.output_text.insert(tk.END, message + "\n")
386 | self.output_text.see(tk.END)
387 |
388 | if __name__ == "__main__":
389 | root = tk.Tk()
390 | app = EnhancedDialogAnalyzer(root)
391 | root.mainloop()
392 |
--------------------------------------------------------------------------------