├── src
    └── python
    │   └── codequestion
    │       ├── __init__.py
    │       ├── etl
    │           ├── __init__.py
    │           └── stackexchange
    │           │   ├── __init__.py
    │           │   ├── decompress.py
    │           │   ├── sift.py
    │           │   ├── execute.py
    │           │   ├── xml2db.py
    │           │   └── db2qa.py
    │       ├── path.py
    │       ├── topics.py
    │       ├── tokenizer.py
    │       ├── models.py
    │       ├── download.py
    │       ├── console.py
    │       ├── index.py
    │       ├── vectors.py
    │       ├── search.py
    │       └── evaluate.py
├── demo.gif
├── logo.png
├── images
    ├── topics.gif
    ├── vscode.png
    ├── architecture.png
    ├── architecture-dark.png
    └── architecture.excalidraw
├── .gitignore
├── config
    ├── index.v1.yml
    └── index.yml
├── .coveragerc
├── test
    ├── python
    │   ├── utils.py
    │   ├── testdownload.py
    │   ├── testconsole.py
    │   └── testindex.py
    └── stackexchange
    │   └── query.txt
├── .pre-commit-config.yaml
├── .pylintrc
├── Makefile
├── .github
    └── workflows
    │   └── build.yml
├── setup.py
├── LICENSE
└── README.md


/src/python/codequestion/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/python/codequestion/etl/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/demo.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neuml/codequestion/HEAD/demo.gif


--------------------------------------------------------------------------------
/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neuml/codequestion/HEAD/logo.png


--------------------------------------------------------------------------------
/images/topics.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neuml/codequestion/HEAD/images/topics.gif


--------------------------------------------------------------------------------
/images/vscode.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neuml/codequestion/HEAD/images/vscode.png


--------------------------------------------------------------------------------
/images/architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neuml/codequestion/HEAD/images/architecture.png


--------------------------------------------------------------------------------
/images/architecture-dark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neuml/codequestion/HEAD/images/architecture-dark.png


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | dist/
3 | htmlcov/
4 | *egg-info/
5 | __pycache__/
6 | .coverage
7 | .coverage.*
8 | *.pyc
9 | 


--------------------------------------------------------------------------------
/config/index.v1.yml:
--------------------------------------------------------------------------------
 1 | path: stackexchange-300d.magnitude
 2 | content: True
 3 | objects: True
 4 | quantize: True
 5 | storevectors: True
 6 | scoring:
 7 |   method: bm25
 8 |   k1: 0.1
 9 | pca: 3
10 | 


--------------------------------------------------------------------------------
/.coveragerc:
--------------------------------------------------------------------------------
 1 | [run]
 2 | source = src/python
 3 | concurrency = multiprocessing,thread
 4 | disable_warnings = no-data-collected
 5 | 
 6 | [combine]
 7 | disable_warnings = no-data-collected
 8 | 
 9 | [report]
10 | exclude_lines =
11 |     if __name__ == .__main__.:
12 | 


--------------------------------------------------------------------------------
/src/python/codequestion/etl/stackexchange/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Stack Exchange imports
 3 | """
 4 | 
 5 | from .db2qa import DB2QA
 6 | from .decompress import Decompress
 7 | from .execute import Execute
 8 | from .sift import Sift
 9 | from .xml2db import XML2DB
10 | 


--------------------------------------------------------------------------------
/test/python/utils.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Utils module
 3 | """
 4 | 
 5 | 
 6 | class Utils:
 7 |     """
 8 |     Utility constants and methods
 9 |     """
10 | 
11 |     PATH = "/tmp/codequestion"
12 |     STACKEXCHANGE = PATH + "/stackexchange"
13 |     QUESTIONS = STACKEXCHANGE + "/questions.db"
14 |     TESTS = PATH + "/test"
15 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 |   - repo: https://github.com/pycqa/pylint
 3 |     rev: v2.12.1
 4 |     hooks:
 5 |     - id: pylint
 6 |       args:
 7 |       - -d import-error
 8 |       - -d duplicate-code
 9 |   - repo: https://github.com/ambv/black
10 |     rev: 22.3.0
11 |     hooks:
12 |     - id: black
13 |       language_version: python3
14 | 


--------------------------------------------------------------------------------
/config/index.yml:
--------------------------------------------------------------------------------
 1 | path: sentence-transformers/all-MiniLM-L6-v2
 2 | content: True
 3 | objects: True
 4 | quantize: True
 5 | functions:
 6 |   - name: graph
 7 |     function: graph.attribute
 8 | expressions:
 9 |   - name: topic
10 |     expression: graph(indexid, 'topic')
11 |   - name: topicrank
12 |     expression: graph(indexid, 'topicrank')
13 | graph:
14 |   limit: 100
15 |   minscore: 0.2
16 |   topics:
17 |     stopwords: 
18 |       - stackoverflow
19 | 


--------------------------------------------------------------------------------
/.pylintrc:
--------------------------------------------------------------------------------
 1 | [BASIC]
 2 | module-rgx=[a-z_][a-zA-Z0-9_]{2,30}$
 3 | method-rgx=[a-z_][a-zA-Z0-9_]{2,30}$
 4 | function-rgx=[a-z_][a-zA-Z0-9_]{2,30}$
 5 | argument-rgx=[a-z_][a-zA-Z0-9_]{0,30}$
 6 | variable-rgx=[a-z_][a-zA-Z0-9_]{0,30}$
 7 | attr-rgx=[a-z_][a-zA-Z0-9_]{0,30}$
 8 | 
 9 | [DESIGN]
10 | max-args=10
11 | max-locals=40
12 | max-returns=10
13 | max-attributes=20
14 | min-public-methods=0
15 | 
16 | [FORMAT]
17 | max-line-length=150
18 | 
19 | [MESSAGES CONTROL]
20 | disable=R0201,W0621
21 | 


--------------------------------------------------------------------------------
/test/python/testdownload.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Download module tests
 3 | """
 4 | 
 5 | import os
 6 | import unittest
 7 | 
 8 | from codequestion.download import Download
 9 | 
10 | # pylint: disable=C0411
11 | from utils import Utils
12 | 
13 | 
14 | class TestDownload(unittest.TestCase):
15 |     """
16 |     Download tests.
17 |     """
18 | 
19 |     def testDownload(self):
20 |         """
21 |         Test download
22 |         """
23 | 
24 |         download = Download()
25 |         download(
26 |             "https://github.com/neuml/codequestion/archive/refs/heads/master.zip",
27 |             Utils.PATH,
28 |         )
29 | 
30 |         # Check archive uncompressed successfully
31 |         self.assertTrue(os.path.exists(Utils.PATH + "/codequestion-master/setup.py"))
32 | 


--------------------------------------------------------------------------------
/src/python/codequestion/path.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Path module
 3 | """
 4 | 
 5 | from rich.console import Console
 6 | 
 7 | 
 8 | class Path:
 9 |     """
10 |     Traverse semantic graphs.
11 |     """
12 | 
13 |     def __init__(self, embeddings):
14 |         """
15 |         Creates a new path action.
16 | 
17 |         Args:
18 |             embeddings: embeddings instance
19 |         """
20 | 
21 |         self.embeddings = embeddings
22 |         self.graph = embeddings.graph
23 | 
24 |     def __call__(self, start, end):
25 |         """
26 |         Runs a path action.
27 | 
28 |         Args:
29 |             start: start node id
30 |             end: end node id
31 |         """
32 | 
33 |         console = Console()
34 | 
35 |         path = self.graph.showpath(start, end)
36 |         for x, uid in enumerate(path):
37 |             query = f"select question from txtai where id = '{uid}'"
38 |             question = self.embeddings.search(query, 1)[0]["question"]
39 |             console.print(f"{x + 1}. {question} ({uid})")
40 | 


--------------------------------------------------------------------------------
/src/python/codequestion/etl/stackexchange/decompress.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Decompress module
 3 | """
 4 | 
 5 | import shlex
 6 | import shutil
 7 | import subprocess
 8 | 
 9 | 
10 | class Decompress:
11 |     """
12 |     Runs a 7zip extract command via an external process.
13 |     """
14 | 
15 |     def __call__(self, path):
16 |         """
17 |         Runs the 7za extraction.
18 | 
19 |         Args:
20 |             path: input directory path with 7z files
21 |         """
22 | 
23 |         # Check for 7za, default to 7z
24 |         binary = "7za" if shutil.which("7za") else "7z"
25 | 
26 |         # Build command
27 |         path = path.replace("\\", "/")
28 |         command = f"{binary} e {path}/*.7z Posts.xml -y -o{path}"
29 |         print(command)
30 | 
31 |         # Start command
32 |         with subprocess.Popen(
33 |             shlex.split(command), stdout=subprocess.PIPE, universal_newlines=True
34 |         ) as process:
35 |             while True:
36 |                 output = process.stdout.readline()
37 |                 if output == "" and process.poll() is not None:
38 |                     break
39 |                 if output:
40 |                     print(output.strip())
41 | 
42 |             # Call final poll on completion
43 |             process.poll()
44 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | # Project utility scripts
 2 | .PHONY: test
 3 | 
 4 | # Setup environment
 5 | export SRC_DIR := ./src/python
 6 | export TEST_DIR := ./test/python
 7 | export PYTHONPATH := ${SRC_DIR}:${TEST_DIR}:${PYTHONPATH}
 8 | export PATH := ${TEST_DIR}:${PATH}
 9 | export PYTHONWARNINGS := ignore
10 | export TOKENIZERS_PARALLELISM := False
11 | 
12 | # Default python executable if not provided
13 | PYTHON ?= python
14 | 
15 | # Download test data
16 | data: 
17 | 	mkdir -p /tmp/codequestion
18 | 	wget -N https://archive.org/download/stackexchange_20220606/ai.stackexchange.com.7z -P /tmp/codequestion/stackexchange/ai
19 | 	wget -N https://raw.githubusercontent.com/neuml/codequestion/master/config/index.v1.yml -P /tmp/codequestion/
20 | 	wget -N https://raw.githubusercontent.com/neuml/codequestion/master/config/index.yml -P /tmp/codequestion/
21 | 
22 | 	wget https://raw.githubusercontent.com/neuml/codequestion/master/test/stackexchange/query.txt -P /tmp/codequestion/test/stackexchange
23 | 	wget http://ixa2.si.ehu.es/stswiki/images/4/48/Stsbenchmark.tar.gz -P /tmp/codequestion
24 | 	tar -C /tmp/codequestion/test -xvzf /tmp/codequestion/Stsbenchmark.tar.gz
25 | 
26 | # Unit tests
27 | test:
28 | 	${PYTHON} -m unittest discover -v -s ${TEST_DIR}
29 | 
30 | # Run tests while calculating code coverage
31 | coverage:
32 | 	coverage run -m unittest discover -v -s ${TEST_DIR}
33 | 	coverage combine
34 | 


--------------------------------------------------------------------------------
/.github/workflows/build.yml:
--------------------------------------------------------------------------------
 1 | # GitHub Actions build workflow
 2 | name: build
 3 | 
 4 | on: ["push", "pull_request"]
 5 | 
 6 | jobs:
 7 |   build:
 8 |     runs-on: ${{ matrix.os }}
 9 |     strategy:
10 |       matrix:
11 |         os: [ubuntu-latest, macos-latest, windows-latest]
12 | 
13 |     timeout-minutes: 60
14 |     steps:
15 |       - name: Checkout code
16 |         uses: actions/checkout@v3
17 | 
18 |       - name: Install Python - Linux
19 |         uses: actions/setup-python@v4
20 |         with:
21 |           python-version: 3.8
22 |         if: matrix.os == 'ubuntu-latest'
23 | 
24 |       - name: Install Python - macOS/Windows
25 |         uses: actions/setup-python@v4
26 |         with:
27 |           python-version: 3.9
28 |         if: matrix.os != 'ubuntu-latest'
29 | 
30 |       - name: Install dependencies - macOS
31 |         run: |
32 |           echo "OMP_NUM_THREADS=1" >> $GITHUB_ENV
33 |         if: matrix.os == 'macos-latest'
34 | 
35 |       - name: Install dependencies - Windows
36 |         run: choco install wget
37 |         if: matrix.os == 'windows-latest'
38 | 
39 |       - name: Build
40 |         run: |
41 |           pip install -U pip
42 |           pip install -U wheel coverage coveralls
43 |           pip install . txtai[similarity]
44 |           python --version
45 |           make data coverage
46 | 
47 |       - uses: pre-commit/action@v3.0.0
48 |         if: matrix.os == 'ubuntu-latest'
49 | 
50 |       - name: Test Coverage
51 |         run: coveralls --service=github
52 |         if: matrix.os == 'ubuntu-latest'
53 |         env:
54 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
55 | 


--------------------------------------------------------------------------------
/src/python/codequestion/topics.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Topics module
 3 | """
 4 | 
 5 | from rich.console import Console
 6 | 
 7 | from txtai.embeddings import Embeddings
 8 | 
 9 | 
10 | class Topics:
11 |     """
12 |     Query topic models.
13 |     """
14 | 
15 |     def __init__(self, embeddings):
16 |         """
17 |         Creates a new topics action.
18 | 
19 |         Args:
20 |             embeddings: embeddings instance
21 |         """
22 | 
23 |         self.embeddings = embeddings
24 |         self.topics = embeddings.graph.topics
25 | 
26 |         # Build on-the-fly topics index
27 |         self.topicembed = Embeddings({"path": "sentence-transformers/all-MiniLM-L6-v2"})
28 |         self.topicembed.index((x, topic, None) for x, topic in enumerate(self.topics))
29 | 
30 |     def __call__(self, query=None):
31 |         """
32 |         Runs a topics action.
33 | 
34 |         Args:
35 |             query: optional query to filter topics, otherwise top topics are shown
36 |         """
37 | 
38 |         console = Console()
39 | 
40 |         topics = list(self.topics.keys())
41 |         if query:
42 |             results = self.topicembed.search(query, 10)
43 |         else:
44 |             results = [(x, 1.0) for x in range(10)]
45 | 
46 |         for uid, score in results:
47 |             if score >= 0.1:
48 |                 topic = topics[uid]
49 |                 console.print(f"[bright_green]{topic}[/bright_green]")
50 | 
51 |                 # Print example question
52 |                 query = f"select id, question from txtai where similar('{topic}')"
53 |                 result = self.embeddings.search(query, 1)[0]
54 |                 console.print(f"{result['question']} ({result['id']})\n")
55 | 


--------------------------------------------------------------------------------
/src/python/codequestion/tokenizer.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Tokenizer module
 3 | """
 4 | 
 5 | import re
 6 | import string
 7 | 
 8 | 
 9 | class Tokenizer:
10 |     """
11 |     Text tokenization methods
12 |     """
13 | 
14 |     # Use standard python punctuation chars but allow tokens to end in # (to allow c#, f#) and + to allow (c++ g++)
15 |     PUNCTUATION = string.punctuation.replace("#", "").replace("+", "")
16 | 
17 |     # fmt: off
18 |     # English Stop Word List (Standard stop words used by Apache Lucene)
19 |     STOP_WORDS = {"a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it",
20 |                   "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these",
21 |                   "they", "this", "to", "was", "will", "with"}
22 |     # fmt: on
23 | 
24 |     @staticmethod
25 |     def tokenize(text):
26 |         """
27 |         Tokenizes input text into a list of tokens. Filters tokens that match a specific pattern and removes stop words.
28 | 
29 |         Args:
30 |             text: input text
31 | 
32 |         Returns:
33 |             list of tokens
34 |         """
35 | 
36 |         # Convert to all lowercase, split on whitespace, strip punctuation
37 |         tokens = [token.strip(Tokenizer.PUNCTUATION) for token in text.lower().split()]
38 | 
39 |         # Filter tokens that are numbers or a valid string at least 2 characters long. Remove stop words.
40 |         # Assume tokens already are uncased (all lowercase)
41 |         return [
42 |             token
43 |             for token in tokens
44 |             if (re.match(r"^[#*+\-.0-9:@_a-z]{2,}$", token) or token.isdigit())
45 |             and token not in Tokenizer.STOP_WORDS
46 |         ]
47 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # pylint: disable = C0111
 2 | from setuptools import find_packages, setup
 3 | 
 4 | with open("README.md", "r", encoding="utf-8") as f:
 5 |     # Remove GitHub dark mode images
 6 |     DESCRIPTION = "".join([line for line in f if "gh-dark-mode-only" not in line])
 7 | 
 8 | setup(
 9 |     name="codequestion",
10 |     version="2.2.0",
11 |     author="NeuML",
12 |     description="Ask coding questions directly from the terminal",
13 |     long_description=DESCRIPTION,
14 |     long_description_content_type="text/markdown",
15 |     url="https://github.com/neuml/codequestion",
16 |     project_urls={
17 |         "Documentation": "https://github.com/neuml/codequestion",
18 |         "Issue Tracker": "https://github.com/neuml/codequestion/issues",
19 |         "Source Code": "https://github.com/neuml/codequestion",
20 |     },
21 |     license="Apache 2.0: http://www.apache.org/licenses/LICENSE-2.0",
22 |     packages=find_packages(where="src/python"),
23 |     package_dir={"": "src/python"},
24 |     keywords="search embedding machine-learning nlp",
25 |     python_requires=">=3.8",
26 |     entry_points={
27 |         "console_scripts": [
28 |             "codequestion = codequestion.console:main",
29 |         ],
30 |     },
31 |     install_requires=[
32 |         "html2markdown>=0.1.7",
33 |         "rich>=12.0.1",
34 |         "scipy>=1.4.1",
35 |         "tqdm>=4.48.0",
36 |         "txtai[graph]>=6.0.0",
37 |     ],
38 |     classifiers=[
39 |         "License :: OSI Approved :: Apache Software License",
40 |         "Operating System :: OS Independent",
41 |         "Programming Language :: Python :: 3",
42 |         "Topic :: Scientific/Engineering :: Artificial Intelligence",
43 |         "Topic :: Software Development",
44 |         "Topic :: Text Processing :: Indexing",
45 |         "Topic :: Utilities",
46 |     ],
47 | )
48 | 


--------------------------------------------------------------------------------
/src/python/codequestion/models.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Models module
 3 | """
 4 | 
 5 | import os
 6 | import os.path
 7 | 
 8 | 
 9 | class Models:
10 |     """
11 |     Common methods for generating data paths.
12 |     """
13 | 
14 |     @staticmethod
15 |     def basePath(create=False):
16 |         """
17 |         Base data path.
18 | 
19 |         Args:
20 |             create: if directory should be created
21 | 
22 |         Returns:
23 |             path
24 |         """
25 | 
26 |         # Derive base path
27 |         path = os.environ.get("CODEQUESTION_HOME")
28 | 
29 |         # Default model base path when environment variable is empty
30 |         path = path if path else os.path.join(os.path.expanduser("~"), ".codequestion")
31 | 
32 |         # Create directory if required
33 |         if create:
34 |             os.makedirs(path, exist_ok=True)
35 | 
36 |         return path
37 | 
38 |     @staticmethod
39 |     def modelPath(name, create=False):
40 |         """
41 |         Model path for name
42 | 
43 |         Args:
44 |             name: model name
45 |             create: if directory should be created
46 | 
47 |         Returns:
48 |             path
49 |         """
50 | 
51 |         path = os.path.join(Models.basePath(), "models", name)
52 | 
53 |         # Create directory if required
54 |         if create:
55 |             os.makedirs(path, exist_ok=True)
56 | 
57 |         return path
58 | 
59 |     @staticmethod
60 |     def vectorPath(name, create=False):
61 |         """
62 |         Vector path for name
63 | 
64 |         Args:
65 |             name: vectors name
66 |             create: if directory should be created
67 | 
68 |         Returns:
69 |             path
70 |         """
71 | 
72 |         path = os.path.join(Models.basePath(), "vectors")
73 | 
74 |         # Create directory path if required
75 |         if create:
76 |             os.makedirs(path, exist_ok=True)
77 | 
78 |         # Append file name to path
79 |         return os.path.join(path, name)
80 | 


--------------------------------------------------------------------------------
/src/python/codequestion/download.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Download module
 3 | """
 4 | 
 5 | import os.path
 6 | import tempfile
 7 | import zipfile
 8 | 
 9 | from urllib.request import urlopen
10 | 
11 | from tqdm import tqdm
12 | 
13 | from .models import Models
14 | 
15 | 
16 | class Download:
17 |     """
18 |     Downloads a pre-trained model.
19 |     """
20 | 
21 |     def __call__(self, url, path=None):
22 |         """
23 |         Downloads a pre-trained model from url into the local model cache directory.
24 | 
25 |         Args:
26 |             url: url model path
27 |         """
28 | 
29 |         # Get base models path
30 |         path = path if path else Models.basePath(True)
31 |         dest = os.path.join(tempfile.gettempdir(), os.path.basename(url))
32 | 
33 |         print(f"Downloading model from {url} to {dest}")
34 | 
35 |         # Download file
36 |         self.download(url, dest)
37 | 
38 |         print(f"Decompressing model to {path}")
39 | 
40 |         # Ensure file was downloaded successfully
41 |         if os.path.exists(dest):
42 |             with zipfile.ZipFile(dest, "r") as z:
43 |                 z.extractall(path)
44 | 
45 |         print("Download complete")
46 | 
47 |     def download(self, url, dest):
48 |         """
49 |         Downloads a remote file from url and stores at dest.
50 | 
51 |         Args:
52 |             url: remote url
53 |             dest: destination file path
54 |         """
55 | 
56 |         with urlopen(url) as response:
57 |             buffer = 16 * 1024
58 |             headers = response.info()
59 |             size = int(headers["Content-Length"]) if "Content-Length" in headers else -1
60 | 
61 |             with tqdm(total=size, unit="B", unit_scale=True, unit_divisor=1024) as pbar:
62 |                 with open(dest, "wb") as f:
63 |                     while True:
64 |                         chunk = response.read(buffer)
65 |                         if not chunk:
66 |                             break
67 | 
68 |                         f.write(chunk)
69 |                         pbar.update(len(chunk))
70 | 
71 | 
72 | if __name__ == "__main__":
73 |     download = Download()
74 |     download(
75 |         "https://github.com/neuml/codequestion/releases/download/v2.0.0/cqmodel.zip"
76 |     )
77 | 


--------------------------------------------------------------------------------
/src/python/codequestion/etl/stackexchange/sift.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Sift module
 3 | """
 4 | 
 5 | import re
 6 | 
 7 | 
 8 | class Sift:
 9 |     """
10 |     Filters a raw posts.xml file for matching results. Uses raw text processing to avoid overhead of parsing xml.
11 |     """
12 | 
13 |     def __call__(self, infile, outfile):
14 |         """
15 |         Processes a raw Posts.xml file. The Posts dump is in Id order ascending.
16 | 
17 |         Args:
18 |             infile: path to input file
19 |             outfile: path to output file
20 |         """
21 | 
22 |         print(f"Converting {infile} to {outfile}")
23 | 
24 |         # Set of answer ids
25 |         ids = set()
26 | 
27 |         with open(infile, encoding="utf-8") as xml:
28 |             with open(outfile, "w", encoding="utf-8") as output:
29 |                 # Write xml start
30 |                 output.write("<posts>\n")
31 | 
32 |                 for line in xml:
33 |                     # PostTypeId = 1 (Question) with accepted answer.
34 |                     if "AcceptedAnswerId" in line:
35 |                         # Parse answer id and score
36 |                         answer = self.parse(r"AcceptedAnswerId=\"([0-9]+)\"", line)
37 |                         score = self.parse(r"Score=\"([0-9]+)\"", line)
38 | 
39 |                         # Require a score of 10+.
40 |                         if score >= 10:
41 |                             # Add answer id to ids list
42 |                             ids.add(answer)
43 | 
44 |                             # Write accepted line
45 |                             output.write(line)
46 | 
47 |                     # PostTypeId = 2 (Answer)
48 |                     elif 'PostTypeId="2"' in line:
49 |                         # Parse post id
50 |                         pid = self.parse(r"Id=\"([0-9]+)\"", line)
51 | 
52 |                         if pid in ids:
53 |                             # Write output line and remove from ids list
54 |                             output.write(line)
55 |                             ids.remove(pid)
56 | 
57 |                 # Write xml end
58 |                 output.write("</posts>\n")
59 | 
60 |     def parse(self, pattern, line):
61 |         """
62 |         Parses an int field and returns the value if found. Returns -1 if no value found.
63 | 
64 |         Args:
65 |             pattern: regex pattern
66 |             line: input line
67 | 
68 |         Return:
69 |             field value
70 |         """
71 | 
72 |         field = re.search(pattern, line)
73 |         return int(field.group(1)) if field else -1
74 | 


--------------------------------------------------------------------------------
/test/python/testconsole.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Console module tests
  3 | """
  4 | 
  5 | import contextlib
  6 | import io
  7 | import os
  8 | import unittest
  9 | 
 10 | from codequestion.console import Console
 11 | from codequestion.etl.stackexchange import Execute
 12 | from codequestion.index import Index
 13 | 
 14 | # pylint: disable=C0411
 15 | from utils import Utils
 16 | 
 17 | 
 18 | class TestConsole(unittest.TestCase):
 19 |     """
 20 |     Console tests.
 21 |     """
 22 | 
 23 |     @classmethod
 24 |     def setUpClass(cls):
 25 |         """
 26 |         Initialize test data.
 27 |         """
 28 | 
 29 |         os.environ["CODEQUESTION_HOME"] = Utils.STACKEXCHANGE + ".console"
 30 | 
 31 |         # Run etl process
 32 |         Execute.SOURCES = ["ai"]
 33 | 
 34 |         execute = Execute()
 35 |         execute(Utils.STACKEXCHANGE)
 36 | 
 37 |         # Create embeddings index
 38 |         index = Index()
 39 |         index(Utils.PATH + "/index.yml", Utils.QUESTIONS)
 40 | 
 41 |         cls.console = Console()
 42 |         cls.console.preloop()
 43 | 
 44 |     def testHelp(self):
 45 |         """
 46 |         Test help command
 47 |         """
 48 | 
 49 |         self.assertIn(".limit", self.command("help"))
 50 |         self.assertIn(".limit", self.command("help .limit"))
 51 | 
 52 |     def testLimit(self):
 53 |         """
 54 |         Test .limit command
 55 |         """
 56 | 
 57 |         self.assertEqual(self.command(".limit 1"), "")
 58 | 
 59 |     def testPath(self):
 60 |         """
 61 |         Test .path command
 62 |         """
 63 | 
 64 |         self.assertIn("1. ", self.command(".path 0 1"))
 65 | 
 66 |     def testSearch(self):
 67 |         """
 68 |         Test search
 69 |         """
 70 | 
 71 |         self.assertIn("Question", self.command("ai"))
 72 | 
 73 |     def testShow(self):
 74 |         """
 75 |         Test .show command
 76 |         """
 77 | 
 78 |         self.assertIn("Question", self.command(".show 0"))
 79 | 
 80 |     def testtopics(self):
 81 |         """
 82 |         Test .topics command
 83 |         """
 84 | 
 85 |         self.assertNotIn("ERROR", self.command(".topics"))
 86 |         self.assertNotIn("ERROR", self.command(".topics ai"))
 87 | 
 88 |     def command(self, command):
 89 |         """
 90 |         Runs a console command.
 91 | 
 92 |         Args:
 93 |             command: command to run
 94 | 
 95 |         Returns:
 96 |             command output
 97 |         """
 98 | 
 99 |         # Run info
100 |         output = io.StringIO()
101 |         with contextlib.redirect_stdout(output):
102 |             self.console.onecmd(command)
103 | 
104 |         return output.getvalue()
105 | 


--------------------------------------------------------------------------------
/test/python/testindex.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Index module tests
  3 | """
  4 | 
  5 | import contextlib
  6 | import io
  7 | import os
  8 | import unittest
  9 | 
 10 | from codequestion.evaluate import StackExchange, STS
 11 | from codequestion.index import Index
 12 | from codequestion.search import Search
 13 | from codequestion.vectors import Vectors
 14 | 
 15 | # pylint: disable=C0411
 16 | from utils import Utils
 17 | 
 18 | 
 19 | class TestIndex(unittest.TestCase):
 20 |     """
 21 |     Index tests.
 22 |     """
 23 | 
 24 |     def testTransformers(self):
 25 |         """
 26 |         Test transformers-backed index
 27 |         """
 28 | 
 29 |         os.environ["CODEQUESTION_HOME"] = Utils.STACKEXCHANGE + ".transformers"
 30 | 
 31 |         # Create embeddings index
 32 |         index = Index()
 33 |         index(Utils.PATH + "/index.yml", Utils.QUESTIONS)
 34 | 
 35 |         # Run tests
 36 |         self.runTests()
 37 | 
 38 |     def testWordVectors(self):
 39 |         """
 40 |         Test word vector-backed index
 41 |         """
 42 | 
 43 |         os.environ["CODEQUESTION_HOME"] = Utils.STACKEXCHANGE + ".wv"
 44 | 
 45 |         # Build word vectors
 46 |         vectors = Vectors()
 47 |         vectors(Utils.QUESTIONS, 300, 3)
 48 | 
 49 |         # Create embeddings index
 50 |         index = Index()
 51 |         index(Utils.PATH + "/index.v1.yml", Utils.QUESTIONS)
 52 | 
 53 |         # Run tests
 54 |         self.runTests()
 55 | 
 56 |     def runTests(self):
 57 |         """
 58 |         Run index tests.
 59 |         """
 60 | 
 61 |         self.search()
 62 |         self.stackexchange()
 63 |         self.sts()
 64 | 
 65 |     def search(self):
 66 |         """
 67 |         Run search test.
 68 |         """
 69 | 
 70 |         # Test search
 71 |         search = Search()
 72 |         self.assertIn(
 73 |             "machine learning", self.command(lambda: search("machine learning"))
 74 |         )
 75 | 
 76 |     def stackexchange(self):
 77 |         """
 78 |         Run stack exchange test.
 79 |         """
 80 | 
 81 |         action = StackExchange()
 82 |         self.assertIn(
 83 |             "Mean Reciprocal Rank", self.command(lambda: action(Utils.TESTS, None))
 84 |         )
 85 |         self.assertIn(
 86 |             "Mean Reciprocal Rank", self.command(lambda: action(Utils.TESTS, "bm25"))
 87 |         )
 88 | 
 89 |     def sts(self):
 90 |         """
 91 |         Run STS test.
 92 |         """
 93 | 
 94 |         action = STS()
 95 |         self.assertIn("Pearson", self.command(lambda: action(Utils.TESTS, None)))
 96 | 
 97 |     def command(self, command):
 98 |         """
 99 |         Runs a console command.
100 | 
101 |         Args:
102 |             command: command to run
103 | 
104 |         Returns:
105 |             command output
106 |         """
107 | 
108 |         # Run info
109 |         output = io.StringIO()
110 |         with contextlib.redirect_stdout(output):
111 |             command()
112 | 
113 |         return output.getvalue()
114 | 


--------------------------------------------------------------------------------
/src/python/codequestion/etl/stackexchange/execute.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Execute module
  3 | """
  4 | 
  5 | import os
  6 | import os.path
  7 | import sys
  8 | 
  9 | from .db2qa import DB2QA
 10 | from .decompress import Decompress
 11 | from .sift import Sift
 12 | from .xml2db import XML2DB
 13 | 
 14 | 
 15 | class Execute:
 16 |     """
 17 |     Main execution method to build a consolidated questions.db file from Stack Exchange Data Dumps.
 18 |     """
 19 | 
 20 |     # List of sources
 21 |     SOURCES = [
 22 |         "ai",
 23 |         "android",
 24 |         "apple",
 25 |         "arduino",
 26 |         "askubuntu",
 27 |         "avp",
 28 |         "codereview",
 29 |         "cs",
 30 |         "datascience",
 31 |         "dba",
 32 |         "devops",
 33 |         "dsp",
 34 |         "raspberrypi",
 35 |         "reverseengineering",
 36 |         "scicomp",
 37 |         "serverfault",
 38 |         "security",
 39 |         "stackoverflow",
 40 |         "stats",
 41 |         "superuser",
 42 |         "unix",
 43 |         "vi",
 44 |         "wordpress",
 45 |     ]
 46 | 
 47 |     def __call__(self, path):
 48 |         """
 49 |         Converts a directory of raw sources to a single output questions database.
 50 | 
 51 |         Args:
 52 |             path: base directory path
 53 |         """
 54 | 
 55 |         # Iterates through a directory of raw sources and builds staging databases
 56 |         databases = self.process(path)
 57 | 
 58 |         # Output database file
 59 |         qafile = os.path.join(path, "questions.db")
 60 | 
 61 |         # Build consolidated SQLite questions database
 62 |         db2qa = DB2QA()
 63 |         db2qa(databases, qafile)
 64 | 
 65 |     def process(self, path):
 66 |         """
 67 |         Iterates through each source and converts raw xml to SQLite databases. Returns a list of
 68 |         output databases.
 69 | 
 70 |         Args:
 71 |             path: input directory path with raw source data directories
 72 | 
 73 |         Returns:
 74 |             paths to output databases
 75 |         """
 76 | 
 77 |         # Extract filtered content and build source databases to process
 78 |         for source in Execute.SOURCES:
 79 |             spath = os.path.join(path, source)
 80 | 
 81 |             # Extract Posts.xml from 7za file
 82 |             decompress = Decompress()
 83 |             decompress(spath)
 84 | 
 85 |             posts = os.path.join(spath, "Posts.xml")
 86 |             filtered = os.path.join(spath, "Filtered.xml")
 87 | 
 88 |             # Filter Posts.xml file for matching questions
 89 |             sift = Sift()
 90 |             sift(posts, filtered)
 91 | 
 92 |             dbfile = os.path.join(spath, f"{source}.db")
 93 | 
 94 |             # Convert filtered Posts.xml file to SQLite db file
 95 |             xml2db = XML2DB()
 96 |             xml2db(filtered, dbfile)
 97 | 
 98 |         # Get list of all databases to consolidate
 99 |         return [
100 |             os.path.join(path, source, f"{source}.db") for source in Execute.SOURCES
101 |         ]
102 | 
103 | 
104 | if __name__ == "__main__":
105 |     # Input data directory
106 |     path = sys.argv[1]
107 |     if not os.path.exists(path):
108 |         print("Data directory does not exist, exiting")
109 |         sys.exit()
110 | 
111 |     # Run ETL process
112 |     execute = Execute()
113 |     execute(path)
114 | 


--------------------------------------------------------------------------------
/src/python/codequestion/console.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Console module
  3 | """
  4 | 
  5 | from cmd import Cmd
  6 | 
  7 | from rich.console import Console as RichConsole
  8 | 
  9 | from .path import Path
 10 | from .search import Search
 11 | from .topics import Topics
 12 | 
 13 | 
 14 | class Console(Cmd):
 15 |     """
 16 |     codequestion console.
 17 |     """
 18 | 
 19 |     def __init__(self):
 20 |         """
 21 |         Creates a new codequestion console.
 22 |         """
 23 | 
 24 |         super().__init__()
 25 | 
 26 |         # Display configuration
 27 |         self.intro = "codequestion console"
 28 |         self.prompt = ">>> "
 29 |         self.console = RichConsole()
 30 | 
 31 |         # Search parameters
 32 |         self.search = None
 33 |         self.embeddings = None
 34 |         self.limit = 1
 35 | 
 36 |         # Topics action
 37 |         self.topics = None
 38 | 
 39 |         # Path traversal action
 40 |         self.path = None
 41 | 
 42 |     def preloop(self):
 43 |         """
 44 |         Loads initial configuration.
 45 |         """
 46 | 
 47 |         # Load query and embeddings
 48 |         self.search = Search()
 49 |         self.embeddings = self.search.embeddings
 50 | 
 51 |         # Load graph-based actions, if necessary
 52 |         if self.embeddings.graph:
 53 |             if self.embeddings.graph.topics:
 54 |                 self.topics = Topics(self.embeddings)
 55 | 
 56 |             self.path = Path(self.embeddings)
 57 | 
 58 |     def default(self, line):
 59 |         """
 60 |         Default event loop.
 61 | 
 62 |         Args:
 63 |             line: command line
 64 |         """
 65 | 
 66 |         # pylint: disable=W0703
 67 |         try:
 68 |             command = line.lower()
 69 |             if command.startswith(".limit"):
 70 |                 command = self.split(line)
 71 |                 self.limit = int(command[1])
 72 |             elif command.startswith(".path") and self.path:
 73 |                 command = self.split(line)
 74 |                 start, end = command[1].split()
 75 |                 self.path(int(start), int(end))
 76 |             elif command.startswith(".show"):
 77 |                 command = self.split(line)
 78 |                 self.search(uid=command[1])
 79 |             elif command.startswith(".topics") and self.topics:
 80 |                 command = self.split(line)
 81 |                 self.topics(command[1] if len(command) > 1 else None)
 82 |             else:
 83 |                 # Search is default action
 84 |                 self.search(line, self.limit)
 85 |         except Exception:
 86 |             self.console.print_exception()
 87 | 
 88 |     def do_help(self, arg):
 89 |         """
 90 |         Shows a help message.
 91 | 
 92 |         Args:
 93 |             arg: optional help message argument
 94 |         """
 95 | 
 96 |         commands = {
 97 |             ".limit": "(number)\t\tset the maximum number of query rows to return",
 98 |             ".path": "(start) (end)\tprints a semantic path between questions",
 99 |             ".show": "(id)\t\tprint question with specified id",
100 |             ".topics": "(query)\t\tshows topics best matching query. if query is empty, top topics are shown",
101 |         }
102 | 
103 |         if arg in commands:
104 |             self.console.print(f"{arg} {commands[arg]}")
105 |         else:
106 |             for command, message in commands.items():
107 |                 self.console.print(f"{command} {message}")
108 | 
109 |             self.console.print("\nDefault mode runs a search query")
110 | 
111 |     def split(self, command, default=None):
112 |         """
113 |         Splits command by whitespace.
114 | 
115 |         Args:
116 |             command: command line
117 |             default: default command action
118 | 
119 |         Returns:
120 |             command action
121 |         """
122 | 
123 |         values = command.split(" ", 1)
124 |         return values if len(values) > 1 else (command, default)
125 | 
126 | 
127 | def main():
128 |     """
129 |     Console execution loop.
130 |     """
131 | 
132 |     Console().cmdloop()
133 | 
134 | 
135 | if __name__ == "__main__":
136 |     main()
137 | 


--------------------------------------------------------------------------------
/src/python/codequestion/index.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Index module
  3 | """
  4 | 
  5 | import os.path
  6 | import sqlite3
  7 | import sys
  8 | 
  9 | from tqdm import tqdm
 10 | from txtai.app import Application
 11 | from txtai.embeddings import Embeddings
 12 | 
 13 | from .models import Models
 14 | from .tokenizer import Tokenizer
 15 | 
 16 | 
 17 | class Index:
 18 |     """
 19 |     Builds a new embeddings index.
 20 |     """
 21 | 
 22 |     def __call__(self, config, dbfile):
 23 |         """
 24 |         Builds and saves an embeddings index.
 25 | 
 26 |         Args:
 27 |             config: input configuration file
 28 |             dbfile: input SQLite file
 29 |         """
 30 | 
 31 |         embeddings = self.build(config, dbfile)
 32 |         embeddings.save(Models.modelPath("stackexchange"))
 33 | 
 34 |     def build(self, config, dbfile):
 35 |         """
 36 |         Builds an embeddings index.
 37 | 
 38 |         Args:
 39 |             config: input configuration file
 40 |             dbfile: input SQLite file
 41 | 
 42 |         Returns:
 43 |             embeddings index
 44 |         """
 45 | 
 46 |         # Configure embeddings index
 47 |         config = Application.read(config)
 48 | 
 49 |         # Resolve full path to vectors file, if necessary
 50 |         if config.get("scoring"):
 51 |             config["path"] = os.path.join(Models.vectorPath(config["path"]))
 52 | 
 53 |         # Create embeddings index
 54 |         embeddings = Embeddings(config)
 55 | 
 56 |         # Build scoring index, if scoring method provided
 57 |         if embeddings.isweighted():
 58 |             embeddings.score(self.stream(dbfile, embeddings, "Building scoring index"))
 59 | 
 60 |         # Build embeddings index
 61 |         embeddings.index(self.stream(dbfile, embeddings, "Building embeddings index"))
 62 | 
 63 |         return embeddings
 64 | 
 65 |     def stream(self, dbfile, embeddings, message):
 66 |         """
 67 |         Streams questions from a questions.db file. This method is a generator and will yield a row at time.
 68 | 
 69 |         Args:
 70 |             dbfile: input SQLite file
 71 |             embeddings: embeddings instance
 72 |             message: progress bar message
 73 |         """
 74 | 
 75 |         # Connection to database file
 76 |         db = sqlite3.connect(dbfile)
 77 |         db.row_factory = sqlite3.Row
 78 |         cur = db.cursor()
 79 | 
 80 |         # Get total number of questions
 81 |         cur.execute("SELECT count(*) from Questions")
 82 |         total = cur.fetchone()[0]
 83 | 
 84 |         # Query for iterating over questions.db rows
 85 |         cur.execute(
 86 |             "SELECT Id, Source, SourceId, Date, Tags, Question, QuestionUser, Answer, AnswerUser, Reference FROM Questions"
 87 |         )
 88 | 
 89 |         for row in tqdm(cur, total=total, desc=message):
 90 |             # Transform all keys to lowercase
 91 |             row = {k.lower(): row[k] for k in row.keys()}
 92 | 
 93 |             # Store answer as object
 94 |             row["object"] = row.pop("answer")
 95 | 
 96 |             # Build text and yield (id, text, tags) tuple
 97 |             row["text"] = row["question"] + " " + row["source"] + " " + row["tags"]
 98 | 
 99 |             # Use custom tokenizer for word vector models
100 |             if embeddings.isweighted():
101 |                 row["text"] = Tokenizer.tokenize(row["text"])
102 | 
103 |             # Yield document
104 |             yield (row["id"], row, row["tags"])
105 | 
106 |         # Free database resources
107 |         db.close()
108 | 
109 | 
110 | # pylint: disable=C0103
111 | if __name__ == "__main__":
112 |     # Path to index configuration file
113 |     config = sys.argv[1] if len(sys.argv) > 1 else None
114 |     if not config or not os.path.exists(config):
115 |         print("Path to index configuration file does not exist, exiting")
116 |         sys.exit()
117 | 
118 |     # Path to questions.db file
119 |     dbfile = sys.argv[2] if len(sys.argv) > 1 else None
120 |     if not dbfile or not os.path.exists(dbfile):
121 |         print("Path to questions.db file does not exist, exiting")
122 |         sys.exit()
123 | 
124 |     # Build index
125 |     index = Index()
126 |     index(config, dbfile)
127 | 


--------------------------------------------------------------------------------
/src/python/codequestion/vectors.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Vectors module
  3 | """
  4 | 
  5 | import os
  6 | import os.path
  7 | import sqlite3
  8 | import sys
  9 | import tempfile
 10 | 
 11 | from tqdm import tqdm
 12 | from txtai.vectors import WordVectors
 13 | 
 14 | from .models import Models
 15 | from .tokenizer import Tokenizer
 16 | 
 17 | 
 18 | class RowIterator:
 19 |     """
 20 |     Iterates over rows in a database query. Allows for multiple iterations.
 21 |     """
 22 | 
 23 |     def __init__(self, dbfile):
 24 |         """
 25 |         Initializes RowIterator.
 26 | 
 27 |         Args:
 28 |             dbfile: path to SQLite file
 29 |         """
 30 | 
 31 |         # Store database file
 32 |         self.dbfile = dbfile
 33 | 
 34 |         self.rows = self.stream(self.dbfile)
 35 | 
 36 |     def __iter__(self):
 37 |         """
 38 |         Creates a database query generator.
 39 | 
 40 |         Returns:
 41 |             generator
 42 |         """
 43 | 
 44 |         # reset the generator
 45 |         self.rows = self.stream(self.dbfile)
 46 |         return self
 47 | 
 48 |     def __next__(self):
 49 |         """
 50 |         Gets the next result in the current generator.
 51 | 
 52 |         Returns:
 53 |             tokens
 54 |         """
 55 | 
 56 |         result = next(self.rows)
 57 |         if result is None:
 58 |             raise StopIteration
 59 | 
 60 |         return result
 61 | 
 62 |     def stream(self, dbfile):
 63 |         """
 64 |         Connects to SQLite file at dbfile and yields parsed tokens for each row.
 65 | 
 66 |         Args:
 67 |             dbfile: path to SQLite file
 68 |         """
 69 | 
 70 |         # Connection to database file
 71 |         db = sqlite3.connect(dbfile)
 72 |         cur = db.cursor()
 73 | 
 74 |         # Get total number of questions
 75 |         cur.execute("SELECT count(*) from Questions")
 76 |         total = cur.fetchone()[0]
 77 | 
 78 |         # Query for iterating over questions.db rows
 79 |         cur.execute("SELECT Question, Source, Tags FROM questions")
 80 | 
 81 |         for question in tqdm(cur, total=total, desc="Tokenizing input"):
 82 |             # Tokenize question, source and tags
 83 |             tokens = Tokenizer.tokenize(
 84 |                 question[0] + " " + question[1] + " " + question[2]
 85 |             )
 86 | 
 87 |             # Skip documents with no tokens parsed
 88 |             if tokens:
 89 |                 yield tokens
 90 | 
 91 |         # Free database resources
 92 |         db.close()
 93 | 
 94 | 
 95 | class Vectors:
 96 |     """
 97 |     Methods to build a FastText model.
 98 |     """
 99 | 
100 |     def __call__(self, dbfile, size, mincount):
101 |         """
102 |         Converts dbfile into a fastText model using pymagnitude's SQLite output format.
103 | 
104 |         Args:
105 |             dbfile: input SQLite file
106 |             size: dimensions for fastText model
107 |             mincount: minimum number of times a token must appear in input
108 |         """
109 | 
110 |         # Stream tokens to temporary file
111 |         tokens = self.tokens(dbfile)
112 | 
113 |         # Output file path
114 |         path = Models.vectorPath(f"stackexchange-{size}d", True)
115 | 
116 |         # Build word vectors model
117 |         WordVectors.build(tokens, size, mincount, path)
118 | 
119 |         # Remove temporary tokens file
120 |         os.remove(tokens)
121 | 
122 |     def tokens(self, dbfile):
123 |         """
124 |         Iterates over each row in dbfile and writes parsed tokens to a temporary file for processing.
125 | 
126 |         Args:
127 |             dbfile: SQLite file to read
128 | 
129 |         Returns:
130 |             path to output file
131 |         """
132 | 
133 |         tokens = None
134 | 
135 |         # Stream tokens to temp working file
136 |         with tempfile.NamedTemporaryFile(
137 |             mode="w", suffix=".txt", delete=False
138 |         ) as output:
139 |             # Save file path
140 |             tokens = output.name
141 | 
142 |             for row in RowIterator(dbfile):
143 |                 output.write(" ".join(row) + "\n")
144 | 
145 |         return tokens
146 | 
147 | 
148 | # pylint: disable=C0103
149 | if __name__ == "__main__":
150 |     # Path to questions.db file
151 |     dbfile = sys.argv[1] if len(sys.argv) > 1 else None
152 |     if not dbfile or not os.path.exists(dbfile):
153 |         print("Path to questions.db file does not exist, exiting")
154 |         sys.exit()
155 | 
156 |     # Resolve questions.db path and run
157 |     vectors = Vectors()
158 |     vectors(dbfile, 300, 3)
159 | 


--------------------------------------------------------------------------------
/src/python/codequestion/search.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Search module
  3 | """
  4 | 
  5 | import os
  6 | import os.path
  7 | import re
  8 | 
  9 | import html2markdown
 10 | 
 11 | from rich.console import Console
 12 | from rich.markdown import Markdown
 13 | from txtai.embeddings import Embeddings
 14 | 
 15 | from .models import Models
 16 | from .tokenizer import Tokenizer
 17 | 
 18 | 
 19 | class Search:
 20 |     """
 21 |     Search an embeddings index.
 22 |     """
 23 | 
 24 |     def __init__(self):
 25 |         """
 26 |         Creates a new search action.
 27 |         """
 28 | 
 29 |         # Load embeddings index
 30 |         self.embeddings = self.load()
 31 |         self.console = Console()
 32 | 
 33 |     def __call__(self, query=None, limit=1, uid=None):
 34 |         """
 35 |         Runs a search action.
 36 | 
 37 |         Args:
 38 |             query: query string
 39 |             limit: number of results to return
 40 |             uid: id to show
 41 |         """
 42 | 
 43 |         # Query prefix
 44 |         prefix = "select id, score, questionuser, question, tags, date, answeruser, object answer, reference from txtai where"
 45 | 
 46 |         if uid is not None:
 47 |             # ID query
 48 |             query = f"{prefix} id = '{uid}'"
 49 |         elif self.embeddings.isweighted():
 50 |             # Use custom tokenizer for word vector models
 51 |             query = Tokenizer.tokenize(query)
 52 | 
 53 |             # Run search and build id query
 54 |             result = self.embeddings.search(query, 1)[0] if query else {}
 55 |             query = f"""
 56 |                 select id, {result.get('score')} score, questionuser, question, tags, date, answeruser, object answer, reference
 57 |                 from txtai
 58 |                 where id = '{result.get('id')}'
 59 |             """
 60 |         else:
 61 |             # Default similar clause query
 62 |             query = f"{prefix} similar('{query}')"
 63 | 
 64 |         # Render results
 65 |         for result in self.embeddings.search(query, limit):
 66 |             # Show result
 67 |             self.result(result, limit)
 68 | 
 69 |         self.console.print()
 70 | 
 71 |     def load(self):
 72 |         """
 73 |         Loads an embeddings model.
 74 | 
 75 |         Returns:
 76 |             embeddings
 77 |         """
 78 | 
 79 |         path = Models.modelPath("stackexchange")
 80 | 
 81 |         if os.path.isfile(os.path.join(path, "config")):
 82 |             print(f"Loading model from {path}")
 83 |             embeddings = Embeddings()
 84 |             embeddings.load(path)
 85 |         else:
 86 |             print("ERROR: loading model: ensure model is installed")
 87 |             print(
 88 |                 "ERROR: Pre-trained model can be installed by running python -m codequestion.download"
 89 |             )
 90 |             raise FileNotFoundError(f"Unable to load codequestion model from {path}")
 91 | 
 92 |         return embeddings
 93 | 
 94 |     def result(self, result, limit):
 95 |         """
 96 |         Renders a result row.
 97 | 
 98 |         Args:
 99 |             result: result row
100 |             limit: number of results
101 |         """
102 | 
103 |         # If score is empty, this a direct query
104 |         score = result["score"]
105 |         score = score if score is not None else 1.0
106 | 
107 |         self.console.print(
108 |             f"[bright_green]Question (by {result['questionuser']}): {result['question']} [{score:4f}][/bright_green]",
109 |             highlight=False,
110 |         )
111 |         self.console.print(f"Id: {result['id']}", highlight=False)
112 |         self.console.print(f"Last Activity: {result['date']}", highlight=False)
113 |         self.console.print(f"Tags: {result['tags']}")
114 |         self.console.print(f"Answer (by {result['answeruser']}):\n", highlight=False)
115 |         self.console.print(self.markdown(result["answer"]))
116 |         self.console.print(f"\nReference: {result['reference']}")
117 | 
118 |         # Print results divider
119 |         if limit > 1:
120 |             self.console.rule()
121 | 
122 |     def markdown(self, text):
123 |         """
124 |         Converts html text to markdown.
125 | 
126 |         Args:
127 |             text: html text
128 | 
129 |         Returns:
130 |             text as markdown
131 |         """
132 | 
133 |         # Remove rel attributes as they are not supported by html2markdown
134 |         text = re.sub(r' rel=".+?">', ">", text)
135 | 
136 |         # Convert html to markdown
137 |         text = html2markdown.convert(text)
138 | 
139 |         # Decode [<>&] characters
140 |         text = text.replace("&lt;", "<").replace("&gt;", ">").replace("&amp;", "&")
141 | 
142 |         # Wrap as Rich Markdown
143 |         return Markdown(text)
144 | 


--------------------------------------------------------------------------------
/src/python/codequestion/etl/stackexchange/xml2db.py:
--------------------------------------------------------------------------------
  1 | """
  2 | XML2DB module
  3 | """
  4 | 
  5 | import os
  6 | import xml.etree.cElementTree as etree
  7 | import sqlite3
  8 | 
  9 | 
 10 | class XML2DB:
 11 |     """
 12 |     Converts a filtered posts xml file to a staging SQLite database for processing.
 13 |     """
 14 | 
 15 |     # Questions schema
 16 |     QUESTIONS = {
 17 |         "Id": "INTEGER PRIMARY KEY",
 18 |         "AcceptedAnswerId": "INTEGER",
 19 |         "CreationDate": "DATETIME",
 20 |         "LastActivityDate": "DATETIME",
 21 |         "Score": "INTEGER",
 22 |         "ViewCount": "INTEGER",
 23 |         "OwnerUserId": "INTEGER",
 24 |         "OwnerDisplayName": "TEXT",
 25 |         "Title": "TEXT",
 26 |         "Tags": "TEXT",
 27 |         "AnswerCount": "INTEGER",
 28 |         "CommentCount": "INTEGER",
 29 |         "FavoriteCount": "INTEGER",
 30 |         "ClosedDate": "DATETIME",
 31 |     }
 32 | 
 33 |     # Answers schema
 34 |     ANSWERS = {
 35 |         "Id": "INTEGER PRIMARY KEY",
 36 |         "ParentId": "INTEGER",
 37 |         "CreationDate": "DATETIME",
 38 |         "Score": "INTEGER",
 39 |         "Body": "TEXT",
 40 |         "OwnerUserId": "INTEGER",
 41 |         "OwnerDisplayName": "TEXT",
 42 |     }
 43 | 
 44 |     # SQL statements
 45 |     CREATE_TABLE = "CREATE TABLE IF NOT EXISTS {table} ({fields})"
 46 |     INSERT_ROW = "INSERT INTO {table} ({columns}) VALUES ({values})"
 47 | 
 48 |     def __call__(self, infile, dbfile):
 49 |         """
 50 |         Converts xml infile to SQLite dbfile.
 51 | 
 52 |         Args:
 53 |             infile: input xml file
 54 |             dbfile: output sqlite file
 55 |         """
 56 | 
 57 |         print(f"Converting {infile} to {dbfile}")
 58 | 
 59 |         # Delete existing file
 60 |         if os.path.exists(dbfile):
 61 |             os.remove(dbfile)
 62 | 
 63 |         # Create new database
 64 |         db = sqlite3.connect(dbfile)
 65 | 
 66 |         # Create database tables if necessary
 67 |         self.create(db, XML2DB.QUESTIONS, "questions")
 68 |         self.create(db, XML2DB.ANSWERS, "answers")
 69 | 
 70 |         count = 0
 71 |         with open(infile, encoding="utf-8") as xml:
 72 |             context, root = self.xmlstream(xml)
 73 | 
 74 |             for event, row in context:
 75 |                 if event == "end":
 76 |                     # Execute insert statement
 77 |                     self.insert(db, row)
 78 | 
 79 |                     count += 1
 80 |                     if count % 10000 == 0:
 81 |                         print(f"Inserted {count} rows")
 82 | 
 83 |                     # Free memory
 84 |                     root.clear()
 85 | 
 86 |         print(f"Total rows inserted: {count}")
 87 | 
 88 |         # Commit changes
 89 |         db.commit()
 90 | 
 91 |     def create(self, db, table, name):
 92 |         """
 93 |         Creates a SQLite table.
 94 | 
 95 |         Args:
 96 |             db: database connection
 97 |             table: table schema
 98 |             name: table name
 99 |         """
100 | 
101 |         columns = [f"{name} {ctype}" for name, ctype in table.items()]
102 |         create = XML2DB.CREATE_TABLE.format(table=name, fields=", ".join(columns))
103 | 
104 |         # pylint: disable=W0703
105 |         try:
106 |             db.execute(create)
107 |         except Exception as e:
108 |             print(create)
109 |             print("Failed to create table: " + e)
110 | 
111 |     def xmlstream(self, xml):
112 |         """
113 |         Creates a xml stream for iterative parsing.
114 | 
115 |         Args:
116 |             xml: input file
117 | 
118 |         Returns:
119 |             context, root
120 |         """
121 | 
122 |         # Parse the tree
123 |         context = etree.iterparse(xml, events=("start", "end"))
124 | 
125 |         # turn it into an iterator
126 |         context = iter(context)
127 | 
128 |         # get the root element
129 |         _, root = next(context)
130 | 
131 |         return context, root
132 | 
133 |     def insert(self, db, row):
134 |         """
135 |         Inserts row into database.
136 | 
137 |         Args:
138 |             db: database connection
139 |             row: row tuple
140 |         """
141 | 
142 |         if "PostTypeId" in row.attrib:
143 |             # PostType="1" - Question, PostType="2" - Answer
144 |             table = (
145 |                 XML2DB.QUESTIONS if row.attrib["PostTypeId"] == "1" else XML2DB.ANSWERS
146 |             )
147 |             name = "questions" if row.attrib["PostTypeId"] == "1" else "answers"
148 | 
149 |             # Build insert prepared statement
150 |             columns = [name for name, _ in table.items()]
151 |             insert = XML2DB.INSERT_ROW.format(
152 |                 table=name,
153 |                 columns=", ".join(columns),
154 |                 values=("?, " * len(columns))[:-2],
155 |             )
156 | 
157 |             # Execute insert statement
158 |             db.execute(insert, self.values(table, row, columns))
159 | 
160 |     def values(self, table, row, columns):
161 |         """
162 |         Formats and converts row into database types based on table schema.
163 | 
164 |         Args:
165 |             table: table schema
166 |             row: row tuple
167 |             columns: column names
168 | 
169 |         Returns:
170 |             Database schema formatted row tuple
171 |         """
172 | 
173 |         values = []
174 |         for column in columns:
175 |             # Get column value
176 |             value = row.attrib[column] if column in row.attrib else None
177 | 
178 |             if table[column].startswith("INTEGER"):
179 |                 values.append(int(value) if value else 0)
180 |             elif table[column] == "BOOLEAN":
181 |                 values.append(1 if value == "TRUE" else 0)
182 |             else:
183 |                 values.append(value)
184 | 
185 |         return values
186 | 


--------------------------------------------------------------------------------
/src/python/codequestion/evaluate.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Evaluate module
  3 | """
  4 | 
  5 | import argparse
  6 | import csv
  7 | import os
  8 | 
  9 | from scipy.stats import pearsonr, spearmanr
 10 | from tqdm import tqdm
 11 | from txtai.embeddings import Embeddings
 12 | 
 13 | from .models import Models
 14 | from .tokenizer import Tokenizer
 15 | 
 16 | 
 17 | class StackExchange:
 18 |     """
 19 |     Stack Exchange query-answer dataset.
 20 |     """
 21 | 
 22 |     def __call__(self, path, method):
 23 |         """
 24 |         Evaluates a pre-trained model against the Stack Exchange query-answer dataset.
 25 | 
 26 |         Args:
 27 |             path: path to tests
 28 |             method: run method
 29 |         """
 30 | 
 31 |         # Load model
 32 |         embeddings = self.load()
 33 | 
 34 |         # Statistics
 35 |         mrr = []
 36 | 
 37 |         # Build scoring index
 38 |         if method in ("bm25", "tfidf", "sif"):
 39 |             scoring = Embeddings({"keyword": True, "content": True})
 40 |             scoring.index(self.stream(embeddings, "Building keyword index"))
 41 |             embeddings = scoring
 42 | 
 43 |         # Run test data
 44 |         with open(
 45 |             os.path.join(path, "stackexchange", "query.txt"), encoding="utf-8"
 46 |         ) as rows:
 47 |             for row in rows:
 48 |                 query, sourceid, source, _ = row.split("|", 3)
 49 |                 print(query, sourceid, source)
 50 | 
 51 |                 # Run search
 52 |                 results = self.search(embeddings, query)
 53 | 
 54 |                 # Get row index within results
 55 |                 index = -1
 56 |                 for x, result in enumerate(results):
 57 |                     if (
 58 |                         int(sourceid) == result["sourceid"]
 59 |                         and source == result["source"]
 60 |                     ):
 61 |                         index = x
 62 | 
 63 |                 # Calculate stats
 64 |                 calc = 1 / (1 + index) if index != -1 else 0.0
 65 |                 print(calc)
 66 |                 mrr.append(calc)
 67 | 
 68 |         mrr = sum(mrr) / len(mrr)
 69 |         print("Mean Reciprocal Rank = ", mrr)
 70 | 
 71 |     def load(self):
 72 |         """
 73 |         Loads a pre-trained embeddings model
 74 | 
 75 |         Returns:
 76 |             embeddings
 77 |         """
 78 | 
 79 |         # Loading embeddings model
 80 |         embeddings = Embeddings()
 81 |         embeddings.load(Models.modelPath("stackexchange"))
 82 | 
 83 |         return embeddings
 84 | 
 85 |     def stream(self, embeddings, message):
 86 |         """
 87 |         Streams content from an embeddings index. This method is a generator and will yield a row at time.
 88 | 
 89 |         Args:
 90 |             embeddings: embeddings index
 91 |             message: progress bar message
 92 |         """
 93 | 
 94 |         offset, batch = 0, 1000
 95 |         with tqdm(total=embeddings.count(), desc=message) as progress:
 96 |             for offset in range(0, embeddings.count(), batch):
 97 |                 for result in embeddings.search(
 98 |                     f"select id, text, tags, source, sourceid from txtai limit {batch} offset {offset}"
 99 |                 ):
100 |                     yield (result["id"], result, None)
101 | 
102 |                 progress.update(batch)
103 | 
104 |     def search(self, embeddings, query):
105 |         """
106 |         Executes a search.
107 | 
108 |         Args:
109 |             embeddings: embeddings instance
110 |             query: query to run
111 | 
112 |         Returns:
113 |             search results
114 |         """
115 | 
116 |         results = None
117 |         if embeddings.isweighted():
118 |             # Use custom tokenizer for word vector models
119 |             uids = [
120 |                 row["id"] for row in embeddings.search(Tokenizer.tokenize(query), 10)
121 |             ]
122 | 
123 |             # Get source id + source for each result
124 |             results = []
125 |             for uid in uids:
126 |                 results.append(
127 |                     embeddings.search(
128 |                         f"select sourceid, source from txtai where id = {uid}"
129 |                     )[0]
130 |                 )
131 |         else:
132 |             # Select source id + source with standard similar clause
133 |             results = embeddings.search(
134 |                 f"select sourceid, source from txtai where similar('{query}') limit 10"
135 |             )
136 | 
137 |         return results
138 | 
139 | 
140 | class STS:
141 |     """
142 |     STS Benchmark Dataset
143 |     General text similarity
144 | 
145 |     http://ixa2.si.ehu.es/stswiki/index.php/STSbenchmark
146 |     """
147 | 
148 |     def __call__(self, path, method):
149 |         """
150 |         Test a list of vector models.
151 | 
152 |         Args:
153 |             path: path to tests
154 |             method: run method
155 |         """
156 | 
157 |         # Load embeddings instance - used to calculate similarity
158 |         embeddings = Embeddings()
159 |         embeddings.load(Models.modelPath("stackexchange"))
160 | 
161 |         # Test model against sts dataset
162 |         self.test(embeddings, path, method)
163 | 
164 |     def test(self, embeddings, path, method):
165 |         """
166 |         Tests input Embeddings model against STS benchmark data.
167 | 
168 |         Args:
169 |             embeddings: embeddings instance
170 |             path: path to tests
171 |             method: run method
172 |         """
173 | 
174 |         # Test file path
175 |         path = os.path.join(
176 |             path, "stsbenchmark", f"sts-{'dev' if method == 'dev' else 'test'}.csv"
177 |         )
178 | 
179 |         # Read test data
180 |         rows = self.read(path)
181 | 
182 |         # Calculated scores and ground truth labels
183 |         scores = []
184 |         labels = []
185 | 
186 |         for row in rows:
187 |             text1, text2 = row[2], row[3]
188 | 
189 |             # Use custom tokenizer for word vector models
190 |             if embeddings.isweighted():
191 |                 text1 = Tokenizer.tokenize(text1)
192 |                 text2 = Tokenizer.tokenize(text2)
193 | 
194 |             if text1 and text2:
195 |                 score = embeddings.similarity(text1, [text2])[0][1]
196 |                 scores.append(score)
197 | 
198 |                 # Ground truth score normalized between 0 - 1
199 |                 labels.append(row[1])
200 | 
201 |         print("Pearson score =", pearsonr(scores, labels))
202 |         print("Spearman score =", spearmanr(scores, labels))
203 | 
204 |     def read(self, path):
205 |         """
206 |         Reads a STS data file.
207 | 
208 |         Args:
209 |             path: full path to file
210 | 
211 |         Returns:
212 |             rows
213 |         """
214 | 
215 |         with open(path, encoding="utf-8") as f:
216 |             data = csv.reader(f, delimiter="\t", quoting=csv.QUOTE_NONE)
217 | 
218 |             rows = []
219 | 
220 |             # Column Index-Name: 4-score, 5-string 1, 6-string 2
221 |             for x, row in enumerate(data):
222 |                 # Normalize score from 0-5 to 0-1. 1 being most similar.
223 |                 score = float(row[4]) / 5.0
224 | 
225 |                 # Store row as id (1 indexed), normalized score, string 1, string 2
226 |                 rows.append((x + 1, score, row[5], row[6]))
227 | 
228 |             return rows
229 | 
230 | 
231 | if __name__ == "__main__":
232 |     # Command line parser
233 |     parser = argparse.ArgumentParser(description="Evaluate")
234 |     parser.add_argument(
235 |         "-s", "--source", required=True, help="data source", metavar="SOURCE"
236 |     )
237 |     parser.add_argument(
238 |         "-p", "--path", required=True, help="path to test files", metavar="PATH"
239 |     )
240 |     parser.add_argument("-m", "--method", help="run method", metavar="METHOD")
241 | 
242 |     # Parse command line arguments
243 |     args = parser.parse_args()
244 | 
245 |     # Get eval action
246 |     action = STS() if args.source.lower() == "sts" else StackExchange()
247 | 
248 |     # Run eval action
249 |     action(args.path, args.method)
250 | 


--------------------------------------------------------------------------------
/src/python/codequestion/etl/stackexchange/db2qa.py:
--------------------------------------------------------------------------------
  1 | """
  2 | DB2QA module
  3 | """
  4 | 
  5 | import os
  6 | import re
  7 | import sqlite3
  8 | 
  9 | 
 10 | class DB2QA:
 11 |     """
 12 |     Converts multiple staging SQLite database (questions, answers in separate tables per source) into a consolidated SQLite database
 13 |     with a single questions table.
 14 |     """
 15 | 
 16 |     # Questions schema
 17 |     QUESTIONS = {
 18 |         "Id": "INTEGER PRIMARY KEY",
 19 |         "Source": "TEXT",
 20 |         "SourceId": "INTEGER",
 21 |         "Date": "DATETIME",
 22 |         "Tags": "TEXT",
 23 |         "Question": "TEXT",
 24 |         "QuestionUser": "TEXT",
 25 |         "Answer": "TEXT",
 26 |         "AnswerUser": "TEXT",
 27 |         "Reference": "TEXT",
 28 |     }
 29 | 
 30 |     # List of sources
 31 |     SOURCES = {
 32 |         "ai": "https://ai.stackexchange.com",
 33 |         "android": "https://android.stackexchange.com",
 34 |         "apple": "https://apple.stackexchange.com",
 35 |         "arduino": "https://arduino.stackexchange.com",
 36 |         "askubuntu": "https://askubuntu.com",
 37 |         "avp": "https://avp.stackexchange.com",
 38 |         "codereview": "https://codereview.stackexchange.com",
 39 |         "cs": "https://cs.stackexchange.com",
 40 |         "datascience": "http://datascience.stackexchange.com",
 41 |         "dba": "https://dba.stackexchange.com",
 42 |         "devops": "https://devops.stackexchange.com",
 43 |         "dsp": "https://dsp.stackexchange.com",
 44 |         "raspberrypi": "https://raspberrypi.stackexchange.com",
 45 |         "reverseengineering": "https://reverseengineering.stackexchange.com",
 46 |         "scicomp": "https://scicomp.stackexchange.com",
 47 |         "security": "https://security.stackexchange.com",
 48 |         "serverfault": "https://serverfault.com",
 49 |         "stackoverflow": "https://stackoverflow.com",
 50 |         "stats": "https://stats.stackexchange.com",
 51 |         "superuser": "https://superuser.com",
 52 |         "unix": "https://unix.stackexchange.com",
 53 |         "vi": "https://vi.stackexchange.com",
 54 |         "wordpress": "https://wordpress.stackexchange.com",
 55 |     }
 56 | 
 57 |     # SQL statements
 58 |     CREATE_TABLE = "CREATE TABLE IF NOT EXISTS {table} ({fields})"
 59 |     INSERT_ROW = "INSERT INTO {table} ({columns}) VALUES ({values})"
 60 |     CREATE_SOURCE_INDEX = "CREATE INDEX source ON questions(Source, SourceId)"
 61 |     CREATE_TEXT_INDEX = "CREATE VIRTUAL TABLE search USING fts5(Id, Question, Tags)"
 62 |     INSERT_TEXT_ROWS = "INSERT INTO search SELECT Id, Question, Tags from questions"
 63 | 
 64 |     def __call__(self, databases, qafile):
 65 |         """
 66 |         Executes a run to convert a list of databases to a single consolidated questions db file.
 67 | 
 68 |         Args:
 69 |             databases: paths to input databases
 70 |             qafile: output database path
 71 |         """
 72 | 
 73 |         print(f"Converting {databases} to {qafile}")
 74 | 
 75 |         # Delete existing file
 76 |         if os.path.exists(qafile):
 77 |             os.remove(qafile)
 78 | 
 79 |         # Create output database
 80 |         qa = sqlite3.connect(qafile)
 81 | 
 82 |         # Create questions table
 83 |         self.create(qa, DB2QA.QUESTIONS, "questions")
 84 | 
 85 |         # Row index
 86 |         index = 0
 87 | 
 88 |         for dbfile in databases:
 89 |             print("Processing " + dbfile)
 90 | 
 91 |             # Create source name
 92 |             source = os.path.splitext(os.path.basename(dbfile))[0].lower()
 93 | 
 94 |             # Input database
 95 |             db = sqlite3.connect(dbfile)
 96 |             cur = db.cursor()
 97 | 
 98 |             cur.execute(
 99 |                 "SELECT Id, AcceptedAnswerId, OwnerUserId, OwnerDisplayName, LastActivityDate, Title, Tags FROM questions"
100 |             )
101 | 
102 |             # Need to select all rows to allow execution of insert statements
103 |             for question in cur.fetchall():
104 |                 # Find accepted answer
105 |                 answer = self.find(question, cur)
106 |                 if answer:
107 |                     # Combine into single question row
108 |                     self.insert(qa, index, source, question, answer)
109 | 
110 |                     index += 1
111 |                     if index % 10000 == 0:
112 |                         print(f"Inserted {index} rows")
113 | 
114 |             db.close()
115 | 
116 |         print(f"Total rows inserted: {index}")
117 | 
118 |         # Create indices
119 |         for statement in [
120 |             DB2QA.CREATE_SOURCE_INDEX,
121 |             DB2QA.CREATE_TEXT_INDEX,
122 |             DB2QA.INSERT_TEXT_ROWS,
123 |         ]:
124 |             qa.execute(statement)
125 | 
126 |         # Commit changes and close
127 |         qa.commit()
128 |         qa.close()
129 | 
130 |     def create(self, db, table, name):
131 |         """
132 |         Creates a SQLite table.
133 | 
134 |         Args:
135 |             db: database connection
136 |             table: table schema
137 |             name: table name
138 |         """
139 | 
140 |         columns = [f"{name} {ctype}" for name, ctype in table.items()]
141 |         create = DB2QA.CREATE_TABLE.format(table=name, fields=", ".join(columns))
142 | 
143 |         # pylint: disable=W0703
144 |         try:
145 |             db.execute(create)
146 |         except Exception as e:
147 |             print(create)
148 |             print("Failed to create table: " + e)
149 | 
150 |     def find(self, question, cur):
151 |         """
152 |         Finds a corresponding answer for the input question.
153 | 
154 |         Args:
155 |             question: input question row
156 |             cur: database cursor
157 | 
158 |         Returns:
159 |             Answer row if found, None otherwise
160 |         """
161 | 
162 |         # Query for accepted answer
163 |         cur.execute(
164 |             "SELECT Body, OwnerUserId, OwnerDisplayName from answers where Id = ?",
165 |             [question[1]],
166 |         )
167 |         answer = cur.fetchone()
168 | 
169 |         if answer and answer[0]:
170 |             # Check if answer has a message body
171 |             return answer
172 | 
173 |         return None
174 | 
175 |     def insert(self, db, index, source, question, answer):
176 |         """
177 |         Builds and inserts a consolidated question.
178 | 
179 |         Args:
180 |             db: database connection
181 |             index: row index
182 |             source: question source
183 |             question: question row
184 |             answer: answer row
185 |         """
186 | 
187 |         table = DB2QA.QUESTIONS
188 | 
189 |         # Build insert prepared statement
190 |         columns = [name for name, _ in table.items()]
191 |         insert = DB2QA.INSERT_ROW.format(
192 |             table="questions",
193 |             columns=", ".join(columns),
194 |             values=("?, " * len(columns))[:-2],
195 |         )
196 | 
197 |         # Build row of insert values
198 |         row = self.build(index, source, question, answer)
199 | 
200 |         # Execute insert statement
201 |         db.execute(insert, self.values(table, row, columns))
202 | 
203 |     def build(self, index, source, question, answer):
204 |         """
205 |         Builds a consolidated question row.
206 | 
207 |         Args:
208 |             index: row index
209 |             source: question source
210 |             question: question row
211 |             answer: answer row
212 | 
213 |         Returns:
214 |             row tuple
215 |         """
216 | 
217 |         # Parse tags into list of tags
218 |         tags = re.sub(r"[<>]", " ", question[6]).split() if question[6] else None
219 | 
220 |         # Get user display name, fallback to user id
221 |         quser = question[3] if question[3] else str(question[2])
222 |         auser = answer[2] if answer[2] else str(answer[1])
223 | 
224 |         # Create URL reference
225 |         reference = f"{DB2QA.SOURCES[source]}/questions/{question[0]}"
226 | 
227 |         # Id, Source, SourceId, Date, Tags, Question, QuestionUser, Answer, AnswerUser, Reference
228 |         return (
229 |             index,
230 |             source,
231 |             question[0],
232 |             question[4],
233 |             " ".join(tags),
234 |             question[5],
235 |             quser,
236 |             answer[0],
237 |             auser,
238 |             reference,
239 |         )
240 | 
241 |     def values(self, table, row, columns):
242 |         """
243 |         Formats and converts row into database types based on table schema.
244 | 
245 |         Args:
246 |             table: table schema
247 |             row: row tuple
248 |             columns: column names
249 | 
250 |         Returns:
251 |             Database schema formatted row tuple
252 |         """
253 | 
254 |         values = []
255 |         for x, column in enumerate(columns):
256 |             # Get value
257 |             value = row[x]
258 | 
259 |             if table[column].startswith("INTEGER"):
260 |                 values.append(int(value) if value else 0)
261 |             elif table[column] == "BOOLEAN":
262 |                 values.append(1 if value == "TRUE" else 0)
263 |             else:
264 |                 values.append(value)
265 | 
266 |         return values
267 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    Copyright 2020- NeuML LLC
179 | 
180 |    Licensed under the Apache License, Version 2.0 (the "License");
181 |    you may not use this file except in compliance with the License.
182 |    You may obtain a copy of the License at
183 | 
184 |        http://www.apache.org/licenses/LICENSE-2.0
185 | 
186 |    Unless required by applicable law or agreed to in writing, software
187 |    distributed under the License is distributed on an "AS IS" BASIS,
188 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
189 |    See the License for the specific language governing permissions and
190 |    limitations under the License.


--------------------------------------------------------------------------------
/test/stackexchange/query.txt:
--------------------------------------------------------------------------------
  1 | android get screen dimensions|4743116|stackoverflow|Get screen width and height in Android|android
  2 | android unique device id|2785485|stackoverflow|Is there a unique Android device ID?|android uniqueidentifier
  3 | apt list all installed packages|17823|askubuntu|How to list all installed packages|apt package-management
  4 | assembly pop stack|4584089|stackoverflow|What is the function of the push / pop instructions used on registers in x86 assembly?|assembly x86 stack terminology
  5 | bash create uuid|103359|serverfault|How to create a UUID in bash?|bash uuid
  6 | bash current time milliseconds|151109|serverfault|How do I get the current Unix time in milliseconds in Bash?|bash unix time
  7 | bash list files|7265272|stackoverflow|How to list files in directory using bash?|bash
  8 | bash sort du by output size|62411|serverfault|How can I sort du -h output by size|linux bash du gnu
  9 | c cast malloc|605845|stackoverflow|Do I cast the result of malloc?|c malloc casting
 10 | c++ convert string lower case|313970|stackoverflow|How to convert std::string to lower case?|c++ string c++-standard-library tolower
 11 | centos print current version|54987|unix|How to determine CentOS version?|centos version
 12 | centos upgrade 5 to 6|309053|superuser|How to upgrade CentOS 5.6 to 6.0?|linux centos
 13 | centos when does cron.daily run|135906|serverfault|When does `cron.daily` run?|linux centos redhat cron
 14 | c++ read file into ascii string|2602013|stackoverflow|Read whole ASCII file into C++ std::string|c++ string caching file-io standard-library
 15 | c++ sleep|4184468|stackoverflow|Sleep for milliseconds|c++ linux sleep
 16 | cuda gcc version|6622454|stackoverflow|CUDA incompatible with my gcc version|gcc cuda debian
 17 | cuda get version|9727688|stackoverflow|How to get the cuda version?|cuda
 18 | curl show http status|272265|superuser|Getting curl to output HTTP status code?|http curl status
 19 | database inner join vs outer join|38549|stackoverflow|What is the difference between "INNER JOIN" and "OUTER JOIN"?|sql database join inner-join outer-join
 20 | docker vs virtual machine|16047306|stackoverflow|How is Docker different from a virtual machine?|docker containers virtual-machine virtualization
 21 | dpkg install deb file|159094|unix|How to install a deb file, by dpkg -i or by apt?|software-installation apt dpkg deb
 22 | ec2 delete terminated instance|393417|serverfault|Delete Amazon EC2 terminated instance|amazon-ec2
 23 | fedora print current version|540603|stackoverflow|How can I find the version of the Fedora I use?|linux fedora
 24 | fedora test fstab|174181|serverfault|How do you validate fstab without rebooting?|fedora mount fstab
 25 | gcc get assembly output|137038|stackoverflow|How do you get assembler output from C/C++ source in gcc?|c++ c debugging gcc assembly
 26 | gcc vs g++|172587|stackoverflow|What is the difference between g++ and gcc?|c++ gcc g++
 27 | git add empty directory|115983|stackoverflow|How can I add an empty directory to a Git repository?|git directory git-add
 28 | git discard unstaged changes|52704|stackoverflow|How do I discard unstaged changes in Git?|git version-control
 29 | git pull vs fetch|292357|stackoverflow|What is the difference between 'git pull' and 'git fetch'?|git version-control git-pull git-fetch
 30 | git undo most recent local commit|927358|stackoverflow|How do I undo the most recent local commits in Git?|git version-control git-commit undo pre-commit
 31 | java fix nullpointerexception|218384|stackoverflow|What is a NullPointerException, and how do I fix it?|java nullpointerexception
 32 | java get random number|5887709|stackoverflow|Getting random numbers in Java|java random
 33 | java hashmap vs hashtable|40471|stackoverflow|What are the differences between a HashMap and a Hashtable in Java?|java collections hashmap hashtable
 34 | java heap space error|37335|stackoverflow|How to deal with "java.lang.OutOfMemoryError: Java heap space" error?|java java-ee jvm out-of-memory heap-memory
 35 | java outofmemoryerror poi|6069847|stackoverflow|java.lang.OutOfMemoryError: Java heap space while reading excel with Apache POI|java apache-poi
 36 | java print date|26717733|stackoverflow|print current date in java|java time
 37 | java round decimal|11701399|stackoverflow|round up to 2 decimal places in java?|java
 38 | javascript check if string contains substring|1789945|stackoverflow|How to check whether a string contains a substring in JavaScript?|javascript string substring string-matching
 39 | javascript create timestamp|221294|stackoverflow|How do you get a timestamp in JavaScript?|javascript date datetime timestamp unix-timestamp
 40 | javascript encode url|332872|stackoverflow|Encode URL in JavaScript?|javascript url encoding
 41 | javascript for-each in array|9329446|stackoverflow|For-each over an array in JavaScript?|javascript arrays loops foreach iteration
 42 | javascript generate uuid|105034|stackoverflow|Create GUID / UUID in JavaScript?|javascript guid uuid
 43 | javascript html5 local storage add object|2010892|stackoverflow|How to store objects in HTML5 localStorage/sessionStorage|javascript html local-storage
 44 | javascript include file in another file|950087|stackoverflow|How do I include a JavaScript file in another JavaScript file?|javascript file import include
 45 | javascript redirect to another website|503093|stackoverflow|How do I redirect to another webpage?|javascript jquery redirect
 46 | javascript remove element from array|5767325|stackoverflow|How can I remove a specific item from an array?|javascript arrays
 47 | javascript replace all occurrences in string|1144783|stackoverflow|How do I replace all occurrences of a string in JavaScript?|javascript string replace
 48 | javascript validate email address|46155|stackoverflow|How can I validate an email address in JavaScript?|javascrit html regex email-validation
 49 | java array sort|8938235|stackoverflow|Sort an array in Java|java arrays
 50 | java split string|3481828|stackoverflow|How do I split a string in Java?|java string split
 51 | java stringbuilder vs stringbuffer|355089|stackoverflow|Difference between StringBuilder and StringBuffer|java stringbuilder stringbuffer
 52 | java string to int|5585779|stackoverflow|How do I convert a String to an int in Java?|java string int type-conversion
 53 | java wait vs sleep|1036754|stackoverflow|Difference between "wait()" and "sleep()" in Java|java multithreading wait sleep java-threads
 54 | java ways to iterate list|18410035|stackoverflow|Ways to iterate over a list in Java|java loops collections iteration
 55 | java write file|2885173|stackoverflow|How do I create a file and write to it in Java?|java file-io
 56 | json comments|244777|stackoverflow|Can comments be used in JSON?|json comments
 57 | json proper content type|477816|stackoverflow|Which JSON content type do I use?|json mime-types content-type
 58 | linux check if port open|309052|serverfault|Check if port is open or closed on a Linux server?|linux port telnet
 59 | linux find files with specific text|16956810|stackoverflow|How to find all files containing specific text (string) on Linux?|linux text grep directory find
 60 | ping specific port|309357|serverfault|Ping a Specific Port|ping
 61 | psql list all databases and tables|1285|dba|How do I list all databases and tables using psql?|postgresql tools psql command-line
 62 | python add new keys to dictionary|1024847|stackoverflow|How can I add new keys to a dictionary?|python dictionary lookup
 63 | python aes decryption|12524994|stackoverflow|Encrypt & Decrypt using PyCrypto AES 256|python encryption padding pycrypto initialization-vector
 64 | python call external script|9318581|stackoverflow|Python - how do I call external python programs?|python call external
 65 | python check if list empty|53513|stackoverflow|How do I check if a list is empty?|python list
 66 | python clone list|2612802|stackoverflow|How do I clone a list so that it doesn't change unexpectedly after assignment?|python list reference copy clone
 67 | python contains string|3437059|stackoverflow|Does Python have a string 'contains' substring method?|python string substring contains
 68 | python convert int to bytes|21017698|stackoverflow|Converting int to bytes in Python 3|python python-3.x
 69 | python google sheets api|56084171|stackoverflow|Accessing Google Sheets Api with Python|stackoverflow python google-sheets-api
 70 | python opencv draw rectangle|23720875|stackoverflow|How to draw a rectangle around a region of interest in python|python opencv computer-vision draw
 71 | python parse csv|12296585|stackoverflow|Python Parse CSV Correctly|python parsing csv
 72 | python parse float|379906|stackoverflow|How do I parse a string to a float or int?|python parsing floating-point type-conversion integer
 73 | python parse json|7771011|stackoverflow|How to parse data in JSON format?|python json parsing
 74 | python pdf extract text|15583535|stackoverflow|How to extract text from a PDF file in Python?|python pypdf
 75 | python read wav file|2060628|stackoverflow|Reading *.wav files in Python|python audio wav wave
 76 | python sort dictionary by value|613183|stackoverflow|How do I sort a dictionary by value?|python sorting dictionary
 77 | python staticmethod vs classmethod|136097|stackoverflow|Difference between @staticmethod and @classmethod|python oop methods python-decorators
 78 | python utc to localtime|4770297|stackoverflow|Convert UTC datetime string to local datetime|python datetime utc localtime
 79 | python yield generator|41136410|stackoverflow|Python `yield from`, or return a generator?|python function return generator
 80 | restful programming|671118|stackoverflow|What is RESTful programming?|rest http architecture definition
 81 | rest put vs post|630453|stackoverflow|What is the difference between POST and PUT in HTTP?|http rest post put
 82 | ruby shell command|2232|stackoverflow|How to call shell commands from Ruby|ruby shell interop
 83 | ruby switch statement|948135|stackoverflow|How to write a switch statement in Ruby|ruby switch-statement conditional-statements
 84 | scp recursive copy directories|264595|serverfault|Can scp copy directories recursively?|linux scp
 85 | server room identify burning smell|496139|serverfault|Something is burning in the server room; how can I quickly identify what it is?|hardware
 86 | sftp port|74176|serverfault|What port does SFTP use?|sftp
 87 | ssh automate script with password|241588|serverfault|How to automate SSH login with password?|ssh password automation
 88 | ssh diff|59140|serverfault|How do diff over ssh?|diff
 89 | svd for pca|134282|stats|Relationship between SVD and PCA. How to use SVD to perform PCA?|pca dimensionality-reduction matrix svd faq
 90 | tensorflow install ubuntu 14.04|41875915|stackoverflow|Install tensorflow on Ubuntu 14.04|python python-2.7 ubuntu tensorflow pip
 91 | ubuntu show current version|12493|askubuntu|How can I find the version of Ubuntu that is installed?|versions release-management
 92 | ubuntu install xfce|116602|askubuntu|How to install Xfce desktop environment?|xfce
 93 | unzip zip file terminal|86849|askubuntu|How to unzip a zip file from the Terminal?|command-line zip
 94 | vi copy text clipboard|84|vi|How can I copy text to the system clipboard from Vim?|cut-copy-paste os-clipboard
 95 | windows check cpu temperature|395434|superuser|How can I check the temperature of my CPU in Windows?|windows cpu temperature
 96 | windows compare pdf files|46123|superuser|How to compare the differences between two PDF files on Windows?|windows pdf file-comparison
 97 | windows ctrl+alt+delete remote desktop|57222|serverfault|How to send ctrl+alt+del using Remote Desktop?|windows remote-desktop
 98 | windows list running processes command line|914782|superuser|How do you list all processes on the command line in Windows?|windows command-line
 99 | windows sudo|9652720|stackoverflow|How to run 'sudo' command in windows|windows
100 | windows wireless keyboard toaster|792607|superuser|Why does Windows think that my wireless keyboard is a toaster?|windows-7 device-manager
101 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <p align="center">
  2 |     <img src="https://raw.githubusercontent.com/neuml/codequestion/master/logo.png"/>
  3 | </p>
  4 | 
  5 | <p align="center">
  6 |     <b>Semantic search for developers</b>
  7 | </p>
  8 | 
  9 | <p align="center">
 10 |     <a href="https://github.com/neuml/codequestion/releases">
 11 |         <img src="https://img.shields.io/github/release/neuml/codequestion.svg?style=flat&color=success" alt="Version"/>
 12 |     </a>
 13 |     <a href="https://github.com/neuml/codequestion">
 14 |         <img src="https://img.shields.io/github/last-commit/neuml/codequestion.svg?style=flat&color=blue" alt="GitHub last commit"/>
 15 |     </a>
 16 |     <a href="https://github.com/neuml/codequestion/issues">
 17 |         <img src="https://img.shields.io/github/issues/neuml/codequestion.svg?style=flat&color=success" alt="GitHub issues"/>
 18 |     </a>
 19 |     <a href="https://github.com/neuml/codequestion/actions?query=workflow%3Abuild">
 20 |         <img src="https://github.com/neuml/codequestion/workflows/build/badge.svg" alt="Build Status"/>
 21 |     </a>
 22 |     <a href="https://coveralls.io/github/neuml/codequestion?branch=master">
 23 |         <img src="https://img.shields.io/coverallsCoverage/github/neuml/codequestion" alt="Coverage Status">
 24 |     </a>
 25 | </p>
 26 | 
 27 | -------------------------------------------------------------------------------------------------------------------------------------------------------
 28 | 
 29 | codequestion is a semantic search application for developer questions.
 30 | 
 31 | ![demo](https://raw.githubusercontent.com/neuml/codequestion/master/demo.gif)
 32 | 
 33 | Developers typically have a web browser window open while they work and run web searches as questions arise. With codequestion, this can be done from a local context. This application executes similarity queries to find similar questions to the input query.
 34 | 
 35 | The default model for codequestion is built off the [Stack Exchange Dumps on archive.org](https://archive.org/details/stackexchange). Once a model is installed, codequestion runs locally, no network connection is required. 
 36 | 
 37 | ![architecture](https://raw.githubusercontent.com/neuml/codequestion/master/images/architecture.png#gh-light-mode-only)
 38 | ![architecture](https://raw.githubusercontent.com/neuml/codequestion/master/images/architecture-dark.png#gh-dark-mode-only)
 39 | 
 40 | codequestion is built with Python 3.8+ and [txtai](https://github.com/neuml/txtai).
 41 | 
 42 | ## Installation
 43 | 
 44 | The easiest way to install is via pip and PyPI
 45 | 
 46 | ```
 47 | pip install codequestion
 48 | ```
 49 | 
 50 | Python 3.8+ is supported. Using a Python [virtual environment](https://docs.python.org/3/library/venv.html) is recommended.
 51 | 
 52 | codequestion can also be installed directly from GitHub to access the latest, unreleased features.
 53 | 
 54 | ```
 55 | pip install git+https://github.com/neuml/codequestion
 56 | ```
 57 | 
 58 | See [this link](https://neuml.github.io/txtai/install/#environment-specific-prerequisites) for environment-specific troubleshooting.
 59 | 
 60 | ## Download a model
 61 | 
 62 | Once codequestion is installed, a model needs to be downloaded.
 63 | 
 64 | ```
 65 | python -m codequestion.download
 66 | ```
 67 | 
 68 | The model will be stored in ~/.codequestion/
 69 | 
 70 | The model can also be manually installed if the machine doesn't have direct internet access. The default model is pulled from the [GitHub release page](https://github.com/neuml/codequestion/releases)
 71 | 
 72 | ```
 73 | unzip cqmodel.zip ~/.codequestion
 74 | ```
 75 | 
 76 | ## Search
 77 | 
 78 | Start up a codequestion shell to get started.
 79 | 
 80 | ```
 81 | codequestion
 82 | ```
 83 | 
 84 | A prompt will appear. Queries can be typed into the console. Type `help` to see all available commands.
 85 | 
 86 | ![demo](https://raw.githubusercontent.com/neuml/codequestion/master/demo.gif)
 87 | 
 88 | ## Topics
 89 | 
 90 | The latest release integrates [txtai 5.0](https://medium.com/neuml/whats-new-in-txtai-5-0-e5c75a13b101), which has support for semantic graphs.
 91 | 
 92 | Semantic graphs add support for topic modeling and path traversal. Topics organize questions into groups with similar concepts. Path traversal uses the semantic graph to show how two potentially disparate entries are connected. An example covering both topic and path traversal is shown below.
 93 | 
 94 | ![topics](https://raw.githubusercontent.com/neuml/codequestion/master/images/topics.gif)
 95 | 
 96 | ## VS Code
 97 | 
 98 | A codequestion prompt can be started within Visual Studio Code. This enables asking coding questions right from your IDE.
 99 | 
100 | Run `` Ctrl+` `` to open a new terminal then type `codequestion`.
101 | 
102 | ![vscode](https://raw.githubusercontent.com/neuml/codequestion/master/images/vscode.png)
103 | 
104 | ## API service
105 | 
106 | codequestion builds a standard txtai embeddings index. As such, it supports hosting the index via a [txtai API service](https://neuml.github.io/txtai/api).
107 | 
108 | Running the following:
109 | 
110 | _app.yml_
111 | ```yaml
112 | path: /home/user/.codequestion/models/stackexchange/
113 | embeddings:
114 | ```
115 | 
116 | ```
117 | # Install API extra
118 | pip install txtai[api]
119 | 
120 | # Start API
121 | CONFIG=app.yml uvicorn "txtai.api:app"
122 | 
123 | # Test API
124 | curl "http://127.0.0.1:8000/search?query=python+query+sqlite&limit=1"
125 | ```
126 | 
127 | Outputs:
128 | ```json
129 | [{
130 |     "id":"616429",
131 |     "text":"How to fetch data from sqlite using python? stackoverflow python sqlite",
132 |     "score":0.8401689529418945
133 | }]
134 | ```
135 | 
136 | Additional metadata fields can be pulled back with SQL statements.
137 | 
138 | ```
139 | curl
140 |     --get
141 |     --data-urlencode "query=select id, date, tags, question, score from txtai where similar('python query sqlite')"
142 |     --data-urlencode "limit=1"
143 |     "http://127.0.0.1:8000/search"
144 | ```
145 | 
146 | ```json
147 | [{
148 |     "id":"616429",
149 |     "date":"2022-05-23T10:45:40.397",
150 |     "tags":"python sqlite",
151 |     "question":"How to fetch data from sqlite using python?",
152 |     "score":0.8401689529418945
153 | }]
154 | ```
155 | 
156 | ## Tech overview
157 | The following is an overview covering how this project works.
158 | 
159 | ### Process the raw data dumps
160 | The raw 7z XML dumps from Stack Exchange are processed through a series of steps (see [building a model](#building-a-model)). Only highly scored questions with accepted answers are retrieved for storage in the model. Questions and answers are consolidated into a single SQLite file called questions.db. The schema for questions.db is below.
161 | 
162 | *questions.db schema*
163 | 
164 |     Id INTEGER PRIMARY KEY
165 |     Source TEXT
166 |     SourceId INTEGER
167 |     Date DATETIME
168 |     Tags TEXT
169 |     Question TEXT
170 |     QuestionUser TEXT
171 |     Answer TEXT
172 |     AnswerUser TEXT
173 |     Reference TEXT
174 | 
175 | ### Index
176 | codequestion builds a txtai embeddings index for questions.db. Each question in the questions.db schema is vectorized with a sentence-transformers model. Once questions.db is converted to a collection of sentence embeddings, the embeddings are normalized and stored in Faiss, which enables fast similarity searches.
177 | 
178 | ### Query
179 | codequestion tokenizes each query using the same method as during indexing. Those tokens are used to build a sentence embedding. That embedding is queried against the Faiss index to find the most similar questions.
180 | 
181 | ## Build a model
182 | The following steps show how to build a codequestion model using Stack Exchange archives.
183 | 
184 | _This is not necessary if using the default model from the [GitHub release page](https://github.com/neuml/codequestion/releases)_
185 | 
186 | 1.) Download files from Stack Exchange: https://archive.org/details/stackexchange
187 | 
188 | 2.) Place selected files into a directory structure like shown below (current process requires all these files).
189 | 
190 | - stackexchange/ai/ai.stackexchange.com.7z
191 | - stackexchange/android/android.stackexchange.com.7z
192 | - stackexchange/apple/apple.stackexchange.com.7z
193 | - stackexchange/arduino/arduino.stackexchange.com.7z
194 | - stackexchange/askubuntu/askubuntu.com.7z
195 | - stackexchange/avp/avp.stackexchange.com.7z
196 | - stackexchange/codereview/codereview.stackexchange.com.7z
197 | - stackexchange/cs/cs.stackexchange.com.7z
198 | - stackexchange/datascience/datascience.stackexchange.com.7z
199 | - stackexchange/dba/dba.stackexchange.com.7z
200 | - stackexchange/devops/devops.stackexchange.com.7z
201 | - stackexchange/dsp/dsp.stackexchange.com.7z
202 | - stackexchange/raspberrypi/raspberrypi.stackexchange.com.7z
203 | - stackexchange/reverseengineering/reverseengineering.stackexchange.com.7z
204 | - stackexchange/scicomp/scicomp.stackexchange.com.7z
205 | - stackexchange/security/security.stackexchange.com.7z
206 | - stackexchange/serverfault/serverfault.com.7z
207 | - stackexchange/stackoverflow/stackoverflow.com-Posts.7z
208 | - stackexchange/stats/stats.stackexchange.com.7z
209 | - stackexchange/superuser/superuser.com.7z
210 | - stackexchange/unix/unix.stackexchange.com.7z
211 | - stackexchange/vi/vi.stackexchange.com.7z
212 | - stackexchange/wordpress/wordpress.stackexchange.com.7z
213 | 
214 | 3.) Run the ETL process
215 | 
216 | ```
217 | python -m codequestion.etl.stackexchange.execute stackexchange
218 | ```
219 | 
220 | This will create the file stackexchange/questions.db
221 | 
222 | 4.) __OPTIONAL:__ Build word vectors - only necessary if using a word vectors model. If using word vector models, make sure to run `pip install txtai[similarity]`
223 | 
224 | ```
225 | python -m codequestion.vectors stackexchange/questions.db
226 | ```
227 | 
228 | This will create the file ~/.codequestion/vectors/stackexchange-300d.magnitude
229 | 
230 | 5.) Build embeddings index
231 | 
232 | ```
233 | python -m codequestion.index index.yml stackexchange/questions.db
234 | ```
235 | 
236 | The [default index.yml](https://raw.githubusercontent.com/neuml/codequestion/master/config/index.yml) file is found on GitHub. Settings can be changed to customize how the index is built.
237 | 
238 | After this step, the index is created and all necessary files are ready to query.
239 | 
240 | ## Model accuracy
241 | The following sections show test results for codequestion v2 and codequestion v1 using the latest Stack Exchange dumps. Version 2 uses a sentence-transformers model. Version 1 uses a word vectors model with BM25 weighting. BM25 and TF-IDF are shown to establish a baseline score.
242 | 
243 | **StackExchange Query**
244 | 
245 | Models are scored using [Mean Reciprocal Rank (MRR)](https://en.wikipedia.org/wiki/Mean_reciprocal_rank).
246 | 
247 | | Model               | MRR   |
248 | | ------------------- | :---: |
249 | | all-MiniLM-L6-v2    | 85.0  |
250 | | SE 300d - BM25      | 77.1  |
251 | | BM25                | 67.7  |
252 | | TF-IDF              | 61.7  |
253 | 
254 | **STS Benchmark**
255 | 
256 | Models are scored using [Pearson Correlation](https://en.wikipedia.org/wiki/Pearson_correlation_coefficient). Note that the word vectors model is only trained on Stack Exchange data, so it isn't expected to generalize as well against the STS dataset.
257 | 
258 | | Model            | Supervision   | Dev   | Test  |
259 | | ---------------- | :-----------: | :---: | :---: |
260 | | all-MiniLM-L6-v2 | Train         | 87.0  | 82.7  |
261 | | SE 300d - BM25   | Train         | 74.0  | 67.4  |
262 | 
263 | ## Tests
264 | To reproduce the tests above, run the following. Substitute $TEST_PATH with any local path.
265 | 
266 |     mkdir -p $TEST_PATH
267 |     wget https://raw.githubusercontent.com/neuml/codequestion/master/test/stackexchange/query.txt -P $TEST_PATH/stackexchange
268 |     wget http://ixa2.si.ehu.es/stswiki/images/4/48/Stsbenchmark.tar.gz
269 |     tar -C $TEST_PATH -xvzf Stsbenchmark.tar.gz
270 |     python -m codequestion.evaluate -s test -p $TEST_PATH
271 | 
272 | ## Further reading
273 | 
274 | - [Find answers with codequestion 2.0](https://medium.com/neuml/find-answers-with-codequestion-2-0-50b2cfd8c8fe)
275 | - [Building a sentence embedding index with fastText and BM25 (codequestion 1.0)](https://towardsdatascience.com/building-a-sentence-embedding-index-with-fasttext-and-bm25-f07e7148d240)
276 | 


--------------------------------------------------------------------------------
/images/architecture.excalidraw:
--------------------------------------------------------------------------------
   1 | {
   2 |   "type": "excalidraw",
   3 |   "version": 2,
   4 |   "source": "https://excalidraw.com",
   5 |   "elements": [
   6 |     {
   7 |       "type": "text",
   8 |       "version": 780,
   9 |       "versionNonce": 881355380,
  10 |       "isDeleted": false,
  11 |       "id": "Buic2Lx427wuSIW8P_Rw5",
  12 |       "fillStyle": "hachure",
  13 |       "strokeWidth": 1,
  14 |       "strokeStyle": "solid",
  15 |       "roughness": 1,
  16 |       "opacity": 100,
  17 |       "angle": 0,
  18 |       "x": 736,
  19 |       "y": 179,
  20 |       "strokeColor": "#000000",
  21 |       "backgroundColor": "#228be6",
  22 |       "width": 658,
  23 |       "height": 46,
  24 |       "seed": 373648901,
  25 |       "groupIds": [],
  26 |       "roundness": null,
  27 |       "boundElements": [],
  28 |       "updated": 1674310362127,
  29 |       "link": null,
  30 |       "locked": false,
  31 |       "fontSize": 36,
  32 |       "fontFamily": 1,
  33 |       "text": "Semantic search for coding questions",
  34 |       "baseline": 32,
  35 |       "textAlign": "left",
  36 |       "verticalAlign": "top",
  37 |       "containerId": null,
  38 |       "originalText": "Semantic search for coding questions"
  39 |     },
  40 |     {
  41 |       "type": "rectangle",
  42 |       "version": 2366,
  43 |       "versionNonce": 106997452,
  44 |       "isDeleted": false,
  45 |       "id": "U2NgEIEiFpAlwmv5Xnyzr",
  46 |       "fillStyle": "hachure",
  47 |       "strokeWidth": 1,
  48 |       "strokeStyle": "solid",
  49 |       "roughness": 1,
  50 |       "opacity": 40,
  51 |       "angle": 0,
  52 |       "x": 536.6352719532887,
  53 |       "y": 425.4438888888891,
  54 |       "strokeColor": "#000000",
  55 |       "backgroundColor": "#000000",
  56 |       "width": 1018.3567583077031,
  57 |       "height": 410.1764927948917,
  58 |       "seed": 1946478225,
  59 |       "groupIds": [],
  60 |       "roundness": null,
  61 |       "boundElements": [],
  62 |       "updated": 1674310362127,
  63 |       "link": "",
  64 |       "locked": false
  65 |     },
  66 |     {
  67 |       "type": "rectangle",
  68 |       "version": 1576,
  69 |       "versionNonce": 175832052,
  70 |       "isDeleted": false,
  71 |       "id": "UO6MS3wSDu7yg2421__LI",
  72 |       "fillStyle": "hachure",
  73 |       "strokeWidth": 1,
  74 |       "strokeStyle": "solid",
  75 |       "roughness": 1,
  76 |       "opacity": 100,
  77 |       "angle": 0,
  78 |       "x": 934.1111111111111,
  79 |       "y": 267,
  80 |       "strokeColor": "#ffeb3b",
  81 |       "backgroundColor": "#ffeb3b",
  82 |       "width": 214,
  83 |       "height": 49,
  84 |       "seed": 1629565989,
  85 |       "groupIds": [
  86 |         "3sURMvhuRfR0M-Q3VRPbg"
  87 |       ],
  88 |       "roundness": null,
  89 |       "boundElements": [
  90 |         {
  91 |           "type": "text",
  92 |           "id": "8sp7H8ijWBlh6aMgZ0XTP"
  93 |         },
  94 |         {
  95 |           "id": "Qzp41i_jzQIBlAB_qFKFH",
  96 |           "type": "arrow"
  97 |         },
  98 |         {
  99 |           "id": "SJ0F0Y81z9hir5qQWAJjk",
 100 |           "type": "arrow"
 101 |         }
 102 |       ],
 103 |       "updated": 1674310362127,
 104 |       "link": null,
 105 |       "locked": false
 106 |     },
 107 |     {
 108 |       "type": "rectangle",
 109 |       "version": 2351,
 110 |       "versionNonce": 733605196,
 111 |       "isDeleted": false,
 112 |       "id": "qYd3q0Vjks7VOHUC9RR51",
 113 |       "fillStyle": "hachure",
 114 |       "strokeWidth": 1,
 115 |       "strokeStyle": "solid",
 116 |       "roughness": 1,
 117 |       "opacity": 100,
 118 |       "angle": 0,
 119 |       "x": 550.1111111111111,
 120 |       "y": 267.5,
 121 |       "strokeColor": "#03a9f4",
 122 |       "backgroundColor": "#03a9f4",
 123 |       "width": 219,
 124 |       "height": 52,
 125 |       "seed": 1441952427,
 126 |       "groupIds": [
 127 |         "3sURMvhuRfR0M-Q3VRPbg"
 128 |       ],
 129 |       "roundness": null,
 130 |       "boundElements": [
 131 |         {
 132 |           "type": "text",
 133 |           "id": "WPeWn6N4rCHf0jY16N9Ge"
 134 |         },
 135 |         {
 136 |           "id": "Qzp41i_jzQIBlAB_qFKFH",
 137 |           "type": "arrow"
 138 |         }
 139 |       ],
 140 |       "updated": 1674310362127,
 141 |       "link": null,
 142 |       "locked": false
 143 |     },
 144 |     {
 145 |       "type": "text",
 146 |       "version": 2088,
 147 |       "versionNonce": 1998544244,
 148 |       "isDeleted": false,
 149 |       "id": "WPeWn6N4rCHf0jY16N9Ge",
 150 |       "fillStyle": "hachure",
 151 |       "strokeWidth": 1,
 152 |       "strokeStyle": "solid",
 153 |       "roughness": 1,
 154 |       "opacity": 100,
 155 |       "angle": 0,
 156 |       "x": 629.6111111111111,
 157 |       "y": 274,
 158 |       "strokeColor": "#000",
 159 |       "backgroundColor": "#fa5252",
 160 |       "width": 60,
 161 |       "height": 39,
 162 |       "seed": 870516459,
 163 |       "groupIds": [
 164 |         "3sURMvhuRfR0M-Q3VRPbg"
 165 |       ],
 166 |       "roundness": null,
 167 |       "boundElements": [],
 168 |       "updated": 1674310362127,
 169 |       "link": null,
 170 |       "locked": false,
 171 |       "fontSize": 28,
 172 |       "fontFamily": 1,
 173 |       "text": "ETL",
 174 |       "baseline": 27,
 175 |       "textAlign": "center",
 176 |       "verticalAlign": "middle",
 177 |       "containerId": "qYd3q0Vjks7VOHUC9RR51",
 178 |       "originalText": "ETL"
 179 |     },
 180 |     {
 181 |       "type": "rectangle",
 182 |       "version": 1785,
 183 |       "versionNonce": 321283020,
 184 |       "isDeleted": false,
 185 |       "id": "5VuUdI_BsJ5pyE1nTqJUI",
 186 |       "fillStyle": "hachure",
 187 |       "strokeWidth": 1,
 188 |       "strokeStyle": "solid",
 189 |       "roughness": 1,
 190 |       "opacity": 100,
 191 |       "angle": 0,
 192 |       "x": 1333.111111111111,
 193 |       "y": 268,
 194 |       "strokeColor": "#00e676",
 195 |       "backgroundColor": "#00e676",
 196 |       "width": 218,
 197 |       "height": 49,
 198 |       "seed": 1044404613,
 199 |       "groupIds": [
 200 |         "3sURMvhuRfR0M-Q3VRPbg"
 201 |       ],
 202 |       "roundness": null,
 203 |       "boundElements": [
 204 |         {
 205 |           "id": "bJJ9SGsJsvT071qBBH0w5",
 206 |           "type": "text"
 207 |         },
 208 |         {
 209 |           "id": "SJ0F0Y81z9hir5qQWAJjk",
 210 |           "type": "arrow"
 211 |         }
 212 |       ],
 213 |       "updated": 1674310362127,
 214 |       "link": null,
 215 |       "locked": false
 216 |     },
 217 |     {
 218 |       "type": "text",
 219 |       "version": 1985,
 220 |       "versionNonce": 1673236212,
 221 |       "isDeleted": false,
 222 |       "id": "bJJ9SGsJsvT071qBBH0w5",
 223 |       "fillStyle": "hachure",
 224 |       "strokeWidth": 1,
 225 |       "strokeStyle": "solid",
 226 |       "roughness": 1,
 227 |       "opacity": 100,
 228 |       "angle": 0,
 229 |       "x": 1338.111111111111,
 230 |       "y": 274.5,
 231 |       "strokeColor": "#000",
 232 |       "backgroundColor": "#fa5252",
 233 |       "width": 208,
 234 |       "height": 36,
 235 |       "seed": 128953675,
 236 |       "groupIds": [
 237 |         "3sURMvhuRfR0M-Q3VRPbg"
 238 |       ],
 239 |       "roundness": null,
 240 |       "boundElements": [],
 241 |       "updated": 1674310362127,
 242 |       "link": null,
 243 |       "locked": false,
 244 |       "fontSize": 28,
 245 |       "fontFamily": 1,
 246 |       "text": "Search",
 247 |       "baseline": 25,
 248 |       "textAlign": "center",
 249 |       "verticalAlign": "middle",
 250 |       "containerId": "5VuUdI_BsJ5pyE1nTqJUI",
 251 |       "originalText": "Search"
 252 |     },
 253 |     {
 254 |       "type": "text",
 255 |       "version": 1602,
 256 |       "versionNonce": 1925188172,
 257 |       "isDeleted": false,
 258 |       "id": "8sp7H8ijWBlh6aMgZ0XTP",
 259 |       "fillStyle": "hachure",
 260 |       "strokeWidth": 1,
 261 |       "strokeStyle": "solid",
 262 |       "roughness": 1,
 263 |       "opacity": 100,
 264 |       "angle": 0,
 265 |       "x": 939.1111111111111,
 266 |       "y": 273.5,
 267 |       "strokeColor": "#000",
 268 |       "backgroundColor": "transparent",
 269 |       "width": 204,
 270 |       "height": 36,
 271 |       "seed": 1854823263,
 272 |       "groupIds": [
 273 |         "3sURMvhuRfR0M-Q3VRPbg"
 274 |       ],
 275 |       "roundness": null,
 276 |       "boundElements": [],
 277 |       "updated": 1674310362127,
 278 |       "link": null,
 279 |       "locked": false,
 280 |       "fontSize": 28,
 281 |       "fontFamily": 1,
 282 |       "text": "Index",
 283 |       "baseline": 25,
 284 |       "textAlign": "center",
 285 |       "verticalAlign": "middle",
 286 |       "containerId": "UO6MS3wSDu7yg2421__LI",
 287 |       "originalText": "Index"
 288 |     },
 289 |     {
 290 |       "type": "text",
 291 |       "version": 1134,
 292 |       "versionNonce": 550867060,
 293 |       "isDeleted": false,
 294 |       "id": "jWJpSXHkTCzRTCA4tbAgv",
 295 |       "fillStyle": "hachure",
 296 |       "strokeWidth": 1,
 297 |       "strokeStyle": "solid",
 298 |       "roughness": 1,
 299 |       "opacity": 100,
 300 |       "angle": 0,
 301 |       "x": 549.6111111111111,
 302 |       "y": 347.30499999999995,
 303 |       "strokeColor": "#000",
 304 |       "backgroundColor": "#03a9f4",
 305 |       "width": 270,
 306 |       "height": 42,
 307 |       "seed": 1241563487,
 308 |       "groupIds": [
 309 |         "3sURMvhuRfR0M-Q3VRPbg"
 310 |       ],
 311 |       "roundness": null,
 312 |       "boundElements": [],
 313 |       "updated": 1674310362127,
 314 |       "link": null,
 315 |       "locked": false,
 316 |       "fontSize": 16,
 317 |       "fontFamily": 1,
 318 |       "text": "- Parse and transform input\n- Filter down to \"popular\" answers",
 319 |       "baseline": 36,
 320 |       "textAlign": "left",
 321 |       "verticalAlign": "top",
 322 |       "containerId": null,
 323 |       "originalText": "- Parse and transform input\n- Filter down to \"popular\" answers"
 324 |     },
 325 |     {
 326 |       "type": "text",
 327 |       "version": 1121,
 328 |       "versionNonce": 761323724,
 329 |       "isDeleted": false,
 330 |       "id": "qEnmXs0P_MQE8r4c4OWGh",
 331 |       "fillStyle": "hachure",
 332 |       "strokeWidth": 1,
 333 |       "strokeStyle": "solid",
 334 |       "roughness": 1,
 335 |       "opacity": 100,
 336 |       "angle": 0,
 337 |       "x": 932.6111111111111,
 338 |       "y": 346.2074999999999,
 339 |       "strokeColor": "#000",
 340 |       "backgroundColor": "#f44336",
 341 |       "width": 245,
 342 |       "height": 42,
 343 |       "seed": 1038536465,
 344 |       "groupIds": [
 345 |         "3sURMvhuRfR0M-Q3VRPbg"
 346 |       ],
 347 |       "roundness": null,
 348 |       "boundElements": [],
 349 |       "updated": 1674310362127,
 350 |       "link": null,
 351 |       "locked": false,
 352 |       "fontSize": 16,
 353 |       "fontFamily": 1,
 354 |       "text": "- Transform input into numbers\n- Store content with vectors",
 355 |       "baseline": 36,
 356 |       "textAlign": "left",
 357 |       "verticalAlign": "top",
 358 |       "containerId": null,
 359 |       "originalText": "- Transform input into numbers\n- Store content with vectors"
 360 |     },
 361 |     {
 362 |       "type": "text",
 363 |       "version": 1185,
 364 |       "versionNonce": 2117296628,
 365 |       "isDeleted": false,
 366 |       "id": "1q8bzjK8lnKUZj8_A9v7D",
 367 |       "fillStyle": "hachure",
 368 |       "strokeWidth": 1,
 369 |       "strokeStyle": "solid",
 370 |       "roughness": 1,
 371 |       "opacity": 100,
 372 |       "angle": 0,
 373 |       "x": 1245.111111111111,
 374 |       "y": 349.2074999999999,
 375 |       "strokeColor": "#000",
 376 |       "backgroundColor": "#f44336",
 377 |       "width": 322,
 378 |       "height": 42,
 379 |       "seed": 304472945,
 380 |       "groupIds": [
 381 |         "3sURMvhuRfR0M-Q3VRPbg"
 382 |       ],
 383 |       "roundness": null,
 384 |       "boundElements": [],
 385 |       "updated": 1674310362127,
 386 |       "link": null,
 387 |       "locked": false,
 388 |       "fontSize": 16,
 389 |       "fontFamily": 1,
 390 |       "text": "- Find similar content with vector search\n- Explore topics and relationships",
 391 |       "baseline": 36,
 392 |       "textAlign": "left",
 393 |       "verticalAlign": "top",
 394 |       "containerId": null,
 395 |       "originalText": "- Find similar content with vector search\n- Explore topics and relationships"
 396 |     },
 397 |     {
 398 |       "type": "arrow",
 399 |       "version": 3387,
 400 |       "versionNonce": 983754572,
 401 |       "isDeleted": false,
 402 |       "id": "Qzp41i_jzQIBlAB_qFKFH",
 403 |       "fillStyle": "hachure",
 404 |       "strokeWidth": 1,
 405 |       "strokeStyle": "solid",
 406 |       "roughness": 1,
 407 |       "opacity": 100,
 408 |       "angle": 0,
 409 |       "x": 771.6111111111111,
 410 |       "y": 289.8470411964629,
 411 |       "strokeColor": "#000",
 412 |       "backgroundColor": "#f44336",
 413 |       "width": 158.1310513485223,
 414 |       "height": 0.5692601572380909,
 415 |       "seed": 660786897,
 416 |       "groupIds": [
 417 |         "3sURMvhuRfR0M-Q3VRPbg"
 418 |       ],
 419 |       "roundness": {
 420 |         "type": 2
 421 |       },
 422 |       "boundElements": [],
 423 |       "updated": 1674310362127,
 424 |       "link": null,
 425 |       "locked": false,
 426 |       "startBinding": {
 427 |         "elementId": "qYd3q0Vjks7VOHUC9RR51",
 428 |         "focus": -0.15367587596362536,
 429 |         "gap": 2.5
 430 |       },
 431 |       "endBinding": {
 432 |         "elementId": "UO6MS3wSDu7yg2421__LI",
 433 |         "focus": 0.027437144815141,
 434 |         "gap": 4.3689486514776945
 435 |       },
 436 |       "lastCommittedPoint": null,
 437 |       "startArrowhead": null,
 438 |       "endArrowhead": "arrow",
 439 |       "points": [
 440 |         [
 441 |           0,
 442 |           0
 443 |         ],
 444 |         [
 445 |           158.1310513485223,
 446 |           0.5692601572380909
 447 |         ]
 448 |       ]
 449 |     },
 450 |     {
 451 |       "type": "arrow",
 452 |       "version": 3907,
 453 |       "versionNonce": 1658520436,
 454 |       "isDeleted": false,
 455 |       "id": "SJ0F0Y81z9hir5qQWAJjk",
 456 |       "fillStyle": "hachure",
 457 |       "strokeWidth": 1,
 458 |       "strokeStyle": "solid",
 459 |       "roughness": 1,
 460 |       "opacity": 100,
 461 |       "angle": 0,
 462 |       "x": 1150.611111111111,
 463 |       "y": 292.6790761701911,
 464 |       "strokeColor": "#000",
 465 |       "backgroundColor": "#f44336",
 466 |       "width": 181.5,
 467 |       "height": 1.5898915058209013,
 468 |       "seed": 899541905,
 469 |       "groupIds": [
 470 |         "3sURMvhuRfR0M-Q3VRPbg"
 471 |       ],
 472 |       "roundness": {
 473 |         "type": 2
 474 |       },
 475 |       "boundElements": [],
 476 |       "updated": 1674310362127,
 477 |       "link": null,
 478 |       "locked": false,
 479 |       "startBinding": {
 480 |         "elementId": "UO6MS3wSDu7yg2421__LI",
 481 |         "focus": 0.08406032225724415,
 482 |         "gap": 2.5
 483 |       },
 484 |       "endBinding": {
 485 |         "elementId": "5VuUdI_BsJ5pyE1nTqJUI",
 486 |         "focus": 0.09327847520504394,
 487 |         "gap": 1
 488 |       },
 489 |       "lastCommittedPoint": null,
 490 |       "startArrowhead": null,
 491 |       "endArrowhead": "arrow",
 492 |       "points": [
 493 |         [
 494 |           0,
 495 |           0
 496 |         ],
 497 |         [
 498 |           181.5,
 499 |           -1.5898915058209013
 500 |         ]
 501 |       ]
 502 |     },
 503 |     {
 504 |       "type": "text",
 505 |       "version": 121,
 506 |       "versionNonce": 1369352592,
 507 |       "isDeleted": false,
 508 |       "id": "0S4gs8k1Aw_EE3epHrlwi",
 509 |       "fillStyle": "hachure",
 510 |       "strokeWidth": 1,
 511 |       "strokeStyle": "solid",
 512 |       "roughness": 1,
 513 |       "opacity": 100,
 514 |       "angle": 0,
 515 |       "x": 558.9603174603171,
 516 |       "y": 440.1485317460317,
 517 |       "strokeColor": "#000000",
 518 |       "backgroundColor": "transparent",
 519 |       "width": 520,
 520 |       "height": 52,
 521 |       "seed": 1521084272,
 522 |       "groupIds": [],
 523 |       "roundness": null,
 524 |       "boundElements": [],
 525 |       "updated": 1665016874534,
 526 |       "link": null,
 527 |       "locked": false,
 528 |       "fontSize": 20,
 529 |       "fontFamily": 1,
 530 |       "text": ">>> python build pdf\n---------------------------------------------------------------",
 531 |       "baseline": 44,
 532 |       "textAlign": "left",
 533 |       "verticalAlign": "top",
 534 |       "containerId": null,
 535 |       "originalText": ">>> python build pdf\n---------------------------------------------------------------"
 536 |     },
 537 |     {
 538 |       "type": "text",
 539 |       "version": 948,
 540 |       "versionNonce": 249568716,
 541 |       "isDeleted": false,
 542 |       "id": "hnqGO83Op144jMURaGlCf",
 543 |       "fillStyle": "hachure",
 544 |       "strokeWidth": 1,
 545 |       "strokeStyle": "solid",
 546 |       "roughness": 1,
 547 |       "opacity": 100,
 548 |       "angle": 0,
 549 |       "x": 567.8492063492062,
 550 |       "y": 489.03742063492075,
 551 |       "strokeColor": "#000000",
 552 |       "backgroundColor": "transparent",
 553 |       "width": 397,
 554 |       "height": 156,
 555 |       "seed": 1108820368,
 556 |       "groupIds": [],
 557 |       "roundness": null,
 558 |       "boundElements": [
 559 |         {
 560 |           "id": "f3vLDOpOTtgvPlvxSLtb6",
 561 |           "type": "arrow"
 562 |         }
 563 |       ],
 564 |       "updated": 1674310362127,
 565 |       "link": null,
 566 |       "locked": false,
 567 |       "fontSize": 20,
 568 |       "fontFamily": 1,
 569 |       "text": "\n\nId: 219570\nLast Activity: 2016-11-22T09:07:49.983\nTags: python pdf pdf-generation\nAnswer (by 772200):",
 570 |       "baseline": 148,
 571 |       "textAlign": "left",
 572 |       "verticalAlign": "top",
 573 |       "containerId": null,
 574 |       "originalText": "\n\nId: 219570\nLast Activity: 2016-11-22T09:07:49.983\nTags: python pdf pdf-generation\nAnswer (by 772200):"
 575 |     },
 576 |     {
 577 |       "type": "text",
 578 |       "version": 173,
 579 |       "versionNonce": 1895751536,
 580 |       "isDeleted": false,
 581 |       "id": "AKQSU-yPRu9JKbNgVRi9v",
 582 |       "fillStyle": "hachure",
 583 |       "strokeWidth": 1,
 584 |       "strokeStyle": "solid",
 585 |       "roughness": 1,
 586 |       "opacity": 100,
 587 |       "angle": 0,
 588 |       "x": 566.7380952380952,
 589 |       "y": 630.1485317460314,
 590 |       "strokeColor": "#000000",
 591 |       "backgroundColor": "transparent",
 592 |       "width": 322,
 593 |       "height": 130,
 594 |       "seed": 1679216,
 595 |       "groupIds": [],
 596 |       "roundness": null,
 597 |       "boundElements": [],
 598 |       "updated": 1665016981495,
 599 |       "link": null,
 600 |       "locked": false,
 601 |       "fontSize": 20.069228106611277,
 602 |       "fontFamily": 1,
 603 |       "text": "\nThe two that come to mind are:\n\n • pyPdf2\n • PDFMiner",
 604 |       "baseline": 122,
 605 |       "textAlign": "left",
 606 |       "verticalAlign": "top",
 607 |       "containerId": null,
 608 |       "originalText": "\nThe two that come to mind are:\n\n • pyPdf2\n • PDFMiner"
 609 |     },
 610 |     {
 611 |       "type": "text",
 612 |       "version": 85,
 613 |       "versionNonce": 1520293264,
 614 |       "isDeleted": false,
 615 |       "id": "oWhhEqzBly2k2HiDjxdzq",
 616 |       "fillStyle": "hachure",
 617 |       "strokeWidth": 1,
 618 |       "strokeStyle": "solid",
 619 |       "roughness": 1,
 620 |       "opacity": 100,
 621 |       "angle": 0,
 622 |       "x": 568.9603174603171,
 623 |       "y": 779.0374206349206,
 624 |       "strokeColor": "#03a9f4",
 625 |       "backgroundColor": "transparent",
 626 |       "width": 550,
 627 |       "height": 26,
 628 |       "seed": 1958330768,
 629 |       "groupIds": [],
 630 |       "roundness": null,
 631 |       "boundElements": [],
 632 |       "updated": 1665016908431,
 633 |       "link": null,
 634 |       "locked": false,
 635 |       "fontSize": 20,
 636 |       "fontFamily": 1,
 637 |       "text": "Reference: https://stackoverflow.com/questions/6413441",
 638 |       "baseline": 18,
 639 |       "textAlign": "left",
 640 |       "verticalAlign": "top",
 641 |       "containerId": null,
 642 |       "originalText": "Reference: https://stackoverflow.com/questions/6413441"
 643 |     },
 644 |     {
 645 |       "type": "text",
 646 |       "version": 584,
 647 |       "versionNonce": 1695708404,
 648 |       "isDeleted": false,
 649 |       "id": "E-1wDKL8ZmPnn0ONPMDti",
 650 |       "fillStyle": "hachure",
 651 |       "strokeWidth": 1,
 652 |       "strokeStyle": "solid",
 653 |       "roughness": 1,
 654 |       "opacity": 100,
 655 |       "angle": 0,
 656 |       "x": 566.738095238095,
 657 |       "y": 502.3707539682539,
 658 |       "strokeColor": "#000000",
 659 |       "backgroundColor": "transparent",
 660 |       "width": 517,
 661 |       "height": 26,
 662 |       "seed": 2028071280,
 663 |       "groupIds": [],
 664 |       "roundness": null,
 665 |       "boundElements": [
 666 |         {
 667 |           "id": "Nb5_4C9PyVmUjcNmwmtTf",
 668 |           "type": "arrow"
 669 |         }
 670 |       ],
 671 |       "updated": 1674310362127,
 672 |       "link": null,
 673 |       "locked": false,
 674 |       "fontSize": 20,
 675 |       "fontFamily": 1,
 676 |       "text": "Question (by 312251): Python PDF library [0.801761]",
 677 |       "baseline": 18,
 678 |       "textAlign": "left",
 679 |       "verticalAlign": "top",
 680 |       "containerId": null,
 681 |       "originalText": "Question (by 312251): Python PDF library [0.801761]"
 682 |     },
 683 |     {
 684 |       "type": "arrow",
 685 |       "version": 502,
 686 |       "versionNonce": 978951056,
 687 |       "isDeleted": false,
 688 |       "id": "Nb5_4C9PyVmUjcNmwmtTf",
 689 |       "fillStyle": "hachure",
 690 |       "strokeWidth": 1,
 691 |       "strokeStyle": "solid",
 692 |       "roughness": 1,
 693 |       "opacity": 100,
 694 |       "angle": 0,
 695 |       "x": 1307.8492063492063,
 696 |       "y": 480.8848231593677,
 697 |       "strokeColor": "#000000",
 698 |       "backgroundColor": "#000000",
 699 |       "width": 217.77777777777783,
 700 |       "height": 31.597041919997253,
 701 |       "seed": 1504292752,
 702 |       "groupIds": [],
 703 |       "roundness": {
 704 |         "type": 2
 705 |       },
 706 |       "boundElements": [],
 707 |       "updated": 1665017411344,
 708 |       "link": null,
 709 |       "locked": false,
 710 |       "startBinding": {
 711 |         "elementId": "bxrIQaIPIjEN65QVVyKjd",
 712 |         "focus": 0.6279335165098477,
 713 |         "gap": 3.8333333333332575
 714 |       },
 715 |       "endBinding": {
 716 |         "elementId": "E-1wDKL8ZmPnn0ONPMDti",
 717 |         "focus": 0.3581099106309167,
 718 |         "gap": 6.333333333333485
 719 |       },
 720 |       "lastCommittedPoint": null,
 721 |       "startArrowhead": null,
 722 |       "endArrowhead": "arrow",
 723 |       "points": [
 724 |         [
 725 |           0,
 726 |           0
 727 |         ],
 728 |         [
 729 |           -65.55555555555566,
 730 |           24.930375253330624
 731 |         ],
 732 |         [
 733 |           -217.77777777777783,
 734 |           31.597041919997253
 735 |         ]
 736 |       ]
 737 |     },
 738 |     {
 739 |       "type": "rectangle",
 740 |       "version": 2726,
 741 |       "versionNonce": 53642316,
 742 |       "isDeleted": false,
 743 |       "id": "bxrIQaIPIjEN65QVVyKjd",
 744 |       "fillStyle": "hachure",
 745 |       "strokeWidth": 1,
 746 |       "strokeStyle": "solid",
 747 |       "roughness": 1,
 748 |       "opacity": 100,
 749 |       "angle": 0,
 750 |       "x": 1311.6825396825395,
 751 |       "y": 454.2596428571427,
 752 |       "strokeColor": "#5f3dc4",
 753 |       "backgroundColor": "#5f3dc4",
 754 |       "width": 219,
 755 |       "height": 52,
 756 |       "seed": 1176432016,
 757 |       "groupIds": [
 758 |         "XHWKg8UL3ErDF5KJBC0TT"
 759 |       ],
 760 |       "roundness": null,
 761 |       "boundElements": [
 762 |         {
 763 |           "id": "Y6VADemG1rq3Yf1_X3Rkw",
 764 |           "type": "text"
 765 |         },
 766 |         {
 767 |           "id": "Qzp41i_jzQIBlAB_qFKFH",
 768 |           "type": "arrow"
 769 |         },
 770 |         {
 771 |           "id": "Nb5_4C9PyVmUjcNmwmtTf",
 772 |           "type": "arrow"
 773 |         }
 774 |       ],
 775 |       "updated": 1674310362127,
 776 |       "link": null,
 777 |       "locked": false
 778 |     },
 779 |     {
 780 |       "type": "text",
 781 |       "version": 1543,
 782 |       "versionNonce": 1775569264,
 783 |       "isDeleted": false,
 784 |       "id": "Y6VADemG1rq3Yf1_X3Rkw",
 785 |       "fillStyle": "hachure",
 786 |       "strokeWidth": 1,
 787 |       "strokeStyle": "solid",
 788 |       "roughness": 1,
 789 |       "opacity": 100,
 790 |       "angle": 0,
 791 |       "x": 1329.6825396825395,
 792 |       "y": 460.7596428571427,
 793 |       "strokeColor": "#000000",
 794 |       "backgroundColor": "#fa5252",
 795 |       "width": 183,
 796 |       "height": 39,
 797 |       "seed": 2097855376,
 798 |       "groupIds": [
 799 |         "XHWKg8UL3ErDF5KJBC0TT"
 800 |       ],
 801 |       "roundness": null,
 802 |       "boundElements": [],
 803 |       "updated": 1665017413674,
 804 |       "link": null,
 805 |       "locked": false,
 806 |       "fontSize": 28,
 807 |       "fontFamily": 1,
 808 |       "text": "Vector match",
 809 |       "baseline": 27,
 810 |       "textAlign": "center",
 811 |       "verticalAlign": "middle",
 812 |       "containerId": "bxrIQaIPIjEN65QVVyKjd",
 813 |       "originalText": "Vector match"
 814 |     },
 815 |     {
 816 |       "type": "rectangle",
 817 |       "version": 1089,
 818 |       "versionNonce": 691460496,
 819 |       "isDeleted": false,
 820 |       "id": "G-qwb8bQ8dozMQZmcq1Gl",
 821 |       "fillStyle": "hachure",
 822 |       "strokeWidth": 1,
 823 |       "strokeStyle": "solid",
 824 |       "roughness": 1,
 825 |       "opacity": 100,
 826 |       "angle": 0,
 827 |       "x": 1319.738095238095,
 828 |       "y": 685.7596428571427,
 829 |       "strokeColor": "#03a9f4",
 830 |       "backgroundColor": "#03a9f4",
 831 |       "width": 214,
 832 |       "height": 49,
 833 |       "seed": 1905111440,
 834 |       "groupIds": [
 835 |         "wf5G07CYJirX_YOSAFeDv"
 836 |       ],
 837 |       "roundness": null,
 838 |       "boundElements": [
 839 |         {
 840 |           "id": "T7Dd9_CUf4IoQLKnCySvb",
 841 |           "type": "text"
 842 |         },
 843 |         {
 844 |           "id": "Qzp41i_jzQIBlAB_qFKFH",
 845 |           "type": "arrow"
 846 |         },
 847 |         {
 848 |           "id": "SJ0F0Y81z9hir5qQWAJjk",
 849 |           "type": "arrow"
 850 |         },
 851 |         {
 852 |           "id": "6CxseRtxEY_xN1wGA8ahy",
 853 |           "type": "arrow"
 854 |         }
 855 |       ],
 856 |       "updated": 1665017719529,
 857 |       "link": null,
 858 |       "locked": false
 859 |     },
 860 |     {
 861 |       "type": "text",
 862 |       "version": 1104,
 863 |       "versionNonce": 1279521680,
 864 |       "isDeleted": false,
 865 |       "id": "T7Dd9_CUf4IoQLKnCySvb",
 866 |       "fillStyle": "hachure",
 867 |       "strokeWidth": 1,
 868 |       "strokeStyle": "solid",
 869 |       "roughness": 1,
 870 |       "opacity": 100,
 871 |       "angle": 0,
 872 |       "x": 1380.238095238095,
 873 |       "y": 690.7596428571427,
 874 |       "strokeColor": "#000000",
 875 |       "backgroundColor": "#03a9f4",
 876 |       "width": 93,
 877 |       "height": 39,
 878 |       "seed": 2089545584,
 879 |       "groupIds": [
 880 |         "wf5G07CYJirX_YOSAFeDv"
 881 |       ],
 882 |       "roundness": null,
 883 |       "boundElements": [],
 884 |       "updated": 1665017732426,
 885 |       "link": null,
 886 |       "locked": false,
 887 |       "fontSize": 28,
 888 |       "fontFamily": 1,
 889 |       "text": "Answer",
 890 |       "baseline": 27,
 891 |       "textAlign": "center",
 892 |       "verticalAlign": "middle",
 893 |       "containerId": "G-qwb8bQ8dozMQZmcq1Gl",
 894 |       "originalText": "Answer"
 895 |     },
 896 |     {
 897 |       "type": "arrow",
 898 |       "version": 616,
 899 |       "versionNonce": 1298745200,
 900 |       "isDeleted": false,
 901 |       "id": "6CxseRtxEY_xN1wGA8ahy",
 902 |       "fillStyle": "hachure",
 903 |       "strokeWidth": 1,
 904 |       "strokeStyle": "solid",
 905 |       "roughness": 1,
 906 |       "opacity": 100,
 907 |       "angle": 0,
 908 |       "x": 1303.9603174603174,
 909 |       "y": 708.4075937236478,
 910 |       "strokeColor": "#000000",
 911 |       "backgroundColor": "#000000",
 912 |       "width": 377.7777777777778,
 913 |       "height": 6.2911737786592425,
 914 |       "seed": 1277086096,
 915 |       "groupIds": [],
 916 |       "roundness": {
 917 |         "type": 2
 918 |       },
 919 |       "boundElements": [],
 920 |       "updated": 1665017719530,
 921 |       "link": null,
 922 |       "locked": false,
 923 |       "startBinding": {
 924 |         "elementId": "G-qwb8bQ8dozMQZmcq1Gl",
 925 |         "focus": 0.17330116418533134,
 926 |         "gap": 15.777777777777601
 927 |       },
 928 |       "endBinding": null,
 929 |       "lastCommittedPoint": null,
 930 |       "startArrowhead": null,
 931 |       "endArrowhead": "arrow",
 932 |       "points": [
 933 |         [
 934 |           0,
 935 |           0
 936 |         ],
 937 |         [
 938 |           -225.55555555555566,
 939 |           5.1800626675482135
 940 |         ],
 941 |         [
 942 |           -377.7777777777778,
 943 |           6.2911737786592425
 944 |         ]
 945 |       ]
 946 |     },
 947 |     {
 948 |       "type": "rectangle",
 949 |       "version": 1744,
 950 |       "versionNonce": 394840052,
 951 |       "isDeleted": false,
 952 |       "id": "mskx8L2KXgKOLHKahrjQI",
 953 |       "fillStyle": "hachure",
 954 |       "strokeWidth": 1,
 955 |       "strokeStyle": "solid",
 956 |       "roughness": 1,
 957 |       "opacity": 100,
 958 |       "angle": 0,
 959 |       "x": 1315.5158730158726,
 960 |       "y": 571.3151984126984,
 961 |       "strokeColor": "#fa5252",
 962 |       "backgroundColor": "#ff7043",
 963 |       "width": 218,
 964 |       "height": 49,
 965 |       "seed": 1444977008,
 966 |       "groupIds": [
 967 |         "Ow3OuCkl-1gnPf96uZhoJ"
 968 |       ],
 969 |       "roundness": null,
 970 |       "boundElements": [
 971 |         {
 972 |           "id": "dADm_k9Od8a9ANLvFiLsB",
 973 |           "type": "text"
 974 |         },
 975 |         {
 976 |           "id": "SJ0F0Y81z9hir5qQWAJjk",
 977 |           "type": "arrow"
 978 |         },
 979 |         {
 980 |           "id": "f3vLDOpOTtgvPlvxSLtb6",
 981 |           "type": "arrow"
 982 |         }
 983 |       ],
 984 |       "updated": 1674310379960,
 985 |       "link": null,
 986 |       "locked": false
 987 |     },
 988 |     {
 989 |       "type": "text",
 990 |       "version": 1496,
 991 |       "versionNonce": 406774092,
 992 |       "isDeleted": false,
 993 |       "id": "dADm_k9Od8a9ANLvFiLsB",
 994 |       "fillStyle": "hachure",
 995 |       "strokeWidth": 1,
 996 |       "strokeStyle": "solid",
 997 |       "roughness": 1,
 998 |       "opacity": 100,
 999 |       "angle": 0,
1000 |       "x": 1353.5158730158726,
1001 |       "y": 576.3151984126984,
1002 |       "strokeColor": "#000000",
1003 |       "backgroundColor": "#ff7043",
1004 |       "width": 142,
1005 |       "height": 39,
1006 |       "seed": 464251760,
1007 |       "groupIds": [
1008 |         "Ow3OuCkl-1gnPf96uZhoJ"
1009 |       ],
1010 |       "roundness": null,
1011 |       "boundElements": [],
1012 |       "updated": 1674310379960,
1013 |       "link": null,
1014 |       "locked": false,
1015 |       "fontSize": 28,
1016 |       "fontFamily": 1,
1017 |       "text": "Metadata",
1018 |       "baseline": 27,
1019 |       "textAlign": "center",
1020 |       "verticalAlign": "middle",
1021 |       "containerId": "mskx8L2KXgKOLHKahrjQI",
1022 |       "originalText": "Metadata"
1023 |     },
1024 |     {
1025 |       "type": "arrow",
1026 |       "version": 583,
1027 |       "versionNonce": 129907088,
1028 |       "isDeleted": false,
1029 |       "id": "f3vLDOpOTtgvPlvxSLtb6",
1030 |       "fillStyle": "hachure",
1031 |       "strokeWidth": 1,
1032 |       "strokeStyle": "solid",
1033 |       "roughness": 1,
1034 |       "opacity": 100,
1035 |       "angle": 0,
1036 |       "x": 1305.0714285714284,
1037 |       "y": 592.5642974832429,
1038 |       "strokeColor": "#000000",
1039 |       "backgroundColor": "#000000",
1040 |       "width": 321.1111111111113,
1041 |       "height": 15.467803352397482,
1042 |       "seed": 449970544,
1043 |       "groupIds": [],
1044 |       "roundness": {
1045 |         "type": 2
1046 |       },
1047 |       "boundElements": [],
1048 |       "updated": 1665017397537,
1049 |       "link": null,
1050 |       "locked": false,
1051 |       "startBinding": {
1052 |         "elementId": "mskx8L2KXgKOLHKahrjQI",
1053 |         "focus": 0.2891539308183631,
1054 |         "gap": 10.444444444444116
1055 |       },
1056 |       "endBinding": {
1057 |         "elementId": "hnqGO83Op144jMURaGlCf",
1058 |         "focus": 0.5894787674956782,
1059 |         "gap": 19.111111111110972
1060 |       },
1061 |       "lastCommittedPoint": null,
1062 |       "startArrowhead": null,
1063 |       "endArrowhead": "arrow",
1064 |       "points": [
1065 |         [
1066 |           0,
1067 |           0
1068 |         ],
1069 |         [
1070 |           -74.44444444444457,
1071 |           3.2455811301751964
1072 |         ],
1073 |         [
1074 |           -321.1111111111113,
1075 |           15.467803352397482
1076 |         ]
1077 |       ]
1078 |     }
1079 |   ],
1080 |   "appState": {
1081 |     "gridSize": null,
1082 |     "viewBackgroundColor": "#fff"
1083 |   },
1084 |   "files": {}
1085 | }


--------------------------------------------------------------------------------