├── src
└── python
│ └── codequestion
│ ├── __init__.py
│ ├── etl
│ ├── __init__.py
│ └── stackexchange
│ │ ├── __init__.py
│ │ ├── decompress.py
│ │ ├── sift.py
│ │ ├── execute.py
│ │ ├── xml2db.py
│ │ └── db2qa.py
│ ├── path.py
│ ├── topics.py
│ ├── tokenizer.py
│ ├── models.py
│ ├── download.py
│ ├── console.py
│ ├── index.py
│ ├── vectors.py
│ ├── search.py
│ └── evaluate.py
├── demo.gif
├── logo.png
├── images
├── topics.gif
├── vscode.png
├── architecture.png
├── architecture-dark.png
└── architecture.excalidraw
├── .gitignore
├── config
├── index.v1.yml
└── index.yml
├── .coveragerc
├── test
├── python
│ ├── utils.py
│ ├── testdownload.py
│ ├── testconsole.py
│ └── testindex.py
└── stackexchange
│ └── query.txt
├── .pre-commit-config.yaml
├── .pylintrc
├── Makefile
├── .github
└── workflows
│ └── build.yml
├── setup.py
├── LICENSE
└── README.md
/src/python/codequestion/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/src/python/codequestion/etl/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/demo.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neuml/codequestion/HEAD/demo.gif
--------------------------------------------------------------------------------
/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neuml/codequestion/HEAD/logo.png
--------------------------------------------------------------------------------
/images/topics.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neuml/codequestion/HEAD/images/topics.gif
--------------------------------------------------------------------------------
/images/vscode.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neuml/codequestion/HEAD/images/vscode.png
--------------------------------------------------------------------------------
/images/architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neuml/codequestion/HEAD/images/architecture.png
--------------------------------------------------------------------------------
/images/architecture-dark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neuml/codequestion/HEAD/images/architecture-dark.png
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | dist/
3 | htmlcov/
4 | *egg-info/
5 | __pycache__/
6 | .coverage
7 | .coverage.*
8 | *.pyc
9 |
--------------------------------------------------------------------------------
/config/index.v1.yml:
--------------------------------------------------------------------------------
1 | path: stackexchange-300d.magnitude
2 | content: True
3 | objects: True
4 | quantize: True
5 | storevectors: True
6 | scoring:
7 | method: bm25
8 | k1: 0.1
9 | pca: 3
10 |
--------------------------------------------------------------------------------
/.coveragerc:
--------------------------------------------------------------------------------
1 | [run]
2 | source = src/python
3 | concurrency = multiprocessing,thread
4 | disable_warnings = no-data-collected
5 |
6 | [combine]
7 | disable_warnings = no-data-collected
8 |
9 | [report]
10 | exclude_lines =
11 | if __name__ == .__main__.:
12 |
--------------------------------------------------------------------------------
/src/python/codequestion/etl/stackexchange/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Stack Exchange imports
3 | """
4 |
5 | from .db2qa import DB2QA
6 | from .decompress import Decompress
7 | from .execute import Execute
8 | from .sift import Sift
9 | from .xml2db import XML2DB
10 |
--------------------------------------------------------------------------------
/test/python/utils.py:
--------------------------------------------------------------------------------
1 | """
2 | Utils module
3 | """
4 |
5 |
6 | class Utils:
7 | """
8 | Utility constants and methods
9 | """
10 |
11 | PATH = "/tmp/codequestion"
12 | STACKEXCHANGE = PATH + "/stackexchange"
13 | QUESTIONS = STACKEXCHANGE + "/questions.db"
14 | TESTS = PATH + "/test"
15 |
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | repos:
2 | - repo: https://github.com/pycqa/pylint
3 | rev: v2.12.1
4 | hooks:
5 | - id: pylint
6 | args:
7 | - -d import-error
8 | - -d duplicate-code
9 | - repo: https://github.com/ambv/black
10 | rev: 22.3.0
11 | hooks:
12 | - id: black
13 | language_version: python3
14 |
--------------------------------------------------------------------------------
/config/index.yml:
--------------------------------------------------------------------------------
1 | path: sentence-transformers/all-MiniLM-L6-v2
2 | content: True
3 | objects: True
4 | quantize: True
5 | functions:
6 | - name: graph
7 | function: graph.attribute
8 | expressions:
9 | - name: topic
10 | expression: graph(indexid, 'topic')
11 | - name: topicrank
12 | expression: graph(indexid, 'topicrank')
13 | graph:
14 | limit: 100
15 | minscore: 0.2
16 | topics:
17 | stopwords:
18 | - stackoverflow
19 |
--------------------------------------------------------------------------------
/.pylintrc:
--------------------------------------------------------------------------------
1 | [BASIC]
2 | module-rgx=[a-z_][a-zA-Z0-9_]{2,30}$
3 | method-rgx=[a-z_][a-zA-Z0-9_]{2,30}$
4 | function-rgx=[a-z_][a-zA-Z0-9_]{2,30}$
5 | argument-rgx=[a-z_][a-zA-Z0-9_]{0,30}$
6 | variable-rgx=[a-z_][a-zA-Z0-9_]{0,30}$
7 | attr-rgx=[a-z_][a-zA-Z0-9_]{0,30}$
8 |
9 | [DESIGN]
10 | max-args=10
11 | max-locals=40
12 | max-returns=10
13 | max-attributes=20
14 | min-public-methods=0
15 |
16 | [FORMAT]
17 | max-line-length=150
18 |
19 | [MESSAGES CONTROL]
20 | disable=R0201,W0621
21 |
--------------------------------------------------------------------------------
/test/python/testdownload.py:
--------------------------------------------------------------------------------
1 | """
2 | Download module tests
3 | """
4 |
5 | import os
6 | import unittest
7 |
8 | from codequestion.download import Download
9 |
10 | # pylint: disable=C0411
11 | from utils import Utils
12 |
13 |
14 | class TestDownload(unittest.TestCase):
15 | """
16 | Download tests.
17 | """
18 |
19 | def testDownload(self):
20 | """
21 | Test download
22 | """
23 |
24 | download = Download()
25 | download(
26 | "https://github.com/neuml/codequestion/archive/refs/heads/master.zip",
27 | Utils.PATH,
28 | )
29 |
30 | # Check archive uncompressed successfully
31 | self.assertTrue(os.path.exists(Utils.PATH + "/codequestion-master/setup.py"))
32 |
--------------------------------------------------------------------------------
/src/python/codequestion/path.py:
--------------------------------------------------------------------------------
1 | """
2 | Path module
3 | """
4 |
5 | from rich.console import Console
6 |
7 |
8 | class Path:
9 | """
10 | Traverse semantic graphs.
11 | """
12 |
13 | def __init__(self, embeddings):
14 | """
15 | Creates a new path action.
16 |
17 | Args:
18 | embeddings: embeddings instance
19 | """
20 |
21 | self.embeddings = embeddings
22 | self.graph = embeddings.graph
23 |
24 | def __call__(self, start, end):
25 | """
26 | Runs a path action.
27 |
28 | Args:
29 | start: start node id
30 | end: end node id
31 | """
32 |
33 | console = Console()
34 |
35 | path = self.graph.showpath(start, end)
36 | for x, uid in enumerate(path):
37 | query = f"select question from txtai where id = '{uid}'"
38 | question = self.embeddings.search(query, 1)[0]["question"]
39 | console.print(f"{x + 1}. {question} ({uid})")
40 |
--------------------------------------------------------------------------------
/src/python/codequestion/etl/stackexchange/decompress.py:
--------------------------------------------------------------------------------
1 | """
2 | Decompress module
3 | """
4 |
5 | import shlex
6 | import shutil
7 | import subprocess
8 |
9 |
10 | class Decompress:
11 | """
12 | Runs a 7zip extract command via an external process.
13 | """
14 |
15 | def __call__(self, path):
16 | """
17 | Runs the 7za extraction.
18 |
19 | Args:
20 | path: input directory path with 7z files
21 | """
22 |
23 | # Check for 7za, default to 7z
24 | binary = "7za" if shutil.which("7za") else "7z"
25 |
26 | # Build command
27 | path = path.replace("\\", "/")
28 | command = f"{binary} e {path}/*.7z Posts.xml -y -o{path}"
29 | print(command)
30 |
31 | # Start command
32 | with subprocess.Popen(
33 | shlex.split(command), stdout=subprocess.PIPE, universal_newlines=True
34 | ) as process:
35 | while True:
36 | output = process.stdout.readline()
37 | if output == "" and process.poll() is not None:
38 | break
39 | if output:
40 | print(output.strip())
41 |
42 | # Call final poll on completion
43 | process.poll()
44 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | # Project utility scripts
2 | .PHONY: test
3 |
4 | # Setup environment
5 | export SRC_DIR := ./src/python
6 | export TEST_DIR := ./test/python
7 | export PYTHONPATH := ${SRC_DIR}:${TEST_DIR}:${PYTHONPATH}
8 | export PATH := ${TEST_DIR}:${PATH}
9 | export PYTHONWARNINGS := ignore
10 | export TOKENIZERS_PARALLELISM := False
11 |
12 | # Default python executable if not provided
13 | PYTHON ?= python
14 |
15 | # Download test data
16 | data:
17 | mkdir -p /tmp/codequestion
18 | wget -N https://archive.org/download/stackexchange_20220606/ai.stackexchange.com.7z -P /tmp/codequestion/stackexchange/ai
19 | wget -N https://raw.githubusercontent.com/neuml/codequestion/master/config/index.v1.yml -P /tmp/codequestion/
20 | wget -N https://raw.githubusercontent.com/neuml/codequestion/master/config/index.yml -P /tmp/codequestion/
21 |
22 | wget https://raw.githubusercontent.com/neuml/codequestion/master/test/stackexchange/query.txt -P /tmp/codequestion/test/stackexchange
23 | wget http://ixa2.si.ehu.es/stswiki/images/4/48/Stsbenchmark.tar.gz -P /tmp/codequestion
24 | tar -C /tmp/codequestion/test -xvzf /tmp/codequestion/Stsbenchmark.tar.gz
25 |
26 | # Unit tests
27 | test:
28 | ${PYTHON} -m unittest discover -v -s ${TEST_DIR}
29 |
30 | # Run tests while calculating code coverage
31 | coverage:
32 | coverage run -m unittest discover -v -s ${TEST_DIR}
33 | coverage combine
34 |
--------------------------------------------------------------------------------
/.github/workflows/build.yml:
--------------------------------------------------------------------------------
1 | # GitHub Actions build workflow
2 | name: build
3 |
4 | on: ["push", "pull_request"]
5 |
6 | jobs:
7 | build:
8 | runs-on: ${{ matrix.os }}
9 | strategy:
10 | matrix:
11 | os: [ubuntu-latest, macos-latest, windows-latest]
12 |
13 | timeout-minutes: 60
14 | steps:
15 | - name: Checkout code
16 | uses: actions/checkout@v3
17 |
18 | - name: Install Python - Linux
19 | uses: actions/setup-python@v4
20 | with:
21 | python-version: 3.8
22 | if: matrix.os == 'ubuntu-latest'
23 |
24 | - name: Install Python - macOS/Windows
25 | uses: actions/setup-python@v4
26 | with:
27 | python-version: 3.9
28 | if: matrix.os != 'ubuntu-latest'
29 |
30 | - name: Install dependencies - macOS
31 | run: |
32 | echo "OMP_NUM_THREADS=1" >> $GITHUB_ENV
33 | if: matrix.os == 'macos-latest'
34 |
35 | - name: Install dependencies - Windows
36 | run: choco install wget
37 | if: matrix.os == 'windows-latest'
38 |
39 | - name: Build
40 | run: |
41 | pip install -U pip
42 | pip install -U wheel coverage coveralls
43 | pip install . txtai[similarity]
44 | python --version
45 | make data coverage
46 |
47 | - uses: pre-commit/action@v3.0.0
48 | if: matrix.os == 'ubuntu-latest'
49 |
50 | - name: Test Coverage
51 | run: coveralls --service=github
52 | if: matrix.os == 'ubuntu-latest'
53 | env:
54 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
55 |
--------------------------------------------------------------------------------
/src/python/codequestion/topics.py:
--------------------------------------------------------------------------------
1 | """
2 | Topics module
3 | """
4 |
5 | from rich.console import Console
6 |
7 | from txtai.embeddings import Embeddings
8 |
9 |
10 | class Topics:
11 | """
12 | Query topic models.
13 | """
14 |
15 | def __init__(self, embeddings):
16 | """
17 | Creates a new topics action.
18 |
19 | Args:
20 | embeddings: embeddings instance
21 | """
22 |
23 | self.embeddings = embeddings
24 | self.topics = embeddings.graph.topics
25 |
26 | # Build on-the-fly topics index
27 | self.topicembed = Embeddings({"path": "sentence-transformers/all-MiniLM-L6-v2"})
28 | self.topicembed.index((x, topic, None) for x, topic in enumerate(self.topics))
29 |
30 | def __call__(self, query=None):
31 | """
32 | Runs a topics action.
33 |
34 | Args:
35 | query: optional query to filter topics, otherwise top topics are shown
36 | """
37 |
38 | console = Console()
39 |
40 | topics = list(self.topics.keys())
41 | if query:
42 | results = self.topicembed.search(query, 10)
43 | else:
44 | results = [(x, 1.0) for x in range(10)]
45 |
46 | for uid, score in results:
47 | if score >= 0.1:
48 | topic = topics[uid]
49 | console.print(f"[bright_green]{topic}[/bright_green]")
50 |
51 | # Print example question
52 | query = f"select id, question from txtai where similar('{topic}')"
53 | result = self.embeddings.search(query, 1)[0]
54 | console.print(f"{result['question']} ({result['id']})\n")
55 |
--------------------------------------------------------------------------------
/src/python/codequestion/tokenizer.py:
--------------------------------------------------------------------------------
1 | """
2 | Tokenizer module
3 | """
4 |
5 | import re
6 | import string
7 |
8 |
9 | class Tokenizer:
10 | """
11 | Text tokenization methods
12 | """
13 |
14 | # Use standard python punctuation chars but allow tokens to end in # (to allow c#, f#) and + to allow (c++ g++)
15 | PUNCTUATION = string.punctuation.replace("#", "").replace("+", "")
16 |
17 | # fmt: off
18 | # English Stop Word List (Standard stop words used by Apache Lucene)
19 | STOP_WORDS = {"a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it",
20 | "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these",
21 | "they", "this", "to", "was", "will", "with"}
22 | # fmt: on
23 |
24 | @staticmethod
25 | def tokenize(text):
26 | """
27 | Tokenizes input text into a list of tokens. Filters tokens that match a specific pattern and removes stop words.
28 |
29 | Args:
30 | text: input text
31 |
32 | Returns:
33 | list of tokens
34 | """
35 |
36 | # Convert to all lowercase, split on whitespace, strip punctuation
37 | tokens = [token.strip(Tokenizer.PUNCTUATION) for token in text.lower().split()]
38 |
39 | # Filter tokens that are numbers or a valid string at least 2 characters long. Remove stop words.
40 | # Assume tokens already are uncased (all lowercase)
41 | return [
42 | token
43 | for token in tokens
44 | if (re.match(r"^[#*+\-.0-9:@_a-z]{2,}$", token) or token.isdigit())
45 | and token not in Tokenizer.STOP_WORDS
46 | ]
47 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | # pylint: disable = C0111
2 | from setuptools import find_packages, setup
3 |
4 | with open("README.md", "r", encoding="utf-8") as f:
5 | # Remove GitHub dark mode images
6 | DESCRIPTION = "".join([line for line in f if "gh-dark-mode-only" not in line])
7 |
8 | setup(
9 | name="codequestion",
10 | version="2.2.0",
11 | author="NeuML",
12 | description="Ask coding questions directly from the terminal",
13 | long_description=DESCRIPTION,
14 | long_description_content_type="text/markdown",
15 | url="https://github.com/neuml/codequestion",
16 | project_urls={
17 | "Documentation": "https://github.com/neuml/codequestion",
18 | "Issue Tracker": "https://github.com/neuml/codequestion/issues",
19 | "Source Code": "https://github.com/neuml/codequestion",
20 | },
21 | license="Apache 2.0: http://www.apache.org/licenses/LICENSE-2.0",
22 | packages=find_packages(where="src/python"),
23 | package_dir={"": "src/python"},
24 | keywords="search embedding machine-learning nlp",
25 | python_requires=">=3.8",
26 | entry_points={
27 | "console_scripts": [
28 | "codequestion = codequestion.console:main",
29 | ],
30 | },
31 | install_requires=[
32 | "html2markdown>=0.1.7",
33 | "rich>=12.0.1",
34 | "scipy>=1.4.1",
35 | "tqdm>=4.48.0",
36 | "txtai[graph]>=6.0.0",
37 | ],
38 | classifiers=[
39 | "License :: OSI Approved :: Apache Software License",
40 | "Operating System :: OS Independent",
41 | "Programming Language :: Python :: 3",
42 | "Topic :: Scientific/Engineering :: Artificial Intelligence",
43 | "Topic :: Software Development",
44 | "Topic :: Text Processing :: Indexing",
45 | "Topic :: Utilities",
46 | ],
47 | )
48 |
--------------------------------------------------------------------------------
/src/python/codequestion/models.py:
--------------------------------------------------------------------------------
1 | """
2 | Models module
3 | """
4 |
5 | import os
6 | import os.path
7 |
8 |
9 | class Models:
10 | """
11 | Common methods for generating data paths.
12 | """
13 |
14 | @staticmethod
15 | def basePath(create=False):
16 | """
17 | Base data path.
18 |
19 | Args:
20 | create: if directory should be created
21 |
22 | Returns:
23 | path
24 | """
25 |
26 | # Derive base path
27 | path = os.environ.get("CODEQUESTION_HOME")
28 |
29 | # Default model base path when environment variable is empty
30 | path = path if path else os.path.join(os.path.expanduser("~"), ".codequestion")
31 |
32 | # Create directory if required
33 | if create:
34 | os.makedirs(path, exist_ok=True)
35 |
36 | return path
37 |
38 | @staticmethod
39 | def modelPath(name, create=False):
40 | """
41 | Model path for name
42 |
43 | Args:
44 | name: model name
45 | create: if directory should be created
46 |
47 | Returns:
48 | path
49 | """
50 |
51 | path = os.path.join(Models.basePath(), "models", name)
52 |
53 | # Create directory if required
54 | if create:
55 | os.makedirs(path, exist_ok=True)
56 |
57 | return path
58 |
59 | @staticmethod
60 | def vectorPath(name, create=False):
61 | """
62 | Vector path for name
63 |
64 | Args:
65 | name: vectors name
66 | create: if directory should be created
67 |
68 | Returns:
69 | path
70 | """
71 |
72 | path = os.path.join(Models.basePath(), "vectors")
73 |
74 | # Create directory path if required
75 | if create:
76 | os.makedirs(path, exist_ok=True)
77 |
78 | # Append file name to path
79 | return os.path.join(path, name)
80 |
--------------------------------------------------------------------------------
/src/python/codequestion/download.py:
--------------------------------------------------------------------------------
1 | """
2 | Download module
3 | """
4 |
5 | import os.path
6 | import tempfile
7 | import zipfile
8 |
9 | from urllib.request import urlopen
10 |
11 | from tqdm import tqdm
12 |
13 | from .models import Models
14 |
15 |
16 | class Download:
17 | """
18 | Downloads a pre-trained model.
19 | """
20 |
21 | def __call__(self, url, path=None):
22 | """
23 | Downloads a pre-trained model from url into the local model cache directory.
24 |
25 | Args:
26 | url: url model path
27 | """
28 |
29 | # Get base models path
30 | path = path if path else Models.basePath(True)
31 | dest = os.path.join(tempfile.gettempdir(), os.path.basename(url))
32 |
33 | print(f"Downloading model from {url} to {dest}")
34 |
35 | # Download file
36 | self.download(url, dest)
37 |
38 | print(f"Decompressing model to {path}")
39 |
40 | # Ensure file was downloaded successfully
41 | if os.path.exists(dest):
42 | with zipfile.ZipFile(dest, "r") as z:
43 | z.extractall(path)
44 |
45 | print("Download complete")
46 |
47 | def download(self, url, dest):
48 | """
49 | Downloads a remote file from url and stores at dest.
50 |
51 | Args:
52 | url: remote url
53 | dest: destination file path
54 | """
55 |
56 | with urlopen(url) as response:
57 | buffer = 16 * 1024
58 | headers = response.info()
59 | size = int(headers["Content-Length"]) if "Content-Length" in headers else -1
60 |
61 | with tqdm(total=size, unit="B", unit_scale=True, unit_divisor=1024) as pbar:
62 | with open(dest, "wb") as f:
63 | while True:
64 | chunk = response.read(buffer)
65 | if not chunk:
66 | break
67 |
68 | f.write(chunk)
69 | pbar.update(len(chunk))
70 |
71 |
72 | if __name__ == "__main__":
73 | download = Download()
74 | download(
75 | "https://github.com/neuml/codequestion/releases/download/v2.0.0/cqmodel.zip"
76 | )
77 |
--------------------------------------------------------------------------------
/src/python/codequestion/etl/stackexchange/sift.py:
--------------------------------------------------------------------------------
1 | """
2 | Sift module
3 | """
4 |
5 | import re
6 |
7 |
8 | class Sift:
9 | """
10 | Filters a raw posts.xml file for matching results. Uses raw text processing to avoid overhead of parsing xml.
11 | """
12 |
13 | def __call__(self, infile, outfile):
14 | """
15 | Processes a raw Posts.xml file. The Posts dump is in Id order ascending.
16 |
17 | Args:
18 | infile: path to input file
19 | outfile: path to output file
20 | """
21 |
22 | print(f"Converting {infile} to {outfile}")
23 |
24 | # Set of answer ids
25 | ids = set()
26 |
27 | with open(infile, encoding="utf-8") as xml:
28 | with open(outfile, "w", encoding="utf-8") as output:
29 | # Write xml start
30 | output.write("\n")
31 |
32 | for line in xml:
33 | # PostTypeId = 1 (Question) with accepted answer.
34 | if "AcceptedAnswerId" in line:
35 | # Parse answer id and score
36 | answer = self.parse(r"AcceptedAnswerId=\"([0-9]+)\"", line)
37 | score = self.parse(r"Score=\"([0-9]+)\"", line)
38 |
39 | # Require a score of 10+.
40 | if score >= 10:
41 | # Add answer id to ids list
42 | ids.add(answer)
43 |
44 | # Write accepted line
45 | output.write(line)
46 |
47 | # PostTypeId = 2 (Answer)
48 | elif 'PostTypeId="2"' in line:
49 | # Parse post id
50 | pid = self.parse(r"Id=\"([0-9]+)\"", line)
51 |
52 | if pid in ids:
53 | # Write output line and remove from ids list
54 | output.write(line)
55 | ids.remove(pid)
56 |
57 | # Write xml end
58 | output.write("\n")
59 |
60 | def parse(self, pattern, line):
61 | """
62 | Parses an int field and returns the value if found. Returns -1 if no value found.
63 |
64 | Args:
65 | pattern: regex pattern
66 | line: input line
67 |
68 | Return:
69 | field value
70 | """
71 |
72 | field = re.search(pattern, line)
73 | return int(field.group(1)) if field else -1
74 |
--------------------------------------------------------------------------------
/test/python/testconsole.py:
--------------------------------------------------------------------------------
1 | """
2 | Console module tests
3 | """
4 |
5 | import contextlib
6 | import io
7 | import os
8 | import unittest
9 |
10 | from codequestion.console import Console
11 | from codequestion.etl.stackexchange import Execute
12 | from codequestion.index import Index
13 |
14 | # pylint: disable=C0411
15 | from utils import Utils
16 |
17 |
18 | class TestConsole(unittest.TestCase):
19 | """
20 | Console tests.
21 | """
22 |
23 | @classmethod
24 | def setUpClass(cls):
25 | """
26 | Initialize test data.
27 | """
28 |
29 | os.environ["CODEQUESTION_HOME"] = Utils.STACKEXCHANGE + ".console"
30 |
31 | # Run etl process
32 | Execute.SOURCES = ["ai"]
33 |
34 | execute = Execute()
35 | execute(Utils.STACKEXCHANGE)
36 |
37 | # Create embeddings index
38 | index = Index()
39 | index(Utils.PATH + "/index.yml", Utils.QUESTIONS)
40 |
41 | cls.console = Console()
42 | cls.console.preloop()
43 |
44 | def testHelp(self):
45 | """
46 | Test help command
47 | """
48 |
49 | self.assertIn(".limit", self.command("help"))
50 | self.assertIn(".limit", self.command("help .limit"))
51 |
52 | def testLimit(self):
53 | """
54 | Test .limit command
55 | """
56 |
57 | self.assertEqual(self.command(".limit 1"), "")
58 |
59 | def testPath(self):
60 | """
61 | Test .path command
62 | """
63 |
64 | self.assertIn("1. ", self.command(".path 0 1"))
65 |
66 | def testSearch(self):
67 | """
68 | Test search
69 | """
70 |
71 | self.assertIn("Question", self.command("ai"))
72 |
73 | def testShow(self):
74 | """
75 | Test .show command
76 | """
77 |
78 | self.assertIn("Question", self.command(".show 0"))
79 |
80 | def testtopics(self):
81 | """
82 | Test .topics command
83 | """
84 |
85 | self.assertNotIn("ERROR", self.command(".topics"))
86 | self.assertNotIn("ERROR", self.command(".topics ai"))
87 |
88 | def command(self, command):
89 | """
90 | Runs a console command.
91 |
92 | Args:
93 | command: command to run
94 |
95 | Returns:
96 | command output
97 | """
98 |
99 | # Run info
100 | output = io.StringIO()
101 | with contextlib.redirect_stdout(output):
102 | self.console.onecmd(command)
103 |
104 | return output.getvalue()
105 |
--------------------------------------------------------------------------------
/test/python/testindex.py:
--------------------------------------------------------------------------------
1 | """
2 | Index module tests
3 | """
4 |
5 | import contextlib
6 | import io
7 | import os
8 | import unittest
9 |
10 | from codequestion.evaluate import StackExchange, STS
11 | from codequestion.index import Index
12 | from codequestion.search import Search
13 | from codequestion.vectors import Vectors
14 |
15 | # pylint: disable=C0411
16 | from utils import Utils
17 |
18 |
19 | class TestIndex(unittest.TestCase):
20 | """
21 | Index tests.
22 | """
23 |
24 | def testTransformers(self):
25 | """
26 | Test transformers-backed index
27 | """
28 |
29 | os.environ["CODEQUESTION_HOME"] = Utils.STACKEXCHANGE + ".transformers"
30 |
31 | # Create embeddings index
32 | index = Index()
33 | index(Utils.PATH + "/index.yml", Utils.QUESTIONS)
34 |
35 | # Run tests
36 | self.runTests()
37 |
38 | def testWordVectors(self):
39 | """
40 | Test word vector-backed index
41 | """
42 |
43 | os.environ["CODEQUESTION_HOME"] = Utils.STACKEXCHANGE + ".wv"
44 |
45 | # Build word vectors
46 | vectors = Vectors()
47 | vectors(Utils.QUESTIONS, 300, 3)
48 |
49 | # Create embeddings index
50 | index = Index()
51 | index(Utils.PATH + "/index.v1.yml", Utils.QUESTIONS)
52 |
53 | # Run tests
54 | self.runTests()
55 |
56 | def runTests(self):
57 | """
58 | Run index tests.
59 | """
60 |
61 | self.search()
62 | self.stackexchange()
63 | self.sts()
64 |
65 | def search(self):
66 | """
67 | Run search test.
68 | """
69 |
70 | # Test search
71 | search = Search()
72 | self.assertIn(
73 | "machine learning", self.command(lambda: search("machine learning"))
74 | )
75 |
76 | def stackexchange(self):
77 | """
78 | Run stack exchange test.
79 | """
80 |
81 | action = StackExchange()
82 | self.assertIn(
83 | "Mean Reciprocal Rank", self.command(lambda: action(Utils.TESTS, None))
84 | )
85 | self.assertIn(
86 | "Mean Reciprocal Rank", self.command(lambda: action(Utils.TESTS, "bm25"))
87 | )
88 |
89 | def sts(self):
90 | """
91 | Run STS test.
92 | """
93 |
94 | action = STS()
95 | self.assertIn("Pearson", self.command(lambda: action(Utils.TESTS, None)))
96 |
97 | def command(self, command):
98 | """
99 | Runs a console command.
100 |
101 | Args:
102 | command: command to run
103 |
104 | Returns:
105 | command output
106 | """
107 |
108 | # Run info
109 | output = io.StringIO()
110 | with contextlib.redirect_stdout(output):
111 | command()
112 |
113 | return output.getvalue()
114 |
--------------------------------------------------------------------------------
/src/python/codequestion/etl/stackexchange/execute.py:
--------------------------------------------------------------------------------
1 | """
2 | Execute module
3 | """
4 |
5 | import os
6 | import os.path
7 | import sys
8 |
9 | from .db2qa import DB2QA
10 | from .decompress import Decompress
11 | from .sift import Sift
12 | from .xml2db import XML2DB
13 |
14 |
15 | class Execute:
16 | """
17 | Main execution method to build a consolidated questions.db file from Stack Exchange Data Dumps.
18 | """
19 |
20 | # List of sources
21 | SOURCES = [
22 | "ai",
23 | "android",
24 | "apple",
25 | "arduino",
26 | "askubuntu",
27 | "avp",
28 | "codereview",
29 | "cs",
30 | "datascience",
31 | "dba",
32 | "devops",
33 | "dsp",
34 | "raspberrypi",
35 | "reverseengineering",
36 | "scicomp",
37 | "serverfault",
38 | "security",
39 | "stackoverflow",
40 | "stats",
41 | "superuser",
42 | "unix",
43 | "vi",
44 | "wordpress",
45 | ]
46 |
47 | def __call__(self, path):
48 | """
49 | Converts a directory of raw sources to a single output questions database.
50 |
51 | Args:
52 | path: base directory path
53 | """
54 |
55 | # Iterates through a directory of raw sources and builds staging databases
56 | databases = self.process(path)
57 |
58 | # Output database file
59 | qafile = os.path.join(path, "questions.db")
60 |
61 | # Build consolidated SQLite questions database
62 | db2qa = DB2QA()
63 | db2qa(databases, qafile)
64 |
65 | def process(self, path):
66 | """
67 | Iterates through each source and converts raw xml to SQLite databases. Returns a list of
68 | output databases.
69 |
70 | Args:
71 | path: input directory path with raw source data directories
72 |
73 | Returns:
74 | paths to output databases
75 | """
76 |
77 | # Extract filtered content and build source databases to process
78 | for source in Execute.SOURCES:
79 | spath = os.path.join(path, source)
80 |
81 | # Extract Posts.xml from 7za file
82 | decompress = Decompress()
83 | decompress(spath)
84 |
85 | posts = os.path.join(spath, "Posts.xml")
86 | filtered = os.path.join(spath, "Filtered.xml")
87 |
88 | # Filter Posts.xml file for matching questions
89 | sift = Sift()
90 | sift(posts, filtered)
91 |
92 | dbfile = os.path.join(spath, f"{source}.db")
93 |
94 | # Convert filtered Posts.xml file to SQLite db file
95 | xml2db = XML2DB()
96 | xml2db(filtered, dbfile)
97 |
98 | # Get list of all databases to consolidate
99 | return [
100 | os.path.join(path, source, f"{source}.db") for source in Execute.SOURCES
101 | ]
102 |
103 |
104 | if __name__ == "__main__":
105 | # Input data directory
106 | path = sys.argv[1]
107 | if not os.path.exists(path):
108 | print("Data directory does not exist, exiting")
109 | sys.exit()
110 |
111 | # Run ETL process
112 | execute = Execute()
113 | execute(path)
114 |
--------------------------------------------------------------------------------
/src/python/codequestion/console.py:
--------------------------------------------------------------------------------
1 | """
2 | Console module
3 | """
4 |
5 | from cmd import Cmd
6 |
7 | from rich.console import Console as RichConsole
8 |
9 | from .path import Path
10 | from .search import Search
11 | from .topics import Topics
12 |
13 |
14 | class Console(Cmd):
15 | """
16 | codequestion console.
17 | """
18 |
19 | def __init__(self):
20 | """
21 | Creates a new codequestion console.
22 | """
23 |
24 | super().__init__()
25 |
26 | # Display configuration
27 | self.intro = "codequestion console"
28 | self.prompt = ">>> "
29 | self.console = RichConsole()
30 |
31 | # Search parameters
32 | self.search = None
33 | self.embeddings = None
34 | self.limit = 1
35 |
36 | # Topics action
37 | self.topics = None
38 |
39 | # Path traversal action
40 | self.path = None
41 |
42 | def preloop(self):
43 | """
44 | Loads initial configuration.
45 | """
46 |
47 | # Load query and embeddings
48 | self.search = Search()
49 | self.embeddings = self.search.embeddings
50 |
51 | # Load graph-based actions, if necessary
52 | if self.embeddings.graph:
53 | if self.embeddings.graph.topics:
54 | self.topics = Topics(self.embeddings)
55 |
56 | self.path = Path(self.embeddings)
57 |
58 | def default(self, line):
59 | """
60 | Default event loop.
61 |
62 | Args:
63 | line: command line
64 | """
65 |
66 | # pylint: disable=W0703
67 | try:
68 | command = line.lower()
69 | if command.startswith(".limit"):
70 | command = self.split(line)
71 | self.limit = int(command[1])
72 | elif command.startswith(".path") and self.path:
73 | command = self.split(line)
74 | start, end = command[1].split()
75 | self.path(int(start), int(end))
76 | elif command.startswith(".show"):
77 | command = self.split(line)
78 | self.search(uid=command[1])
79 | elif command.startswith(".topics") and self.topics:
80 | command = self.split(line)
81 | self.topics(command[1] if len(command) > 1 else None)
82 | else:
83 | # Search is default action
84 | self.search(line, self.limit)
85 | except Exception:
86 | self.console.print_exception()
87 |
88 | def do_help(self, arg):
89 | """
90 | Shows a help message.
91 |
92 | Args:
93 | arg: optional help message argument
94 | """
95 |
96 | commands = {
97 | ".limit": "(number)\t\tset the maximum number of query rows to return",
98 | ".path": "(start) (end)\tprints a semantic path between questions",
99 | ".show": "(id)\t\tprint question with specified id",
100 | ".topics": "(query)\t\tshows topics best matching query. if query is empty, top topics are shown",
101 | }
102 |
103 | if arg in commands:
104 | self.console.print(f"{arg} {commands[arg]}")
105 | else:
106 | for command, message in commands.items():
107 | self.console.print(f"{command} {message}")
108 |
109 | self.console.print("\nDefault mode runs a search query")
110 |
111 | def split(self, command, default=None):
112 | """
113 | Splits command by whitespace.
114 |
115 | Args:
116 | command: command line
117 | default: default command action
118 |
119 | Returns:
120 | command action
121 | """
122 |
123 | values = command.split(" ", 1)
124 | return values if len(values) > 1 else (command, default)
125 |
126 |
127 | def main():
128 | """
129 | Console execution loop.
130 | """
131 |
132 | Console().cmdloop()
133 |
134 |
135 | if __name__ == "__main__":
136 | main()
137 |
--------------------------------------------------------------------------------
/src/python/codequestion/index.py:
--------------------------------------------------------------------------------
1 | """
2 | Index module
3 | """
4 |
5 | import os.path
6 | import sqlite3
7 | import sys
8 |
9 | from tqdm import tqdm
10 | from txtai.app import Application
11 | from txtai.embeddings import Embeddings
12 |
13 | from .models import Models
14 | from .tokenizer import Tokenizer
15 |
16 |
17 | class Index:
18 | """
19 | Builds a new embeddings index.
20 | """
21 |
22 | def __call__(self, config, dbfile):
23 | """
24 | Builds and saves an embeddings index.
25 |
26 | Args:
27 | config: input configuration file
28 | dbfile: input SQLite file
29 | """
30 |
31 | embeddings = self.build(config, dbfile)
32 | embeddings.save(Models.modelPath("stackexchange"))
33 |
34 | def build(self, config, dbfile):
35 | """
36 | Builds an embeddings index.
37 |
38 | Args:
39 | config: input configuration file
40 | dbfile: input SQLite file
41 |
42 | Returns:
43 | embeddings index
44 | """
45 |
46 | # Configure embeddings index
47 | config = Application.read(config)
48 |
49 | # Resolve full path to vectors file, if necessary
50 | if config.get("scoring"):
51 | config["path"] = os.path.join(Models.vectorPath(config["path"]))
52 |
53 | # Create embeddings index
54 | embeddings = Embeddings(config)
55 |
56 | # Build scoring index, if scoring method provided
57 | if embeddings.isweighted():
58 | embeddings.score(self.stream(dbfile, embeddings, "Building scoring index"))
59 |
60 | # Build embeddings index
61 | embeddings.index(self.stream(dbfile, embeddings, "Building embeddings index"))
62 |
63 | return embeddings
64 |
65 | def stream(self, dbfile, embeddings, message):
66 | """
67 | Streams questions from a questions.db file. This method is a generator and will yield a row at time.
68 |
69 | Args:
70 | dbfile: input SQLite file
71 | embeddings: embeddings instance
72 | message: progress bar message
73 | """
74 |
75 | # Connection to database file
76 | db = sqlite3.connect(dbfile)
77 | db.row_factory = sqlite3.Row
78 | cur = db.cursor()
79 |
80 | # Get total number of questions
81 | cur.execute("SELECT count(*) from Questions")
82 | total = cur.fetchone()[0]
83 |
84 | # Query for iterating over questions.db rows
85 | cur.execute(
86 | "SELECT Id, Source, SourceId, Date, Tags, Question, QuestionUser, Answer, AnswerUser, Reference FROM Questions"
87 | )
88 |
89 | for row in tqdm(cur, total=total, desc=message):
90 | # Transform all keys to lowercase
91 | row = {k.lower(): row[k] for k in row.keys()}
92 |
93 | # Store answer as object
94 | row["object"] = row.pop("answer")
95 |
96 | # Build text and yield (id, text, tags) tuple
97 | row["text"] = row["question"] + " " + row["source"] + " " + row["tags"]
98 |
99 | # Use custom tokenizer for word vector models
100 | if embeddings.isweighted():
101 | row["text"] = Tokenizer.tokenize(row["text"])
102 |
103 | # Yield document
104 | yield (row["id"], row, row["tags"])
105 |
106 | # Free database resources
107 | db.close()
108 |
109 |
110 | # pylint: disable=C0103
111 | if __name__ == "__main__":
112 | # Path to index configuration file
113 | config = sys.argv[1] if len(sys.argv) > 1 else None
114 | if not config or not os.path.exists(config):
115 | print("Path to index configuration file does not exist, exiting")
116 | sys.exit()
117 |
118 | # Path to questions.db file
119 | dbfile = sys.argv[2] if len(sys.argv) > 1 else None
120 | if not dbfile or not os.path.exists(dbfile):
121 | print("Path to questions.db file does not exist, exiting")
122 | sys.exit()
123 |
124 | # Build index
125 | index = Index()
126 | index(config, dbfile)
127 |
--------------------------------------------------------------------------------
/src/python/codequestion/vectors.py:
--------------------------------------------------------------------------------
1 | """
2 | Vectors module
3 | """
4 |
5 | import os
6 | import os.path
7 | import sqlite3
8 | import sys
9 | import tempfile
10 |
11 | from tqdm import tqdm
12 | from txtai.vectors import WordVectors
13 |
14 | from .models import Models
15 | from .tokenizer import Tokenizer
16 |
17 |
18 | class RowIterator:
19 | """
20 | Iterates over rows in a database query. Allows for multiple iterations.
21 | """
22 |
23 | def __init__(self, dbfile):
24 | """
25 | Initializes RowIterator.
26 |
27 | Args:
28 | dbfile: path to SQLite file
29 | """
30 |
31 | # Store database file
32 | self.dbfile = dbfile
33 |
34 | self.rows = self.stream(self.dbfile)
35 |
36 | def __iter__(self):
37 | """
38 | Creates a database query generator.
39 |
40 | Returns:
41 | generator
42 | """
43 |
44 | # reset the generator
45 | self.rows = self.stream(self.dbfile)
46 | return self
47 |
48 | def __next__(self):
49 | """
50 | Gets the next result in the current generator.
51 |
52 | Returns:
53 | tokens
54 | """
55 |
56 | result = next(self.rows)
57 | if result is None:
58 | raise StopIteration
59 |
60 | return result
61 |
62 | def stream(self, dbfile):
63 | """
64 | Connects to SQLite file at dbfile and yields parsed tokens for each row.
65 |
66 | Args:
67 | dbfile: path to SQLite file
68 | """
69 |
70 | # Connection to database file
71 | db = sqlite3.connect(dbfile)
72 | cur = db.cursor()
73 |
74 | # Get total number of questions
75 | cur.execute("SELECT count(*) from Questions")
76 | total = cur.fetchone()[0]
77 |
78 | # Query for iterating over questions.db rows
79 | cur.execute("SELECT Question, Source, Tags FROM questions")
80 |
81 | for question in tqdm(cur, total=total, desc="Tokenizing input"):
82 | # Tokenize question, source and tags
83 | tokens = Tokenizer.tokenize(
84 | question[0] + " " + question[1] + " " + question[2]
85 | )
86 |
87 | # Skip documents with no tokens parsed
88 | if tokens:
89 | yield tokens
90 |
91 | # Free database resources
92 | db.close()
93 |
94 |
95 | class Vectors:
96 | """
97 | Methods to build a FastText model.
98 | """
99 |
100 | def __call__(self, dbfile, size, mincount):
101 | """
102 | Converts dbfile into a fastText model using pymagnitude's SQLite output format.
103 |
104 | Args:
105 | dbfile: input SQLite file
106 | size: dimensions for fastText model
107 | mincount: minimum number of times a token must appear in input
108 | """
109 |
110 | # Stream tokens to temporary file
111 | tokens = self.tokens(dbfile)
112 |
113 | # Output file path
114 | path = Models.vectorPath(f"stackexchange-{size}d", True)
115 |
116 | # Build word vectors model
117 | WordVectors.build(tokens, size, mincount, path)
118 |
119 | # Remove temporary tokens file
120 | os.remove(tokens)
121 |
122 | def tokens(self, dbfile):
123 | """
124 | Iterates over each row in dbfile and writes parsed tokens to a temporary file for processing.
125 |
126 | Args:
127 | dbfile: SQLite file to read
128 |
129 | Returns:
130 | path to output file
131 | """
132 |
133 | tokens = None
134 |
135 | # Stream tokens to temp working file
136 | with tempfile.NamedTemporaryFile(
137 | mode="w", suffix=".txt", delete=False
138 | ) as output:
139 | # Save file path
140 | tokens = output.name
141 |
142 | for row in RowIterator(dbfile):
143 | output.write(" ".join(row) + "\n")
144 |
145 | return tokens
146 |
147 |
148 | # pylint: disable=C0103
149 | if __name__ == "__main__":
150 | # Path to questions.db file
151 | dbfile = sys.argv[1] if len(sys.argv) > 1 else None
152 | if not dbfile or not os.path.exists(dbfile):
153 | print("Path to questions.db file does not exist, exiting")
154 | sys.exit()
155 |
156 | # Resolve questions.db path and run
157 | vectors = Vectors()
158 | vectors(dbfile, 300, 3)
159 |
--------------------------------------------------------------------------------
/src/python/codequestion/search.py:
--------------------------------------------------------------------------------
1 | """
2 | Search module
3 | """
4 |
5 | import os
6 | import os.path
7 | import re
8 |
9 | import html2markdown
10 |
11 | from rich.console import Console
12 | from rich.markdown import Markdown
13 | from txtai.embeddings import Embeddings
14 |
15 | from .models import Models
16 | from .tokenizer import Tokenizer
17 |
18 |
19 | class Search:
20 | """
21 | Search an embeddings index.
22 | """
23 |
24 | def __init__(self):
25 | """
26 | Creates a new search action.
27 | """
28 |
29 | # Load embeddings index
30 | self.embeddings = self.load()
31 | self.console = Console()
32 |
33 | def __call__(self, query=None, limit=1, uid=None):
34 | """
35 | Runs a search action.
36 |
37 | Args:
38 | query: query string
39 | limit: number of results to return
40 | uid: id to show
41 | """
42 |
43 | # Query prefix
44 | prefix = "select id, score, questionuser, question, tags, date, answeruser, object answer, reference from txtai where"
45 |
46 | if uid is not None:
47 | # ID query
48 | query = f"{prefix} id = '{uid}'"
49 | elif self.embeddings.isweighted():
50 | # Use custom tokenizer for word vector models
51 | query = Tokenizer.tokenize(query)
52 |
53 | # Run search and build id query
54 | result = self.embeddings.search(query, 1)[0] if query else {}
55 | query = f"""
56 | select id, {result.get('score')} score, questionuser, question, tags, date, answeruser, object answer, reference
57 | from txtai
58 | where id = '{result.get('id')}'
59 | """
60 | else:
61 | # Default similar clause query
62 | query = f"{prefix} similar('{query}')"
63 |
64 | # Render results
65 | for result in self.embeddings.search(query, limit):
66 | # Show result
67 | self.result(result, limit)
68 |
69 | self.console.print()
70 |
71 | def load(self):
72 | """
73 | Loads an embeddings model.
74 |
75 | Returns:
76 | embeddings
77 | """
78 |
79 | path = Models.modelPath("stackexchange")
80 |
81 | if os.path.isfile(os.path.join(path, "config")):
82 | print(f"Loading model from {path}")
83 | embeddings = Embeddings()
84 | embeddings.load(path)
85 | else:
86 | print("ERROR: loading model: ensure model is installed")
87 | print(
88 | "ERROR: Pre-trained model can be installed by running python -m codequestion.download"
89 | )
90 | raise FileNotFoundError(f"Unable to load codequestion model from {path}")
91 |
92 | return embeddings
93 |
94 | def result(self, result, limit):
95 | """
96 | Renders a result row.
97 |
98 | Args:
99 | result: result row
100 | limit: number of results
101 | """
102 |
103 | # If score is empty, this a direct query
104 | score = result["score"]
105 | score = score if score is not None else 1.0
106 |
107 | self.console.print(
108 | f"[bright_green]Question (by {result['questionuser']}): {result['question']} [{score:4f}][/bright_green]",
109 | highlight=False,
110 | )
111 | self.console.print(f"Id: {result['id']}", highlight=False)
112 | self.console.print(f"Last Activity: {result['date']}", highlight=False)
113 | self.console.print(f"Tags: {result['tags']}")
114 | self.console.print(f"Answer (by {result['answeruser']}):\n", highlight=False)
115 | self.console.print(self.markdown(result["answer"]))
116 | self.console.print(f"\nReference: {result['reference']}")
117 |
118 | # Print results divider
119 | if limit > 1:
120 | self.console.rule()
121 |
122 | def markdown(self, text):
123 | """
124 | Converts html text to markdown.
125 |
126 | Args:
127 | text: html text
128 |
129 | Returns:
130 | text as markdown
131 | """
132 |
133 | # Remove rel attributes as they are not supported by html2markdown
134 | text = re.sub(r' rel=".+?">', ">", text)
135 |
136 | # Convert html to markdown
137 | text = html2markdown.convert(text)
138 |
139 | # Decode [<>&] characters
140 | text = text.replace("<", "<").replace(">", ">").replace("&", "&")
141 |
142 | # Wrap as Rich Markdown
143 | return Markdown(text)
144 |
--------------------------------------------------------------------------------
/src/python/codequestion/etl/stackexchange/xml2db.py:
--------------------------------------------------------------------------------
1 | """
2 | XML2DB module
3 | """
4 |
5 | import os
6 | import xml.etree.cElementTree as etree
7 | import sqlite3
8 |
9 |
10 | class XML2DB:
11 | """
12 | Converts a filtered posts xml file to a staging SQLite database for processing.
13 | """
14 |
15 | # Questions schema
16 | QUESTIONS = {
17 | "Id": "INTEGER PRIMARY KEY",
18 | "AcceptedAnswerId": "INTEGER",
19 | "CreationDate": "DATETIME",
20 | "LastActivityDate": "DATETIME",
21 | "Score": "INTEGER",
22 | "ViewCount": "INTEGER",
23 | "OwnerUserId": "INTEGER",
24 | "OwnerDisplayName": "TEXT",
25 | "Title": "TEXT",
26 | "Tags": "TEXT",
27 | "AnswerCount": "INTEGER",
28 | "CommentCount": "INTEGER",
29 | "FavoriteCount": "INTEGER",
30 | "ClosedDate": "DATETIME",
31 | }
32 |
33 | # Answers schema
34 | ANSWERS = {
35 | "Id": "INTEGER PRIMARY KEY",
36 | "ParentId": "INTEGER",
37 | "CreationDate": "DATETIME",
38 | "Score": "INTEGER",
39 | "Body": "TEXT",
40 | "OwnerUserId": "INTEGER",
41 | "OwnerDisplayName": "TEXT",
42 | }
43 |
44 | # SQL statements
45 | CREATE_TABLE = "CREATE TABLE IF NOT EXISTS {table} ({fields})"
46 | INSERT_ROW = "INSERT INTO {table} ({columns}) VALUES ({values})"
47 |
48 | def __call__(self, infile, dbfile):
49 | """
50 | Converts xml infile to SQLite dbfile.
51 |
52 | Args:
53 | infile: input xml file
54 | dbfile: output sqlite file
55 | """
56 |
57 | print(f"Converting {infile} to {dbfile}")
58 |
59 | # Delete existing file
60 | if os.path.exists(dbfile):
61 | os.remove(dbfile)
62 |
63 | # Create new database
64 | db = sqlite3.connect(dbfile)
65 |
66 | # Create database tables if necessary
67 | self.create(db, XML2DB.QUESTIONS, "questions")
68 | self.create(db, XML2DB.ANSWERS, "answers")
69 |
70 | count = 0
71 | with open(infile, encoding="utf-8") as xml:
72 | context, root = self.xmlstream(xml)
73 |
74 | for event, row in context:
75 | if event == "end":
76 | # Execute insert statement
77 | self.insert(db, row)
78 |
79 | count += 1
80 | if count % 10000 == 0:
81 | print(f"Inserted {count} rows")
82 |
83 | # Free memory
84 | root.clear()
85 |
86 | print(f"Total rows inserted: {count}")
87 |
88 | # Commit changes
89 | db.commit()
90 |
91 | def create(self, db, table, name):
92 | """
93 | Creates a SQLite table.
94 |
95 | Args:
96 | db: database connection
97 | table: table schema
98 | name: table name
99 | """
100 |
101 | columns = [f"{name} {ctype}" for name, ctype in table.items()]
102 | create = XML2DB.CREATE_TABLE.format(table=name, fields=", ".join(columns))
103 |
104 | # pylint: disable=W0703
105 | try:
106 | db.execute(create)
107 | except Exception as e:
108 | print(create)
109 | print("Failed to create table: " + e)
110 |
111 | def xmlstream(self, xml):
112 | """
113 | Creates a xml stream for iterative parsing.
114 |
115 | Args:
116 | xml: input file
117 |
118 | Returns:
119 | context, root
120 | """
121 |
122 | # Parse the tree
123 | context = etree.iterparse(xml, events=("start", "end"))
124 |
125 | # turn it into an iterator
126 | context = iter(context)
127 |
128 | # get the root element
129 | _, root = next(context)
130 |
131 | return context, root
132 |
133 | def insert(self, db, row):
134 | """
135 | Inserts row into database.
136 |
137 | Args:
138 | db: database connection
139 | row: row tuple
140 | """
141 |
142 | if "PostTypeId" in row.attrib:
143 | # PostType="1" - Question, PostType="2" - Answer
144 | table = (
145 | XML2DB.QUESTIONS if row.attrib["PostTypeId"] == "1" else XML2DB.ANSWERS
146 | )
147 | name = "questions" if row.attrib["PostTypeId"] == "1" else "answers"
148 |
149 | # Build insert prepared statement
150 | columns = [name for name, _ in table.items()]
151 | insert = XML2DB.INSERT_ROW.format(
152 | table=name,
153 | columns=", ".join(columns),
154 | values=("?, " * len(columns))[:-2],
155 | )
156 |
157 | # Execute insert statement
158 | db.execute(insert, self.values(table, row, columns))
159 |
160 | def values(self, table, row, columns):
161 | """
162 | Formats and converts row into database types based on table schema.
163 |
164 | Args:
165 | table: table schema
166 | row: row tuple
167 | columns: column names
168 |
169 | Returns:
170 | Database schema formatted row tuple
171 | """
172 |
173 | values = []
174 | for column in columns:
175 | # Get column value
176 | value = row.attrib[column] if column in row.attrib else None
177 |
178 | if table[column].startswith("INTEGER"):
179 | values.append(int(value) if value else 0)
180 | elif table[column] == "BOOLEAN":
181 | values.append(1 if value == "TRUE" else 0)
182 | else:
183 | values.append(value)
184 |
185 | return values
186 |
--------------------------------------------------------------------------------
/src/python/codequestion/evaluate.py:
--------------------------------------------------------------------------------
1 | """
2 | Evaluate module
3 | """
4 |
5 | import argparse
6 | import csv
7 | import os
8 |
9 | from scipy.stats import pearsonr, spearmanr
10 | from tqdm import tqdm
11 | from txtai.embeddings import Embeddings
12 |
13 | from .models import Models
14 | from .tokenizer import Tokenizer
15 |
16 |
17 | class StackExchange:
18 | """
19 | Stack Exchange query-answer dataset.
20 | """
21 |
22 | def __call__(self, path, method):
23 | """
24 | Evaluates a pre-trained model against the Stack Exchange query-answer dataset.
25 |
26 | Args:
27 | path: path to tests
28 | method: run method
29 | """
30 |
31 | # Load model
32 | embeddings = self.load()
33 |
34 | # Statistics
35 | mrr = []
36 |
37 | # Build scoring index
38 | if method in ("bm25", "tfidf", "sif"):
39 | scoring = Embeddings({"keyword": True, "content": True})
40 | scoring.index(self.stream(embeddings, "Building keyword index"))
41 | embeddings = scoring
42 |
43 | # Run test data
44 | with open(
45 | os.path.join(path, "stackexchange", "query.txt"), encoding="utf-8"
46 | ) as rows:
47 | for row in rows:
48 | query, sourceid, source, _ = row.split("|", 3)
49 | print(query, sourceid, source)
50 |
51 | # Run search
52 | results = self.search(embeddings, query)
53 |
54 | # Get row index within results
55 | index = -1
56 | for x, result in enumerate(results):
57 | if (
58 | int(sourceid) == result["sourceid"]
59 | and source == result["source"]
60 | ):
61 | index = x
62 |
63 | # Calculate stats
64 | calc = 1 / (1 + index) if index != -1 else 0.0
65 | print(calc)
66 | mrr.append(calc)
67 |
68 | mrr = sum(mrr) / len(mrr)
69 | print("Mean Reciprocal Rank = ", mrr)
70 |
71 | def load(self):
72 | """
73 | Loads a pre-trained embeddings model
74 |
75 | Returns:
76 | embeddings
77 | """
78 |
79 | # Loading embeddings model
80 | embeddings = Embeddings()
81 | embeddings.load(Models.modelPath("stackexchange"))
82 |
83 | return embeddings
84 |
85 | def stream(self, embeddings, message):
86 | """
87 | Streams content from an embeddings index. This method is a generator and will yield a row at time.
88 |
89 | Args:
90 | embeddings: embeddings index
91 | message: progress bar message
92 | """
93 |
94 | offset, batch = 0, 1000
95 | with tqdm(total=embeddings.count(), desc=message) as progress:
96 | for offset in range(0, embeddings.count(), batch):
97 | for result in embeddings.search(
98 | f"select id, text, tags, source, sourceid from txtai limit {batch} offset {offset}"
99 | ):
100 | yield (result["id"], result, None)
101 |
102 | progress.update(batch)
103 |
104 | def search(self, embeddings, query):
105 | """
106 | Executes a search.
107 |
108 | Args:
109 | embeddings: embeddings instance
110 | query: query to run
111 |
112 | Returns:
113 | search results
114 | """
115 |
116 | results = None
117 | if embeddings.isweighted():
118 | # Use custom tokenizer for word vector models
119 | uids = [
120 | row["id"] for row in embeddings.search(Tokenizer.tokenize(query), 10)
121 | ]
122 |
123 | # Get source id + source for each result
124 | results = []
125 | for uid in uids:
126 | results.append(
127 | embeddings.search(
128 | f"select sourceid, source from txtai where id = {uid}"
129 | )[0]
130 | )
131 | else:
132 | # Select source id + source with standard similar clause
133 | results = embeddings.search(
134 | f"select sourceid, source from txtai where similar('{query}') limit 10"
135 | )
136 |
137 | return results
138 |
139 |
140 | class STS:
141 | """
142 | STS Benchmark Dataset
143 | General text similarity
144 |
145 | http://ixa2.si.ehu.es/stswiki/index.php/STSbenchmark
146 | """
147 |
148 | def __call__(self, path, method):
149 | """
150 | Test a list of vector models.
151 |
152 | Args:
153 | path: path to tests
154 | method: run method
155 | """
156 |
157 | # Load embeddings instance - used to calculate similarity
158 | embeddings = Embeddings()
159 | embeddings.load(Models.modelPath("stackexchange"))
160 |
161 | # Test model against sts dataset
162 | self.test(embeddings, path, method)
163 |
164 | def test(self, embeddings, path, method):
165 | """
166 | Tests input Embeddings model against STS benchmark data.
167 |
168 | Args:
169 | embeddings: embeddings instance
170 | path: path to tests
171 | method: run method
172 | """
173 |
174 | # Test file path
175 | path = os.path.join(
176 | path, "stsbenchmark", f"sts-{'dev' if method == 'dev' else 'test'}.csv"
177 | )
178 |
179 | # Read test data
180 | rows = self.read(path)
181 |
182 | # Calculated scores and ground truth labels
183 | scores = []
184 | labels = []
185 |
186 | for row in rows:
187 | text1, text2 = row[2], row[3]
188 |
189 | # Use custom tokenizer for word vector models
190 | if embeddings.isweighted():
191 | text1 = Tokenizer.tokenize(text1)
192 | text2 = Tokenizer.tokenize(text2)
193 |
194 | if text1 and text2:
195 | score = embeddings.similarity(text1, [text2])[0][1]
196 | scores.append(score)
197 |
198 | # Ground truth score normalized between 0 - 1
199 | labels.append(row[1])
200 |
201 | print("Pearson score =", pearsonr(scores, labels))
202 | print("Spearman score =", spearmanr(scores, labels))
203 |
204 | def read(self, path):
205 | """
206 | Reads a STS data file.
207 |
208 | Args:
209 | path: full path to file
210 |
211 | Returns:
212 | rows
213 | """
214 |
215 | with open(path, encoding="utf-8") as f:
216 | data = csv.reader(f, delimiter="\t", quoting=csv.QUOTE_NONE)
217 |
218 | rows = []
219 |
220 | # Column Index-Name: 4-score, 5-string 1, 6-string 2
221 | for x, row in enumerate(data):
222 | # Normalize score from 0-5 to 0-1. 1 being most similar.
223 | score = float(row[4]) / 5.0
224 |
225 | # Store row as id (1 indexed), normalized score, string 1, string 2
226 | rows.append((x + 1, score, row[5], row[6]))
227 |
228 | return rows
229 |
230 |
231 | if __name__ == "__main__":
232 | # Command line parser
233 | parser = argparse.ArgumentParser(description="Evaluate")
234 | parser.add_argument(
235 | "-s", "--source", required=True, help="data source", metavar="SOURCE"
236 | )
237 | parser.add_argument(
238 | "-p", "--path", required=True, help="path to test files", metavar="PATH"
239 | )
240 | parser.add_argument("-m", "--method", help="run method", metavar="METHOD")
241 |
242 | # Parse command line arguments
243 | args = parser.parse_args()
244 |
245 | # Get eval action
246 | action = STS() if args.source.lower() == "sts" else StackExchange()
247 |
248 | # Run eval action
249 | action(args.path, args.method)
250 |
--------------------------------------------------------------------------------
/src/python/codequestion/etl/stackexchange/db2qa.py:
--------------------------------------------------------------------------------
1 | """
2 | DB2QA module
3 | """
4 |
5 | import os
6 | import re
7 | import sqlite3
8 |
9 |
10 | class DB2QA:
11 | """
12 | Converts multiple staging SQLite database (questions, answers in separate tables per source) into a consolidated SQLite database
13 | with a single questions table.
14 | """
15 |
16 | # Questions schema
17 | QUESTIONS = {
18 | "Id": "INTEGER PRIMARY KEY",
19 | "Source": "TEXT",
20 | "SourceId": "INTEGER",
21 | "Date": "DATETIME",
22 | "Tags": "TEXT",
23 | "Question": "TEXT",
24 | "QuestionUser": "TEXT",
25 | "Answer": "TEXT",
26 | "AnswerUser": "TEXT",
27 | "Reference": "TEXT",
28 | }
29 |
30 | # List of sources
31 | SOURCES = {
32 | "ai": "https://ai.stackexchange.com",
33 | "android": "https://android.stackexchange.com",
34 | "apple": "https://apple.stackexchange.com",
35 | "arduino": "https://arduino.stackexchange.com",
36 | "askubuntu": "https://askubuntu.com",
37 | "avp": "https://avp.stackexchange.com",
38 | "codereview": "https://codereview.stackexchange.com",
39 | "cs": "https://cs.stackexchange.com",
40 | "datascience": "http://datascience.stackexchange.com",
41 | "dba": "https://dba.stackexchange.com",
42 | "devops": "https://devops.stackexchange.com",
43 | "dsp": "https://dsp.stackexchange.com",
44 | "raspberrypi": "https://raspberrypi.stackexchange.com",
45 | "reverseengineering": "https://reverseengineering.stackexchange.com",
46 | "scicomp": "https://scicomp.stackexchange.com",
47 | "security": "https://security.stackexchange.com",
48 | "serverfault": "https://serverfault.com",
49 | "stackoverflow": "https://stackoverflow.com",
50 | "stats": "https://stats.stackexchange.com",
51 | "superuser": "https://superuser.com",
52 | "unix": "https://unix.stackexchange.com",
53 | "vi": "https://vi.stackexchange.com",
54 | "wordpress": "https://wordpress.stackexchange.com",
55 | }
56 |
57 | # SQL statements
58 | CREATE_TABLE = "CREATE TABLE IF NOT EXISTS {table} ({fields})"
59 | INSERT_ROW = "INSERT INTO {table} ({columns}) VALUES ({values})"
60 | CREATE_SOURCE_INDEX = "CREATE INDEX source ON questions(Source, SourceId)"
61 | CREATE_TEXT_INDEX = "CREATE VIRTUAL TABLE search USING fts5(Id, Question, Tags)"
62 | INSERT_TEXT_ROWS = "INSERT INTO search SELECT Id, Question, Tags from questions"
63 |
64 | def __call__(self, databases, qafile):
65 | """
66 | Executes a run to convert a list of databases to a single consolidated questions db file.
67 |
68 | Args:
69 | databases: paths to input databases
70 | qafile: output database path
71 | """
72 |
73 | print(f"Converting {databases} to {qafile}")
74 |
75 | # Delete existing file
76 | if os.path.exists(qafile):
77 | os.remove(qafile)
78 |
79 | # Create output database
80 | qa = sqlite3.connect(qafile)
81 |
82 | # Create questions table
83 | self.create(qa, DB2QA.QUESTIONS, "questions")
84 |
85 | # Row index
86 | index = 0
87 |
88 | for dbfile in databases:
89 | print("Processing " + dbfile)
90 |
91 | # Create source name
92 | source = os.path.splitext(os.path.basename(dbfile))[0].lower()
93 |
94 | # Input database
95 | db = sqlite3.connect(dbfile)
96 | cur = db.cursor()
97 |
98 | cur.execute(
99 | "SELECT Id, AcceptedAnswerId, OwnerUserId, OwnerDisplayName, LastActivityDate, Title, Tags FROM questions"
100 | )
101 |
102 | # Need to select all rows to allow execution of insert statements
103 | for question in cur.fetchall():
104 | # Find accepted answer
105 | answer = self.find(question, cur)
106 | if answer:
107 | # Combine into single question row
108 | self.insert(qa, index, source, question, answer)
109 |
110 | index += 1
111 | if index % 10000 == 0:
112 | print(f"Inserted {index} rows")
113 |
114 | db.close()
115 |
116 | print(f"Total rows inserted: {index}")
117 |
118 | # Create indices
119 | for statement in [
120 | DB2QA.CREATE_SOURCE_INDEX,
121 | DB2QA.CREATE_TEXT_INDEX,
122 | DB2QA.INSERT_TEXT_ROWS,
123 | ]:
124 | qa.execute(statement)
125 |
126 | # Commit changes and close
127 | qa.commit()
128 | qa.close()
129 |
130 | def create(self, db, table, name):
131 | """
132 | Creates a SQLite table.
133 |
134 | Args:
135 | db: database connection
136 | table: table schema
137 | name: table name
138 | """
139 |
140 | columns = [f"{name} {ctype}" for name, ctype in table.items()]
141 | create = DB2QA.CREATE_TABLE.format(table=name, fields=", ".join(columns))
142 |
143 | # pylint: disable=W0703
144 | try:
145 | db.execute(create)
146 | except Exception as e:
147 | print(create)
148 | print("Failed to create table: " + e)
149 |
150 | def find(self, question, cur):
151 | """
152 | Finds a corresponding answer for the input question.
153 |
154 | Args:
155 | question: input question row
156 | cur: database cursor
157 |
158 | Returns:
159 | Answer row if found, None otherwise
160 | """
161 |
162 | # Query for accepted answer
163 | cur.execute(
164 | "SELECT Body, OwnerUserId, OwnerDisplayName from answers where Id = ?",
165 | [question[1]],
166 | )
167 | answer = cur.fetchone()
168 |
169 | if answer and answer[0]:
170 | # Check if answer has a message body
171 | return answer
172 |
173 | return None
174 |
175 | def insert(self, db, index, source, question, answer):
176 | """
177 | Builds and inserts a consolidated question.
178 |
179 | Args:
180 | db: database connection
181 | index: row index
182 | source: question source
183 | question: question row
184 | answer: answer row
185 | """
186 |
187 | table = DB2QA.QUESTIONS
188 |
189 | # Build insert prepared statement
190 | columns = [name for name, _ in table.items()]
191 | insert = DB2QA.INSERT_ROW.format(
192 | table="questions",
193 | columns=", ".join(columns),
194 | values=("?, " * len(columns))[:-2],
195 | )
196 |
197 | # Build row of insert values
198 | row = self.build(index, source, question, answer)
199 |
200 | # Execute insert statement
201 | db.execute(insert, self.values(table, row, columns))
202 |
203 | def build(self, index, source, question, answer):
204 | """
205 | Builds a consolidated question row.
206 |
207 | Args:
208 | index: row index
209 | source: question source
210 | question: question row
211 | answer: answer row
212 |
213 | Returns:
214 | row tuple
215 | """
216 |
217 | # Parse tags into list of tags
218 | tags = re.sub(r"[<>]", " ", question[6]).split() if question[6] else None
219 |
220 | # Get user display name, fallback to user id
221 | quser = question[3] if question[3] else str(question[2])
222 | auser = answer[2] if answer[2] else str(answer[1])
223 |
224 | # Create URL reference
225 | reference = f"{DB2QA.SOURCES[source]}/questions/{question[0]}"
226 |
227 | # Id, Source, SourceId, Date, Tags, Question, QuestionUser, Answer, AnswerUser, Reference
228 | return (
229 | index,
230 | source,
231 | question[0],
232 | question[4],
233 | " ".join(tags),
234 | question[5],
235 | quser,
236 | answer[0],
237 | auser,
238 | reference,
239 | )
240 |
241 | def values(self, table, row, columns):
242 | """
243 | Formats and converts row into database types based on table schema.
244 |
245 | Args:
246 | table: table schema
247 | row: row tuple
248 | columns: column names
249 |
250 | Returns:
251 | Database schema formatted row tuple
252 | """
253 |
254 | values = []
255 | for x, column in enumerate(columns):
256 | # Get value
257 | value = row[x]
258 |
259 | if table[column].startswith("INTEGER"):
260 | values.append(int(value) if value else 0)
261 | elif table[column] == "BOOLEAN":
262 | values.append(1 if value == "TRUE" else 0)
263 | else:
264 | values.append(value)
265 |
266 | return values
267 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | Copyright 2020- NeuML LLC
179 |
180 | Licensed under the Apache License, Version 2.0 (the "License");
181 | you may not use this file except in compliance with the License.
182 | You may obtain a copy of the License at
183 |
184 | http://www.apache.org/licenses/LICENSE-2.0
185 |
186 | Unless required by applicable law or agreed to in writing, software
187 | distributed under the License is distributed on an "AS IS" BASIS,
188 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
189 | See the License for the specific language governing permissions and
190 | limitations under the License.
--------------------------------------------------------------------------------
/test/stackexchange/query.txt:
--------------------------------------------------------------------------------
1 | android get screen dimensions|4743116|stackoverflow|Get screen width and height in Android|android
2 | android unique device id|2785485|stackoverflow|Is there a unique Android device ID?|android uniqueidentifier
3 | apt list all installed packages|17823|askubuntu|How to list all installed packages|apt package-management
4 | assembly pop stack|4584089|stackoverflow|What is the function of the push / pop instructions used on registers in x86 assembly?|assembly x86 stack terminology
5 | bash create uuid|103359|serverfault|How to create a UUID in bash?|bash uuid
6 | bash current time milliseconds|151109|serverfault|How do I get the current Unix time in milliseconds in Bash?|bash unix time
7 | bash list files|7265272|stackoverflow|How to list files in directory using bash?|bash
8 | bash sort du by output size|62411|serverfault|How can I sort du -h output by size|linux bash du gnu
9 | c cast malloc|605845|stackoverflow|Do I cast the result of malloc?|c malloc casting
10 | c++ convert string lower case|313970|stackoverflow|How to convert std::string to lower case?|c++ string c++-standard-library tolower
11 | centos print current version|54987|unix|How to determine CentOS version?|centos version
12 | centos upgrade 5 to 6|309053|superuser|How to upgrade CentOS 5.6 to 6.0?|linux centos
13 | centos when does cron.daily run|135906|serverfault|When does `cron.daily` run?|linux centos redhat cron
14 | c++ read file into ascii string|2602013|stackoverflow|Read whole ASCII file into C++ std::string|c++ string caching file-io standard-library
15 | c++ sleep|4184468|stackoverflow|Sleep for milliseconds|c++ linux sleep
16 | cuda gcc version|6622454|stackoverflow|CUDA incompatible with my gcc version|gcc cuda debian
17 | cuda get version|9727688|stackoverflow|How to get the cuda version?|cuda
18 | curl show http status|272265|superuser|Getting curl to output HTTP status code?|http curl status
19 | database inner join vs outer join|38549|stackoverflow|What is the difference between "INNER JOIN" and "OUTER JOIN"?|sql database join inner-join outer-join
20 | docker vs virtual machine|16047306|stackoverflow|How is Docker different from a virtual machine?|docker containers virtual-machine virtualization
21 | dpkg install deb file|159094|unix|How to install a deb file, by dpkg -i or by apt?|software-installation apt dpkg deb
22 | ec2 delete terminated instance|393417|serverfault|Delete Amazon EC2 terminated instance|amazon-ec2
23 | fedora print current version|540603|stackoverflow|How can I find the version of the Fedora I use?|linux fedora
24 | fedora test fstab|174181|serverfault|How do you validate fstab without rebooting?|fedora mount fstab
25 | gcc get assembly output|137038|stackoverflow|How do you get assembler output from C/C++ source in gcc?|c++ c debugging gcc assembly
26 | gcc vs g++|172587|stackoverflow|What is the difference between g++ and gcc?|c++ gcc g++
27 | git add empty directory|115983|stackoverflow|How can I add an empty directory to a Git repository?|git directory git-add
28 | git discard unstaged changes|52704|stackoverflow|How do I discard unstaged changes in Git?|git version-control
29 | git pull vs fetch|292357|stackoverflow|What is the difference between 'git pull' and 'git fetch'?|git version-control git-pull git-fetch
30 | git undo most recent local commit|927358|stackoverflow|How do I undo the most recent local commits in Git?|git version-control git-commit undo pre-commit
31 | java fix nullpointerexception|218384|stackoverflow|What is a NullPointerException, and how do I fix it?|java nullpointerexception
32 | java get random number|5887709|stackoverflow|Getting random numbers in Java|java random
33 | java hashmap vs hashtable|40471|stackoverflow|What are the differences between a HashMap and a Hashtable in Java?|java collections hashmap hashtable
34 | java heap space error|37335|stackoverflow|How to deal with "java.lang.OutOfMemoryError: Java heap space" error?|java java-ee jvm out-of-memory heap-memory
35 | java outofmemoryerror poi|6069847|stackoverflow|java.lang.OutOfMemoryError: Java heap space while reading excel with Apache POI|java apache-poi
36 | java print date|26717733|stackoverflow|print current date in java|java time
37 | java round decimal|11701399|stackoverflow|round up to 2 decimal places in java?|java
38 | javascript check if string contains substring|1789945|stackoverflow|How to check whether a string contains a substring in JavaScript?|javascript string substring string-matching
39 | javascript create timestamp|221294|stackoverflow|How do you get a timestamp in JavaScript?|javascript date datetime timestamp unix-timestamp
40 | javascript encode url|332872|stackoverflow|Encode URL in JavaScript?|javascript url encoding
41 | javascript for-each in array|9329446|stackoverflow|For-each over an array in JavaScript?|javascript arrays loops foreach iteration
42 | javascript generate uuid|105034|stackoverflow|Create GUID / UUID in JavaScript?|javascript guid uuid
43 | javascript html5 local storage add object|2010892|stackoverflow|How to store objects in HTML5 localStorage/sessionStorage|javascript html local-storage
44 | javascript include file in another file|950087|stackoverflow|How do I include a JavaScript file in another JavaScript file?|javascript file import include
45 | javascript redirect to another website|503093|stackoverflow|How do I redirect to another webpage?|javascript jquery redirect
46 | javascript remove element from array|5767325|stackoverflow|How can I remove a specific item from an array?|javascript arrays
47 | javascript replace all occurrences in string|1144783|stackoverflow|How do I replace all occurrences of a string in JavaScript?|javascript string replace
48 | javascript validate email address|46155|stackoverflow|How can I validate an email address in JavaScript?|javascrit html regex email-validation
49 | java array sort|8938235|stackoverflow|Sort an array in Java|java arrays
50 | java split string|3481828|stackoverflow|How do I split a string in Java?|java string split
51 | java stringbuilder vs stringbuffer|355089|stackoverflow|Difference between StringBuilder and StringBuffer|java stringbuilder stringbuffer
52 | java string to int|5585779|stackoverflow|How do I convert a String to an int in Java?|java string int type-conversion
53 | java wait vs sleep|1036754|stackoverflow|Difference between "wait()" and "sleep()" in Java|java multithreading wait sleep java-threads
54 | java ways to iterate list|18410035|stackoverflow|Ways to iterate over a list in Java|java loops collections iteration
55 | java write file|2885173|stackoverflow|How do I create a file and write to it in Java?|java file-io
56 | json comments|244777|stackoverflow|Can comments be used in JSON?|json comments
57 | json proper content type|477816|stackoverflow|Which JSON content type do I use?|json mime-types content-type
58 | linux check if port open|309052|serverfault|Check if port is open or closed on a Linux server?|linux port telnet
59 | linux find files with specific text|16956810|stackoverflow|How to find all files containing specific text (string) on Linux?|linux text grep directory find
60 | ping specific port|309357|serverfault|Ping a Specific Port|ping
61 | psql list all databases and tables|1285|dba|How do I list all databases and tables using psql?|postgresql tools psql command-line
62 | python add new keys to dictionary|1024847|stackoverflow|How can I add new keys to a dictionary?|python dictionary lookup
63 | python aes decryption|12524994|stackoverflow|Encrypt & Decrypt using PyCrypto AES 256|python encryption padding pycrypto initialization-vector
64 | python call external script|9318581|stackoverflow|Python - how do I call external python programs?|python call external
65 | python check if list empty|53513|stackoverflow|How do I check if a list is empty?|python list
66 | python clone list|2612802|stackoverflow|How do I clone a list so that it doesn't change unexpectedly after assignment?|python list reference copy clone
67 | python contains string|3437059|stackoverflow|Does Python have a string 'contains' substring method?|python string substring contains
68 | python convert int to bytes|21017698|stackoverflow|Converting int to bytes in Python 3|python python-3.x
69 | python google sheets api|56084171|stackoverflow|Accessing Google Sheets Api with Python|stackoverflow python google-sheets-api
70 | python opencv draw rectangle|23720875|stackoverflow|How to draw a rectangle around a region of interest in python|python opencv computer-vision draw
71 | python parse csv|12296585|stackoverflow|Python Parse CSV Correctly|python parsing csv
72 | python parse float|379906|stackoverflow|How do I parse a string to a float or int?|python parsing floating-point type-conversion integer
73 | python parse json|7771011|stackoverflow|How to parse data in JSON format?|python json parsing
74 | python pdf extract text|15583535|stackoverflow|How to extract text from a PDF file in Python?|python pypdf
75 | python read wav file|2060628|stackoverflow|Reading *.wav files in Python|python audio wav wave
76 | python sort dictionary by value|613183|stackoverflow|How do I sort a dictionary by value?|python sorting dictionary
77 | python staticmethod vs classmethod|136097|stackoverflow|Difference between @staticmethod and @classmethod|python oop methods python-decorators
78 | python utc to localtime|4770297|stackoverflow|Convert UTC datetime string to local datetime|python datetime utc localtime
79 | python yield generator|41136410|stackoverflow|Python `yield from`, or return a generator?|python function return generator
80 | restful programming|671118|stackoverflow|What is RESTful programming?|rest http architecture definition
81 | rest put vs post|630453|stackoverflow|What is the difference between POST and PUT in HTTP?|http rest post put
82 | ruby shell command|2232|stackoverflow|How to call shell commands from Ruby|ruby shell interop
83 | ruby switch statement|948135|stackoverflow|How to write a switch statement in Ruby|ruby switch-statement conditional-statements
84 | scp recursive copy directories|264595|serverfault|Can scp copy directories recursively?|linux scp
85 | server room identify burning smell|496139|serverfault|Something is burning in the server room; how can I quickly identify what it is?|hardware
86 | sftp port|74176|serverfault|What port does SFTP use?|sftp
87 | ssh automate script with password|241588|serverfault|How to automate SSH login with password?|ssh password automation
88 | ssh diff|59140|serverfault|How do diff over ssh?|diff
89 | svd for pca|134282|stats|Relationship between SVD and PCA. How to use SVD to perform PCA?|pca dimensionality-reduction matrix svd faq
90 | tensorflow install ubuntu 14.04|41875915|stackoverflow|Install tensorflow on Ubuntu 14.04|python python-2.7 ubuntu tensorflow pip
91 | ubuntu show current version|12493|askubuntu|How can I find the version of Ubuntu that is installed?|versions release-management
92 | ubuntu install xfce|116602|askubuntu|How to install Xfce desktop environment?|xfce
93 | unzip zip file terminal|86849|askubuntu|How to unzip a zip file from the Terminal?|command-line zip
94 | vi copy text clipboard|84|vi|How can I copy text to the system clipboard from Vim?|cut-copy-paste os-clipboard
95 | windows check cpu temperature|395434|superuser|How can I check the temperature of my CPU in Windows?|windows cpu temperature
96 | windows compare pdf files|46123|superuser|How to compare the differences between two PDF files on Windows?|windows pdf file-comparison
97 | windows ctrl+alt+delete remote desktop|57222|serverfault|How to send ctrl+alt+del using Remote Desktop?|windows remote-desktop
98 | windows list running processes command line|914782|superuser|How do you list all processes on the command line in Windows?|windows command-line
99 | windows sudo|9652720|stackoverflow|How to run 'sudo' command in windows|windows
100 | windows wireless keyboard toaster|792607|superuser|Why does Windows think that my wireless keyboard is a toaster?|windows-7 device-manager
101 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 | Semantic search for developers
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 | -------------------------------------------------------------------------------------------------------------------------------------------------------
28 |
29 | codequestion is a semantic search application for developer questions.
30 |
31 | 
32 |
33 | Developers typically have a web browser window open while they work and run web searches as questions arise. With codequestion, this can be done from a local context. This application executes similarity queries to find similar questions to the input query.
34 |
35 | The default model for codequestion is built off the [Stack Exchange Dumps on archive.org](https://archive.org/details/stackexchange). Once a model is installed, codequestion runs locally, no network connection is required.
36 |
37 | 
38 | 
39 |
40 | codequestion is built with Python 3.8+ and [txtai](https://github.com/neuml/txtai).
41 |
42 | ## Installation
43 |
44 | The easiest way to install is via pip and PyPI
45 |
46 | ```
47 | pip install codequestion
48 | ```
49 |
50 | Python 3.8+ is supported. Using a Python [virtual environment](https://docs.python.org/3/library/venv.html) is recommended.
51 |
52 | codequestion can also be installed directly from GitHub to access the latest, unreleased features.
53 |
54 | ```
55 | pip install git+https://github.com/neuml/codequestion
56 | ```
57 |
58 | See [this link](https://neuml.github.io/txtai/install/#environment-specific-prerequisites) for environment-specific troubleshooting.
59 |
60 | ## Download a model
61 |
62 | Once codequestion is installed, a model needs to be downloaded.
63 |
64 | ```
65 | python -m codequestion.download
66 | ```
67 |
68 | The model will be stored in ~/.codequestion/
69 |
70 | The model can also be manually installed if the machine doesn't have direct internet access. The default model is pulled from the [GitHub release page](https://github.com/neuml/codequestion/releases)
71 |
72 | ```
73 | unzip cqmodel.zip ~/.codequestion
74 | ```
75 |
76 | ## Search
77 |
78 | Start up a codequestion shell to get started.
79 |
80 | ```
81 | codequestion
82 | ```
83 |
84 | A prompt will appear. Queries can be typed into the console. Type `help` to see all available commands.
85 |
86 | 
87 |
88 | ## Topics
89 |
90 | The latest release integrates [txtai 5.0](https://medium.com/neuml/whats-new-in-txtai-5-0-e5c75a13b101), which has support for semantic graphs.
91 |
92 | Semantic graphs add support for topic modeling and path traversal. Topics organize questions into groups with similar concepts. Path traversal uses the semantic graph to show how two potentially disparate entries are connected. An example covering both topic and path traversal is shown below.
93 |
94 | 
95 |
96 | ## VS Code
97 |
98 | A codequestion prompt can be started within Visual Studio Code. This enables asking coding questions right from your IDE.
99 |
100 | Run `` Ctrl+` `` to open a new terminal then type `codequestion`.
101 |
102 | 
103 |
104 | ## API service
105 |
106 | codequestion builds a standard txtai embeddings index. As such, it supports hosting the index via a [txtai API service](https://neuml.github.io/txtai/api).
107 |
108 | Running the following:
109 |
110 | _app.yml_
111 | ```yaml
112 | path: /home/user/.codequestion/models/stackexchange/
113 | embeddings:
114 | ```
115 |
116 | ```
117 | # Install API extra
118 | pip install txtai[api]
119 |
120 | # Start API
121 | CONFIG=app.yml uvicorn "txtai.api:app"
122 |
123 | # Test API
124 | curl "http://127.0.0.1:8000/search?query=python+query+sqlite&limit=1"
125 | ```
126 |
127 | Outputs:
128 | ```json
129 | [{
130 | "id":"616429",
131 | "text":"How to fetch data from sqlite using python? stackoverflow python sqlite",
132 | "score":0.8401689529418945
133 | }]
134 | ```
135 |
136 | Additional metadata fields can be pulled back with SQL statements.
137 |
138 | ```
139 | curl
140 | --get
141 | --data-urlencode "query=select id, date, tags, question, score from txtai where similar('python query sqlite')"
142 | --data-urlencode "limit=1"
143 | "http://127.0.0.1:8000/search"
144 | ```
145 |
146 | ```json
147 | [{
148 | "id":"616429",
149 | "date":"2022-05-23T10:45:40.397",
150 | "tags":"python sqlite",
151 | "question":"How to fetch data from sqlite using python?",
152 | "score":0.8401689529418945
153 | }]
154 | ```
155 |
156 | ## Tech overview
157 | The following is an overview covering how this project works.
158 |
159 | ### Process the raw data dumps
160 | The raw 7z XML dumps from Stack Exchange are processed through a series of steps (see [building a model](#building-a-model)). Only highly scored questions with accepted answers are retrieved for storage in the model. Questions and answers are consolidated into a single SQLite file called questions.db. The schema for questions.db is below.
161 |
162 | *questions.db schema*
163 |
164 | Id INTEGER PRIMARY KEY
165 | Source TEXT
166 | SourceId INTEGER
167 | Date DATETIME
168 | Tags TEXT
169 | Question TEXT
170 | QuestionUser TEXT
171 | Answer TEXT
172 | AnswerUser TEXT
173 | Reference TEXT
174 |
175 | ### Index
176 | codequestion builds a txtai embeddings index for questions.db. Each question in the questions.db schema is vectorized with a sentence-transformers model. Once questions.db is converted to a collection of sentence embeddings, the embeddings are normalized and stored in Faiss, which enables fast similarity searches.
177 |
178 | ### Query
179 | codequestion tokenizes each query using the same method as during indexing. Those tokens are used to build a sentence embedding. That embedding is queried against the Faiss index to find the most similar questions.
180 |
181 | ## Build a model
182 | The following steps show how to build a codequestion model using Stack Exchange archives.
183 |
184 | _This is not necessary if using the default model from the [GitHub release page](https://github.com/neuml/codequestion/releases)_
185 |
186 | 1.) Download files from Stack Exchange: https://archive.org/details/stackexchange
187 |
188 | 2.) Place selected files into a directory structure like shown below (current process requires all these files).
189 |
190 | - stackexchange/ai/ai.stackexchange.com.7z
191 | - stackexchange/android/android.stackexchange.com.7z
192 | - stackexchange/apple/apple.stackexchange.com.7z
193 | - stackexchange/arduino/arduino.stackexchange.com.7z
194 | - stackexchange/askubuntu/askubuntu.com.7z
195 | - stackexchange/avp/avp.stackexchange.com.7z
196 | - stackexchange/codereview/codereview.stackexchange.com.7z
197 | - stackexchange/cs/cs.stackexchange.com.7z
198 | - stackexchange/datascience/datascience.stackexchange.com.7z
199 | - stackexchange/dba/dba.stackexchange.com.7z
200 | - stackexchange/devops/devops.stackexchange.com.7z
201 | - stackexchange/dsp/dsp.stackexchange.com.7z
202 | - stackexchange/raspberrypi/raspberrypi.stackexchange.com.7z
203 | - stackexchange/reverseengineering/reverseengineering.stackexchange.com.7z
204 | - stackexchange/scicomp/scicomp.stackexchange.com.7z
205 | - stackexchange/security/security.stackexchange.com.7z
206 | - stackexchange/serverfault/serverfault.com.7z
207 | - stackexchange/stackoverflow/stackoverflow.com-Posts.7z
208 | - stackexchange/stats/stats.stackexchange.com.7z
209 | - stackexchange/superuser/superuser.com.7z
210 | - stackexchange/unix/unix.stackexchange.com.7z
211 | - stackexchange/vi/vi.stackexchange.com.7z
212 | - stackexchange/wordpress/wordpress.stackexchange.com.7z
213 |
214 | 3.) Run the ETL process
215 |
216 | ```
217 | python -m codequestion.etl.stackexchange.execute stackexchange
218 | ```
219 |
220 | This will create the file stackexchange/questions.db
221 |
222 | 4.) __OPTIONAL:__ Build word vectors - only necessary if using a word vectors model. If using word vector models, make sure to run `pip install txtai[similarity]`
223 |
224 | ```
225 | python -m codequestion.vectors stackexchange/questions.db
226 | ```
227 |
228 | This will create the file ~/.codequestion/vectors/stackexchange-300d.magnitude
229 |
230 | 5.) Build embeddings index
231 |
232 | ```
233 | python -m codequestion.index index.yml stackexchange/questions.db
234 | ```
235 |
236 | The [default index.yml](https://raw.githubusercontent.com/neuml/codequestion/master/config/index.yml) file is found on GitHub. Settings can be changed to customize how the index is built.
237 |
238 | After this step, the index is created and all necessary files are ready to query.
239 |
240 | ## Model accuracy
241 | The following sections show test results for codequestion v2 and codequestion v1 using the latest Stack Exchange dumps. Version 2 uses a sentence-transformers model. Version 1 uses a word vectors model with BM25 weighting. BM25 and TF-IDF are shown to establish a baseline score.
242 |
243 | **StackExchange Query**
244 |
245 | Models are scored using [Mean Reciprocal Rank (MRR)](https://en.wikipedia.org/wiki/Mean_reciprocal_rank).
246 |
247 | | Model | MRR |
248 | | ------------------- | :---: |
249 | | all-MiniLM-L6-v2 | 85.0 |
250 | | SE 300d - BM25 | 77.1 |
251 | | BM25 | 67.7 |
252 | | TF-IDF | 61.7 |
253 |
254 | **STS Benchmark**
255 |
256 | Models are scored using [Pearson Correlation](https://en.wikipedia.org/wiki/Pearson_correlation_coefficient). Note that the word vectors model is only trained on Stack Exchange data, so it isn't expected to generalize as well against the STS dataset.
257 |
258 | | Model | Supervision | Dev | Test |
259 | | ---------------- | :-----------: | :---: | :---: |
260 | | all-MiniLM-L6-v2 | Train | 87.0 | 82.7 |
261 | | SE 300d - BM25 | Train | 74.0 | 67.4 |
262 |
263 | ## Tests
264 | To reproduce the tests above, run the following. Substitute $TEST_PATH with any local path.
265 |
266 | mkdir -p $TEST_PATH
267 | wget https://raw.githubusercontent.com/neuml/codequestion/master/test/stackexchange/query.txt -P $TEST_PATH/stackexchange
268 | wget http://ixa2.si.ehu.es/stswiki/images/4/48/Stsbenchmark.tar.gz
269 | tar -C $TEST_PATH -xvzf Stsbenchmark.tar.gz
270 | python -m codequestion.evaluate -s test -p $TEST_PATH
271 |
272 | ## Further reading
273 |
274 | - [Find answers with codequestion 2.0](https://medium.com/neuml/find-answers-with-codequestion-2-0-50b2cfd8c8fe)
275 | - [Building a sentence embedding index with fastText and BM25 (codequestion 1.0)](https://towardsdatascience.com/building-a-sentence-embedding-index-with-fasttext-and-bm25-f07e7148d240)
276 |
--------------------------------------------------------------------------------
/images/architecture.excalidraw:
--------------------------------------------------------------------------------
1 | {
2 | "type": "excalidraw",
3 | "version": 2,
4 | "source": "https://excalidraw.com",
5 | "elements": [
6 | {
7 | "type": "text",
8 | "version": 780,
9 | "versionNonce": 881355380,
10 | "isDeleted": false,
11 | "id": "Buic2Lx427wuSIW8P_Rw5",
12 | "fillStyle": "hachure",
13 | "strokeWidth": 1,
14 | "strokeStyle": "solid",
15 | "roughness": 1,
16 | "opacity": 100,
17 | "angle": 0,
18 | "x": 736,
19 | "y": 179,
20 | "strokeColor": "#000000",
21 | "backgroundColor": "#228be6",
22 | "width": 658,
23 | "height": 46,
24 | "seed": 373648901,
25 | "groupIds": [],
26 | "roundness": null,
27 | "boundElements": [],
28 | "updated": 1674310362127,
29 | "link": null,
30 | "locked": false,
31 | "fontSize": 36,
32 | "fontFamily": 1,
33 | "text": "Semantic search for coding questions",
34 | "baseline": 32,
35 | "textAlign": "left",
36 | "verticalAlign": "top",
37 | "containerId": null,
38 | "originalText": "Semantic search for coding questions"
39 | },
40 | {
41 | "type": "rectangle",
42 | "version": 2366,
43 | "versionNonce": 106997452,
44 | "isDeleted": false,
45 | "id": "U2NgEIEiFpAlwmv5Xnyzr",
46 | "fillStyle": "hachure",
47 | "strokeWidth": 1,
48 | "strokeStyle": "solid",
49 | "roughness": 1,
50 | "opacity": 40,
51 | "angle": 0,
52 | "x": 536.6352719532887,
53 | "y": 425.4438888888891,
54 | "strokeColor": "#000000",
55 | "backgroundColor": "#000000",
56 | "width": 1018.3567583077031,
57 | "height": 410.1764927948917,
58 | "seed": 1946478225,
59 | "groupIds": [],
60 | "roundness": null,
61 | "boundElements": [],
62 | "updated": 1674310362127,
63 | "link": "",
64 | "locked": false
65 | },
66 | {
67 | "type": "rectangle",
68 | "version": 1576,
69 | "versionNonce": 175832052,
70 | "isDeleted": false,
71 | "id": "UO6MS3wSDu7yg2421__LI",
72 | "fillStyle": "hachure",
73 | "strokeWidth": 1,
74 | "strokeStyle": "solid",
75 | "roughness": 1,
76 | "opacity": 100,
77 | "angle": 0,
78 | "x": 934.1111111111111,
79 | "y": 267,
80 | "strokeColor": "#ffeb3b",
81 | "backgroundColor": "#ffeb3b",
82 | "width": 214,
83 | "height": 49,
84 | "seed": 1629565989,
85 | "groupIds": [
86 | "3sURMvhuRfR0M-Q3VRPbg"
87 | ],
88 | "roundness": null,
89 | "boundElements": [
90 | {
91 | "type": "text",
92 | "id": "8sp7H8ijWBlh6aMgZ0XTP"
93 | },
94 | {
95 | "id": "Qzp41i_jzQIBlAB_qFKFH",
96 | "type": "arrow"
97 | },
98 | {
99 | "id": "SJ0F0Y81z9hir5qQWAJjk",
100 | "type": "arrow"
101 | }
102 | ],
103 | "updated": 1674310362127,
104 | "link": null,
105 | "locked": false
106 | },
107 | {
108 | "type": "rectangle",
109 | "version": 2351,
110 | "versionNonce": 733605196,
111 | "isDeleted": false,
112 | "id": "qYd3q0Vjks7VOHUC9RR51",
113 | "fillStyle": "hachure",
114 | "strokeWidth": 1,
115 | "strokeStyle": "solid",
116 | "roughness": 1,
117 | "opacity": 100,
118 | "angle": 0,
119 | "x": 550.1111111111111,
120 | "y": 267.5,
121 | "strokeColor": "#03a9f4",
122 | "backgroundColor": "#03a9f4",
123 | "width": 219,
124 | "height": 52,
125 | "seed": 1441952427,
126 | "groupIds": [
127 | "3sURMvhuRfR0M-Q3VRPbg"
128 | ],
129 | "roundness": null,
130 | "boundElements": [
131 | {
132 | "type": "text",
133 | "id": "WPeWn6N4rCHf0jY16N9Ge"
134 | },
135 | {
136 | "id": "Qzp41i_jzQIBlAB_qFKFH",
137 | "type": "arrow"
138 | }
139 | ],
140 | "updated": 1674310362127,
141 | "link": null,
142 | "locked": false
143 | },
144 | {
145 | "type": "text",
146 | "version": 2088,
147 | "versionNonce": 1998544244,
148 | "isDeleted": false,
149 | "id": "WPeWn6N4rCHf0jY16N9Ge",
150 | "fillStyle": "hachure",
151 | "strokeWidth": 1,
152 | "strokeStyle": "solid",
153 | "roughness": 1,
154 | "opacity": 100,
155 | "angle": 0,
156 | "x": 629.6111111111111,
157 | "y": 274,
158 | "strokeColor": "#000",
159 | "backgroundColor": "#fa5252",
160 | "width": 60,
161 | "height": 39,
162 | "seed": 870516459,
163 | "groupIds": [
164 | "3sURMvhuRfR0M-Q3VRPbg"
165 | ],
166 | "roundness": null,
167 | "boundElements": [],
168 | "updated": 1674310362127,
169 | "link": null,
170 | "locked": false,
171 | "fontSize": 28,
172 | "fontFamily": 1,
173 | "text": "ETL",
174 | "baseline": 27,
175 | "textAlign": "center",
176 | "verticalAlign": "middle",
177 | "containerId": "qYd3q0Vjks7VOHUC9RR51",
178 | "originalText": "ETL"
179 | },
180 | {
181 | "type": "rectangle",
182 | "version": 1785,
183 | "versionNonce": 321283020,
184 | "isDeleted": false,
185 | "id": "5VuUdI_BsJ5pyE1nTqJUI",
186 | "fillStyle": "hachure",
187 | "strokeWidth": 1,
188 | "strokeStyle": "solid",
189 | "roughness": 1,
190 | "opacity": 100,
191 | "angle": 0,
192 | "x": 1333.111111111111,
193 | "y": 268,
194 | "strokeColor": "#00e676",
195 | "backgroundColor": "#00e676",
196 | "width": 218,
197 | "height": 49,
198 | "seed": 1044404613,
199 | "groupIds": [
200 | "3sURMvhuRfR0M-Q3VRPbg"
201 | ],
202 | "roundness": null,
203 | "boundElements": [
204 | {
205 | "id": "bJJ9SGsJsvT071qBBH0w5",
206 | "type": "text"
207 | },
208 | {
209 | "id": "SJ0F0Y81z9hir5qQWAJjk",
210 | "type": "arrow"
211 | }
212 | ],
213 | "updated": 1674310362127,
214 | "link": null,
215 | "locked": false
216 | },
217 | {
218 | "type": "text",
219 | "version": 1985,
220 | "versionNonce": 1673236212,
221 | "isDeleted": false,
222 | "id": "bJJ9SGsJsvT071qBBH0w5",
223 | "fillStyle": "hachure",
224 | "strokeWidth": 1,
225 | "strokeStyle": "solid",
226 | "roughness": 1,
227 | "opacity": 100,
228 | "angle": 0,
229 | "x": 1338.111111111111,
230 | "y": 274.5,
231 | "strokeColor": "#000",
232 | "backgroundColor": "#fa5252",
233 | "width": 208,
234 | "height": 36,
235 | "seed": 128953675,
236 | "groupIds": [
237 | "3sURMvhuRfR0M-Q3VRPbg"
238 | ],
239 | "roundness": null,
240 | "boundElements": [],
241 | "updated": 1674310362127,
242 | "link": null,
243 | "locked": false,
244 | "fontSize": 28,
245 | "fontFamily": 1,
246 | "text": "Search",
247 | "baseline": 25,
248 | "textAlign": "center",
249 | "verticalAlign": "middle",
250 | "containerId": "5VuUdI_BsJ5pyE1nTqJUI",
251 | "originalText": "Search"
252 | },
253 | {
254 | "type": "text",
255 | "version": 1602,
256 | "versionNonce": 1925188172,
257 | "isDeleted": false,
258 | "id": "8sp7H8ijWBlh6aMgZ0XTP",
259 | "fillStyle": "hachure",
260 | "strokeWidth": 1,
261 | "strokeStyle": "solid",
262 | "roughness": 1,
263 | "opacity": 100,
264 | "angle": 0,
265 | "x": 939.1111111111111,
266 | "y": 273.5,
267 | "strokeColor": "#000",
268 | "backgroundColor": "transparent",
269 | "width": 204,
270 | "height": 36,
271 | "seed": 1854823263,
272 | "groupIds": [
273 | "3sURMvhuRfR0M-Q3VRPbg"
274 | ],
275 | "roundness": null,
276 | "boundElements": [],
277 | "updated": 1674310362127,
278 | "link": null,
279 | "locked": false,
280 | "fontSize": 28,
281 | "fontFamily": 1,
282 | "text": "Index",
283 | "baseline": 25,
284 | "textAlign": "center",
285 | "verticalAlign": "middle",
286 | "containerId": "UO6MS3wSDu7yg2421__LI",
287 | "originalText": "Index"
288 | },
289 | {
290 | "type": "text",
291 | "version": 1134,
292 | "versionNonce": 550867060,
293 | "isDeleted": false,
294 | "id": "jWJpSXHkTCzRTCA4tbAgv",
295 | "fillStyle": "hachure",
296 | "strokeWidth": 1,
297 | "strokeStyle": "solid",
298 | "roughness": 1,
299 | "opacity": 100,
300 | "angle": 0,
301 | "x": 549.6111111111111,
302 | "y": 347.30499999999995,
303 | "strokeColor": "#000",
304 | "backgroundColor": "#03a9f4",
305 | "width": 270,
306 | "height": 42,
307 | "seed": 1241563487,
308 | "groupIds": [
309 | "3sURMvhuRfR0M-Q3VRPbg"
310 | ],
311 | "roundness": null,
312 | "boundElements": [],
313 | "updated": 1674310362127,
314 | "link": null,
315 | "locked": false,
316 | "fontSize": 16,
317 | "fontFamily": 1,
318 | "text": "- Parse and transform input\n- Filter down to \"popular\" answers",
319 | "baseline": 36,
320 | "textAlign": "left",
321 | "verticalAlign": "top",
322 | "containerId": null,
323 | "originalText": "- Parse and transform input\n- Filter down to \"popular\" answers"
324 | },
325 | {
326 | "type": "text",
327 | "version": 1121,
328 | "versionNonce": 761323724,
329 | "isDeleted": false,
330 | "id": "qEnmXs0P_MQE8r4c4OWGh",
331 | "fillStyle": "hachure",
332 | "strokeWidth": 1,
333 | "strokeStyle": "solid",
334 | "roughness": 1,
335 | "opacity": 100,
336 | "angle": 0,
337 | "x": 932.6111111111111,
338 | "y": 346.2074999999999,
339 | "strokeColor": "#000",
340 | "backgroundColor": "#f44336",
341 | "width": 245,
342 | "height": 42,
343 | "seed": 1038536465,
344 | "groupIds": [
345 | "3sURMvhuRfR0M-Q3VRPbg"
346 | ],
347 | "roundness": null,
348 | "boundElements": [],
349 | "updated": 1674310362127,
350 | "link": null,
351 | "locked": false,
352 | "fontSize": 16,
353 | "fontFamily": 1,
354 | "text": "- Transform input into numbers\n- Store content with vectors",
355 | "baseline": 36,
356 | "textAlign": "left",
357 | "verticalAlign": "top",
358 | "containerId": null,
359 | "originalText": "- Transform input into numbers\n- Store content with vectors"
360 | },
361 | {
362 | "type": "text",
363 | "version": 1185,
364 | "versionNonce": 2117296628,
365 | "isDeleted": false,
366 | "id": "1q8bzjK8lnKUZj8_A9v7D",
367 | "fillStyle": "hachure",
368 | "strokeWidth": 1,
369 | "strokeStyle": "solid",
370 | "roughness": 1,
371 | "opacity": 100,
372 | "angle": 0,
373 | "x": 1245.111111111111,
374 | "y": 349.2074999999999,
375 | "strokeColor": "#000",
376 | "backgroundColor": "#f44336",
377 | "width": 322,
378 | "height": 42,
379 | "seed": 304472945,
380 | "groupIds": [
381 | "3sURMvhuRfR0M-Q3VRPbg"
382 | ],
383 | "roundness": null,
384 | "boundElements": [],
385 | "updated": 1674310362127,
386 | "link": null,
387 | "locked": false,
388 | "fontSize": 16,
389 | "fontFamily": 1,
390 | "text": "- Find similar content with vector search\n- Explore topics and relationships",
391 | "baseline": 36,
392 | "textAlign": "left",
393 | "verticalAlign": "top",
394 | "containerId": null,
395 | "originalText": "- Find similar content with vector search\n- Explore topics and relationships"
396 | },
397 | {
398 | "type": "arrow",
399 | "version": 3387,
400 | "versionNonce": 983754572,
401 | "isDeleted": false,
402 | "id": "Qzp41i_jzQIBlAB_qFKFH",
403 | "fillStyle": "hachure",
404 | "strokeWidth": 1,
405 | "strokeStyle": "solid",
406 | "roughness": 1,
407 | "opacity": 100,
408 | "angle": 0,
409 | "x": 771.6111111111111,
410 | "y": 289.8470411964629,
411 | "strokeColor": "#000",
412 | "backgroundColor": "#f44336",
413 | "width": 158.1310513485223,
414 | "height": 0.5692601572380909,
415 | "seed": 660786897,
416 | "groupIds": [
417 | "3sURMvhuRfR0M-Q3VRPbg"
418 | ],
419 | "roundness": {
420 | "type": 2
421 | },
422 | "boundElements": [],
423 | "updated": 1674310362127,
424 | "link": null,
425 | "locked": false,
426 | "startBinding": {
427 | "elementId": "qYd3q0Vjks7VOHUC9RR51",
428 | "focus": -0.15367587596362536,
429 | "gap": 2.5
430 | },
431 | "endBinding": {
432 | "elementId": "UO6MS3wSDu7yg2421__LI",
433 | "focus": 0.027437144815141,
434 | "gap": 4.3689486514776945
435 | },
436 | "lastCommittedPoint": null,
437 | "startArrowhead": null,
438 | "endArrowhead": "arrow",
439 | "points": [
440 | [
441 | 0,
442 | 0
443 | ],
444 | [
445 | 158.1310513485223,
446 | 0.5692601572380909
447 | ]
448 | ]
449 | },
450 | {
451 | "type": "arrow",
452 | "version": 3907,
453 | "versionNonce": 1658520436,
454 | "isDeleted": false,
455 | "id": "SJ0F0Y81z9hir5qQWAJjk",
456 | "fillStyle": "hachure",
457 | "strokeWidth": 1,
458 | "strokeStyle": "solid",
459 | "roughness": 1,
460 | "opacity": 100,
461 | "angle": 0,
462 | "x": 1150.611111111111,
463 | "y": 292.6790761701911,
464 | "strokeColor": "#000",
465 | "backgroundColor": "#f44336",
466 | "width": 181.5,
467 | "height": 1.5898915058209013,
468 | "seed": 899541905,
469 | "groupIds": [
470 | "3sURMvhuRfR0M-Q3VRPbg"
471 | ],
472 | "roundness": {
473 | "type": 2
474 | },
475 | "boundElements": [],
476 | "updated": 1674310362127,
477 | "link": null,
478 | "locked": false,
479 | "startBinding": {
480 | "elementId": "UO6MS3wSDu7yg2421__LI",
481 | "focus": 0.08406032225724415,
482 | "gap": 2.5
483 | },
484 | "endBinding": {
485 | "elementId": "5VuUdI_BsJ5pyE1nTqJUI",
486 | "focus": 0.09327847520504394,
487 | "gap": 1
488 | },
489 | "lastCommittedPoint": null,
490 | "startArrowhead": null,
491 | "endArrowhead": "arrow",
492 | "points": [
493 | [
494 | 0,
495 | 0
496 | ],
497 | [
498 | 181.5,
499 | -1.5898915058209013
500 | ]
501 | ]
502 | },
503 | {
504 | "type": "text",
505 | "version": 121,
506 | "versionNonce": 1369352592,
507 | "isDeleted": false,
508 | "id": "0S4gs8k1Aw_EE3epHrlwi",
509 | "fillStyle": "hachure",
510 | "strokeWidth": 1,
511 | "strokeStyle": "solid",
512 | "roughness": 1,
513 | "opacity": 100,
514 | "angle": 0,
515 | "x": 558.9603174603171,
516 | "y": 440.1485317460317,
517 | "strokeColor": "#000000",
518 | "backgroundColor": "transparent",
519 | "width": 520,
520 | "height": 52,
521 | "seed": 1521084272,
522 | "groupIds": [],
523 | "roundness": null,
524 | "boundElements": [],
525 | "updated": 1665016874534,
526 | "link": null,
527 | "locked": false,
528 | "fontSize": 20,
529 | "fontFamily": 1,
530 | "text": ">>> python build pdf\n---------------------------------------------------------------",
531 | "baseline": 44,
532 | "textAlign": "left",
533 | "verticalAlign": "top",
534 | "containerId": null,
535 | "originalText": ">>> python build pdf\n---------------------------------------------------------------"
536 | },
537 | {
538 | "type": "text",
539 | "version": 948,
540 | "versionNonce": 249568716,
541 | "isDeleted": false,
542 | "id": "hnqGO83Op144jMURaGlCf",
543 | "fillStyle": "hachure",
544 | "strokeWidth": 1,
545 | "strokeStyle": "solid",
546 | "roughness": 1,
547 | "opacity": 100,
548 | "angle": 0,
549 | "x": 567.8492063492062,
550 | "y": 489.03742063492075,
551 | "strokeColor": "#000000",
552 | "backgroundColor": "transparent",
553 | "width": 397,
554 | "height": 156,
555 | "seed": 1108820368,
556 | "groupIds": [],
557 | "roundness": null,
558 | "boundElements": [
559 | {
560 | "id": "f3vLDOpOTtgvPlvxSLtb6",
561 | "type": "arrow"
562 | }
563 | ],
564 | "updated": 1674310362127,
565 | "link": null,
566 | "locked": false,
567 | "fontSize": 20,
568 | "fontFamily": 1,
569 | "text": "\n\nId: 219570\nLast Activity: 2016-11-22T09:07:49.983\nTags: python pdf pdf-generation\nAnswer (by 772200):",
570 | "baseline": 148,
571 | "textAlign": "left",
572 | "verticalAlign": "top",
573 | "containerId": null,
574 | "originalText": "\n\nId: 219570\nLast Activity: 2016-11-22T09:07:49.983\nTags: python pdf pdf-generation\nAnswer (by 772200):"
575 | },
576 | {
577 | "type": "text",
578 | "version": 173,
579 | "versionNonce": 1895751536,
580 | "isDeleted": false,
581 | "id": "AKQSU-yPRu9JKbNgVRi9v",
582 | "fillStyle": "hachure",
583 | "strokeWidth": 1,
584 | "strokeStyle": "solid",
585 | "roughness": 1,
586 | "opacity": 100,
587 | "angle": 0,
588 | "x": 566.7380952380952,
589 | "y": 630.1485317460314,
590 | "strokeColor": "#000000",
591 | "backgroundColor": "transparent",
592 | "width": 322,
593 | "height": 130,
594 | "seed": 1679216,
595 | "groupIds": [],
596 | "roundness": null,
597 | "boundElements": [],
598 | "updated": 1665016981495,
599 | "link": null,
600 | "locked": false,
601 | "fontSize": 20.069228106611277,
602 | "fontFamily": 1,
603 | "text": "\nThe two that come to mind are:\n\n • pyPdf2\n • PDFMiner",
604 | "baseline": 122,
605 | "textAlign": "left",
606 | "verticalAlign": "top",
607 | "containerId": null,
608 | "originalText": "\nThe two that come to mind are:\n\n • pyPdf2\n • PDFMiner"
609 | },
610 | {
611 | "type": "text",
612 | "version": 85,
613 | "versionNonce": 1520293264,
614 | "isDeleted": false,
615 | "id": "oWhhEqzBly2k2HiDjxdzq",
616 | "fillStyle": "hachure",
617 | "strokeWidth": 1,
618 | "strokeStyle": "solid",
619 | "roughness": 1,
620 | "opacity": 100,
621 | "angle": 0,
622 | "x": 568.9603174603171,
623 | "y": 779.0374206349206,
624 | "strokeColor": "#03a9f4",
625 | "backgroundColor": "transparent",
626 | "width": 550,
627 | "height": 26,
628 | "seed": 1958330768,
629 | "groupIds": [],
630 | "roundness": null,
631 | "boundElements": [],
632 | "updated": 1665016908431,
633 | "link": null,
634 | "locked": false,
635 | "fontSize": 20,
636 | "fontFamily": 1,
637 | "text": "Reference: https://stackoverflow.com/questions/6413441",
638 | "baseline": 18,
639 | "textAlign": "left",
640 | "verticalAlign": "top",
641 | "containerId": null,
642 | "originalText": "Reference: https://stackoverflow.com/questions/6413441"
643 | },
644 | {
645 | "type": "text",
646 | "version": 584,
647 | "versionNonce": 1695708404,
648 | "isDeleted": false,
649 | "id": "E-1wDKL8ZmPnn0ONPMDti",
650 | "fillStyle": "hachure",
651 | "strokeWidth": 1,
652 | "strokeStyle": "solid",
653 | "roughness": 1,
654 | "opacity": 100,
655 | "angle": 0,
656 | "x": 566.738095238095,
657 | "y": 502.3707539682539,
658 | "strokeColor": "#000000",
659 | "backgroundColor": "transparent",
660 | "width": 517,
661 | "height": 26,
662 | "seed": 2028071280,
663 | "groupIds": [],
664 | "roundness": null,
665 | "boundElements": [
666 | {
667 | "id": "Nb5_4C9PyVmUjcNmwmtTf",
668 | "type": "arrow"
669 | }
670 | ],
671 | "updated": 1674310362127,
672 | "link": null,
673 | "locked": false,
674 | "fontSize": 20,
675 | "fontFamily": 1,
676 | "text": "Question (by 312251): Python PDF library [0.801761]",
677 | "baseline": 18,
678 | "textAlign": "left",
679 | "verticalAlign": "top",
680 | "containerId": null,
681 | "originalText": "Question (by 312251): Python PDF library [0.801761]"
682 | },
683 | {
684 | "type": "arrow",
685 | "version": 502,
686 | "versionNonce": 978951056,
687 | "isDeleted": false,
688 | "id": "Nb5_4C9PyVmUjcNmwmtTf",
689 | "fillStyle": "hachure",
690 | "strokeWidth": 1,
691 | "strokeStyle": "solid",
692 | "roughness": 1,
693 | "opacity": 100,
694 | "angle": 0,
695 | "x": 1307.8492063492063,
696 | "y": 480.8848231593677,
697 | "strokeColor": "#000000",
698 | "backgroundColor": "#000000",
699 | "width": 217.77777777777783,
700 | "height": 31.597041919997253,
701 | "seed": 1504292752,
702 | "groupIds": [],
703 | "roundness": {
704 | "type": 2
705 | },
706 | "boundElements": [],
707 | "updated": 1665017411344,
708 | "link": null,
709 | "locked": false,
710 | "startBinding": {
711 | "elementId": "bxrIQaIPIjEN65QVVyKjd",
712 | "focus": 0.6279335165098477,
713 | "gap": 3.8333333333332575
714 | },
715 | "endBinding": {
716 | "elementId": "E-1wDKL8ZmPnn0ONPMDti",
717 | "focus": 0.3581099106309167,
718 | "gap": 6.333333333333485
719 | },
720 | "lastCommittedPoint": null,
721 | "startArrowhead": null,
722 | "endArrowhead": "arrow",
723 | "points": [
724 | [
725 | 0,
726 | 0
727 | ],
728 | [
729 | -65.55555555555566,
730 | 24.930375253330624
731 | ],
732 | [
733 | -217.77777777777783,
734 | 31.597041919997253
735 | ]
736 | ]
737 | },
738 | {
739 | "type": "rectangle",
740 | "version": 2726,
741 | "versionNonce": 53642316,
742 | "isDeleted": false,
743 | "id": "bxrIQaIPIjEN65QVVyKjd",
744 | "fillStyle": "hachure",
745 | "strokeWidth": 1,
746 | "strokeStyle": "solid",
747 | "roughness": 1,
748 | "opacity": 100,
749 | "angle": 0,
750 | "x": 1311.6825396825395,
751 | "y": 454.2596428571427,
752 | "strokeColor": "#5f3dc4",
753 | "backgroundColor": "#5f3dc4",
754 | "width": 219,
755 | "height": 52,
756 | "seed": 1176432016,
757 | "groupIds": [
758 | "XHWKg8UL3ErDF5KJBC0TT"
759 | ],
760 | "roundness": null,
761 | "boundElements": [
762 | {
763 | "id": "Y6VADemG1rq3Yf1_X3Rkw",
764 | "type": "text"
765 | },
766 | {
767 | "id": "Qzp41i_jzQIBlAB_qFKFH",
768 | "type": "arrow"
769 | },
770 | {
771 | "id": "Nb5_4C9PyVmUjcNmwmtTf",
772 | "type": "arrow"
773 | }
774 | ],
775 | "updated": 1674310362127,
776 | "link": null,
777 | "locked": false
778 | },
779 | {
780 | "type": "text",
781 | "version": 1543,
782 | "versionNonce": 1775569264,
783 | "isDeleted": false,
784 | "id": "Y6VADemG1rq3Yf1_X3Rkw",
785 | "fillStyle": "hachure",
786 | "strokeWidth": 1,
787 | "strokeStyle": "solid",
788 | "roughness": 1,
789 | "opacity": 100,
790 | "angle": 0,
791 | "x": 1329.6825396825395,
792 | "y": 460.7596428571427,
793 | "strokeColor": "#000000",
794 | "backgroundColor": "#fa5252",
795 | "width": 183,
796 | "height": 39,
797 | "seed": 2097855376,
798 | "groupIds": [
799 | "XHWKg8UL3ErDF5KJBC0TT"
800 | ],
801 | "roundness": null,
802 | "boundElements": [],
803 | "updated": 1665017413674,
804 | "link": null,
805 | "locked": false,
806 | "fontSize": 28,
807 | "fontFamily": 1,
808 | "text": "Vector match",
809 | "baseline": 27,
810 | "textAlign": "center",
811 | "verticalAlign": "middle",
812 | "containerId": "bxrIQaIPIjEN65QVVyKjd",
813 | "originalText": "Vector match"
814 | },
815 | {
816 | "type": "rectangle",
817 | "version": 1089,
818 | "versionNonce": 691460496,
819 | "isDeleted": false,
820 | "id": "G-qwb8bQ8dozMQZmcq1Gl",
821 | "fillStyle": "hachure",
822 | "strokeWidth": 1,
823 | "strokeStyle": "solid",
824 | "roughness": 1,
825 | "opacity": 100,
826 | "angle": 0,
827 | "x": 1319.738095238095,
828 | "y": 685.7596428571427,
829 | "strokeColor": "#03a9f4",
830 | "backgroundColor": "#03a9f4",
831 | "width": 214,
832 | "height": 49,
833 | "seed": 1905111440,
834 | "groupIds": [
835 | "wf5G07CYJirX_YOSAFeDv"
836 | ],
837 | "roundness": null,
838 | "boundElements": [
839 | {
840 | "id": "T7Dd9_CUf4IoQLKnCySvb",
841 | "type": "text"
842 | },
843 | {
844 | "id": "Qzp41i_jzQIBlAB_qFKFH",
845 | "type": "arrow"
846 | },
847 | {
848 | "id": "SJ0F0Y81z9hir5qQWAJjk",
849 | "type": "arrow"
850 | },
851 | {
852 | "id": "6CxseRtxEY_xN1wGA8ahy",
853 | "type": "arrow"
854 | }
855 | ],
856 | "updated": 1665017719529,
857 | "link": null,
858 | "locked": false
859 | },
860 | {
861 | "type": "text",
862 | "version": 1104,
863 | "versionNonce": 1279521680,
864 | "isDeleted": false,
865 | "id": "T7Dd9_CUf4IoQLKnCySvb",
866 | "fillStyle": "hachure",
867 | "strokeWidth": 1,
868 | "strokeStyle": "solid",
869 | "roughness": 1,
870 | "opacity": 100,
871 | "angle": 0,
872 | "x": 1380.238095238095,
873 | "y": 690.7596428571427,
874 | "strokeColor": "#000000",
875 | "backgroundColor": "#03a9f4",
876 | "width": 93,
877 | "height": 39,
878 | "seed": 2089545584,
879 | "groupIds": [
880 | "wf5G07CYJirX_YOSAFeDv"
881 | ],
882 | "roundness": null,
883 | "boundElements": [],
884 | "updated": 1665017732426,
885 | "link": null,
886 | "locked": false,
887 | "fontSize": 28,
888 | "fontFamily": 1,
889 | "text": "Answer",
890 | "baseline": 27,
891 | "textAlign": "center",
892 | "verticalAlign": "middle",
893 | "containerId": "G-qwb8bQ8dozMQZmcq1Gl",
894 | "originalText": "Answer"
895 | },
896 | {
897 | "type": "arrow",
898 | "version": 616,
899 | "versionNonce": 1298745200,
900 | "isDeleted": false,
901 | "id": "6CxseRtxEY_xN1wGA8ahy",
902 | "fillStyle": "hachure",
903 | "strokeWidth": 1,
904 | "strokeStyle": "solid",
905 | "roughness": 1,
906 | "opacity": 100,
907 | "angle": 0,
908 | "x": 1303.9603174603174,
909 | "y": 708.4075937236478,
910 | "strokeColor": "#000000",
911 | "backgroundColor": "#000000",
912 | "width": 377.7777777777778,
913 | "height": 6.2911737786592425,
914 | "seed": 1277086096,
915 | "groupIds": [],
916 | "roundness": {
917 | "type": 2
918 | },
919 | "boundElements": [],
920 | "updated": 1665017719530,
921 | "link": null,
922 | "locked": false,
923 | "startBinding": {
924 | "elementId": "G-qwb8bQ8dozMQZmcq1Gl",
925 | "focus": 0.17330116418533134,
926 | "gap": 15.777777777777601
927 | },
928 | "endBinding": null,
929 | "lastCommittedPoint": null,
930 | "startArrowhead": null,
931 | "endArrowhead": "arrow",
932 | "points": [
933 | [
934 | 0,
935 | 0
936 | ],
937 | [
938 | -225.55555555555566,
939 | 5.1800626675482135
940 | ],
941 | [
942 | -377.7777777777778,
943 | 6.2911737786592425
944 | ]
945 | ]
946 | },
947 | {
948 | "type": "rectangle",
949 | "version": 1744,
950 | "versionNonce": 394840052,
951 | "isDeleted": false,
952 | "id": "mskx8L2KXgKOLHKahrjQI",
953 | "fillStyle": "hachure",
954 | "strokeWidth": 1,
955 | "strokeStyle": "solid",
956 | "roughness": 1,
957 | "opacity": 100,
958 | "angle": 0,
959 | "x": 1315.5158730158726,
960 | "y": 571.3151984126984,
961 | "strokeColor": "#fa5252",
962 | "backgroundColor": "#ff7043",
963 | "width": 218,
964 | "height": 49,
965 | "seed": 1444977008,
966 | "groupIds": [
967 | "Ow3OuCkl-1gnPf96uZhoJ"
968 | ],
969 | "roundness": null,
970 | "boundElements": [
971 | {
972 | "id": "dADm_k9Od8a9ANLvFiLsB",
973 | "type": "text"
974 | },
975 | {
976 | "id": "SJ0F0Y81z9hir5qQWAJjk",
977 | "type": "arrow"
978 | },
979 | {
980 | "id": "f3vLDOpOTtgvPlvxSLtb6",
981 | "type": "arrow"
982 | }
983 | ],
984 | "updated": 1674310379960,
985 | "link": null,
986 | "locked": false
987 | },
988 | {
989 | "type": "text",
990 | "version": 1496,
991 | "versionNonce": 406774092,
992 | "isDeleted": false,
993 | "id": "dADm_k9Od8a9ANLvFiLsB",
994 | "fillStyle": "hachure",
995 | "strokeWidth": 1,
996 | "strokeStyle": "solid",
997 | "roughness": 1,
998 | "opacity": 100,
999 | "angle": 0,
1000 | "x": 1353.5158730158726,
1001 | "y": 576.3151984126984,
1002 | "strokeColor": "#000000",
1003 | "backgroundColor": "#ff7043",
1004 | "width": 142,
1005 | "height": 39,
1006 | "seed": 464251760,
1007 | "groupIds": [
1008 | "Ow3OuCkl-1gnPf96uZhoJ"
1009 | ],
1010 | "roundness": null,
1011 | "boundElements": [],
1012 | "updated": 1674310379960,
1013 | "link": null,
1014 | "locked": false,
1015 | "fontSize": 28,
1016 | "fontFamily": 1,
1017 | "text": "Metadata",
1018 | "baseline": 27,
1019 | "textAlign": "center",
1020 | "verticalAlign": "middle",
1021 | "containerId": "mskx8L2KXgKOLHKahrjQI",
1022 | "originalText": "Metadata"
1023 | },
1024 | {
1025 | "type": "arrow",
1026 | "version": 583,
1027 | "versionNonce": 129907088,
1028 | "isDeleted": false,
1029 | "id": "f3vLDOpOTtgvPlvxSLtb6",
1030 | "fillStyle": "hachure",
1031 | "strokeWidth": 1,
1032 | "strokeStyle": "solid",
1033 | "roughness": 1,
1034 | "opacity": 100,
1035 | "angle": 0,
1036 | "x": 1305.0714285714284,
1037 | "y": 592.5642974832429,
1038 | "strokeColor": "#000000",
1039 | "backgroundColor": "#000000",
1040 | "width": 321.1111111111113,
1041 | "height": 15.467803352397482,
1042 | "seed": 449970544,
1043 | "groupIds": [],
1044 | "roundness": {
1045 | "type": 2
1046 | },
1047 | "boundElements": [],
1048 | "updated": 1665017397537,
1049 | "link": null,
1050 | "locked": false,
1051 | "startBinding": {
1052 | "elementId": "mskx8L2KXgKOLHKahrjQI",
1053 | "focus": 0.2891539308183631,
1054 | "gap": 10.444444444444116
1055 | },
1056 | "endBinding": {
1057 | "elementId": "hnqGO83Op144jMURaGlCf",
1058 | "focus": 0.5894787674956782,
1059 | "gap": 19.111111111110972
1060 | },
1061 | "lastCommittedPoint": null,
1062 | "startArrowhead": null,
1063 | "endArrowhead": "arrow",
1064 | "points": [
1065 | [
1066 | 0,
1067 | 0
1068 | ],
1069 | [
1070 | -74.44444444444457,
1071 | 3.2455811301751964
1072 | ],
1073 | [
1074 | -321.1111111111113,
1075 | 15.467803352397482
1076 | ]
1077 | ]
1078 | }
1079 | ],
1080 | "appState": {
1081 | "gridSize": null,
1082 | "viewBackgroundColor": "#fff"
1083 | },
1084 | "files": {}
1085 | }
--------------------------------------------------------------------------------