├── .gitignore ├── BaseXClient.py ├── LICENSE ├── README.md ├── __init__.py ├── bibtidy.py ├── init.sh └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | build 2 | bibtidy.egg-info 3 | __pycache__ 4 | .vscode -------------------------------------------------------------------------------- /BaseXClient.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Python 2.7.3 and 3.x client for BaseX. 4 | Works with BaseX 7.0 and later 5 | 6 | Requires Python 3.x or Python 2.x having some backports like bytearray. 7 | (I've tested Python 3.2.3, and Python 2.7.3 on Fedora 16 linux x86_64.) 8 | 9 | LIMITATIONS: 10 | 11 | * binary content would corrupt, maybe. (I didn't test it) 12 | * also, will fail to extract stored binary content, maybe. 13 | (both my code, and original don't care escaped 0xff.) 14 | 15 | Documentation: http://docs.basex.org/wiki/Clients 16 | 17 | (C) 2012, Hiroaki Itoh. BSD License 18 | updated 2014 by Marc van Grootel 19 | 20 | """ 21 | 22 | import hashlib 23 | import socket 24 | import threading 25 | 26 | # --------------------------------- 27 | # 28 | 29 | 30 | class SocketWrapper(object): 31 | """a wrapper to python native socket module.""" 32 | 33 | def __init__(self, sock, 34 | receive_bytes_encoding='utf-8', 35 | send_bytes_encoding='utf-8'): 36 | 37 | self.receive_bytes_encoding = receive_bytes_encoding 38 | self.send_bytes_encoding = send_bytes_encoding 39 | 40 | self.terminator = bytearray(chr(0), self.receive_bytes_encoding) 41 | self.__s = sock 42 | self.__buf = bytearray(chr(0) * 0x1000, self.receive_bytes_encoding) 43 | self.__bpos = 0 44 | self.__bsize = 0 45 | 46 | def clear_buffer(self): 47 | """reset buffer status for next invocation ``recv_until_terminator()`` 48 | or ``recv_single_byte()``.""" 49 | self.__bpos = 0 50 | self.__bsize = 0 51 | 52 | def __fill_buffer(self): 53 | """cache next bytes""" 54 | if self.__bpos >= self.__bsize: 55 | self.__bsize = self.__s.recv_into(self.__buf) 56 | self.__bpos = 0 57 | 58 | # Returns a single byte from the socket. 59 | def recv_single_byte(self): 60 | """recv a single byte from previously fetched buffer.""" 61 | self.__fill_buffer() 62 | result_byte = self.__buf[self.__bpos] 63 | self.__bpos += 1 64 | return result_byte 65 | 66 | # Reads until terminator byte is found. 67 | def recv_until_terminator(self): 68 | """recv a nul(or specified as terminator_byte)-terminated whole string 69 | from previously fetched buffer.""" 70 | result_bytes = bytearray() 71 | while True: 72 | self.__fill_buffer() 73 | pos = self.__buf.find(self.terminator, self.__bpos, self.__bsize) 74 | if pos >= 0: 75 | result_bytes.extend(self.__buf[self.__bpos:pos]) 76 | self.__bpos = pos + 1 77 | break 78 | else: 79 | result_bytes.extend(self.__buf[self.__bpos:self.__bsize]) 80 | self.__bpos = self.__bsize 81 | return result_bytes.decode(self.receive_bytes_encoding) 82 | 83 | def sendall(self, data): 84 | """sendall with specified byte encoding if data is not bytearray, bytes 85 | (maybe str). if data is bytearray or bytes, it will be passed to native sendall API 86 | directly.""" 87 | if isinstance(data, (bytearray, bytes)): 88 | return self.__s.sendall(data) 89 | return self.__s.sendall(bytearray(data, self.send_bytes_encoding)) 90 | 91 | def __getattr__(self, name): 92 | return lambda *arg, **kw: getattr(self.__s, name)(*arg, **kw) 93 | 94 | 95 | # --------------------------------- 96 | # 97 | class Session(object): 98 | """class Session. 99 | 100 | see http://docs.basex.org/wiki/Server_Protocol 101 | """ 102 | 103 | def __init__(self, host, port, user, password, 104 | receive_bytes_encoding='utf-8', 105 | send_bytes_encoding='utf-8'): 106 | """Create and return session with host, port, user name and password""" 107 | 108 | self.__info = None 109 | 110 | # create server connection 111 | self.__swrapper = SocketWrapper( 112 | socket.socket(socket.AF_INET, socket.SOCK_STREAM), 113 | receive_bytes_encoding=receive_bytes_encoding, 114 | send_bytes_encoding=send_bytes_encoding) 115 | 116 | self.__swrapper.connect((host, port)) 117 | 118 | # receive timestamp 119 | response = self.recv_c_str().split(':') 120 | 121 | # send username and hashed password/timestamp 122 | hfun = hashlib.md5() 123 | 124 | if len(response) > 1: 125 | code = "%s:%s:%s" % (user, response[0], password) 126 | nonce = response[1] 127 | else: 128 | code = password 129 | nonce = response[0] 130 | 131 | hfun.update(hashlib.md5(code.encode('us-ascii')).hexdigest().encode('us-ascii')) 132 | hfun.update(nonce.encode('us-ascii')) 133 | self.send(user + chr(0) + hfun.hexdigest()) 134 | 135 | # evaluate success flag 136 | if not self.server_response_success(): 137 | raise IOError('Access Denied.') 138 | 139 | def execute(self, com): 140 | """Execute a command and return the result""" 141 | # send command to server 142 | self.send(com) 143 | 144 | # receive result 145 | result = self.receive() 146 | self.__info = self.recv_c_str() 147 | if not self.server_response_success(): 148 | raise IOError(self.__info) 149 | return result 150 | 151 | def query(self, querytxt): 152 | """Creates a new query instance (having id returned from server).""" 153 | return Query(self, querytxt) 154 | 155 | def create(self, name, content): 156 | """Creates a new database with the specified input (may be empty).""" 157 | self.__send_input(8, name, content) 158 | 159 | def add(self, path, content): 160 | """Adds a new resource to the opened database.""" 161 | self.__send_input(9, path, content) 162 | 163 | def replace(self, path, content): 164 | """Replaces a resource with the specified input.""" 165 | self.__send_input(12, path, content) 166 | 167 | def store(self, path, content): 168 | """Stores a binary resource in the opened database. 169 | 170 | api won't escape 0x00, 0xff automatically, so you must do it 171 | yourself explicitly.""" 172 | # ------------------------------------------ 173 | # chr(13) + path + chr(0) + content + chr(0) 174 | self.__send_binary_input(13, path, content) 175 | # 176 | # ------------------------------------------ 177 | 178 | def info(self): 179 | """Return process information""" 180 | return self.__info 181 | 182 | def close(self): 183 | """Close the session""" 184 | self.send('exit') 185 | self.__swrapper.close() 186 | 187 | def recv_c_str(self): 188 | """Retrieve a string from the socket""" 189 | return self.__swrapper.recv_until_terminator() 190 | 191 | def send(self, value): 192 | """Send the defined string""" 193 | self.__swrapper.sendall(value + chr(0)) 194 | 195 | def __send_input(self, code, arg, content): 196 | """internal. don't care.""" 197 | self.__swrapper.sendall(chr(code) + arg + chr(0) + content + chr(0)) 198 | self.__info = self.recv_c_str() 199 | if not self.server_response_success(): 200 | raise IOError(self.info()) 201 | 202 | def __send_binary_input(self, code, path, content): 203 | """internal. don't care.""" 204 | # at this time, we can't use __send_input itself because of encoding 205 | # problem. we have to build bytearray directly. 206 | if not isinstance(content, (bytearray, bytes)): 207 | raise ValueError("Sorry, content must be bytearray or bytes, not " + 208 | str(type(content))) 209 | 210 | # ------------------------------------------ 211 | # chr(code) + path + chr(0) + content + chr(0) 212 | data = bytearray([code]) 213 | try: 214 | data.extend(path) 215 | except: 216 | data.extend(path.encode('utf-8')) 217 | data.extend([0]) 218 | data.extend(content) 219 | data.extend([0]) 220 | # 221 | # ------------------------------------------ 222 | self.__swrapper.sendall(data) 223 | self.__info = self.recv_c_str() 224 | if not self.server_response_success(): 225 | raise IOError(self.info()) 226 | 227 | def server_response_success(self): 228 | """Return success check""" 229 | return self.__swrapper.recv_single_byte() == 0 230 | 231 | def receive(self): 232 | """Return received string""" 233 | self.__swrapper.clear_buffer() 234 | return self.recv_c_str() 235 | 236 | def iter_receive(self): 237 | """iter_receive() -> (typecode, item) 238 | 239 | iterate while the query returns items. 240 | typecode list is in http://docs.basex.org/wiki/Server_Protocol:_Types 241 | """ 242 | self.__swrapper.clear_buffer() 243 | typecode = self.__swrapper.recv_single_byte() 244 | while typecode > 0: 245 | string = self.recv_c_str() 246 | yield (typecode, string) 247 | typecode = self.__swrapper.recv_single_byte() 248 | if not self.server_response_success(): 249 | raise IOError(self.recv_c_str()) 250 | 251 | # --------------------------------- 252 | # 253 | 254 | 255 | class Query(): 256 | """class Query. 257 | 258 | see http://docs.basex.org/wiki/Server_Protocol 259 | """ 260 | 261 | def __init__(self, session, querytxt): 262 | """Create query object with session and query""" 263 | self.__session = session 264 | self.__id = self.__exc(chr(0), querytxt) 265 | 266 | def bind(self, name, value, datatype=''): 267 | """Binds a value to a variable. 268 | An empty string can be specified as data type.""" 269 | self.__exc(chr(3), self.__id + chr(0) + name + chr(0) + value + chr(0) + datatype) 270 | 271 | def context(self, value, datatype=''): 272 | """Bind the context item""" 273 | self.__exc(chr(14), self.__id + chr(0) + value + chr(0) + datatype) 274 | 275 | def iter(self): 276 | """iterate while the query returns items""" 277 | self.__session.send(chr(4) + self.__id) 278 | return self.__session.iter_receive() 279 | 280 | def execute(self): 281 | """Execute the query and return the result""" 282 | return self.__exc(chr(5), self.__id) 283 | 284 | def info(self): 285 | """Return query information""" 286 | return self.__exc(chr(6), self.__id) 287 | 288 | def options(self): 289 | """Return serialization parameters""" 290 | return self.__exc(chr(7), self.__id) 291 | 292 | def updating(self): 293 | """Returns true if the query may perform updates; false otherwise.""" 294 | return self.__exc(chr(30), self.__id) 295 | 296 | def full(self): 297 | """Returns all resulting items as strings, prefixed by XDM Meta Data.""" 298 | return self.__exc(chr(31), self.__id) 299 | 300 | def close(self): 301 | """Close the query""" 302 | self.__exc(chr(2), self.__id) 303 | 304 | def __exc(self, cmd, arg): 305 | """internal. don't care.""" 306 | # should we expose this? 307 | # (this makes sense only when mismatch between C/S is existing.) 308 | self.__session.send(cmd + arg) 309 | result = self.__session.receive() 310 | if not self.__session.server_response_success(): 311 | raise IOError(self.__session.recv_c_str()) 312 | return result 313 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 (Bill) Yuchen Lin 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # bibtidy 2 | 3 | Welcome to bibtidy, a simple tool for simplifying the creation of your BibTeX files! *Changes: We changed the online retriving mode into offline mode due to querying rate limiti.* 4 | 5 | ## Welcome Stars 6 | If you like this tool, don't forget to star it! Your support is my motivation to keep updating this tool. 7 | 8 | ## Features 9 | Checking: Automatically fix incorrect and fill incomplete entries with reference to DBLP. We adopt offline databases to retrieve bibtex entries, because all online APIs have rate limitations. 10 | 11 | ## Requirements 12 | Python 3.10.0 13 | Ubuntu 22.04 14 | (Other versions are not tested. Welcome to test and report.) 15 | 16 | ## Installation 17 | Download this repo. Then, we need to install and configure baseX database as the backend database. It may require more than 10 minutes to initialize the database. 18 | ```bash 19 | ./init.sh 20 | ``` 21 | 22 | 23 | ## Usage: Checking a BibTeX file 24 | Suppose we have a file named "test.bib" with some BibTex entries from unknown sources: 25 | ```bibtex 26 | @misc{ba2023qpg, 27 | title={Testing Database Engines via Query Plan Guidance}, 28 | author={Jinsheng Ba and Rigger, Manuel}, 29 | year={2023}, 30 | eprint={2312.17510}, 31 | archivePrefix={arXiv}, 32 | primaryClass={cs.CR} 33 | } 34 | 35 | ``` 36 | We can check and autofix it by running the following command: 37 | ```python 38 | bibtidy test.bib 39 | ``` 40 | 41 | The output will be in the file `test_revised.bib`: 42 | ```bibtex 43 | @inproceedings{qpg, 44 | author = {Jinsheng Ba and Manuel Rigger}, 45 | booktitle = {ICSE}, 46 | crossref = {conf/icse/2023}, 47 | doi = {10.1109/ICSE48619.2023.00174}, 48 | month = {May}, 49 | pages = {2060-2071}, 50 | title = {Testing Database Engines via Query Plan Guidance.}, 51 | url = {https://doi.org/10.1109/ICSE48619.2023.00174}, 52 | year = {2023} 53 | } 54 | ``` 55 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- 1 | from . import bibtidy 2 | 3 | __all__ = ["bibtidy"] -------------------------------------------------------------------------------- /bibtidy.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | import argparse 3 | import os, re 4 | import bibtexparser 5 | from difflib import SequenceMatcher 6 | from bibtexparser.bparser import BibTexParser 7 | import BaseXClient 8 | from lxml import etree 9 | from tqdm import tqdm 10 | 11 | session = BaseXClient.Session('localhost', 1984, 'admin', 'admin') 12 | 13 | def dblp_search(title): 14 | data = session.execute("xquery for $x in doc('dblp')/dblp/* where $x/title contains text '" + title + "' return $x") 15 | data = "\n" + data + "\n" 16 | root = etree.fromstring(data) 17 | return root 18 | 19 | def bibtex_prioritize(entries, title): 20 | """ 21 | Prioritize the entries according to the similarity of the title. 22 | """ 23 | selected_entries = [] 24 | selected_eprint_entries = [] 25 | for entry in entries: 26 | similarity = SequenceMatcher(None, entry.find('title').text, title).ratio() 27 | if entry.find('journal') is not None and entry.find('journal').text == 'CoRR': # Lower the priorities of the Arxiv papers 28 | selected_eprint_entries.append({"entry": entry, "similarity": similarity}) 29 | else: 30 | selected_entries.append({"entry": entry, "similarity": similarity}) 31 | 32 | sorted_selected_entries = sorted(selected_entries, key=lambda x: x["similarity"], reverse=True) 33 | sorted_selected_eprint_entries = sorted(selected_eprint_entries, key=lambda x: x["similarity"], reverse=True) 34 | sorted_selected_entries.extend(sorted_selected_eprint_entries) 35 | return sorted_selected_entries 36 | 37 | def write_bibtex(entry, output): 38 | """ 39 | Write the bibtex entry to the file. 40 | """ 41 | dblp_library = bibtexparser.bibdatabase.BibDatabase() 42 | dblp_library.entries = [entry] 43 | writer = bibtexparser.bwriter.BibTexWriter() 44 | writer.order_entries_by = None 45 | result = bibtexparser.dumps(dblp_library, writer=writer) 46 | with open(output, "a") as f: 47 | f.write(result) 48 | 49 | 50 | def bibtex_checking(bibtex_library, args): 51 | ''' 52 | correct the bibtex file with the dblp database and return the results. 53 | ''' 54 | for entry in tqdm(bibtex_library.entries, desc='Processing bibtex', unit='entry'): 55 | old_title = entry['title'] 56 | bibtex_matched = dblp_search(old_title) 57 | if (len(bibtex_matched) > 0): 58 | bibtex_best_match = bibtex_prioritize(bibtex_matched, entry['title'])[0] 59 | authors = "" 60 | for key in bibtex_best_match['entry']: 61 | if key.tag == 'author': 62 | if authors != "": 63 | authors += " and " 64 | if key.text.split(" ")[-1].isdigit(): # Remove the number at the end of the author name 65 | authors += key.text.rsplit(' ', 1)[0] 66 | authors += re.sub(r'\s*\d+$', '', key.text) 67 | elif key.tag == 'url' or key.tag == 'crossref': # Ignore interrnal tags 68 | continue 69 | elif key.tag == 'ee': # Convert DOI 70 | entry['url'] = key.text 71 | if key.text.startswith('https://doi.org/'): # Some publishers, such as USENIX, do not provide DOI 72 | entry['doi'] = key.text.split('https://doi.org/')[1].rstrip('}') 73 | else: 74 | entry[key.tag] = key.text 75 | if authors != "": 76 | entry['author'] = authors 77 | entry['ENTRYTYPE'] = bibtex_best_match['entry'].tag # Update the entry type 78 | 79 | # Reporting results 80 | if bibtex_best_match['similarity'] < 0.5 and args.debug: 81 | print("[Debug] \"" + entry['title'] + "\" has no similar entires.") 82 | elif bibtex_best_match['similarity'] < 0.8: 83 | print("[Warning] Suspicious update: \"" + old_title + "\" -> \"" + entry['title'] + "\". Please check whether it is correct.") 84 | elif bibtex_best_match['similarity'] < 1 and args.debug: 85 | print("[Debug] \"" + old_title + "\" -> \"" + entry['title'] + "\".") 86 | write_bibtex(entry, args.file + "_revised.bib") 87 | else: 88 | if args.debug: 89 | print("[Debug] \"" + entry['title'] + "\" is not found in the DBLP database.") 90 | return 91 | 92 | def main(): 93 | parser = argparse.ArgumentParser(description='bibtidy: Make your research easier with correct citations!') 94 | parser.add_argument('file') 95 | parser.add_argument('-o', '--output', type=str, default='stdout', help='the file path of the output') 96 | parser.add_argument('-d', '--debug', action='store_true', default=False, help='enable debug mode') 97 | args = parser.parse_args() 98 | 99 | if os.path.isfile(args.file + "_revised.bib") == True: 100 | print("The output file already exists! Please delete it first.") 101 | return 102 | 103 | with open(args.file) as bibtex_file: 104 | parser = BibTexParser() 105 | parser.ignore_nonstandard_types = False 106 | bibtex_library = bibtexparser.load(bibtex_file, parser) 107 | bibtex_checking(bibtex_library, args) 108 | 109 | # close session 110 | if session: 111 | session.close() 112 | 113 | 114 | if __name__ == "__main__": 115 | main() -------------------------------------------------------------------------------- /init.sh: -------------------------------------------------------------------------------- 1 | sudo apt install basex -y 2 | wget https://dblp.org/xml/dblp.xml.gz 3 | gunzip dblp.xml.gz 4 | wget https://dblp.org/xml/dblp.dtd 5 | 6 | basex -c "ALTER PASSWORD admin admin" 7 | basex -c "SET INTPARSE true; SET DTD true; SET TEXTINDEX true; SET TOKENINDEX true; SET FTINDEX true; CREATE DB dblp dblp.xml" 8 | 9 | basexserver -S -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | 3 | version = "0.0.2" 4 | 5 | def load_readme(): 6 | with open("README.md") as f: 7 | return f.read() 8 | 9 | setuptools.setup( 10 | name="bibtidy", 11 | version=version, 12 | author='Jinsheng Ba', 13 | author_email='bajinsheng@gmail.com', 14 | description="A tool for simplifying BiBTex creation.", 15 | long_description=load_readme(), 16 | long_description_content_type="text/markdown", 17 | url="https://github.com/bajinsheng/bibtidy", 18 | py_modules=["bibtidy"], 19 | packages=setuptools.find_packages(), 20 | install_requires=['bibtexparser==1.4.0', 21 | 'requests', 22 | ], 23 | classifiers=[ 24 | "Programming Language :: Python :: 3", 25 | "License :: OSI Approved :: MIT License", 26 | "Operating System :: OS Independent", 27 | ], 28 | python_requires='>=3.6', 29 | entry_points={ 30 | "console_scripts": [ 31 | "bibtidy = bibtidy:main" 32 | ] 33 | } 34 | ) 35 | --------------------------------------------------------------------------------