├── data ├── processed │ └── .gitkeep └── unprocessed │ └── .gitkeep ├── db_scripts ├── db_openlibrary.sql ├── tbl_edition_isbns.sql ├── tbl_works.sql ├── tbl_author_works.sql ├── tbl_authors.sql ├── tbl_editions.sql ├── tbl_fileinfo.sql ├── tbl_authors_indexes.sql ├── tbl_edition_isbns_indexes.sql ├── tbl_author_works_indexes.sql ├── tbl_works_indexes.sql ├── tbl_editions_indexes.sql ├── openlibrary-data-loader.sql └── load.sql ├── create_db.bat.sample ├── .gitignore ├── openlibrary-db.sql ├── openlibrary_data_process.py └── README.md /data/processed/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data/unprocessed/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /db_scripts/db_openlibrary.sql: -------------------------------------------------------------------------------- 1 | create database openlibrary; -------------------------------------------------------------------------------- /create_db.bat.sample: -------------------------------------------------------------------------------- 1 | psql --set=sslmode=require -f openlibrary-db.sql -h localhost -p 5432 -U username postgres -------------------------------------------------------------------------------- /db_scripts/tbl_edition_isbns.sql: -------------------------------------------------------------------------------- 1 | create table edition_isbns ( 2 | edition_key text not null, 3 | isbn text not null, 4 | constraint pk_editionisbns_editionkey_isbn primary key (edition_key, isbn) 5 | ); 6 | -------------------------------------------------------------------------------- /db_scripts/tbl_works.sql: -------------------------------------------------------------------------------- 1 | create table works ( 2 | type text, 3 | key text NOT NULL, 4 | revision integer, 5 | last_modified date, 6 | data jsonb, 7 | constraint pk_works_key primary key(key) 8 | ); 9 | -------------------------------------------------------------------------------- /db_scripts/tbl_author_works.sql: -------------------------------------------------------------------------------- 1 | create table author_works ( 2 | author_key text not null, 3 | work_key text not null, 4 | constraint pk_authorworks_authorkey_workkey primary key (author_key, work_key) 5 | ); 6 | -------------------------------------------------------------------------------- /db_scripts/tbl_authors.sql: -------------------------------------------------------------------------------- 1 | create table authors ( 2 | type text, 3 | key text not null, 4 | revision integer, 5 | last_modified date, 6 | data jsonb, 7 | constraint pk_author_key primary key (key) 8 | ); 9 | -------------------------------------------------------------------------------- /db_scripts/tbl_editions.sql: -------------------------------------------------------------------------------- 1 | create table editions ( 2 | type text, 3 | key text not null, 4 | revision integer, 5 | last_modified date, 6 | data jsonb, 7 | constraint pk_editions_key primary key (key) 8 | ); 9 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.csv 2 | *.txt 3 | *.xlsx 4 | 5 | data/unprocessed/ol_dump_works*.txt.gz 6 | data/unprocessed/ol_dump_authors*.txt.gz 7 | data/unprocessed/ol_dump_editions*.txt.gz 8 | 9 | create_db.bat 10 | 11 | copy_commands.sql 12 | 13 | .vscode/settings.json 14 | -------------------------------------------------------------------------------- /db_scripts/tbl_fileinfo.sql: -------------------------------------------------------------------------------- 1 | create table fileinfo ( 2 | name_of_table text, 3 | id int, 4 | loaded boolean, 5 | filenames text ARRAY, 6 | constraint key primary key (id) 7 | ); 8 | 9 | -- load all the file names into the database 10 | \copy fileinfo from './data/processed/filenames.txt' delimiter E'\t' quote '|' csv; 11 | -------------------------------------------------------------------------------- /db_scripts/tbl_authors_indexes.sql: -------------------------------------------------------------------------------- 1 | create unique index cuix_authors_key on authors (key); 2 | alter table authors cluster on cuix_authors_key; 3 | 4 | create index ix_authors_data on authors using gin (data jsonb_path_ops); 5 | 6 | -- index name from the jsonb data 7 | create index ix_authors_name on authors using gin ((data->>'name') gin_trgm_ops); 8 | -------------------------------------------------------------------------------- /db_scripts/tbl_edition_isbns_indexes.sql: -------------------------------------------------------------------------------- 1 | create unique index cuix_editionisbns_editionkey_isbn on edition_isbns (edition_key, isbn); 2 | alter table edition_isbns cluster on cuix_editionisbns_editionkey_isbn; 3 | 4 | create index ix_editionisbns_isbn on edition_isbns (isbn); 5 | create index ix_editionisbns_editionkey on edition_isbns (edition_key); 6 | -------------------------------------------------------------------------------- /db_scripts/tbl_author_works_indexes.sql: -------------------------------------------------------------------------------- 1 | create unique index cuix_authorworks_authorkey_workkey on author_works (author_key, work_key); 2 | alter table author_works cluster on cuix_authorworks_authorkey_workkey; 3 | 4 | create index ix_authorworks_workkey on author_works (work_key); 5 | create index ix_authorworks_authorkey on author_works (author_key); 6 | -------------------------------------------------------------------------------- /db_scripts/tbl_works_indexes.sql: -------------------------------------------------------------------------------- 1 | create unique index cuix_works_key on works (key); 2 | alter table works cluster on cuix_works_key; 3 | 4 | create index ix_works_data on works using gin (data jsonb_path_ops); 5 | 6 | -- index title and subtitle from jsonb data 7 | create index ix_works_title on works using gin ((data->>'title') gin_trgm_ops); 8 | create index ix_works_subtitle on works using gin ((data->>'subtitle') gin_trgm_ops); -------------------------------------------------------------------------------- /db_scripts/tbl_editions_indexes.sql: -------------------------------------------------------------------------------- 1 | create unique index cuix_editions_key on editions (key); 2 | alter table editions cluster on cuix_editions_key; 3 | 4 | create index ix_editions_workkey on editions (work_key); 5 | create index ix_editions_data on editions using gin (data jsonb_path_ops); 6 | 7 | -- index title and subtitle from the jsonb data 8 | create index ix_editions_title on editions using gin ((data->>'title') gin_trgm_ops); 9 | create index ix_editions_subtitle on editions using gin ((data->>'subtitle') gin_trgm_ops); -------------------------------------------------------------------------------- /db_scripts/openlibrary-data-loader.sql: -------------------------------------------------------------------------------- 1 | \a 2 | \t 3 | 4 | -- spool the output into copy_commands.sql to generate all the copy commands 5 | \o copy_commands.sql 6 | select format('\copy %s from ''./data/processed/%s'' delimiter E''\t'' quote ''|'' csv;', name_of_table, filename) 7 | from ( 8 | select id, name_of_table, filename 9 | from fileinfo, unnest(filenames) AS filename 10 | where loaded = false 11 | and name_of_table = (select name_of_table from fileinfo where loaded = false order by id limit 1) 12 | order by id) t; 13 | 14 | -- turn spooling off 15 | \o 16 | \a 17 | \t 18 | 19 | -- run the copy commands file 20 | \i copy_commands.sql 21 | 22 | update fileinfo set loaded = 't' 23 | where id = ( 24 | select id 25 | from fileinfo 26 | where loaded = false 27 | and name_of_table = (select name_of_table from fileinfo where loaded = false order by id limit 1) 28 | order by id); 29 | -------------------------------------------------------------------------------- /openlibrary-db.sql: -------------------------------------------------------------------------------- 1 | -- create the database 2 | \i 'db_scripts/db_openlibrary.sql'; 3 | 4 | -- -- switch to using the database 5 | \c openlibrary; 6 | 7 | -- additional extensions 8 | create extension pg_trgm; 9 | 10 | -- -- set client encoding 11 | set client_encoding = 'UTF8'; 12 | 13 | -- -- create tables 14 | \i 'db_scripts/tbl_authors.sql'; 15 | \i 'db_scripts/tbl_works.sql'; 16 | \i 'db_scripts/tbl_author_works.sql'; 17 | \i 'db_scripts/tbl_editions.sql'; 18 | \i 'db_scripts/tbl_edition_isbns.sql'; 19 | 20 | -- create filenames that can be accessed in lieu of parameters 21 | \i 'db_scripts/tbl_fileinfo.sql'; 22 | 23 | -- load in data 24 | \i 'db_scripts/load.sql'; 25 | 26 | -- finally remove temp table 27 | drop table fileinfo; 28 | 29 | -- vaccuum analyze will remove dead tuples and try to regain some space 30 | -- if you have enough room, you can use vacuum full analyze which will gain the most space back, but it requires enough space on your computer to make a complete second copy of the db 31 | -- if you add verbose it will explain what it is trying to do. (vacuum verbose analyze) 32 | vacuum analyze; 33 | -------------------------------------------------------------------------------- /db_scripts/load.sql: -------------------------------------------------------------------------------- 1 | select now(); 2 | -- import authors csv 3 | alter table authors set unlogged; 4 | \i 'db_scripts/openlibrary-data-loader.sql' 5 | alter table authors set logged; 6 | 7 | select now(); 8 | \i 'db_scripts/tbl_authors_indexes.sql'; 9 | 10 | select now(); 11 | -- import works csv 12 | alter table works set unlogged; 13 | \i 'db_scripts/openlibrary-data-loader.sql' 14 | alter table works set logged; 15 | 16 | select now(); 17 | \i 'db_scripts/tbl_works_indexes.sql'; 18 | 19 | select now(); 20 | -- set author and work_key for author_works from the data embedded in works 21 | alter table author_works set unlogged; 22 | insert into author_works (author_key, work_key) 23 | select distinct author_key, work_key 24 | from ( 25 | select 26 | jsonb_array_elements(data->'authors')->'author'->>'key' as author_key, 27 | key as work_key 28 | from works 29 | where key is not null 30 | and data->'authors'->0->'author' is not null) authorship 31 | where author_key is not null 32 | and work_key is not null; 33 | alter table author_works set logged; 34 | 35 | select now(); 36 | \i 'db_scripts/tbl_author_works_indexes.sql'; 37 | 38 | select now(); 39 | -- import editions csv 40 | alter table editions set unlogged; 41 | select now(); 42 | \i 'db_scripts/openlibrary-data-loader.sql' 43 | alter table editions 44 | add column work_key text; 45 | 46 | select now(); 47 | update editions 48 | set work_key = data->'works'->0->>'key'; 49 | alter table editions set logged; 50 | 51 | select now(); 52 | \i 'db_scripts/tbl_editions_indexes.sql'; 53 | select now(); 54 | -- set isbn for edition_isbns from the embedded json 55 | 56 | select now(); 57 | alter table edition_isbns set unlogged; 58 | insert into edition_isbns (edition_key, isbn) 59 | select 60 | distinct edition_key, 61 | isbn 62 | from ( 63 | select 64 | key as edition_key, 65 | jsonb_array_elements_text(data->'isbn_13') as isbn 66 | from editions 67 | where jsonb_array_length(data->'isbn_13') > 0 68 | and key is not null 69 | union all 70 | select 71 | key as edition_key, 72 | jsonb_array_elements_text(data->'isbn_10') as isbn 73 | from editions 74 | where jsonb_array_length(data->'isbn_10') > 0 75 | and key is not null 76 | union all 77 | select 78 | key as edition_key, 79 | jsonb_array_elements_text(data->'isbn') as isbn 80 | from editions 81 | where jsonb_array_length(data->'isbn') > 0 82 | and key is not null) isbns; 83 | alter table edition_isbns set logged; 84 | 85 | 86 | select now(); 87 | -- create isbn indexes 88 | \i 'db_scripts/tbl_edition_isbns_indexes.sql'; 89 | select now(); 90 | -------------------------------------------------------------------------------- /openlibrary_data_process.py: -------------------------------------------------------------------------------- 1 | """ 2 | This script processes the bulk download data from the Open Library project. 3 | It converts the large text files into smaller csv files which are easier to load into the db. 4 | Decide how large you would like to make each chunk using LINES_PER_FILE 5 | For editions, 3 million lines was about 3.24 gigs and about an hour to load. 6 | """ 7 | 8 | import csv 9 | import ctypes as ct 10 | from multiprocessing import Pool 11 | import os 12 | 13 | # Optional if you want to make a smaller copy from the unzipped version for testing 14 | # sed -i '' '100000,$ d' ./data/unprocessed/ol_dump_editions.txt 15 | 16 | # You can run this file once with all 3 downloaded and unzipped files or run it as they come in. 17 | # Just make sure the end product in filenames.txt looks like this 18 | # authors 0 False {authors_2000.csv,authors_4000.csv,authors_6000.csv} 19 | # works 1 False {works_2000.csv,works_4000.csv,works_6000.csv,works_8000.csv} 20 | # editions 2 False {editions_2000.csv,editions_4000.csv,editions_6000.csv} 21 | 22 | # Field size limit: See https://stackoverflow.com/a/54517228 for more info on this setting 23 | csv.field_size_limit(int(ct.c_ulong(-1).value // 2)) 24 | 25 | LINES_PER_FILE = 2000000 26 | 27 | INPUT_PATH = "./data/unprocessed/" 28 | OUTPUT_PATH = "./data/processed/" 29 | FILE_IDENTIFIERS = ['authors', 'works', 'editions'] 30 | 31 | def process_file(source_file: str, file_id) -> None: 32 | """ 33 | Processes a single file by chunking it into smaller csv files. 34 | 35 | :param source_file: The name of the file being processed 36 | :param file_id: The id of the file to process 37 | """ 38 | print(f"Currently processing {source_file}") 39 | 40 | filenames = [] 41 | file_path = os.path.join(INPUT_PATH, (f"ol_dump_{source_file}.txt")) 42 | with open(file_path, encoding="utf-8") as csv_input_file: 43 | reader = csv.reader(csv_input_file, delimiter='\t') 44 | 45 | for line, row in enumerate(reader): 46 | # Every time the row limit is reached, write the chunked csv file 47 | if line % LINES_PER_FILE == 0: 48 | chunked_filename = source_file + f"_{line + LINES_PER_FILE}.csv" 49 | filenames.append(chunked_filename) 50 | 51 | # Open a new file for writing 52 | output = open(os.path.join(OUTPUT_PATH, chunked_filename), 53 | "w", newline="", encoding="utf-8") 54 | writer = csv.writer( 55 | output, delimiter='\t', quotechar='|', quoting=csv.QUOTE_MINIMAL) 56 | 57 | if len(row) > 4: 58 | writer.writerow( 59 | [row[0], row[1], row[2], row[3], row[4]]) 60 | 61 | with open(os.path.join(OUTPUT_PATH, "filenames.txt"), "a", newline="", encoding="utf-8") as filenames_output: 62 | filenames_writer = csv.writer(filenames_output, delimiter='\t', quotechar='|', quoting=csv.QUOTE_MINIMAL) 63 | filenames_writer.writerow( 64 | [source_file, file_id, False, '{' + ','.join(filenames).strip("'") + '}']) 65 | 66 | print(f"{source_file} text file has now been processed") 67 | 68 | 69 | if __name__ == '__main__': 70 | """ 71 | Main driver for the data processor. For each file, a new process is created that 72 | performs data processing in parallel. 73 | """ 74 | with Pool() as pool: 75 | results = [] 76 | for i, filename in enumerate(FILE_IDENTIFIERS): 77 | results.append(pool.apply_async(process_file, args=(filename, i))) 78 | # Wait for the processes to finish before exiting the main python program 79 | for res in results: 80 | res.wait() 81 | print("Process complete") 82 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Open Library database 2 | 3 | Open Library is an online library of bibliographic data. The library publishes [full data dumps](https://openlibrary.org/developers/dumps) of all authors, works, and editions. 4 | 5 | This project provides instructions and scripts for importing this data into a PostgreSQL database, and some sample queries to test the database. 6 | 7 | The database is primarily aimed at querying the database using ISBN and includes tables specifically for these identifiers. It could be extended to change this to other identifiers, such as Open Library ID, or to query by title or author. 8 | 9 | ## Getting started 10 | 11 | The following steps should get you up and running with a working database. 12 | 13 | 1. Install the [required prerequisites](#prerequisites) so that you have a database server. 14 | 2. [Download the data](#downloading-the-data) from Open Library. 15 | 3. Run the [processing the data](#processing-the-data) scripts to make it easier to import. 16 | 4. [Import the data](#import-into-database) into the database. 17 | 18 | ## Prerequisites 19 | 20 | - [Python 3](https://www.python.org/downloads/) - Tested with 3.10 21 | - [PostgreSQL](https://www.postgresql.org/) - Version 15 is tested but all recent versions should work 22 | - Disk space - The data files are large, and the uncompressed editions file is 45GB. You will need at least 250GB of free space to import all the data. 23 | 24 | ## Downloading the data 25 | 26 | Open Library offer bulk downloads on their website, available from the [data dumps page](https://openlibrary.org/developers/dumps). 27 | 28 | These are updated every month. The downloads available include (compressed sizes shown): 29 | 30 | - Editions (~10.5GB) 31 | - Works (~3.5GB) 32 | - Authors (~0.5GB) 33 | - All types (~10GB) 34 | 35 | Download the Editions, Works, and Authors data dumps. 36 | 37 | ```console 38 | wget https://openlibrary.org/data/ol_dump_editions_latest.txt.gz -P ~/downloads 39 | wget https://openlibrary.org/data/ol_dump_works_latest.txt.gz -P ~/downloads 40 | wget https://openlibrary.org/data/ol_dump_authors_latest.txt.gz -P ~/downloads 41 | ``` 42 | 43 | Move the data from your downloads folder. 44 | 45 | ```console 46 | mv ~/downloads/ol_dump_authors_*txt.gz ./data/unprocessed/ol_dump_authors.txt.gz 47 | mv ~/downloads/ol_dump_works_*txt.gz ./data/unprocessed/ol_dump_works.txt.gz 48 | mv ~/downloads/ol_dump_editions_*txt.gz ./data/unprocessed/ol_dump_editions.txt.gz 49 | ``` 50 | 51 | Then uncompress the data files. 52 | 53 | ```console 54 | gzip -d -c data/unprocessed/ol_dump_editions.txt.gz > data/unprocessed/ol_dump_editions.txt 55 | gzip -d -c data/unprocessed/ol_dump_works.txt.gz > data/unprocessed/ol_dump_works.txt 56 | gzip -d -c data/unprocessed/ol_dump_authors.txt.gz > data/unprocessed/ol_dump_authors.txt 57 | ``` 58 | 59 | ### Processing the data 60 | 61 | Unfortunately the downloads provided don't play nicely for direct importing into PostgreSQL. The open library file errors on import as the number of columns provided varies. Cleaning it up is difficult as just the text file for editions is 25GB. 62 | 63 | _To do note: Check if this is still the case and if so there could be some Linux tools to do this - maybe try `sed` and `awk`_ 64 | 65 | Instead, the file [openlibrary_data_process.py](openlibrary_data_process.py) reads in the text file and writes it out again for each row, but only where there are 5 columns. 66 | 67 | Because the files are huge and are only going to grow (editions is now 45gb+) the `openlibrary_data_process.py` file also splits the data into smaller files to load sequentially. You can change the number of lines in each chunk by using the `LINES_PER_FILE` setting in the script. The default is 2 million. 68 | 69 | Once the files are split you can delete the 3 .txt files in the uncompressed folder because you will need around 250 Gb of freespace to load all 3 files into the database without encountering lack of space errors. If you have plenty of space you can keep the files! 70 | 71 | ```console 72 | python openlibrary_data_process_chunked.py 73 | ``` 74 | 75 | This generates multiple files into the `data/processed` directory. 76 | 77 | One of those files will be used to access the rest of them when loading the data. 78 | 79 | ### Import into database 80 | 81 | We import the data directly into PostgreSQL tables and can then do complex searches with SQL. 82 | 83 | There are a series of database scripts which will create the database and tables, and then import the data. These are in the [database](database) folder. The data files (created in the previous process) need to be within the `data/processed` folder for this to work. 84 | 85 | The PostgreSQL database command line tool `psql` is used to run the scripts. The following command will create the database and tables. Replace `username` with your database server username. 86 | 87 | ```console 88 | psql --set=sslmode=require -f openlibrary-db.sql -h localhost -p 5432 -U username postgres 89 | ``` 90 | 91 | ### Database details 92 | 93 | The database is split into 5 main tables 94 | 95 | | Data | Description | 96 | | :------------ | :-------------------------------------------------------------- | 97 | | Authors | Authors are the individuals who write the works | 98 | | Works | The works as created by the authors, with titles, and subtitles | 99 | | Author Works | A table linking the works with authors | 100 | | Editions | The particular editions of the works, including ISBNs | 101 | | Edition ISBNs | The ISBNs for the editions | 102 | 103 | ## Query the data 104 | 105 | That's the database set up - it can now be queried using SQL. 106 | 107 | Get details for a single item using the ISBN13 9781551922461 (Harry Potter and the Prisoner of Azkaban): 108 | 109 | ```sql 110 | select 111 | e.data->>'title' "EditionTitle", 112 | w.data->>'title' "WorkTitle", 113 | a.data->>'name' "Name", 114 | e.data->>'subtitle' "EditionSubtitle", 115 | w.data->>'subtitle' "WorkSubtitle", 116 | e.data->>'subjects' "Subjects", 117 | e.data->'description'->>'value' "EditionDescription", 118 | w.data->'description'->>'value' "WorkDescription", 119 | e.data->'notes'->>'value' "EditionNotes", 120 | w.data->'notes'->>'value' "WorkNotes" 121 | from editions e 122 | join edition_isbns ei 123 | on ei.edition_key = e.key 124 | join works w 125 | on w.key = e.work_key 126 | join author_works a_w 127 | on a_w.work_key = w.key 128 | join authors a 129 | on a_w.author_key = a.key 130 | where ei.isbn = '9781551922461' 131 | ``` 132 | --------------------------------------------------------------------------------