├── data
    ├── processed
    │   └── .gitkeep
    └── unprocessed
    │   └── .gitkeep
├── db_scripts
    ├── db_openlibrary.sql
    ├── tbl_edition_isbns.sql
    ├── tbl_works.sql
    ├── tbl_author_works.sql
    ├── tbl_authors.sql
    ├── tbl_editions.sql
    ├── tbl_fileinfo.sql
    ├── tbl_authors_indexes.sql
    ├── tbl_edition_isbns_indexes.sql
    ├── tbl_author_works_indexes.sql
    ├── tbl_works_indexes.sql
    ├── tbl_editions_indexes.sql
    ├── openlibrary-data-loader.sql
    └── load.sql
├── create_db.bat.sample
├── .gitignore
├── openlibrary-db.sql
├── openlibrary_data_process.py
└── README.md


/data/processed/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/data/unprocessed/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/db_scripts/db_openlibrary.sql:
--------------------------------------------------------------------------------
1 | create database openlibrary;


--------------------------------------------------------------------------------
/create_db.bat.sample:
--------------------------------------------------------------------------------
1 | psql --set=sslmode=require -f openlibrary-db.sql -h localhost -p 5432 -U username postgres


--------------------------------------------------------------------------------
/db_scripts/tbl_edition_isbns.sql:
--------------------------------------------------------------------------------
1 | create table edition_isbns (
2 |   edition_key text not null,
3 |   isbn text not null,
4 |   constraint pk_editionisbns_editionkey_isbn primary key (edition_key, isbn)
5 | );
6 | 


--------------------------------------------------------------------------------
/db_scripts/tbl_works.sql:
--------------------------------------------------------------------------------
1 | create table works (
2 |   type text,
3 |   key text NOT NULL,
4 |   revision integer,
5 |   last_modified date,
6 |   data jsonb,
7 |   constraint pk_works_key primary key(key)
8 | );
9 | 


--------------------------------------------------------------------------------
/db_scripts/tbl_author_works.sql:
--------------------------------------------------------------------------------
1 | create table author_works (
2 |   author_key text not null,
3 |   work_key text not null,
4 |   constraint pk_authorworks_authorkey_workkey primary key (author_key, work_key)
5 | );
6 | 


--------------------------------------------------------------------------------
/db_scripts/tbl_authors.sql:
--------------------------------------------------------------------------------
1 | create table authors (
2 |   type text,
3 |   key text not null,
4 |   revision integer,
5 |   last_modified date,
6 |   data jsonb,
7 |   constraint pk_author_key primary key (key)
8 | );
9 | 


--------------------------------------------------------------------------------
/db_scripts/tbl_editions.sql:
--------------------------------------------------------------------------------
1 | create table editions (
2 |   type text,
3 |   key text not null,
4 |   revision integer,
5 |   last_modified date,
6 |   data jsonb,
7 |   constraint pk_editions_key primary key (key)
8 | );
9 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.csv
 2 | *.txt
 3 | *.xlsx
 4 | 
 5 | data/unprocessed/ol_dump_works*.txt.gz
 6 | data/unprocessed/ol_dump_authors*.txt.gz
 7 | data/unprocessed/ol_dump_editions*.txt.gz
 8 | 
 9 | create_db.bat
10 | 
11 | copy_commands.sql
12 | 
13 | .vscode/settings.json
14 | 


--------------------------------------------------------------------------------
/db_scripts/tbl_fileinfo.sql:
--------------------------------------------------------------------------------
 1 | create table fileinfo (
 2 |   name_of_table text,
 3 |   id int,
 4 |   loaded boolean,
 5 |   filenames text ARRAY,
 6 |   constraint key primary key (id)
 7 | );
 8 | 
 9 | -- load all the file names into the database
10 | \copy fileinfo from './data/processed/filenames.txt' delimiter E'\t' quote '|' csv;
11 | 


--------------------------------------------------------------------------------
/db_scripts/tbl_authors_indexes.sql:
--------------------------------------------------------------------------------
1 | create unique index cuix_authors_key on authors (key);
2 | alter table authors cluster on cuix_authors_key;
3 | 
4 | create index ix_authors_data on authors using gin (data jsonb_path_ops);
5 | 
6 | -- index name from the jsonb data
7 | create index ix_authors_name on authors using gin ((data->>'name') gin_trgm_ops);
8 | 


--------------------------------------------------------------------------------
/db_scripts/tbl_edition_isbns_indexes.sql:
--------------------------------------------------------------------------------
1 | create unique index cuix_editionisbns_editionkey_isbn on edition_isbns (edition_key, isbn);
2 | alter table edition_isbns cluster on cuix_editionisbns_editionkey_isbn;
3 | 
4 | create index ix_editionisbns_isbn on edition_isbns (isbn);
5 | create index ix_editionisbns_editionkey on edition_isbns (edition_key);
6 | 


--------------------------------------------------------------------------------
/db_scripts/tbl_author_works_indexes.sql:
--------------------------------------------------------------------------------
1 | create unique index cuix_authorworks_authorkey_workkey on author_works (author_key, work_key);
2 | alter table author_works cluster on cuix_authorworks_authorkey_workkey;
3 | 
4 | create index ix_authorworks_workkey on author_works (work_key);
5 | create index ix_authorworks_authorkey on author_works (author_key);
6 | 


--------------------------------------------------------------------------------
/db_scripts/tbl_works_indexes.sql:
--------------------------------------------------------------------------------
1 | create unique index cuix_works_key on works (key);
2 | alter table works cluster on cuix_works_key;
3 | 
4 | create index ix_works_data on works using gin (data jsonb_path_ops);
5 | 
6 | -- index title and subtitle from jsonb data
7 | create index ix_works_title on works using gin ((data->>'title') gin_trgm_ops);
8 | create index ix_works_subtitle on works using gin ((data->>'subtitle') gin_trgm_ops);


--------------------------------------------------------------------------------
/db_scripts/tbl_editions_indexes.sql:
--------------------------------------------------------------------------------
1 | create unique index cuix_editions_key on editions (key);
2 | alter table editions cluster on cuix_editions_key;
3 | 
4 | create index ix_editions_workkey on editions (work_key);
5 | create index ix_editions_data on editions using gin (data jsonb_path_ops);
6 | 
7 | -- index title and subtitle from the jsonb data
8 | create index ix_editions_title on editions using gin ((data->>'title') gin_trgm_ops);
9 | create index ix_editions_subtitle on editions using gin ((data->>'subtitle') gin_trgm_ops);


--------------------------------------------------------------------------------
/db_scripts/openlibrary-data-loader.sql:
--------------------------------------------------------------------------------
 1 | \a
 2 | \t
 3 | 
 4 | -- spool the output into copy_commands.sql to generate all the copy commands
 5 | \o copy_commands.sql
 6 | select format('\copy %s from ''./data/processed/%s'' delimiter E''\t'' quote ''|'' csv;', name_of_table, filename)
 7 | from (
 8 |   select id, name_of_table, filename 
 9 |   from fileinfo, unnest(filenames) AS filename 
10 |   where loaded = false 
11 |   and name_of_table = (select name_of_table from fileinfo where loaded = false order by id limit 1)
12 |   order by id) t;
13 | 
14 | -- turn spooling off
15 | \o
16 | \a
17 | \t
18 | 
19 | -- run the copy commands file
20 | \i copy_commands.sql
21 | 
22 | update fileinfo set loaded = 't'
23 | where id = (
24 |   select id 
25 |   from fileinfo 
26 |   where loaded = false
27 |   and name_of_table = (select name_of_table from fileinfo where loaded = false order by id limit 1)
28 |   order by id);
29 | 


--------------------------------------------------------------------------------
/openlibrary-db.sql:
--------------------------------------------------------------------------------
 1 | -- create the database
 2 | \i 'db_scripts/db_openlibrary.sql';
 3 | 
 4 | -- -- switch to using the database
 5 | \c openlibrary;
 6 | 
 7 | -- additional extensions
 8 | create extension pg_trgm;
 9 | 
10 | -- -- set client encoding
11 | set client_encoding = 'UTF8';
12 | 
13 | -- -- create tables
14 | \i 'db_scripts/tbl_authors.sql';
15 | \i 'db_scripts/tbl_works.sql';
16 | \i 'db_scripts/tbl_author_works.sql';
17 | \i 'db_scripts/tbl_editions.sql';
18 | \i 'db_scripts/tbl_edition_isbns.sql';
19 | 
20 | -- create filenames that can be accessed in lieu of parameters
21 | \i 'db_scripts/tbl_fileinfo.sql';
22 | 
23 | -- load in data
24 | \i 'db_scripts/load.sql';
25 | 
26 | -- finally remove temp table
27 | drop table fileinfo;
28 | 
29 | -- vaccuum analyze will remove dead tuples and try to regain some space
30 | -- if you have enough room, you can use vacuum full analyze which will gain the most space back, but it requires enough space on your computer to make a complete second copy of the db
31 | -- if you add verbose it will explain what it is trying to do.  (vacuum verbose analyze)
32 | vacuum analyze;
33 | 


--------------------------------------------------------------------------------
/db_scripts/load.sql:
--------------------------------------------------------------------------------
 1 | select now();
 2 | -- import authors csv
 3 | alter table authors set unlogged;
 4 | \i 'db_scripts/openlibrary-data-loader.sql'
 5 | alter table authors set logged;
 6 | 
 7 | select now();
 8 | \i 'db_scripts/tbl_authors_indexes.sql';
 9 | 
10 | select now();
11 | -- import works csv
12 | alter table works set unlogged;
13 | \i 'db_scripts/openlibrary-data-loader.sql'
14 | alter table works set logged;
15 | 
16 | select now();
17 | \i 'db_scripts/tbl_works_indexes.sql';
18 | 
19 | select now();
20 | -- set author and work_key for author_works from the data embedded in works
21 | alter table author_works set unlogged;
22 | insert into author_works (author_key, work_key)
23 | select distinct author_key, work_key
24 | from (
25 |   select
26 |     jsonb_array_elements(data->'authors')->'author'->>'key' as author_key,
27 |     key as work_key
28 |   from works
29 |   where key is not null
30 |   and data->'authors'->0->'author' is not null) authorship
31 | where author_key is not null
32 | and work_key is not null;
33 | alter table author_works set logged;
34 | 
35 | select now();
36 | \i 'db_scripts/tbl_author_works_indexes.sql';
37 | 
38 | select now();
39 | -- import editions csv
40 | alter table editions set unlogged;
41 | select now();
42 | \i 'db_scripts/openlibrary-data-loader.sql'
43 | alter table editions
44 | add column work_key text;
45 | 
46 | select now();
47 | update editions
48 | set work_key = data->'works'->0->>'key';
49 | alter table editions set logged;
50 | 
51 | select now();
52 | \i 'db_scripts/tbl_editions_indexes.sql';
53 | select now();
54 | -- set isbn for edition_isbns from the embedded json
55 | 
56 | select now();
57 | alter table edition_isbns set unlogged;
58 | insert into edition_isbns (edition_key, isbn)
59 | select
60 |   distinct edition_key,
61 |   isbn
62 | from (
63 |   select
64 |     key as edition_key,
65 |     jsonb_array_elements_text(data->'isbn_13') as isbn
66 |   from editions
67 |   where jsonb_array_length(data->'isbn_13') > 0
68 |   and key is not null
69 |   union all
70 |   select
71 |     key as edition_key,
72 |     jsonb_array_elements_text(data->'isbn_10') as isbn
73 |   from editions
74 |   where jsonb_array_length(data->'isbn_10') > 0
75 |   and key is not null
76 |   union all
77 |   select
78 |     key as edition_key,
79 |     jsonb_array_elements_text(data->'isbn') as isbn
80 |   from editions
81 |   where jsonb_array_length(data->'isbn') > 0
82 |   and key is not null) isbns;
83 | alter table edition_isbns set logged;
84 | 
85 | 
86 | select now();
87 | -- create isbn indexes
88 | \i 'db_scripts/tbl_edition_isbns_indexes.sql';
89 | select now();
90 | 


--------------------------------------------------------------------------------
/openlibrary_data_process.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This script processes the bulk download data from the Open Library project.
 3 | It converts the large text files into smaller csv files which are easier to load into the db.
 4 | Decide how large you would like to make each chunk using LINES_PER_FILE
 5 | For editions, 3 million lines was about 3.24 gigs and about an hour to load.
 6 | """
 7 | 
 8 | import csv
 9 | import ctypes as ct
10 | from multiprocessing import Pool
11 | import os
12 | 
13 | # Optional if you want to make a smaller copy from the unzipped version for testing
14 | # sed -i '' '100000,$ d' ./data/unprocessed/ol_dump_editions.txt
15 | 
16 | # You can run this file once with all 3 downloaded and unzipped files or run it as they come in.
17 | # Just make sure the end product in filenames.txt  looks like this
18 | # authors	0	False	{authors_2000.csv,authors_4000.csv,authors_6000.csv}
19 | # works	1	False	{works_2000.csv,works_4000.csv,works_6000.csv,works_8000.csv}
20 | # editions	2	False	{editions_2000.csv,editions_4000.csv,editions_6000.csv}
21 | 
22 | # Field size limit: See https://stackoverflow.com/a/54517228 for more info on this setting
23 | csv.field_size_limit(int(ct.c_ulong(-1).value // 2))
24 | 
25 | LINES_PER_FILE = 2000000
26 | 
27 | INPUT_PATH = "./data/unprocessed/"
28 | OUTPUT_PATH = "./data/processed/"
29 | FILE_IDENTIFIERS = ['authors', 'works', 'editions']
30 | 
31 | def process_file(source_file: str, file_id) -> None:
32 |     """
33 |     Processes a single file by chunking it into smaller csv files.
34 | 
35 |     :param source_file: The name of the file being processed
36 |     :param file_id: The id of the file to process
37 |     """
38 |     print(f"Currently processing {source_file}")
39 | 
40 |     filenames = []
41 |     file_path = os.path.join(INPUT_PATH, (f"ol_dump_{source_file}.txt"))
42 |     with open(file_path, encoding="utf-8") as csv_input_file:
43 |         reader = csv.reader(csv_input_file, delimiter='\t')
44 | 
45 |         for line, row in enumerate(reader):
46 |             # Every time the row limit is reached, write the chunked csv file
47 |             if line % LINES_PER_FILE == 0:
48 |                 chunked_filename = source_file + f"_{line + LINES_PER_FILE}.csv"
49 |                 filenames.append(chunked_filename)
50 | 
51 |                 # Open a new file for writing
52 |                 output = open(os.path.join(OUTPUT_PATH, chunked_filename),
53 |                               "w", newline="", encoding="utf-8")
54 |                 writer = csv.writer(
55 |                     output, delimiter='\t', quotechar='|', quoting=csv.QUOTE_MINIMAL)
56 | 
57 |             if len(row) > 4:
58 |                 writer.writerow(
59 |                     [row[0], row[1], row[2], row[3], row[4]])
60 | 
61 |     with open(os.path.join(OUTPUT_PATH, "filenames.txt"), "a", newline="", encoding="utf-8") as filenames_output:
62 |         filenames_writer = csv.writer(filenames_output, delimiter='\t', quotechar='|', quoting=csv.QUOTE_MINIMAL)
63 |         filenames_writer.writerow(
64 |             [source_file, file_id, False, '{' + ','.join(filenames).strip("'") + '}'])
65 | 
66 |         print(f"{source_file} text file has now been processed")
67 | 
68 | 
69 | if __name__ == '__main__':
70 |     """
71 |     Main driver for the data processor. For each file, a new process is created that
72 |     performs data processing in parallel.
73 |     """
74 |     with Pool() as pool:
75 |         results = []
76 |         for i, filename in enumerate(FILE_IDENTIFIERS):
77 |             results.append(pool.apply_async(process_file, args=(filename, i)))
78 |         # Wait for the processes to finish before exiting the main python program
79 |         for res in results:
80 |             res.wait()
81 |     print("Process complete")
82 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Open Library database
  2 | 
  3 | Open Library is an online library of bibliographic data. The library publishes [full data dumps](https://openlibrary.org/developers/dumps) of all authors, works, and editions.
  4 | 
  5 | This project provides instructions and scripts for importing this data into a PostgreSQL database, and some sample queries to test the database.
  6 | 
  7 | The database is primarily aimed at querying the database using ISBN and includes tables specifically for these identifiers. It could be extended to change this to other identifiers, such as Open Library ID, or to query by title or author.
  8 | 
  9 | ## Getting started
 10 | 
 11 | The following steps should get you up and running with a working database.
 12 | 
 13 | 1. Install the [required prerequisites](#prerequisites) so that you have a database server.
 14 | 2. [Download the data](#downloading-the-data) from Open Library.
 15 | 3. Run the [processing the data](#processing-the-data) scripts to make it easier to import.
 16 | 4. [Import the data](#import-into-database) into the database.
 17 | 
 18 | ## Prerequisites
 19 | 
 20 | - [Python 3](https://www.python.org/downloads/) - Tested with 3.10
 21 | - [PostgreSQL](https://www.postgresql.org/) - Version 15 is tested but all recent versions should work
 22 | - Disk space - The data files are large, and the uncompressed editions file is 45GB. You will need at least 250GB of free space to import all the data.
 23 | 
 24 | ## Downloading the data
 25 | 
 26 | Open Library offer bulk downloads on their website, available from the [data dumps page](https://openlibrary.org/developers/dumps).
 27 | 
 28 | These are updated every month. The downloads available include (compressed sizes shown):
 29 | 
 30 | - Editions (~10.5GB)
 31 | - Works (~3.5GB)
 32 | - Authors (~0.5GB)
 33 | - All types (~10GB)
 34 | 
 35 | Download the Editions, Works, and Authors data dumps.
 36 | 
 37 | ```console
 38 | wget https://openlibrary.org/data/ol_dump_editions_latest.txt.gz -P ~/downloads
 39 | wget https://openlibrary.org/data/ol_dump_works_latest.txt.gz -P ~/downloads
 40 | wget https://openlibrary.org/data/ol_dump_authors_latest.txt.gz -P ~/downloads
 41 | ```
 42 | 
 43 | Move the data from your downloads folder.
 44 | 
 45 | ```console
 46 | mv ~/downloads/ol_dump_authors_*txt.gz ./data/unprocessed/ol_dump_authors.txt.gz
 47 | mv ~/downloads/ol_dump_works_*txt.gz ./data/unprocessed/ol_dump_works.txt.gz
 48 | mv ~/downloads/ol_dump_editions_*txt.gz ./data/unprocessed/ol_dump_editions.txt.gz
 49 | ```
 50 | 
 51 | Then uncompress the data files.
 52 | 
 53 | ```console
 54 | gzip -d -c data/unprocessed/ol_dump_editions.txt.gz > data/unprocessed/ol_dump_editions.txt
 55 | gzip -d -c data/unprocessed/ol_dump_works.txt.gz > data/unprocessed/ol_dump_works.txt
 56 | gzip -d -c data/unprocessed/ol_dump_authors.txt.gz > data/unprocessed/ol_dump_authors.txt
 57 | ```
 58 | 
 59 | ### Processing the data
 60 | 
 61 | Unfortunately the downloads provided don't play nicely for direct importing into PostgreSQL. The open library file errors on import as the number of columns provided varies. Cleaning it up is difficult as just the text file for editions is 25GB.
 62 | 
 63 | _To do note: Check if this is still the case and if so there could be some Linux tools to do this - maybe try `sed` and `awk`_
 64 | 
 65 | Instead, the file [openlibrary_data_process.py](openlibrary_data_process.py) reads in the text file and writes it out again for each row, but only where there are 5 columns.
 66 | 
 67 | Because the files are huge and are only going to grow (editions is now 45gb+) the `openlibrary_data_process.py` file also splits the data into smaller files to load sequentially. You can change the number of lines in each chunk by using the `LINES_PER_FILE` setting in the script. The default is 2 million.
 68 | 
 69 | Once the files are split you can delete the 3 .txt files in the uncompressed folder because you will need around 250 Gb of freespace to load all 3 files into the database without encountering lack of space errors. If you have plenty of space you can keep the files!
 70 | 
 71 | ```console
 72 | python openlibrary_data_process_chunked.py
 73 | ```
 74 | 
 75 | This generates multiple files into the `data/processed` directory.
 76 | 
 77 | One of those files will be used to access the rest of them when loading the data.
 78 | 
 79 | ### Import into database
 80 | 
 81 | We import the data directly into PostgreSQL tables and can then do complex searches with SQL.
 82 | 
 83 | There are a series of database scripts which will create the database and tables, and then import the data. These are in the [database](database) folder. The data files (created in the previous process) need to be within the `data/processed` folder for this to work.
 84 | 
 85 | The PostgreSQL database command line tool `psql` is used to run the scripts. The following command will create the database and tables. Replace `username` with your database server username.
 86 | 
 87 | ```console
 88 | psql --set=sslmode=require -f openlibrary-db.sql -h localhost -p 5432 -U username postgres
 89 | ```
 90 | 
 91 | ### Database details
 92 | 
 93 | The database is split into 5 main tables
 94 | 
 95 | | Data          | Description                                                     |
 96 | | :------------ | :-------------------------------------------------------------- |
 97 | | Authors       | Authors are the individuals who write the works                 |
 98 | | Works         | The works as created by the authors, with titles, and subtitles |
 99 | | Author Works  | A table linking the works with authors                          |
100 | | Editions      | The particular editions of the works, including ISBNs           |
101 | | Edition ISBNs | The ISBNs for the editions                                      |
102 | 
103 | ## Query the data
104 | 
105 | That's the database set up - it can now be queried using SQL.
106 | 
107 | Get details for a single item using the ISBN13 9781551922461 (Harry Potter and the Prisoner of Azkaban):
108 | 
109 | ```sql
110 | select
111 |     e.data->>'title' "EditionTitle",
112 |     w.data->>'title' "WorkTitle",
113 | 	a.data->>'name' "Name",
114 |     e.data->>'subtitle' "EditionSubtitle",
115 |     w.data->>'subtitle' "WorkSubtitle",
116 |     e.data->>'subjects' "Subjects",
117 |     e.data->'description'->>'value' "EditionDescription",
118 |     w.data->'description'->>'value' "WorkDescription",
119 |     e.data->'notes'->>'value' "EditionNotes",
120 |     w.data->'notes'->>'value' "WorkNotes"
121 | from editions e
122 | join edition_isbns ei
123 |     on ei.edition_key = e.key
124 | join works w
125 |     on w.key = e.work_key
126 | join author_works a_w
127 | 	on a_w.work_key = w.key
128 | join authors a
129 | 	on a_w.author_key = a.key
130 | where ei.isbn = '9781551922461'
131 | ```
132 | 


--------------------------------------------------------------------------------