├── .dockerignore ├── .github ├── FUNDING.yml └── workflows │ ├── build.yml │ └── release-pgxn.yml ├── .gitignore ├── Dockerfile ├── LICENSE ├── META.json ├── Makefile ├── README.md ├── pgpdf.c ├── pgpdf.control ├── sql └── pgpdf--0.1.0.sql └── test ├── expected └── pgpdf.out ├── pgintro.pdf └── sql └── pgpdf.sql /.dockerignore: -------------------------------------------------------------------------------- 1 | /.git/ 2 | /dist/ 3 | /results/ 4 | /tmp_check/ 5 | regression.* 6 | *.o 7 | *.so -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | github: [Florents-Tselai] 2 | -------------------------------------------------------------------------------- /.github/workflows/build.yml: -------------------------------------------------------------------------------- 1 | name: build 2 | on: [ push, pull_request ] 3 | jobs: 4 | ubuntu: 5 | runs-on: ${{ matrix.os }} 6 | strategy: 7 | fail-fast: false 8 | matrix: 9 | include: 10 | - postgres: 18 11 | os: ubuntu-24.04 12 | - postgres: 17 13 | os: ubuntu-24.04 14 | - postgres: 16 15 | os: ubuntu-24.04 16 | - postgres: 15 17 | os: ubuntu-22.04 18 | - postgres: 14 19 | os: ubuntu-22.04 20 | 21 | steps: 22 | - uses: actions/checkout@v4 23 | 24 | - run: | 25 | sudo apt update 26 | sudo apt install -y libpoppler-glib-dev pkg-config wget 27 | 28 | - uses: ankane/setup-postgres@v1 29 | with: 30 | postgres-version: ${{ matrix.postgres }} 31 | dev-files: true 32 | 33 | - run: make 34 | - run: sudo make install 35 | - run: make installcheck 36 | - if: ${{ failure() }} 37 | run: cat regression.diffs 38 | 39 | -------------------------------------------------------------------------------- /.github/workflows/release-pgxn.yml: -------------------------------------------------------------------------------- 1 | name: Release 2 | on: 3 | push: 4 | tags: [v*] 5 | jobs: 6 | release: 7 | name: Release on PGXN 8 | runs-on: ubuntu-latest 9 | container: pgxn/pgxn-tools 10 | env: 11 | PGXN_USERNAME: ${{ secrets.PGXN_USERNAME }} 12 | PGXN_PASSWORD: ${{ secrets.PGXN_PASSWORD }} 13 | steps: 14 | - name: Check out the repo 15 | uses: actions/checkout@v4 16 | - name: Bundle the Release 17 | id: bundle 18 | run: pgxn-bundle 19 | - name: Release on PGXN 20 | run: pgxn-release 21 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .deps/ 2 | .idea/ 3 | *.o 4 | *.so 5 | build/ 6 | dist/ 7 | *.egg-info/ 8 | archive/ 9 | 10 | /*.png 11 | /results/ 12 | /pgdata/ 13 | /docs/ 14 | /.readthedocs.yaml 15 | /pgpdf.dylib 16 | /regression.diffs 17 | /regression.out 18 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | ARG PG_MAJOR=17 2 | FROM postgres:$PG_MAJOR 3 | ARG PG_MAJOR 4 | 5 | COPY . /tmp/pgpdf 6 | 7 | RUN apt-get update && \ 8 | apt-mark hold locales && \ 9 | apt-get install -y --no-install-recommends libpoppler-glib-dev pkg-config wget build-essential postgresql-server-dev-$PG_MAJOR && \ 10 | cd /tmp/pgpdf && \ 11 | make clean && \ 12 | make install && \ 13 | mkdir /usr/share/doc/pgpdf && \ 14 | cp LICENSE README.md /usr/share/doc/pgpdf && \ 15 | rm -r /tmp/pgpdf && \ 16 | apt-get remove -y pkg-config wget build-essential postgresql-server-dev-$PG_MAJOR && \ 17 | apt-get autoremove -y && \ 18 | apt-mark unhold locales && \ 19 | rm -rf /var/lib/apt/lists/* -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | GNU GENERAL PUBLIC LICENSE 2 | Version 2, June 1991 3 | 4 | Copyright (C) 1989, 1991 Free Software Foundation, Inc., 5 | 6 | Everyone is permitted to copy and distribute verbatim copies 7 | of this license document, but changing it is not allowed. 8 | 9 | Preamble 10 | 11 | The licenses for most software are designed to take away your 12 | freedom to share and change it. By contrast, the GNU General Public 13 | License is intended to guarantee your freedom to share and change free 14 | software--to make sure the software is free for all its users. This 15 | General Public License applies to most of the Free Software 16 | Foundation's software and to any other program whose authors commit to 17 | using it. (Some other Free Software Foundation software is covered by 18 | the GNU Lesser General Public License instead.) You can apply it to 19 | your programs, too. 20 | 21 | When we speak of free software, we are referring to freedom, not 22 | price. Our General Public Licenses are designed to make sure that you 23 | have the freedom to distribute copies of free software (and charge for 24 | this service if you wish), that you receive source code or can get it 25 | if you want it, that you can change the software or use pieces of it 26 | in new free programs; and that you know you can do these things. 27 | 28 | To protect your rights, we need to make restrictions that forbid 29 | anyone to deny you these rights or to ask you to surrender the rights. 30 | These restrictions translate to certain responsibilities for you if you 31 | distribute copies of the software, or if you modify it. 32 | 33 | For example, if you distribute copies of such a program, whether 34 | gratis or for a fee, you must give the recipients all the rights that 35 | you have. You must make sure that they, too, receive or can get the 36 | source code. And you must show them these terms so they know their 37 | rights. 38 | 39 | We protect your rights with two steps: (1) copyright the software, and 40 | (2) offer you this license which gives you legal permission to copy, 41 | distribute and/or modify the software. 42 | 43 | Also, for each author's protection and ours, we want to make certain 44 | that everyone understands that there is no warranty for this free 45 | software. If the software is modified by someone else and passed on, we 46 | want its recipients to know that what they have is not the original, so 47 | that any problems introduced by others will not reflect on the original 48 | authors' reputations. 49 | 50 | Finally, any free program is threatened constantly by software 51 | patents. We wish to avoid the danger that redistributors of a free 52 | program will individually obtain patent licenses, in effect making the 53 | program proprietary. To prevent this, we have made it clear that any 54 | patent must be licensed for everyone's free use or not licensed at all. 55 | 56 | The precise terms and conditions for copying, distribution and 57 | modification follow. 58 | 59 | GNU GENERAL PUBLIC LICENSE 60 | TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 61 | 62 | 0. This License applies to any program or other work which contains 63 | a notice placed by the copyright holder saying it may be distributed 64 | under the terms of this General Public License. The "Program", below, 65 | refers to any such program or work, and a "work based on the Program" 66 | means either the Program or any derivative work under copyright law: 67 | that is to say, a work containing the Program or a portion of it, 68 | either verbatim or with modifications and/or translated into another 69 | language. (Hereinafter, translation is included without limitation in 70 | the term "modification".) Each licensee is addressed as "you". 71 | 72 | Activities other than copying, distribution and modification are not 73 | covered by this License; they are outside its scope. The act of 74 | running the Program is not restricted, and the output from the Program 75 | is covered only if its contents constitute a work based on the 76 | Program (independent of having been made by running the Program). 77 | Whether that is true depends on what the Program does. 78 | 79 | 1. You may copy and distribute verbatim copies of the Program's 80 | source code as you receive it, in any medium, provided that you 81 | conspicuously and appropriately publish on each copy an appropriate 82 | copyright notice and disclaimer of warranty; keep intact all the 83 | notices that refer to this License and to the absence of any warranty; 84 | and give any other recipients of the Program a copy of this License 85 | along with the Program. 86 | 87 | You may charge a fee for the physical act of transferring a copy, and 88 | you may at your option offer warranty protection in exchange for a fee. 89 | 90 | 2. You may modify your copy or copies of the Program or any portion 91 | of it, thus forming a work based on the Program, and copy and 92 | distribute such modifications or work under the terms of Section 1 93 | above, provided that you also meet all of these conditions: 94 | 95 | a) You must cause the modified files to carry prominent notices 96 | stating that you changed the files and the date of any change. 97 | 98 | b) You must cause any work that you distribute or publish, that in 99 | whole or in part contains or is derived from the Program or any 100 | part thereof, to be licensed as a whole at no charge to all third 101 | parties under the terms of this License. 102 | 103 | c) If the modified program normally reads commands interactively 104 | when run, you must cause it, when started running for such 105 | interactive use in the most ordinary way, to print or display an 106 | announcement including an appropriate copyright notice and a 107 | notice that there is no warranty (or else, saying that you provide 108 | a warranty) and that users may redistribute the program under 109 | these conditions, and telling the user how to view a copy of this 110 | License. (Exception: if the Program itself is interactive but 111 | does not normally print such an announcement, your work based on 112 | the Program is not required to print an announcement.) 113 | 114 | These requirements apply to the modified work as a whole. If 115 | identifiable sections of that work are not derived from the Program, 116 | and can be reasonably considered independent and separate works in 117 | themselves, then this License, and its terms, do not apply to those 118 | sections when you distribute them as separate works. But when you 119 | distribute the same sections as part of a whole which is a work based 120 | on the Program, the distribution of the whole must be on the terms of 121 | this License, whose permissions for other licensees extend to the 122 | entire whole, and thus to each and every part regardless of who wrote it. 123 | 124 | Thus, it is not the intent of this section to claim rights or contest 125 | your rights to work written entirely by you; rather, the intent is to 126 | exercise the right to control the distribution of derivative or 127 | collective works based on the Program. 128 | 129 | In addition, mere aggregation of another work not based on the Program 130 | with the Program (or with a work based on the Program) on a volume of 131 | a storage or distribution medium does not bring the other work under 132 | the scope of this License. 133 | 134 | 3. You may copy and distribute the Program (or a work based on it, 135 | under Section 2) in object code or executable form under the terms of 136 | Sections 1 and 2 above provided that you also do one of the following: 137 | 138 | a) Accompany it with the complete corresponding machine-readable 139 | source code, which must be distributed under the terms of Sections 140 | 1 and 2 above on a medium customarily used for software interchange; or, 141 | 142 | b) Accompany it with a written offer, valid for at least three 143 | years, to give any third party, for a charge no more than your 144 | cost of physically performing source distribution, a complete 145 | machine-readable copy of the corresponding source code, to be 146 | distributed under the terms of Sections 1 and 2 above on a medium 147 | customarily used for software interchange; or, 148 | 149 | c) Accompany it with the information you received as to the offer 150 | to distribute corresponding source code. (This alternative is 151 | allowed only for noncommercial distribution and only if you 152 | received the program in object code or executable form with such 153 | an offer, in accord with Subsection b above.) 154 | 155 | The source code for a work means the preferred form of the work for 156 | making modifications to it. For an executable work, complete source 157 | code means all the source code for all modules it contains, plus any 158 | associated interface definition files, plus the scripts used to 159 | control compilation and installation of the executable. However, as a 160 | special exception, the source code distributed need not include 161 | anything that is normally distributed (in either source or binary 162 | form) with the major components (compiler, kernel, and so on) of the 163 | operating system on which the executable runs, unless that component 164 | itself accompanies the executable. 165 | 166 | If distribution of executable or object code is made by offering 167 | access to copy from a designated place, then offering equivalent 168 | access to copy the source code from the same place counts as 169 | distribution of the source code, even though third parties are not 170 | compelled to copy the source along with the object code. 171 | 172 | 4. You may not copy, modify, sublicense, or distribute the Program 173 | except as expressly provided under this License. Any attempt 174 | otherwise to copy, modify, sublicense or distribute the Program is 175 | void, and will automatically terminate your rights under this License. 176 | However, parties who have received copies, or rights, from you under 177 | this License will not have their licenses terminated so long as such 178 | parties remain in full compliance. 179 | 180 | 5. You are not required to accept this License, since you have not 181 | signed it. However, nothing else grants you permission to modify or 182 | distribute the Program or its derivative works. These actions are 183 | prohibited by law if you do not accept this License. Therefore, by 184 | modifying or distributing the Program (or any work based on the 185 | Program), you indicate your acceptance of this License to do so, and 186 | all its terms and conditions for copying, distributing or modifying 187 | the Program or works based on it. 188 | 189 | 6. Each time you redistribute the Program (or any work based on the 190 | Program), the recipient automatically receives a license from the 191 | original licensor to copy, distribute or modify the Program subject to 192 | these terms and conditions. You may not impose any further 193 | restrictions on the recipients' exercise of the rights granted herein. 194 | You are not responsible for enforcing compliance by third parties to 195 | this License. 196 | 197 | 7. If, as a consequence of a court judgment or allegation of patent 198 | infringement or for any other reason (not limited to patent issues), 199 | conditions are imposed on you (whether by court order, agreement or 200 | otherwise) that contradict the conditions of this License, they do not 201 | excuse you from the conditions of this License. If you cannot 202 | distribute so as to satisfy simultaneously your obligations under this 203 | License and any other pertinent obligations, then as a consequence you 204 | may not distribute the Program at all. For example, if a patent 205 | license would not permit royalty-free redistribution of the Program by 206 | all those who receive copies directly or indirectly through you, then 207 | the only way you could satisfy both it and this License would be to 208 | refrain entirely from distribution of the Program. 209 | 210 | If any portion of this section is held invalid or unenforceable under 211 | any particular circumstance, the balance of the section is intended to 212 | apply and the section as a whole is intended to apply in other 213 | circumstances. 214 | 215 | It is not the purpose of this section to induce you to infringe any 216 | patents or other property right claims or to contest validity of any 217 | such claims; this section has the sole purpose of protecting the 218 | integrity of the free software distribution system, which is 219 | implemented by public license practices. Many people have made 220 | generous contributions to the wide range of software distributed 221 | through that system in reliance on consistent application of that 222 | system; it is up to the author/donor to decide if he or she is willing 223 | to distribute software through any other system and a licensee cannot 224 | impose that choice. 225 | 226 | This section is intended to make thoroughly clear what is believed to 227 | be a consequence of the rest of this License. 228 | 229 | 8. If the distribution and/or use of the Program is restricted in 230 | certain countries either by patents or by copyrighted interfaces, the 231 | original copyright holder who places the Program under this License 232 | may add an explicit geographical distribution limitation excluding 233 | those countries, so that distribution is permitted only in or among 234 | countries not thus excluded. In such case, this License incorporates 235 | the limitation as if written in the body of this License. 236 | 237 | 9. The Free Software Foundation may publish revised and/or new versions 238 | of the General Public License from time to time. Such new versions will 239 | be similar in spirit to the present version, but may differ in detail to 240 | address new problems or concerns. 241 | 242 | Each version is given a distinguishing version number. If the Program 243 | specifies a version number of this License which applies to it and "any 244 | later version", you have the option of following the terms and conditions 245 | either of that version or of any later version published by the Free 246 | Software Foundation. If the Program does not specify a version number of 247 | this License, you may choose any version ever published by the Free Software 248 | Foundation. 249 | 250 | 10. If you wish to incorporate parts of the Program into other free 251 | programs whose distribution conditions are different, write to the author 252 | to ask for permission. For software which is copyrighted by the Free 253 | Software Foundation, write to the Free Software Foundation; we sometimes 254 | make exceptions for this. Our decision will be guided by the two goals 255 | of preserving the free status of all derivatives of our free software and 256 | of promoting the sharing and reuse of software generally. 257 | 258 | NO WARRANTY 259 | 260 | 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY 261 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN 262 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES 263 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED 264 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 265 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS 266 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE 267 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, 268 | REPAIR OR CORRECTION. 269 | 270 | 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 271 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR 272 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, 273 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING 274 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED 275 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY 276 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER 277 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE 278 | POSSIBILITY OF SUCH DAMAGES. 279 | 280 | END OF TERMS AND CONDITIONS 281 | 282 | How to Apply These Terms to Your New Programs 283 | 284 | If you develop a new program, and you want it to be of the greatest 285 | possible use to the public, the best way to achieve this is to make it 286 | free software which everyone can redistribute and change under these terms. 287 | 288 | To do so, attach the following notices to the program. It is safest 289 | to attach them to the start of each source file to most effectively 290 | convey the exclusion of warranty; and each file should have at least 291 | the "copyright" line and a pointer to where the full notice is found. 292 | 293 | 294 | Copyright (C) 295 | 296 | This program is free software; you can redistribute it and/or modify 297 | it under the terms of the GNU General Public License as published by 298 | the Free Software Foundation; either version 2 of the License, or 299 | (at your option) any later version. 300 | 301 | This program is distributed in the hope that it will be useful, 302 | but WITHOUT ANY WARRANTY; without even the implied warranty of 303 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 304 | GNU General Public License for more details. 305 | 306 | You should have received a copy of the GNU General Public License along 307 | with this program; if not, see . 308 | 309 | Also add information on how to contact you by electronic and paper mail. 310 | 311 | If the program is interactive, make it output a short notice like this 312 | when it starts in an interactive mode: 313 | 314 | Gnomovision version 69, Copyright (C) year name of author 315 | Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. 316 | This is free software, and you are welcome to redistribute it 317 | under certain conditions; type `show c' for details. 318 | 319 | The hypothetical commands `show w' and `show c' should show the appropriate 320 | parts of the General Public License. Of course, the commands you use may 321 | be called something other than `show w' and `show c'; they could even be 322 | mouse-clicks or menu items--whatever suits your program. 323 | 324 | You should also get your employer (if you work as a programmer) or your 325 | school, if any, to sign a "copyright disclaimer" for the program, if 326 | necessary. Here is a sample; alter the names: 327 | 328 | Yoyodyne, Inc., hereby disclaims all copyright interest in the program 329 | `Gnomovision' (which makes passes at compilers) written by James Hacker. 330 | 331 | , 1 April 1989 332 | Moe Ghoul, President of Vice 333 | 334 | This General Public License does not permit incorporating your program into 335 | proprietary programs. If your program is a subroutine library, you may 336 | consider it more useful to permit linking proprietary applications with the 337 | library. If this is what you want to do, use the GNU Lesser General 338 | Public License instead of this License. 339 | -------------------------------------------------------------------------------- /META.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "pgpdf", 3 | "abstract": "PDF Type for Postgres", 4 | "version": "0.1.0", 5 | "maintainer": [ 6 | "Florents Tselai " 7 | ], 8 | "license": "gpl_2", 9 | "generated_by": "Florents Tselai", 10 | "release_status": "stable", 11 | "prereqs": { 12 | "runtime": { 13 | "requires": { 14 | "PostgreSQL": "14.0.0" 15 | } 16 | } 17 | }, 18 | "provides": { 19 | "pgpdf": { 20 | "file": "sql/pgpdf--0.1.0.sql", 21 | "docfile": "README.md", 22 | "version": "0.1.0", 23 | "abstract": "PDF Type for Postgres" 24 | } 25 | }, 26 | "resources": { 27 | "bugtracker": { 28 | "web": "https://github.com/Florents-Tselai/pgpdf/issues" 29 | }, 30 | "repository": { 31 | "url": "git://github.com/Florents-Tselai/pgpdf.git" , 32 | "web": "https://github.com/Florents-Tselai/pgpdf", 33 | "type": "git" 34 | } 35 | }, 36 | "meta-spec": { 37 | "version": "1.0.0", 38 | "url": "http://pgxn.org/meta/spec.txt" 39 | }, 40 | "tags": [ 41 | "pdf", 42 | "documents" 43 | ] 44 | } 45 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | PG_CONFIG = pg_config 2 | PKG_CONFIG = pkg-config 3 | 4 | EXTENSION = pgpdf 5 | EXTVERSION = 0.1.0 6 | MODULE_big = $(EXTENSION) 7 | 8 | OBJS = pgpdf.o 9 | 10 | DATA = $(wildcard sql/*--*.sql) 11 | 12 | PG_CPPFLAGS = $(shell $(PKG_CONFIG) --cflags poppler poppler-glib) 13 | PG_LDFLAGS = $(shell $(PKG_CONFIG) --libs poppler poppler-glib) 14 | SHLIB_LINK =-lpoppler -lpoppler-glib 15 | 16 | TESTS = $(wildcard test/sql/*.sql) 17 | REGRESS = $(patsubst test/sql/%.sql,%,$(TESTS)) 18 | REGRESS_OPTS = --inputdir=test --load-extension=$(EXTENSION) 19 | 20 | TEST_FILES = /tmp/pgintro.pdf /tmp/bad.pdf /tmp/big.pdf 21 | /tmp/pgintro.pdf: 22 | cp test/pgintro.pdf $@ 23 | /tmp/bad.pdf: 24 | echo 'not a pdf' >> $@ 25 | /tmp/big.pdf: 26 | wget https://www.postgresql.org/files/documentation/pdf/10/postgresql-10-A4.pdf -O $@ 27 | 28 | installcheck: $(TEST_FILES) 29 | 30 | EXTRA_CLEAN = $(TEST_FILES) 31 | 32 | PGXS := $(shell $(PG_CONFIG) --pgxs) 33 | include $(PGXS) 34 | 35 | dev: clean all install installcheck 36 | 37 | .PHONY: dist 38 | 39 | dist: 40 | mkdir -p dist 41 | git archive --format zip --prefix=$(EXTENSION)-$(EXTVERSION)/ --output dist/$(EXTENSION)-$(EXTVERSION).zip main 42 | 43 | # for Docker 44 | PG_MAJOR ?= 17 45 | 46 | .PHONY: docker-build 47 | 48 | docker-build: 49 | docker build --pull --no-cache --build-arg PG_MAJOR=$(PG_MAJOR) -t florents/pgpdf:pg$(PG_MAJOR) -t florents/pgpdf:$(EXTVERSION)-pg$(PG_MAJOR) . 50 | 51 | .PHONY: docker-release 52 | 53 | docker-release: 54 | docker buildx build --push --pull --no-cache --platform linux/amd64,linux/arm64 --build-arg PG_MAJOR=$(PG_MAJOR) -t florents/pgpdf:pg$(PG_MAJOR) -t florents/pgpdf:$(EXTVERSION)-pg$(PG_MAJOR) . -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # pgPDF: `pdf` type for Postgres 2 | 3 | [![build](https://github.com/Florents-Tselai/pgpdf/actions/workflows/build.yml/badge.svg)](https://github.com/Florents-Tselai/pgpdf/actions/workflows/build.yml) 4 | ![GitHub Repo stars](https://img.shields.io/github/stars/Florents-Tselai/pgpdf) 5 | Docker Pulls 6 | 7 | This extension for PostgreSQL provides a `pdf` data type and assorted functions. 8 | 9 | You can create a `pdf` type, by casting either a `text` filepath or `bytea` column. 10 | 11 | ```tsql 12 | SELECT '/tmp/pgintro.pdf'::pdf; 13 | ``` 14 | 15 | ```tsql 16 | pdf 17 | ---------------------------------------------------------------------------------- 18 | PostgreSQL Introduction + 19 | Digoal.Zhou + 20 | 7/20/2011Catalog + 21 |  PostgreSQL Origin 22 | ``` 23 | 24 | If you don’t have the PDF file in your filesystem, 25 | but have already stored its content in a `bytea` column, 26 | you can just cast it to `pdf`. 27 | 28 | ```tsql 29 | SELECT pg_read_binary_file('/tmp/pgintro.pdf')::bytea::pdf; 30 | ``` 31 | 32 | **Why?**: 33 | This allows you to work with PDFs in an ACID-compliant way. 34 | The usual alternative relies on external scripts or services which can easily 35 | make your data ingestion pipeline brittle and leave your raw data out-of-sync. 36 | 37 | The actual PDF parsing is done by [poppler](https://poppler.freedesktop.org). 38 | 39 | Also check blog: 40 | - [Full Text Search on PDFs With Postgres](https://tselai.com/full-text-search-pdf-postgres) 41 | - [pgpdf: pdf type for Postgres](https://tselai.com/pgpdf-pdf-type-postgres) 42 | 43 | ## Usage 44 | 45 | Download some PDFs. 46 | 47 | ```sh 48 | wget https://wiki.postgresql.org/images/e/ea/PostgreSQL_Introduction.pdf -O /tmp/pgintro.pdf 49 | wget https://pdfobject.com/pdf/sample.pdf -O /tmp/sample.pdf 50 | ``` 51 | 52 | Create a table with a `pdf` column: 53 | 54 | ```tsql 55 | CREATE TABLE pdfs(name text primary key, doc pdf); 56 | 57 | INSERT INTO pdfs VALUES ('pgintro', '/tmp/pgintro.pdf'); 58 | INSERT INTO pdfs VALUES ('pgintro', '/tmp/sample.pdf'); 59 | ``` 60 | 61 | Parsing and validation should happen automatically. 62 | The files will be read from the disk only once! 63 | 64 | > [!NOTE] 65 | > The filepath should be accessible by the `postgres` process / user! 66 | > That's different than the user running psql. 67 | > If you don't understand what this means, as your DBA! 68 | 69 | ### Reading from URLs 70 | 71 | You can combine pgpdf with [pgsql-http](https://github.com/pramsey/pgsql-http) 72 | to quickly grab remote PDFs into Postgres, 73 | by fetching the remote content as `bytea` and then treat it as a PDF. 74 | 75 | ```tsql 76 | CREATE EXTENSION pgpdf; 77 | CREATE EXTENSION http; 78 | 79 | SELECT pdf_read_bytes(text_to_bytea(content)) 80 | FROM http_get('https://wiki.postgresql.org/images/e/e3/Hooks_in_postgresql.pdf'); 81 | ``` 82 | 83 | ### String Functions and Operators 84 | 85 | Standard Postgres [String Functions and Operators](https://www.postgresql.org/docs/17/functions-string.html) 86 | should work as usual: 87 | 88 | ```tsql 89 | SELECT 'Below is the PDF we received ' || '/tmp/pgintro.pdf'::pdf; 90 | ``` 91 | 92 | ```tsql 93 | SELECT upper('/tmp/pgintro.pdf'::pdf::text); 94 | ``` 95 | 96 | ``` tsql 97 | SELECT name 98 | FROM pdfs 99 | WHERE doc::text LIKE '%Postgres%'; 100 | ``` 101 | 102 | ### Full-Text Search (FTS) 103 | 104 | You can also perform full-text search (FTS), since you can work on a `pdf` file like normal text. 105 | 106 | ```tsql 107 | SELECT '/tmp/pgintro.pdf'::pdf::text @@ to_tsquery('postgres'); 108 | ``` 109 | 110 | ```tsql 111 | ?column? 112 | ---------- 113 | t 114 | (1 row) 115 | ``` 116 | 117 | ```tsql 118 | SELECT '/tmp/pgintro.pdf'::pdf::text @@ to_tsquery('oracle'); 119 | ``` 120 | 121 | ```tsql 122 | ?column? 123 | ---------- 124 | f 125 | (1 row) 126 | ``` 127 | 128 | ### Document similarity with `pg_trgm` 129 | 130 | You can use [pg_trgm](https://postgresql.org/docs/17/interactive/pgtrgm.html) 131 | to get the similarity between two documents: 132 | 133 | ```tsql 134 | CREATE EXTENSION pg_trgm; 135 | 136 | SELECT similarity('/tmp/pgintro.pdf'::pdf::text, '/tmp/sample.pdf'::pdf::text); 137 | ``` 138 | 139 | ### Metadata 140 | 141 | The following functions are available: 142 | 143 | - `pdf_title(pdf) → text` 144 | - `pdf_author(pdf) → text` 145 | - `pdf_num_pages(pdf) → integer` 146 | 147 | Total number of pages in the document 148 | - `pdf_page(pdf, integer) → text` 149 | 150 | Get the i-th page as text 151 | - `pdf_creator(pdf) → text` 152 | - `pdf_keywords(pdf) → text` 153 | - `pdf_metadata(pdf) → text` 154 | - `pdf_version(pdf) → text` 155 | - `pdf_subject(pdf) → text` 156 | - `pdf_creation(pdf) → timestamp` 157 | - `pdf_modification(pdf) → timestamp` 158 | 159 | ```tsql 160 | SELECT pdf_title('/tmp/pgintro.pdf'); 161 | ``` 162 | 163 | ```tsql 164 | pdf_title 165 | ------------------------- 166 | PostgreSQL Introduction 167 | (1 row) 168 | ``` 169 | 170 | ```tsql 171 | SELECT pdf_author('/tmp/pgintro.pdf'); 172 | ``` 173 | 174 | ```tsql 175 | pdf_author 176 | ------------ 177 | 周正中 178 | (1 row) 179 | ``` 180 | 181 | Getting a subset of pages 182 | 183 | ```tsql 184 | SELECT pdf_num_pages('/tmp/pgintro.pdf'); 185 | ``` 186 | 187 | ```tsql 188 | pdf_num_pages 189 | --------------- 190 | 24 191 | (1 row) 192 | ``` 193 | 194 | ```tsql 195 | SELECT pdf_page('/tmp/pgintro.pdf', 1); 196 | ``` 197 | 198 | ```tsql 199 | pdf_page 200 | ------------------------------ 201 | Catalog + 202 |  PostgreSQL Origin + 203 |  Layout + 204 |  Features + 205 |  Enterprise Class Attribute+ 206 |  Case 207 | (1 row) 208 | ``` 209 | 210 | ```tsql 211 | SELECT pdf_subject('/tmp/pgintro.pdf'); 212 | ``` 213 | 214 | ```tsql 215 | pdf_subject 216 | ------------- 217 | 218 | (1 row) 219 | ``` 220 | 221 | ```tsql 222 | SELECT pdf_creation('/tmp/pgintro.pdf'); 223 | ``` 224 | 225 | ```tsql 226 | pdf_creation 227 | -------------------------- 228 | Wed Jul 20 11:13:37 2011 229 | (1 row) 230 | ``` 231 | 232 | ```tsql 233 | SELECT pdf_modification('/tmp/pgintro.pdf'); 234 | ``` 235 | 236 | ```tsql 237 | pdf_modification 238 | -------------------------- 239 | Wed Jul 20 11:13:37 2011 240 | (1 row) 241 | ``` 242 | 243 | ```tsql 244 | SELECT pdf_creator('/tmp/pgintro.pdf'); 245 | ``` 246 | 247 | ```tsql 248 | pdf_creator 249 | ------------------------------------ 250 | Microsoft® Office PowerPoint® 2007 251 | (1 row) 252 | ``` 253 | 254 | ```tsql 255 | SELECT pdf_metadata('/tmp/pgintro.pdf'); 256 | ``` 257 | 258 | ```tsql 259 | pdf_metadata 260 | -------------- 261 | 262 | (1 row) 263 | ``` 264 | 265 | ```tsql 266 | SELECT pdf_version('/tmp/pgintro.pdf'); 267 | ``` 268 | 269 | ```tsql 270 | pdf_version 271 | ------------- 272 | PDF-1.5 273 | (1 row) 274 | ``` 275 | 276 | ## Installation 277 | 278 | Install [poppler](https://poppler.freedesktop.org) dependencies 279 | 280 | **Linux** 281 | ``` 282 | sudo apt install -y libpoppler-glib-dev pkg-config 283 | ``` 284 | 285 | **Homebrew/MacOS** 286 | 287 | ``` 288 | brew install poppler pkgconf 289 | ``` 290 | 291 | ``` 292 | cd /tmp 293 | git clone https://github.com/Florents-Tselai/pgpdf.git 294 | cd pgpdf 295 | make 296 | make install # may need sudo 297 | ``` 298 | 299 | After the installation, in a session: 300 | 301 | ```tsql 302 | CREATE EXTENSION pgpdf; 303 | ``` 304 | 305 | ### Docker 306 | 307 | Get the [Docker image](https://hub.docker.com/r/florents/pgpdf) with: 308 | 309 | ```sh 310 | docker pull florents/pgpdf:pg17 311 | ``` 312 | 313 | This adds pgpdf to the [Postgres image](https://hub.docker.com/_/postgres) (replace `17` with your Postgres server version, and run it the same way). 314 | 315 | Run the image in a container. 316 | 317 | ```sh 318 | docker run --name pgpdf -p 5432:5432 -e POSTGRES_PASSWORD=pass florents/pgpdf:pg17 319 | ``` 320 | 321 | Through another terminal, connect to the running server (container). 322 | 323 | ```sh 324 | PGPASSWORD=pass psql -h localhost -p 5432 -U postgres 325 | ``` 326 | 327 | > [!WARNING] 328 | > Reading arbitrary binary data (PDF) into your database can pose security risks. 329 | > Only use this for files you trust. 330 | 331 | -------------------------------------------------------------------------------- /pgpdf.c: -------------------------------------------------------------------------------- 1 | #include "postgres.h" 2 | 3 | #include "fmgr.h" 4 | #include "utils/builtins.h" 5 | #include "utils/jsonb.h" 6 | #include "utils/datetime.h" 7 | #include "utils/date.h" 8 | 9 | #include "poppler.h" 10 | #include 11 | #include 12 | #include 13 | 14 | #if PG_VERSION_NUM >= 160000 15 | #include 16 | #include "varatt.h" 17 | 18 | #endif 19 | 20 | #if PG_VERSION_NUM < 130000 21 | #define TYPALIGN_DOUBLE 'd' 22 | #define TYPALIGN_INT 'i' 23 | #endif 24 | 25 | PG_MODULE_MAGIC; 26 | 27 | typedef struct varlena pdftype; 28 | 29 | #define DatumGetPdfP(X) ((pdftype *) PG_DETOAST_DATUM(X)) 30 | #define DatumGetPdfPP(X) ((pdftype *) PG_DETOAST_DATUM_PACKED(X)) 31 | #define PdfPGetDatum(X) PointerGetDatum(X) 32 | 33 | #define PG_GETARG_PDF_P(n) DatumGetPdfP(PG_GETARG_DATUM(n)) 34 | #define PG_GETARG_PDF_PP(n) DatumGetPdfPP(PG_GETARG_DATUM(n)) 35 | #define PG_RETURN_PDF_P(x) PG_RETURN_POINTER(x) 36 | 37 | #define PG_GETARG_POPPLER_DOCUMENT(X) ({ \ 38 | pdftype* pdf = PG_GETARG_PDF_P(X); \ 39 | GError* error = NULL; \ 40 | GBytes* pdf_data = g_bytes_new(VARDATA(pdf), VARSIZE_ANY_EXHDR(pdf)); \ 41 | PopplerDocument* doc = poppler_document_new_from_bytes(pdf_data, NULL, &error); \ 42 | g_bytes_unref(pdf_data); \ 43 | if (!doc) { \ 44 | elog(ERROR, "Error parsing PDF document: %s", error->message); \ 45 | g_clear_error(&error); \ 46 | } \ 47 | doc; \ 48 | }) 49 | 50 | PG_FUNCTION_INFO_V1(pdf_in); 51 | 52 | Datum 53 | pdf_in(PG_FUNCTION_ARGS) 54 | { 55 | Datum filename_t = CStringGetTextDatum(PG_GETARG_CSTRING(0)); 56 | Datum pdf_bytes; 57 | int32 pdf_bytes_len; 58 | 59 | pdftype* result; 60 | GBytes* g_bytes = NULL; 61 | PopplerDocument* doc = NULL; 62 | GError* error = NULL; 63 | 64 | pdf_bytes = DirectFunctionCall1(pg_read_binary_file_all, filename_t); 65 | pdf_bytes_len = VARSIZE_ANY_EXHDR(pdf_bytes); 66 | 67 | result = (pdftype*)palloc(VARHDRSZ + pdf_bytes_len); 68 | SET_VARSIZE(result, VARHDRSZ + pdf_bytes_len); 69 | 70 | memcpy(VARDATA(result), VARDATA_ANY(pdf_bytes), pdf_bytes_len); 71 | 72 | g_bytes = g_bytes_new(VARDATA(result), pdf_bytes_len); 73 | 74 | doc = poppler_document_new_from_bytes(g_bytes, NULL, &error); 75 | g_bytes_unref(g_bytes); 76 | 77 | if (!doc) 78 | { 79 | elog(ERROR, "Error parsing PDF document: %s", error->message); 80 | pfree(result); 81 | g_clear_error(&error); 82 | PG_RETURN_NULL(); 83 | } 84 | 85 | g_object_unref(doc); 86 | 87 | PG_RETURN_POINTER(result); 88 | } 89 | 90 | PG_FUNCTION_INFO_V1(pdf_out); 91 | 92 | Datum 93 | pdf_out(PG_FUNCTION_ARGS) 94 | { 95 | PopplerDocument* doc = PG_GETARG_POPPLER_DOCUMENT(0); 96 | StringInfo strinfo = makeStringInfo(); 97 | 98 | int num_pages = poppler_document_get_n_pages(doc); 99 | for (int i = 0; i < num_pages; i++) 100 | { 101 | PopplerPage* page = poppler_document_get_page(doc, i); 102 | if (!page) 103 | { 104 | elog(WARNING, "Failed to get page %d\n", i); 105 | continue; 106 | } 107 | 108 | gchar* page_text = poppler_page_get_text(page); 109 | if (page_text) 110 | { 111 | appendStringInfo(strinfo, "%s", page_text); 112 | g_free(page_text); 113 | } 114 | else 115 | { 116 | elog(WARNING, "Failed to extract text from page %d\n", i); 117 | } 118 | 119 | g_object_unref(page); 120 | } 121 | 122 | PG_RETURN_CSTRING(strinfo->data); 123 | } 124 | 125 | 126 | PG_FUNCTION_INFO_V1(pdf_title); 127 | Datum 128 | pdf_title(PG_FUNCTION_ARGS) 129 | { 130 | PopplerDocument *doc = PG_GETARG_POPPLER_DOCUMENT(0); 131 | const char *title = poppler_document_get_title(doc); 132 | 133 | if (title == NULL) 134 | PG_RETURN_NULL(); 135 | 136 | PG_RETURN_TEXT_P(cstring_to_text(title)); 137 | } 138 | 139 | PG_FUNCTION_INFO_V1(pdf_author); 140 | Datum 141 | pdf_author(PG_FUNCTION_ARGS) 142 | { 143 | PopplerDocument *doc = PG_GETARG_POPPLER_DOCUMENT(0); 144 | const char *author = poppler_document_get_author(doc); 145 | 146 | if (author == NULL) 147 | PG_RETURN_NULL(); 148 | 149 | PG_RETURN_TEXT_P(cstring_to_text(author)); 150 | } 151 | 152 | PG_FUNCTION_INFO_V1(pdf_creator); 153 | Datum 154 | pdf_creator(PG_FUNCTION_ARGS) 155 | { 156 | PopplerDocument *doc = PG_GETARG_POPPLER_DOCUMENT(0); 157 | const char *creator = poppler_document_get_creator(doc); 158 | 159 | if (creator == NULL) 160 | PG_RETURN_NULL(); 161 | 162 | PG_RETURN_TEXT_P(cstring_to_text(creator)); 163 | } 164 | 165 | PG_FUNCTION_INFO_V1(pdf_keywords); 166 | Datum 167 | pdf_keywords(PG_FUNCTION_ARGS) 168 | { 169 | PopplerDocument *doc = PG_GETARG_POPPLER_DOCUMENT(0); 170 | const char *keywords = poppler_document_get_keywords(doc); 171 | 172 | if (keywords == NULL) 173 | PG_RETURN_NULL(); 174 | 175 | PG_RETURN_TEXT_P(cstring_to_text(keywords)); 176 | } 177 | 178 | PG_FUNCTION_INFO_V1(pdf_metadata); 179 | Datum 180 | pdf_metadata(PG_FUNCTION_ARGS) 181 | { 182 | PopplerDocument *doc = PG_GETARG_POPPLER_DOCUMENT(0); 183 | const char *metadata = poppler_document_get_metadata(doc); 184 | 185 | if (metadata == NULL) 186 | PG_RETURN_NULL(); 187 | 188 | PG_RETURN_TEXT_P(cstring_to_text(metadata)); 189 | } 190 | 191 | PG_FUNCTION_INFO_V1(pdf_version); 192 | Datum 193 | pdf_version(PG_FUNCTION_ARGS) 194 | { 195 | PopplerDocument *doc = PG_GETARG_POPPLER_DOCUMENT(0); 196 | const char *version = poppler_document_get_pdf_version_string(doc); 197 | 198 | if (version == NULL) 199 | PG_RETURN_NULL(); 200 | 201 | PG_RETURN_TEXT_P(cstring_to_text(version)); 202 | } 203 | 204 | PG_FUNCTION_INFO_V1(pdf_subject); 205 | Datum 206 | pdf_subject(PG_FUNCTION_ARGS) 207 | { 208 | PopplerDocument *doc = PG_GETARG_POPPLER_DOCUMENT(0); 209 | const char *subject = poppler_document_get_subject(doc); 210 | 211 | if (subject == NULL) 212 | PG_RETURN_NULL(); 213 | 214 | PG_RETURN_TEXT_P(cstring_to_text(subject)); 215 | } 216 | 217 | PG_FUNCTION_INFO_V1(pdf_num_pages); 218 | 219 | Datum 220 | pdf_num_pages(PG_FUNCTION_ARGS) 221 | { 222 | PopplerDocument* doc = PG_GETARG_POPPLER_DOCUMENT(0); 223 | PG_RETURN_INT32(poppler_document_get_n_pages(doc)); 224 | } 225 | 226 | 227 | PG_FUNCTION_INFO_V1(pdf_page); 228 | 229 | Datum 230 | pdf_page(PG_FUNCTION_ARGS) 231 | { 232 | PopplerDocument* doc = PG_GETARG_POPPLER_DOCUMENT(0); 233 | int32 i = PG_GETARG_INT32(1); 234 | PopplerPage* page = poppler_document_get_page(doc, i); 235 | PG_RETURN_TEXT_P(cstring_to_text(poppler_page_get_text(page))); 236 | } 237 | 238 | PG_FUNCTION_INFO_V1(pdf_creation); 239 | 240 | Datum 241 | pdf_creation(PG_FUNCTION_ARGS) 242 | { 243 | PopplerDocument *doc = PG_GETARG_POPPLER_DOCUMENT(0); 244 | GDateTime *dt = poppler_document_get_creation_date_time(doc); 245 | 246 | if (dt == NULL) 247 | PG_RETURN_NULL(); 248 | 249 | gint year = g_date_time_get_year(dt); 250 | gint month = g_date_time_get_month(dt); 251 | gint day = g_date_time_get_day_of_month(dt); 252 | gint hour = g_date_time_get_hour(dt); 253 | gint minute = g_date_time_get_minute(dt); 254 | gint second = g_date_time_get_second(dt); 255 | 256 | g_date_time_unref(dt); 257 | 258 | TimestampTz ts = DatumGetTimestamp(DirectFunctionCall6( 259 | make_timestamp, 260 | Int32GetDatum(year), 261 | Int32GetDatum(month), 262 | Int32GetDatum(day), 263 | Int32GetDatum(hour), 264 | Int32GetDatum(minute), 265 | Float8GetDatum((double)second) 266 | )); 267 | 268 | PG_RETURN_TIMESTAMPTZ(ts); 269 | } 270 | 271 | PG_FUNCTION_INFO_V1(pdf_modification); 272 | 273 | Datum 274 | pdf_modification(PG_FUNCTION_ARGS) 275 | { 276 | PopplerDocument *doc = PG_GETARG_POPPLER_DOCUMENT(0); 277 | GDateTime *dt = poppler_document_get_modification_date_time(doc); 278 | 279 | if (dt == NULL) 280 | PG_RETURN_NULL(); 281 | 282 | gint year = g_date_time_get_year(dt); 283 | gint month = g_date_time_get_month(dt); 284 | gint day = g_date_time_get_day_of_month(dt); 285 | gint hour = g_date_time_get_hour(dt); 286 | gint minute = g_date_time_get_minute(dt); 287 | gint second = g_date_time_get_second(dt); 288 | 289 | g_date_time_unref(dt); 290 | 291 | TimestampTz ts = DatumGetTimestamp(DirectFunctionCall6( 292 | make_timestamp, 293 | Int32GetDatum(year), 294 | Int32GetDatum(month), 295 | Int32GetDatum(day), 296 | Int32GetDatum(hour), 297 | Int32GetDatum(minute), 298 | Float8GetDatum((double)second) 299 | )); 300 | 301 | PG_RETURN_TIMESTAMPTZ(ts); 302 | } 303 | 304 | PG_FUNCTION_INFO_V1(pdf_from_bytea); 305 | 306 | Datum 307 | pdf_from_bytea(PG_FUNCTION_ARGS) 308 | { 309 | bytea* input_bytes = PG_GETARG_BYTEA_P(0); // Get the binary PDF data 310 | int32 bytes_len = VARSIZE_ANY_EXHDR(input_bytes); 311 | pdftype* result; 312 | 313 | result = (pdftype*)palloc(VARHDRSZ + bytes_len); 314 | SET_VARSIZE(result, VARHDRSZ + bytes_len); 315 | 316 | memcpy(VARDATA(result), VARDATA_ANY(input_bytes), bytes_len); 317 | 318 | PG_RETURN_POINTER(result); 319 | } 320 | 321 | 322 | PG_FUNCTION_INFO_V1(pdf_to_bytea); 323 | 324 | Datum 325 | pdf_to_bytea(PG_FUNCTION_ARGS) 326 | { 327 | pdftype *pdf = PG_GETARG_PDF_P(0); 328 | bytea *result; 329 | 330 | int32 data_len = VARSIZE_ANY_EXHDR(pdf); 331 | 332 | result = (bytea *) palloc(VARHDRSZ + data_len); 333 | SET_VARSIZE(result, VARHDRSZ + data_len); 334 | 335 | memcpy(VARDATA(result), VARDATA(pdf), data_len); 336 | 337 | PG_RETURN_BYTEA_P(result); 338 | } 339 | 340 | PG_FUNCTION_INFO_V1(bytea_to_pdf); 341 | 342 | 343 | Datum 344 | bytea_to_pdf(PG_FUNCTION_ARGS) 345 | { 346 | bytea *bytes = PG_GETARG_BYTEA_PP(0); 347 | pdftype *result; 348 | GError *error = NULL; 349 | 350 | GBytes *pdf_data = g_bytes_new(VARDATA_ANY(bytes), VARSIZE_ANY_EXHDR(bytes)); 351 | 352 | PopplerDocument *doc = poppler_document_new_from_bytes(pdf_data, NULL, &error); 353 | if (!doc) { 354 | g_bytes_unref(pdf_data); 355 | elog(ERROR, "Error parsing PDF document: %s", error->message); 356 | } 357 | 358 | result = (pdftype*) palloc(VARHDRSZ + VARSIZE_ANY_EXHDR(bytes)); 359 | SET_VARSIZE(result, VARHDRSZ + VARSIZE_ANY_EXHDR(bytes)); 360 | 361 | memcpy(VARDATA(result), g_bytes_get_data(pdf_data, NULL), VARSIZE_ANY_EXHDR(bytes)); 362 | 363 | g_object_unref(doc); 364 | g_bytes_unref(pdf_data); 365 | 366 | PG_RETURN_POINTER(result); 367 | } 368 | -------------------------------------------------------------------------------- /pgpdf.control: -------------------------------------------------------------------------------- 1 | default_version = '0.1.0' 2 | module_pathname = '$libdir/pgpdf' 3 | comment = 'pdf type' 4 | relocatable = true -------------------------------------------------------------------------------- /sql/pgpdf--0.1.0.sql: -------------------------------------------------------------------------------- 1 | CREATE TYPE pdf; 2 | 3 | CREATE FUNCTION pdf_in(cstring) RETURNS pdf 4 | IMMUTABLE 5 | STRICT 6 | LANGUAGE C 7 | AS 8 | 'MODULE_PATHNAME'; 9 | 10 | CREATE FUNCTION pdf_out(pdf) RETURNS cstring 11 | IMMUTABLE 12 | STRICT 13 | LANGUAGE C 14 | AS 15 | 'MODULE_PATHNAME'; 16 | 17 | CREATE TYPE pdf 18 | ( 19 | INTERNALLENGTH = -1, 20 | INPUT = pdf_in, 21 | OUTPUT = pdf_out, 22 | STORAGE = extended 23 | ); 24 | 25 | CREATE CAST (pdf AS text) WITH INOUT AS ASSIGNMENT; 26 | CREATE CAST (text AS pdf) WITH INOUT AS ASSIGNMENT; 27 | 28 | 29 | /* API */ 30 | 31 | CREATE FUNCTION pdf_title(pdf) RETURNS TEXT 32 | IMMUTABLE 33 | STRICT 34 | LANGUAGE C 35 | AS 36 | 'MODULE_PATHNAME'; 37 | 38 | CREATE FUNCTION pdf_author(pdf) RETURNS TEXT 39 | IMMUTABLE 40 | STRICT 41 | LANGUAGE C 42 | AS 43 | 'MODULE_PATHNAME'; 44 | 45 | CREATE FUNCTION pdf_num_pages(pdf) RETURNS INTEGER 46 | IMMUTABLE 47 | STRICT 48 | LANGUAGE C 49 | AS 50 | 'MODULE_PATHNAME'; 51 | 52 | CREATE FUNCTION pdf_page(pdf, integer) RETURNS TEXT 53 | IMMUTABLE 54 | STRICT 55 | LANGUAGE C 56 | AS 57 | 'MODULE_PATHNAME'; 58 | 59 | CREATE FUNCTION pdf_creator(pdf) RETURNS TEXT 60 | IMMUTABLE 61 | STRICT 62 | LANGUAGE C 63 | AS 64 | 'MODULE_PATHNAME'; 65 | 66 | CREATE FUNCTION pdf_keywords(pdf) RETURNS TEXT 67 | IMMUTABLE 68 | STRICT 69 | LANGUAGE C 70 | AS 71 | 'MODULE_PATHNAME'; 72 | 73 | CREATE FUNCTION pdf_metadata(pdf) RETURNS TEXT 74 | IMMUTABLE 75 | STRICT 76 | LANGUAGE C 77 | AS 78 | 'MODULE_PATHNAME'; 79 | 80 | CREATE FUNCTION pdf_version(pdf) RETURNS TEXT 81 | IMMUTABLE 82 | STRICT 83 | LANGUAGE C 84 | AS 85 | 'MODULE_PATHNAME'; 86 | 87 | CREATE FUNCTION pdf_subject(pdf) RETURNS TEXT 88 | IMMUTABLE 89 | STRICT 90 | LANGUAGE C 91 | AS 92 | 'MODULE_PATHNAME'; 93 | 94 | CREATE FUNCTION pdf_creation(pdf) RETURNS TIMESTAMP 95 | IMMUTABLE 96 | STRICT 97 | LANGUAGE C 98 | AS 99 | 'MODULE_PATHNAME'; 100 | 101 | CREATE FUNCTION pdf_modification(pdf) RETURNS TIMESTAMP 102 | IMMUTABLE 103 | STRICT 104 | LANGUAGE C 105 | AS 106 | 'MODULE_PATHNAME'; 107 | 108 | 109 | CREATE FUNCTION bytea_to_pdf(bytea) RETURNS pdf 110 | LANGUAGE C 111 | IMMUTABLE 112 | STRICT 113 | AS 114 | 'MODULE_PATHNAME'; 115 | 116 | 117 | CREATE FUNCTION pdf_to_bytea(pdf) RETURNS bytea 118 | LANGUAGE C 119 | IMMUTABLE 120 | STRICT 121 | AS 122 | 'MODULE_PATHNAME'; 123 | 124 | CREATE CAST (bytea AS pdf) WITH FUNCTION bytea_to_pdf(bytea) AS ASSIGNMENT; 125 | CREATE CAST (pdf AS bytea) WITH FUNCTION pdf_to_bytea(pdf) AS ASSIGNMENT; 126 | 127 | -------------------- 128 | 129 | CREATE FUNCTION pdf_read_file(text) RETURNS text 130 | LANGUAGE SQL 131 | IMMUTABLE 132 | STRICT 133 | AS 'SELECT $1::pdf::text'; 134 | 135 | 136 | CREATE FUNCTION pdf_read_bytes(bytea) RETURNS text 137 | LANGUAGE SQL 138 | IMMUTABLE 139 | STRICT 140 | AS 'SELECT $1::bytea::pdf::text'; 141 | 142 | 143 | -------------------------------------------------------------------------------- /test/expected/pgpdf.out: -------------------------------------------------------------------------------- 1 | /* Errors */ 2 | SELECT 'notexists.pdf'::pdf; 3 | ERROR: could not open file "notexists.pdf" for reading: No such file or directory 4 | LINE 2: SELECT 'notexists.pdf'::pdf; 5 | ^ 6 | SELECT '/tmp/bad.pdf'::pdf; 7 | ERROR: Error parsing PDF document: PDF document is damaged 8 | LINE 1: SELECT '/tmp/bad.pdf'::pdf; 9 | ^ 10 | /* OK */ 11 | SELECT '/tmp/pgintro.pdf'::pdf; 12 | pdf 13 | ---------------------------------------------------------------------------------- 14 | PostgreSQL Introduction + 15 | Digoal.Zhou + 16 | 7/20/2011Catalog + 17 |  PostgreSQL Origin + 18 |  Layout + 19 |  Features + 20 |  Enterprise Class Attribute + 21 |  CaseOrigin + 22 | Extract From Wiki + 23 | 1973 + 24 | Postgres95 + 25 | 1995 + 26 | POSTGRES 1985 + 27 | 1996 + 28 | OLTP + 29 | Michael Stonebraker + 30 | H-Store + 31 | C-Store + 32 | DW + 33 | DWPortion ContributersLogical Layout + 34 | InstanceCluster + 35 | DatabaseDatabase(s) + 36 | SchemaSchema(s) + 37 | Object + 38 | Field + 39 | Table(s) + 40 | Row(s) + 41 | Index(s) + 42 | Column + 43 | (s) + 44 | View(s) + 45 | Function(s) + 46 | Sequence(s) + 47 | Other(s)Process Introduction + 48 | Shared Memory Area + 49 | IPC + 50 | APP + 51 | Handshake + 52 | & + 53 | authentication + 54 | postmaster + 55 | fork + 56 | autovacuum launcher + 57 | autovacuum worker + 58 | Shared buffer + 59 | backend process + 60 | WAL buffer + 61 | WAL writer + 62 | XLOGs + 63 | Datafiles + 64 | Archiver + 65 | ARCH FILEs + 66 | bgwriterPotion Features + 67 | GEQO + 68 | WAL + 69 | Online + 70 | Backup + 71 | CBO + 72 | MVCC + 73 | PITR + 74 | Open Source + 75 | & Free + 76 | ACID + 77 | RDBMS + 78 | Stream + 79 | ReplicationPowerful Localization + 80 | Support + 81 |  Supported Character Sets + 82 |  http://www.postgresql.org/docs/9.1/static/multibyte.html + 83 |  Support Database and Column level COLLATE + 84 |  Example : CREATE TABLE test1 ( a text COLLATE "de_DE", b + 85 | text COLLATE "es_ES", ... );Powerful Platform Support + 86 | X86 + 87 | X86_64 + 88 | IA64 + 89 | PowerPC + 90 | PowerPC 64 + 91 | S/390 + 92 | S/390x + 93 | Sparc + 94 | Sparc 64 + 95 | Alpha + 96 | ARM + 97 | MIPS + 98 | MIPSEL + 99 | M68K + 100 | PA-RISC + 101 | Linux + 102 | Windows + 103 | FreeBSD + 104 | OpenBSD + 105 | NetBSD + 106 | Mac OS X + 107 | AIX + 108 | HP/UX + 109 | IRIX + 110 | Solaris + 111 | Tru64 Unix + 112 | UnixWareRich Extensions + 113 | adminpackisnpgrowlocks + 114 | auto_explainlopgstattuple + 115 | btree_ginltreeseg + 116 | btree_gistoid2namesepgsql + 117 | chkpasspageinspectspi + 118 | citextpasswordchecksslinfo + 119 | cubepg_buffercachestart-scripts + 120 | dblinkpg_freespacemaptablefunc + 121 | dict_intpg_standbytest_parser + 122 | dict_xsynpg_stat_statementstsearch2 + 123 | earthdistancepg_test_fsyncunaccent + 124 | fuzzystrmatchpg_trgmuuid-ossp + 125 | hstorepg_upgradevacuumlo + 126 | intaggpgbenchxml2 + 127 | intarraypgcryptoPotion Compare + 128 | 1. Language + 129 | SQL/Plsql + 130 | 2. Index + 131 | Global / Partition + 132 | 3. DDL Rollback + 133 | Cann’t rollback but can recovery from + 134 | Backup or Flash Recovery Area. + 135 | 4. Compress + 136 | Table Level + 137 | 5. Trigger + 138 | 6. Data Type + 139 | …… + 140 | 1. Language + 141 | SQL/Plpgsql/Pltcl/Plperl/Plpython… + 142 | 2. Index + 143 | Global(non-partition TABLE) + 144 | Partition + 145 | Partial Index + 146 | 3. DDL Rollback + 147 | Can rollback every ddl sql. + 148 | 4. Compress + 149 | Column Level(Limited) + 150 | 5. Trigger / Rule + 151 | 6. Data Type extention + 152 | IP / MAC / XML / UUID / … + 153 | ……LimitReliability + 154 |  ACID + 155 |  Atomicity + 156 |  All Success or All Fail + 157 |  Consistency + 158 |  Only valid data will be written to the database + 159 |  Example:check (age>=0) + 160 |  Isolation + 161 |  SERIALIZABLE | REPEATABLE READ | READ COMMITTED | + 162 | READ UNCOMMITTED + 163 |  Durability + 164 |  The ability of the DBMS to recover the committed transaction + 165 | updates against any kind of system failure (hardware or software).Recoverability+ 166 |  Requirement + 167 |  Baseline Backup + 168 |  Parameter + 169 |  Open fsync,full_page_writes + 170 |  Optional open synchronous_commit + 171 |  Open WAL BackupRecoverability + 172 | Mistake + 173 | Checkpoint + 174 | Time Line + 175 | Which Page the first + 176 | Modified after Checkpoint + 177 | WAL + 178 | Archive + 179 | Inconsistent Backup + 180 | PITRSecurity + 181 | PostgreSQL + 182 | Connection Limit + 183 | Auth Method + 184 | (Trust, + 185 | Password, + 186 | Ident, + 187 | LDAP…) + 188 | PG_HBA + 189 | Listene + 190 | Which + 191 | Address + 192 | Roles + 193 | GRANT + 194 | REVOKEScalability + 195 |  Hardware + 196 |  Software + 197 | ProjectTypeMethodStorage + 198 | PlproxyOLTPDistributedCan Shared-nothing + 199 | GridSQLDWDistributedCan Shared-nothing + 200 | GreenPlumDWDistributedShared-nothing + 201 | Aster DataDWDistributedShared-nothing + 202 | Postgres-XCOLTPDistributedCan Shared-nothing + 203 | Pgpool-IIDWDistributedCan Shared-nothing + 204 | Sequoia/Contin + 205 | uentOLTPDistributedCan Shared-nothing + 206 | PGMemcacheOLTPDistributedCachePerformance + 207 |  SAIO Optimizer + 208 |  wulczer.org + 209 |  Virtual Index + 210 |  Prefetch + 211 |  Cache State Persistent + 212 |  Tablespace Based IO Cost Value + 213 |  Async IO + 214 |  Partial Index + 215 |  Parallel restoreHigh-AvailabilityHigh-AvailabilityArchive Case + 216 | Product SAN + 217 | FingerFingerFingerFingerFinger + 218 | DB1DB2DB3DB4DB5 + 219 | ② + 220 | Compress Transmit + 221 | 。。。。 + 222 | FingerFinger + 223 | DBxDBy + 224 | ③ + 225 | ① + 226 | Product SAN + 227 | Cloud Storage(s) + 228 | DNS + 229 | DB1 + 230 | WAL + 231 | DB2 + 232 | WAL + 233 | 。。。。 + 234 | DBx + 235 | WAL + 236 | Dby + 237 | WAL + 238 | Coordinate + 239 | DB + 240 | DB1 + 241 | WAL + 242 | DB2 + 243 | WAL + 244 | 。。。。 + 245 | DBx + 246 | WAL + 247 | Dby + 248 | WALHA & DR Case + 249 | StreamRep + 250 | pg_xlog + 251 | HOTStandby_A + 252 | pg_xlog + 253 | pg_xlog + 254 | Primary_A + 255 | Standby_A + 256 | Storage Cloud + 257 | pg_xlog + 258 | Primary_B + 259 | pg_xlog + 260 | Primary_C + 261 | Storage Cloud + 262 | WAN + 263 | Compr + 264 | ess + 265 | Transm + 266 | it + 267 | pg_xlog + 268 | Standby_B + 269 | pg_xlog + 270 | Standby_CShard-everything HA Case + 271 | RHCS + 272 | Primary + 273 | Standby + 274 | FailOver + 275 | Intervent + 276 | UP + 277 | Stream Replication + 278 | SAN 2 + 279 | SAN 1 + 280 | WAL Backup + 281 | xlog + 282 | Datafile + 283 | Backup + 284 | Used to PITR + 285 | Datafile + 286 | DatafileThanks + 287 |  Thanks all people contribute to PostgreSQL. + 288 |  + 289 | Digoal.Zhou + 290 |  + 291 |  + 292 | Blog + 293 | http://blog.163.com/digoal@126 294 | (1 row) 295 | 296 | /* API */ 297 | SELECT pdf_title('/tmp/pgintro.pdf'); 298 | pdf_title 299 | ------------------------- 300 | PostgreSQL Introduction 301 | (1 row) 302 | 303 | SELECT pdf_author('/tmp/pgintro.pdf'); 304 | pdf_author 305 | ------------ 306 | 周正中 307 | (1 row) 308 | 309 | SELECT pdf_num_pages('/tmp/pgintro.pdf'); 310 | pdf_num_pages 311 | --------------- 312 | 24 313 | (1 row) 314 | 315 | SELECT pdf_page('/tmp/pgintro.pdf', 1); 316 | pdf_page 317 | ------------------------------ 318 | Catalog + 319 |  PostgreSQL Origin + 320 |  Layout + 321 |  Features + 322 |  Enterprise Class Attribute+ 323 |  Case 324 | (1 row) 325 | 326 | SELECT pdf_creator('/tmp/pgintro.pdf'); 327 | pdf_creator 328 | ------------------------------------ 329 | Microsoft® Office PowerPoint® 2007 330 | (1 row) 331 | 332 | SELECT pdf_keywords('/tmp/pgintro.pdf'); 333 | pdf_keywords 334 | -------------- 335 | 336 | (1 row) 337 | 338 | SELECT pdf_metadata('/tmp/pgintro.pdf'); 339 | pdf_metadata 340 | -------------- 341 | 342 | (1 row) 343 | 344 | SELECT pdf_version('/tmp/pgintro.pdf'); 345 | pdf_version 346 | ------------- 347 | PDF-1.5 348 | (1 row) 349 | 350 | SELECT pdf_subject('/tmp/pgintro.pdf'); 351 | pdf_subject 352 | ------------- 353 | 354 | (1 row) 355 | 356 | SELECT pdf_creation('/tmp/pgintro.pdf'); 357 | pdf_creation 358 | -------------------------- 359 | Wed Jul 20 11:13:37 2011 360 | (1 row) 361 | 362 | SELECT pdf_modification('/tmp/pgintro.pdf'); 363 | pdf_modification 364 | -------------------------- 365 | Wed Jul 20 11:13:37 2011 366 | (1 row) 367 | 368 | /* bytea -> pdf */ 369 | SELECT pg_read_binary_file('/tmp/pgintro.pdf')::pdf::text = '/tmp/pgintro.pdf'::pdf::text; 370 | ?column? 371 | ---------- 372 | t 373 | (1 row) 374 | 375 | /* pdf -> bytea */ 376 | SELECT '/tmp/pgintro.pdf'::pdf::bytea = pg_read_binary_file('/tmp/pgintro.pdf'); 377 | ?column? 378 | ---------- 379 | t 380 | (1 row) 381 | 382 | /* FTS */ 383 | SELECT '/tmp/pgintro.pdf'::pdf::text @@ to_tsquery('postgres'); 384 | ?column? 385 | ---------- 386 | t 387 | (1 row) 388 | 389 | SELECT '/tmp/pgintro.pdf'::pdf::text @@ to_tsquery('oracle'); 390 | ?column? 391 | ---------- 392 | f 393 | (1 row) 394 | 395 | /* Old functions mentioned in the blog post. Making sure they continue to work */ 396 | SELECT pdf_read_file('/tmp/pgintro.pdf') = '/tmp/pgintro.pdf'::pdf::text; 397 | ?column? 398 | ---------- 399 | t 400 | (1 row) 401 | 402 | select pdf_read_bytes(pg_read_binary_file('/tmp/pgintro.pdf')) = '/tmp/pgintro.pdf'::pdf::text; 403 | ?column? 404 | ---------- 405 | t 406 | (1 row) 407 | 408 | /* bigger files: >8KB */ 409 | CREATE TABLE pdfs(i serial primary key, d pdf); 410 | INSERT INTO pdfs(d) VALUES('/tmp/pgintro.pdf'); 411 | INSERT INTO pdfs(d) VALUES('/tmp/big.pdf'); 412 | SELECT length(d::text) FROM pdfs 413 | length 414 | --------- 415 | 4182 416 | 6042170 417 | (2 rows) 418 | 419 | -------------------------------------------------------------------------------- /test/pgintro.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Florents-Tselai/pgpdf/01e89bcbec81229cfac1819c625ce9986c608720/test/pgintro.pdf -------------------------------------------------------------------------------- /test/sql/pgpdf.sql: -------------------------------------------------------------------------------- 1 | /* Errors */ 2 | SELECT 'notexists.pdf'::pdf; 3 | SELECT '/tmp/bad.pdf'::pdf; 4 | 5 | /* OK */ 6 | SELECT '/tmp/pgintro.pdf'::pdf; 7 | 8 | /* API */ 9 | SELECT pdf_title('/tmp/pgintro.pdf'); 10 | SELECT pdf_author('/tmp/pgintro.pdf'); 11 | SELECT pdf_num_pages('/tmp/pgintro.pdf'); 12 | SELECT pdf_page('/tmp/pgintro.pdf', 1); 13 | SELECT pdf_creator('/tmp/pgintro.pdf'); 14 | SELECT pdf_keywords('/tmp/pgintro.pdf'); 15 | SELECT pdf_metadata('/tmp/pgintro.pdf'); 16 | SELECT pdf_version('/tmp/pgintro.pdf'); 17 | SELECT pdf_subject('/tmp/pgintro.pdf'); 18 | SELECT pdf_creation('/tmp/pgintro.pdf'); 19 | SELECT pdf_modification('/tmp/pgintro.pdf'); 20 | 21 | /* bytea -> pdf */ 22 | SELECT pg_read_binary_file('/tmp/pgintro.pdf')::pdf::text = '/tmp/pgintro.pdf'::pdf::text; 23 | 24 | /* pdf -> bytea */ 25 | SELECT '/tmp/pgintro.pdf'::pdf::bytea = pg_read_binary_file('/tmp/pgintro.pdf'); 26 | 27 | /* FTS */ 28 | SELECT '/tmp/pgintro.pdf'::pdf::text @@ to_tsquery('postgres'); 29 | SELECT '/tmp/pgintro.pdf'::pdf::text @@ to_tsquery('oracle'); 30 | 31 | /* Old functions mentioned in the blog post. Making sure they continue to work */ 32 | SELECT pdf_read_file('/tmp/pgintro.pdf') = '/tmp/pgintro.pdf'::pdf::text; 33 | select pdf_read_bytes(pg_read_binary_file('/tmp/pgintro.pdf')) = '/tmp/pgintro.pdf'::pdf::text; 34 | 35 | /* bigger files: >8KB */ 36 | CREATE TABLE pdfs(i serial primary key, d pdf); 37 | INSERT INTO pdfs(d) VALUES('/tmp/pgintro.pdf'); 38 | INSERT INTO pdfs(d) VALUES('/tmp/big.pdf'); 39 | SELECT length(d::text) FROM pdfs --------------------------------------------------------------------------------