├── .dockerignore
├── .github
├── FUNDING.yml
└── workflows
│ ├── build.yml
│ └── release-pgxn.yml
├── .gitignore
├── Dockerfile
├── LICENSE
├── META.json
├── Makefile
├── README.md
├── pgpdf.c
├── pgpdf.control
├── sql
└── pgpdf--0.1.0.sql
└── test
├── expected
└── pgpdf.out
├── pgintro.pdf
└── sql
└── pgpdf.sql
/.dockerignore:
--------------------------------------------------------------------------------
1 | /.git/
2 | /dist/
3 | /results/
4 | /tmp_check/
5 | regression.*
6 | *.o
7 | *.so
--------------------------------------------------------------------------------
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | github: [Florents-Tselai]
2 |
--------------------------------------------------------------------------------
/.github/workflows/build.yml:
--------------------------------------------------------------------------------
1 | name: build
2 | on: [ push, pull_request ]
3 | jobs:
4 | ubuntu:
5 | runs-on: ${{ matrix.os }}
6 | strategy:
7 | fail-fast: false
8 | matrix:
9 | include:
10 | - postgres: 18
11 | os: ubuntu-24.04
12 | - postgres: 17
13 | os: ubuntu-24.04
14 | - postgres: 16
15 | os: ubuntu-24.04
16 | - postgres: 15
17 | os: ubuntu-22.04
18 | - postgres: 14
19 | os: ubuntu-22.04
20 |
21 | steps:
22 | - uses: actions/checkout@v4
23 |
24 | - run: |
25 | sudo apt update
26 | sudo apt install -y libpoppler-glib-dev pkg-config wget
27 |
28 | - uses: ankane/setup-postgres@v1
29 | with:
30 | postgres-version: ${{ matrix.postgres }}
31 | dev-files: true
32 |
33 | - run: make
34 | - run: sudo make install
35 | - run: make installcheck
36 | - if: ${{ failure() }}
37 | run: cat regression.diffs
38 |
39 |
--------------------------------------------------------------------------------
/.github/workflows/release-pgxn.yml:
--------------------------------------------------------------------------------
1 | name: Release
2 | on:
3 | push:
4 | tags: [v*]
5 | jobs:
6 | release:
7 | name: Release on PGXN
8 | runs-on: ubuntu-latest
9 | container: pgxn/pgxn-tools
10 | env:
11 | PGXN_USERNAME: ${{ secrets.PGXN_USERNAME }}
12 | PGXN_PASSWORD: ${{ secrets.PGXN_PASSWORD }}
13 | steps:
14 | - name: Check out the repo
15 | uses: actions/checkout@v4
16 | - name: Bundle the Release
17 | id: bundle
18 | run: pgxn-bundle
19 | - name: Release on PGXN
20 | run: pgxn-release
21 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .deps/
2 | .idea/
3 | *.o
4 | *.so
5 | build/
6 | dist/
7 | *.egg-info/
8 | archive/
9 |
10 | /*.png
11 | /results/
12 | /pgdata/
13 | /docs/
14 | /.readthedocs.yaml
15 | /pgpdf.dylib
16 | /regression.diffs
17 | /regression.out
18 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | ARG PG_MAJOR=17
2 | FROM postgres:$PG_MAJOR
3 | ARG PG_MAJOR
4 |
5 | COPY . /tmp/pgpdf
6 |
7 | RUN apt-get update && \
8 | apt-mark hold locales && \
9 | apt-get install -y --no-install-recommends libpoppler-glib-dev pkg-config wget build-essential postgresql-server-dev-$PG_MAJOR && \
10 | cd /tmp/pgpdf && \
11 | make clean && \
12 | make install && \
13 | mkdir /usr/share/doc/pgpdf && \
14 | cp LICENSE README.md /usr/share/doc/pgpdf && \
15 | rm -r /tmp/pgpdf && \
16 | apt-get remove -y pkg-config wget build-essential postgresql-server-dev-$PG_MAJOR && \
17 | apt-get autoremove -y && \
18 | apt-mark unhold locales && \
19 | rm -rf /var/lib/apt/lists/*
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | GNU GENERAL PUBLIC LICENSE
2 | Version 2, June 1991
3 |
4 | Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
5 |
6 | Everyone is permitted to copy and distribute verbatim copies
7 | of this license document, but changing it is not allowed.
8 |
9 | Preamble
10 |
11 | The licenses for most software are designed to take away your
12 | freedom to share and change it. By contrast, the GNU General Public
13 | License is intended to guarantee your freedom to share and change free
14 | software--to make sure the software is free for all its users. This
15 | General Public License applies to most of the Free Software
16 | Foundation's software and to any other program whose authors commit to
17 | using it. (Some other Free Software Foundation software is covered by
18 | the GNU Lesser General Public License instead.) You can apply it to
19 | your programs, too.
20 |
21 | When we speak of free software, we are referring to freedom, not
22 | price. Our General Public Licenses are designed to make sure that you
23 | have the freedom to distribute copies of free software (and charge for
24 | this service if you wish), that you receive source code or can get it
25 | if you want it, that you can change the software or use pieces of it
26 | in new free programs; and that you know you can do these things.
27 |
28 | To protect your rights, we need to make restrictions that forbid
29 | anyone to deny you these rights or to ask you to surrender the rights.
30 | These restrictions translate to certain responsibilities for you if you
31 | distribute copies of the software, or if you modify it.
32 |
33 | For example, if you distribute copies of such a program, whether
34 | gratis or for a fee, you must give the recipients all the rights that
35 | you have. You must make sure that they, too, receive or can get the
36 | source code. And you must show them these terms so they know their
37 | rights.
38 |
39 | We protect your rights with two steps: (1) copyright the software, and
40 | (2) offer you this license which gives you legal permission to copy,
41 | distribute and/or modify the software.
42 |
43 | Also, for each author's protection and ours, we want to make certain
44 | that everyone understands that there is no warranty for this free
45 | software. If the software is modified by someone else and passed on, we
46 | want its recipients to know that what they have is not the original, so
47 | that any problems introduced by others will not reflect on the original
48 | authors' reputations.
49 |
50 | Finally, any free program is threatened constantly by software
51 | patents. We wish to avoid the danger that redistributors of a free
52 | program will individually obtain patent licenses, in effect making the
53 | program proprietary. To prevent this, we have made it clear that any
54 | patent must be licensed for everyone's free use or not licensed at all.
55 |
56 | The precise terms and conditions for copying, distribution and
57 | modification follow.
58 |
59 | GNU GENERAL PUBLIC LICENSE
60 | TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
61 |
62 | 0. This License applies to any program or other work which contains
63 | a notice placed by the copyright holder saying it may be distributed
64 | under the terms of this General Public License. The "Program", below,
65 | refers to any such program or work, and a "work based on the Program"
66 | means either the Program or any derivative work under copyright law:
67 | that is to say, a work containing the Program or a portion of it,
68 | either verbatim or with modifications and/or translated into another
69 | language. (Hereinafter, translation is included without limitation in
70 | the term "modification".) Each licensee is addressed as "you".
71 |
72 | Activities other than copying, distribution and modification are not
73 | covered by this License; they are outside its scope. The act of
74 | running the Program is not restricted, and the output from the Program
75 | is covered only if its contents constitute a work based on the
76 | Program (independent of having been made by running the Program).
77 | Whether that is true depends on what the Program does.
78 |
79 | 1. You may copy and distribute verbatim copies of the Program's
80 | source code as you receive it, in any medium, provided that you
81 | conspicuously and appropriately publish on each copy an appropriate
82 | copyright notice and disclaimer of warranty; keep intact all the
83 | notices that refer to this License and to the absence of any warranty;
84 | and give any other recipients of the Program a copy of this License
85 | along with the Program.
86 |
87 | You may charge a fee for the physical act of transferring a copy, and
88 | you may at your option offer warranty protection in exchange for a fee.
89 |
90 | 2. You may modify your copy or copies of the Program or any portion
91 | of it, thus forming a work based on the Program, and copy and
92 | distribute such modifications or work under the terms of Section 1
93 | above, provided that you also meet all of these conditions:
94 |
95 | a) You must cause the modified files to carry prominent notices
96 | stating that you changed the files and the date of any change.
97 |
98 | b) You must cause any work that you distribute or publish, that in
99 | whole or in part contains or is derived from the Program or any
100 | part thereof, to be licensed as a whole at no charge to all third
101 | parties under the terms of this License.
102 |
103 | c) If the modified program normally reads commands interactively
104 | when run, you must cause it, when started running for such
105 | interactive use in the most ordinary way, to print or display an
106 | announcement including an appropriate copyright notice and a
107 | notice that there is no warranty (or else, saying that you provide
108 | a warranty) and that users may redistribute the program under
109 | these conditions, and telling the user how to view a copy of this
110 | License. (Exception: if the Program itself is interactive but
111 | does not normally print such an announcement, your work based on
112 | the Program is not required to print an announcement.)
113 |
114 | These requirements apply to the modified work as a whole. If
115 | identifiable sections of that work are not derived from the Program,
116 | and can be reasonably considered independent and separate works in
117 | themselves, then this License, and its terms, do not apply to those
118 | sections when you distribute them as separate works. But when you
119 | distribute the same sections as part of a whole which is a work based
120 | on the Program, the distribution of the whole must be on the terms of
121 | this License, whose permissions for other licensees extend to the
122 | entire whole, and thus to each and every part regardless of who wrote it.
123 |
124 | Thus, it is not the intent of this section to claim rights or contest
125 | your rights to work written entirely by you; rather, the intent is to
126 | exercise the right to control the distribution of derivative or
127 | collective works based on the Program.
128 |
129 | In addition, mere aggregation of another work not based on the Program
130 | with the Program (or with a work based on the Program) on a volume of
131 | a storage or distribution medium does not bring the other work under
132 | the scope of this License.
133 |
134 | 3. You may copy and distribute the Program (or a work based on it,
135 | under Section 2) in object code or executable form under the terms of
136 | Sections 1 and 2 above provided that you also do one of the following:
137 |
138 | a) Accompany it with the complete corresponding machine-readable
139 | source code, which must be distributed under the terms of Sections
140 | 1 and 2 above on a medium customarily used for software interchange; or,
141 |
142 | b) Accompany it with a written offer, valid for at least three
143 | years, to give any third party, for a charge no more than your
144 | cost of physically performing source distribution, a complete
145 | machine-readable copy of the corresponding source code, to be
146 | distributed under the terms of Sections 1 and 2 above on a medium
147 | customarily used for software interchange; or,
148 |
149 | c) Accompany it with the information you received as to the offer
150 | to distribute corresponding source code. (This alternative is
151 | allowed only for noncommercial distribution and only if you
152 | received the program in object code or executable form with such
153 | an offer, in accord with Subsection b above.)
154 |
155 | The source code for a work means the preferred form of the work for
156 | making modifications to it. For an executable work, complete source
157 | code means all the source code for all modules it contains, plus any
158 | associated interface definition files, plus the scripts used to
159 | control compilation and installation of the executable. However, as a
160 | special exception, the source code distributed need not include
161 | anything that is normally distributed (in either source or binary
162 | form) with the major components (compiler, kernel, and so on) of the
163 | operating system on which the executable runs, unless that component
164 | itself accompanies the executable.
165 |
166 | If distribution of executable or object code is made by offering
167 | access to copy from a designated place, then offering equivalent
168 | access to copy the source code from the same place counts as
169 | distribution of the source code, even though third parties are not
170 | compelled to copy the source along with the object code.
171 |
172 | 4. You may not copy, modify, sublicense, or distribute the Program
173 | except as expressly provided under this License. Any attempt
174 | otherwise to copy, modify, sublicense or distribute the Program is
175 | void, and will automatically terminate your rights under this License.
176 | However, parties who have received copies, or rights, from you under
177 | this License will not have their licenses terminated so long as such
178 | parties remain in full compliance.
179 |
180 | 5. You are not required to accept this License, since you have not
181 | signed it. However, nothing else grants you permission to modify or
182 | distribute the Program or its derivative works. These actions are
183 | prohibited by law if you do not accept this License. Therefore, by
184 | modifying or distributing the Program (or any work based on the
185 | Program), you indicate your acceptance of this License to do so, and
186 | all its terms and conditions for copying, distributing or modifying
187 | the Program or works based on it.
188 |
189 | 6. Each time you redistribute the Program (or any work based on the
190 | Program), the recipient automatically receives a license from the
191 | original licensor to copy, distribute or modify the Program subject to
192 | these terms and conditions. You may not impose any further
193 | restrictions on the recipients' exercise of the rights granted herein.
194 | You are not responsible for enforcing compliance by third parties to
195 | this License.
196 |
197 | 7. If, as a consequence of a court judgment or allegation of patent
198 | infringement or for any other reason (not limited to patent issues),
199 | conditions are imposed on you (whether by court order, agreement or
200 | otherwise) that contradict the conditions of this License, they do not
201 | excuse you from the conditions of this License. If you cannot
202 | distribute so as to satisfy simultaneously your obligations under this
203 | License and any other pertinent obligations, then as a consequence you
204 | may not distribute the Program at all. For example, if a patent
205 | license would not permit royalty-free redistribution of the Program by
206 | all those who receive copies directly or indirectly through you, then
207 | the only way you could satisfy both it and this License would be to
208 | refrain entirely from distribution of the Program.
209 |
210 | If any portion of this section is held invalid or unenforceable under
211 | any particular circumstance, the balance of the section is intended to
212 | apply and the section as a whole is intended to apply in other
213 | circumstances.
214 |
215 | It is not the purpose of this section to induce you to infringe any
216 | patents or other property right claims or to contest validity of any
217 | such claims; this section has the sole purpose of protecting the
218 | integrity of the free software distribution system, which is
219 | implemented by public license practices. Many people have made
220 | generous contributions to the wide range of software distributed
221 | through that system in reliance on consistent application of that
222 | system; it is up to the author/donor to decide if he or she is willing
223 | to distribute software through any other system and a licensee cannot
224 | impose that choice.
225 |
226 | This section is intended to make thoroughly clear what is believed to
227 | be a consequence of the rest of this License.
228 |
229 | 8. If the distribution and/or use of the Program is restricted in
230 | certain countries either by patents or by copyrighted interfaces, the
231 | original copyright holder who places the Program under this License
232 | may add an explicit geographical distribution limitation excluding
233 | those countries, so that distribution is permitted only in or among
234 | countries not thus excluded. In such case, this License incorporates
235 | the limitation as if written in the body of this License.
236 |
237 | 9. The Free Software Foundation may publish revised and/or new versions
238 | of the General Public License from time to time. Such new versions will
239 | be similar in spirit to the present version, but may differ in detail to
240 | address new problems or concerns.
241 |
242 | Each version is given a distinguishing version number. If the Program
243 | specifies a version number of this License which applies to it and "any
244 | later version", you have the option of following the terms and conditions
245 | either of that version or of any later version published by the Free
246 | Software Foundation. If the Program does not specify a version number of
247 | this License, you may choose any version ever published by the Free Software
248 | Foundation.
249 |
250 | 10. If you wish to incorporate parts of the Program into other free
251 | programs whose distribution conditions are different, write to the author
252 | to ask for permission. For software which is copyrighted by the Free
253 | Software Foundation, write to the Free Software Foundation; we sometimes
254 | make exceptions for this. Our decision will be guided by the two goals
255 | of preserving the free status of all derivatives of our free software and
256 | of promoting the sharing and reuse of software generally.
257 |
258 | NO WARRANTY
259 |
260 | 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
261 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN
262 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
263 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
264 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
265 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS
266 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE
267 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
268 | REPAIR OR CORRECTION.
269 |
270 | 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
271 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
272 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
273 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
274 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
275 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
276 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
277 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
278 | POSSIBILITY OF SUCH DAMAGES.
279 |
280 | END OF TERMS AND CONDITIONS
281 |
282 | How to Apply These Terms to Your New Programs
283 |
284 | If you develop a new program, and you want it to be of the greatest
285 | possible use to the public, the best way to achieve this is to make it
286 | free software which everyone can redistribute and change under these terms.
287 |
288 | To do so, attach the following notices to the program. It is safest
289 | to attach them to the start of each source file to most effectively
290 | convey the exclusion of warranty; and each file should have at least
291 | the "copyright" line and a pointer to where the full notice is found.
292 |
293 |
294 | Copyright (C)
295 |
296 | This program is free software; you can redistribute it and/or modify
297 | it under the terms of the GNU General Public License as published by
298 | the Free Software Foundation; either version 2 of the License, or
299 | (at your option) any later version.
300 |
301 | This program is distributed in the hope that it will be useful,
302 | but WITHOUT ANY WARRANTY; without even the implied warranty of
303 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
304 | GNU General Public License for more details.
305 |
306 | You should have received a copy of the GNU General Public License along
307 | with this program; if not, see .
308 |
309 | Also add information on how to contact you by electronic and paper mail.
310 |
311 | If the program is interactive, make it output a short notice like this
312 | when it starts in an interactive mode:
313 |
314 | Gnomovision version 69, Copyright (C) year name of author
315 | Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
316 | This is free software, and you are welcome to redistribute it
317 | under certain conditions; type `show c' for details.
318 |
319 | The hypothetical commands `show w' and `show c' should show the appropriate
320 | parts of the General Public License. Of course, the commands you use may
321 | be called something other than `show w' and `show c'; they could even be
322 | mouse-clicks or menu items--whatever suits your program.
323 |
324 | You should also get your employer (if you work as a programmer) or your
325 | school, if any, to sign a "copyright disclaimer" for the program, if
326 | necessary. Here is a sample; alter the names:
327 |
328 | Yoyodyne, Inc., hereby disclaims all copyright interest in the program
329 | `Gnomovision' (which makes passes at compilers) written by James Hacker.
330 |
331 | , 1 April 1989
332 | Moe Ghoul, President of Vice
333 |
334 | This General Public License does not permit incorporating your program into
335 | proprietary programs. If your program is a subroutine library, you may
336 | consider it more useful to permit linking proprietary applications with the
337 | library. If this is what you want to do, use the GNU Lesser General
338 | Public License instead of this License.
339 |
--------------------------------------------------------------------------------
/META.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "pgpdf",
3 | "abstract": "PDF Type for Postgres",
4 | "version": "0.1.0",
5 | "maintainer": [
6 | "Florents Tselai "
7 | ],
8 | "license": "gpl_2",
9 | "generated_by": "Florents Tselai",
10 | "release_status": "stable",
11 | "prereqs": {
12 | "runtime": {
13 | "requires": {
14 | "PostgreSQL": "14.0.0"
15 | }
16 | }
17 | },
18 | "provides": {
19 | "pgpdf": {
20 | "file": "sql/pgpdf--0.1.0.sql",
21 | "docfile": "README.md",
22 | "version": "0.1.0",
23 | "abstract": "PDF Type for Postgres"
24 | }
25 | },
26 | "resources": {
27 | "bugtracker": {
28 | "web": "https://github.com/Florents-Tselai/pgpdf/issues"
29 | },
30 | "repository": {
31 | "url": "git://github.com/Florents-Tselai/pgpdf.git" ,
32 | "web": "https://github.com/Florents-Tselai/pgpdf",
33 | "type": "git"
34 | }
35 | },
36 | "meta-spec": {
37 | "version": "1.0.0",
38 | "url": "http://pgxn.org/meta/spec.txt"
39 | },
40 | "tags": [
41 | "pdf",
42 | "documents"
43 | ]
44 | }
45 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | PG_CONFIG = pg_config
2 | PKG_CONFIG = pkg-config
3 |
4 | EXTENSION = pgpdf
5 | EXTVERSION = 0.1.0
6 | MODULE_big = $(EXTENSION)
7 |
8 | OBJS = pgpdf.o
9 |
10 | DATA = $(wildcard sql/*--*.sql)
11 |
12 | PG_CPPFLAGS = $(shell $(PKG_CONFIG) --cflags poppler poppler-glib)
13 | PG_LDFLAGS = $(shell $(PKG_CONFIG) --libs poppler poppler-glib)
14 | SHLIB_LINK =-lpoppler -lpoppler-glib
15 |
16 | TESTS = $(wildcard test/sql/*.sql)
17 | REGRESS = $(patsubst test/sql/%.sql,%,$(TESTS))
18 | REGRESS_OPTS = --inputdir=test --load-extension=$(EXTENSION)
19 |
20 | TEST_FILES = /tmp/pgintro.pdf /tmp/bad.pdf /tmp/big.pdf
21 | /tmp/pgintro.pdf:
22 | cp test/pgintro.pdf $@
23 | /tmp/bad.pdf:
24 | echo 'not a pdf' >> $@
25 | /tmp/big.pdf:
26 | wget https://www.postgresql.org/files/documentation/pdf/10/postgresql-10-A4.pdf -O $@
27 |
28 | installcheck: $(TEST_FILES)
29 |
30 | EXTRA_CLEAN = $(TEST_FILES)
31 |
32 | PGXS := $(shell $(PG_CONFIG) --pgxs)
33 | include $(PGXS)
34 |
35 | dev: clean all install installcheck
36 |
37 | .PHONY: dist
38 |
39 | dist:
40 | mkdir -p dist
41 | git archive --format zip --prefix=$(EXTENSION)-$(EXTVERSION)/ --output dist/$(EXTENSION)-$(EXTVERSION).zip main
42 |
43 | # for Docker
44 | PG_MAJOR ?= 17
45 |
46 | .PHONY: docker-build
47 |
48 | docker-build:
49 | docker build --pull --no-cache --build-arg PG_MAJOR=$(PG_MAJOR) -t florents/pgpdf:pg$(PG_MAJOR) -t florents/pgpdf:$(EXTVERSION)-pg$(PG_MAJOR) .
50 |
51 | .PHONY: docker-release
52 |
53 | docker-release:
54 | docker buildx build --push --pull --no-cache --platform linux/amd64,linux/arm64 --build-arg PG_MAJOR=$(PG_MAJOR) -t florents/pgpdf:pg$(PG_MAJOR) -t florents/pgpdf:$(EXTVERSION)-pg$(PG_MAJOR) .
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # pgPDF: `pdf` type for Postgres
2 |
3 | [](https://github.com/Florents-Tselai/pgpdf/actions/workflows/build.yml)
4 | 
5 |
6 |
7 | This extension for PostgreSQL provides a `pdf` data type and assorted functions.
8 |
9 | You can create a `pdf` type, by casting either a `text` filepath or `bytea` column.
10 |
11 | ```tsql
12 | SELECT '/tmp/pgintro.pdf'::pdf;
13 | ```
14 |
15 | ```tsql
16 | pdf
17 | ----------------------------------------------------------------------------------
18 | PostgreSQL Introduction +
19 | Digoal.Zhou +
20 | 7/20/2011Catalog +
21 | PostgreSQL Origin
22 | ```
23 |
24 | If you don’t have the PDF file in your filesystem,
25 | but have already stored its content in a `bytea` column,
26 | you can just cast it to `pdf`.
27 |
28 | ```tsql
29 | SELECT pg_read_binary_file('/tmp/pgintro.pdf')::bytea::pdf;
30 | ```
31 |
32 | **Why?**:
33 | This allows you to work with PDFs in an ACID-compliant way.
34 | The usual alternative relies on external scripts or services which can easily
35 | make your data ingestion pipeline brittle and leave your raw data out-of-sync.
36 |
37 | The actual PDF parsing is done by [poppler](https://poppler.freedesktop.org).
38 |
39 | Also check blog:
40 | - [Full Text Search on PDFs With Postgres](https://tselai.com/full-text-search-pdf-postgres)
41 | - [pgpdf: pdf type for Postgres](https://tselai.com/pgpdf-pdf-type-postgres)
42 |
43 | ## Usage
44 |
45 | Download some PDFs.
46 |
47 | ```sh
48 | wget https://wiki.postgresql.org/images/e/ea/PostgreSQL_Introduction.pdf -O /tmp/pgintro.pdf
49 | wget https://pdfobject.com/pdf/sample.pdf -O /tmp/sample.pdf
50 | ```
51 |
52 | Create a table with a `pdf` column:
53 |
54 | ```tsql
55 | CREATE TABLE pdfs(name text primary key, doc pdf);
56 |
57 | INSERT INTO pdfs VALUES ('pgintro', '/tmp/pgintro.pdf');
58 | INSERT INTO pdfs VALUES ('pgintro', '/tmp/sample.pdf');
59 | ```
60 |
61 | Parsing and validation should happen automatically.
62 | The files will be read from the disk only once!
63 |
64 | > [!NOTE]
65 | > The filepath should be accessible by the `postgres` process / user!
66 | > That's different than the user running psql.
67 | > If you don't understand what this means, as your DBA!
68 |
69 | ### Reading from URLs
70 |
71 | You can combine pgpdf with [pgsql-http](https://github.com/pramsey/pgsql-http)
72 | to quickly grab remote PDFs into Postgres,
73 | by fetching the remote content as `bytea` and then treat it as a PDF.
74 |
75 | ```tsql
76 | CREATE EXTENSION pgpdf;
77 | CREATE EXTENSION http;
78 |
79 | SELECT pdf_read_bytes(text_to_bytea(content))
80 | FROM http_get('https://wiki.postgresql.org/images/e/e3/Hooks_in_postgresql.pdf');
81 | ```
82 |
83 | ### String Functions and Operators
84 |
85 | Standard Postgres [String Functions and Operators](https://www.postgresql.org/docs/17/functions-string.html)
86 | should work as usual:
87 |
88 | ```tsql
89 | SELECT 'Below is the PDF we received ' || '/tmp/pgintro.pdf'::pdf;
90 | ```
91 |
92 | ```tsql
93 | SELECT upper('/tmp/pgintro.pdf'::pdf::text);
94 | ```
95 |
96 | ``` tsql
97 | SELECT name
98 | FROM pdfs
99 | WHERE doc::text LIKE '%Postgres%';
100 | ```
101 |
102 | ### Full-Text Search (FTS)
103 |
104 | You can also perform full-text search (FTS), since you can work on a `pdf` file like normal text.
105 |
106 | ```tsql
107 | SELECT '/tmp/pgintro.pdf'::pdf::text @@ to_tsquery('postgres');
108 | ```
109 |
110 | ```tsql
111 | ?column?
112 | ----------
113 | t
114 | (1 row)
115 | ```
116 |
117 | ```tsql
118 | SELECT '/tmp/pgintro.pdf'::pdf::text @@ to_tsquery('oracle');
119 | ```
120 |
121 | ```tsql
122 | ?column?
123 | ----------
124 | f
125 | (1 row)
126 | ```
127 |
128 | ### Document similarity with `pg_trgm`
129 |
130 | You can use [pg_trgm](https://postgresql.org/docs/17/interactive/pgtrgm.html)
131 | to get the similarity between two documents:
132 |
133 | ```tsql
134 | CREATE EXTENSION pg_trgm;
135 |
136 | SELECT similarity('/tmp/pgintro.pdf'::pdf::text, '/tmp/sample.pdf'::pdf::text);
137 | ```
138 |
139 | ### Metadata
140 |
141 | The following functions are available:
142 |
143 | - `pdf_title(pdf) → text`
144 | - `pdf_author(pdf) → text`
145 | - `pdf_num_pages(pdf) → integer`
146 |
147 | Total number of pages in the document
148 | - `pdf_page(pdf, integer) → text`
149 |
150 | Get the i-th page as text
151 | - `pdf_creator(pdf) → text`
152 | - `pdf_keywords(pdf) → text`
153 | - `pdf_metadata(pdf) → text`
154 | - `pdf_version(pdf) → text`
155 | - `pdf_subject(pdf) → text`
156 | - `pdf_creation(pdf) → timestamp`
157 | - `pdf_modification(pdf) → timestamp`
158 |
159 | ```tsql
160 | SELECT pdf_title('/tmp/pgintro.pdf');
161 | ```
162 |
163 | ```tsql
164 | pdf_title
165 | -------------------------
166 | PostgreSQL Introduction
167 | (1 row)
168 | ```
169 |
170 | ```tsql
171 | SELECT pdf_author('/tmp/pgintro.pdf');
172 | ```
173 |
174 | ```tsql
175 | pdf_author
176 | ------------
177 | 周正中
178 | (1 row)
179 | ```
180 |
181 | Getting a subset of pages
182 |
183 | ```tsql
184 | SELECT pdf_num_pages('/tmp/pgintro.pdf');
185 | ```
186 |
187 | ```tsql
188 | pdf_num_pages
189 | ---------------
190 | 24
191 | (1 row)
192 | ```
193 |
194 | ```tsql
195 | SELECT pdf_page('/tmp/pgintro.pdf', 1);
196 | ```
197 |
198 | ```tsql
199 | pdf_page
200 | ------------------------------
201 | Catalog +
202 | PostgreSQL Origin +
203 | Layout +
204 | Features +
205 | Enterprise Class Attribute+
206 | Case
207 | (1 row)
208 | ```
209 |
210 | ```tsql
211 | SELECT pdf_subject('/tmp/pgintro.pdf');
212 | ```
213 |
214 | ```tsql
215 | pdf_subject
216 | -------------
217 |
218 | (1 row)
219 | ```
220 |
221 | ```tsql
222 | SELECT pdf_creation('/tmp/pgintro.pdf');
223 | ```
224 |
225 | ```tsql
226 | pdf_creation
227 | --------------------------
228 | Wed Jul 20 11:13:37 2011
229 | (1 row)
230 | ```
231 |
232 | ```tsql
233 | SELECT pdf_modification('/tmp/pgintro.pdf');
234 | ```
235 |
236 | ```tsql
237 | pdf_modification
238 | --------------------------
239 | Wed Jul 20 11:13:37 2011
240 | (1 row)
241 | ```
242 |
243 | ```tsql
244 | SELECT pdf_creator('/tmp/pgintro.pdf');
245 | ```
246 |
247 | ```tsql
248 | pdf_creator
249 | ------------------------------------
250 | Microsoft® Office PowerPoint® 2007
251 | (1 row)
252 | ```
253 |
254 | ```tsql
255 | SELECT pdf_metadata('/tmp/pgintro.pdf');
256 | ```
257 |
258 | ```tsql
259 | pdf_metadata
260 | --------------
261 |
262 | (1 row)
263 | ```
264 |
265 | ```tsql
266 | SELECT pdf_version('/tmp/pgintro.pdf');
267 | ```
268 |
269 | ```tsql
270 | pdf_version
271 | -------------
272 | PDF-1.5
273 | (1 row)
274 | ```
275 |
276 | ## Installation
277 |
278 | Install [poppler](https://poppler.freedesktop.org) dependencies
279 |
280 | **Linux**
281 | ```
282 | sudo apt install -y libpoppler-glib-dev pkg-config
283 | ```
284 |
285 | **Homebrew/MacOS**
286 |
287 | ```
288 | brew install poppler pkgconf
289 | ```
290 |
291 | ```
292 | cd /tmp
293 | git clone https://github.com/Florents-Tselai/pgpdf.git
294 | cd pgpdf
295 | make
296 | make install # may need sudo
297 | ```
298 |
299 | After the installation, in a session:
300 |
301 | ```tsql
302 | CREATE EXTENSION pgpdf;
303 | ```
304 |
305 | ### Docker
306 |
307 | Get the [Docker image](https://hub.docker.com/r/florents/pgpdf) with:
308 |
309 | ```sh
310 | docker pull florents/pgpdf:pg17
311 | ```
312 |
313 | This adds pgpdf to the [Postgres image](https://hub.docker.com/_/postgres) (replace `17` with your Postgres server version, and run it the same way).
314 |
315 | Run the image in a container.
316 |
317 | ```sh
318 | docker run --name pgpdf -p 5432:5432 -e POSTGRES_PASSWORD=pass florents/pgpdf:pg17
319 | ```
320 |
321 | Through another terminal, connect to the running server (container).
322 |
323 | ```sh
324 | PGPASSWORD=pass psql -h localhost -p 5432 -U postgres
325 | ```
326 |
327 | > [!WARNING]
328 | > Reading arbitrary binary data (PDF) into your database can pose security risks.
329 | > Only use this for files you trust.
330 |
331 |
--------------------------------------------------------------------------------
/pgpdf.c:
--------------------------------------------------------------------------------
1 | #include "postgres.h"
2 |
3 | #include "fmgr.h"
4 | #include "utils/builtins.h"
5 | #include "utils/jsonb.h"
6 | #include "utils/datetime.h"
7 | #include "utils/date.h"
8 |
9 | #include "poppler.h"
10 | #include
11 | #include
12 | #include
13 |
14 | #if PG_VERSION_NUM >= 160000
15 | #include
16 | #include "varatt.h"
17 |
18 | #endif
19 |
20 | #if PG_VERSION_NUM < 130000
21 | #define TYPALIGN_DOUBLE 'd'
22 | #define TYPALIGN_INT 'i'
23 | #endif
24 |
25 | PG_MODULE_MAGIC;
26 |
27 | typedef struct varlena pdftype;
28 |
29 | #define DatumGetPdfP(X) ((pdftype *) PG_DETOAST_DATUM(X))
30 | #define DatumGetPdfPP(X) ((pdftype *) PG_DETOAST_DATUM_PACKED(X))
31 | #define PdfPGetDatum(X) PointerGetDatum(X)
32 |
33 | #define PG_GETARG_PDF_P(n) DatumGetPdfP(PG_GETARG_DATUM(n))
34 | #define PG_GETARG_PDF_PP(n) DatumGetPdfPP(PG_GETARG_DATUM(n))
35 | #define PG_RETURN_PDF_P(x) PG_RETURN_POINTER(x)
36 |
37 | #define PG_GETARG_POPPLER_DOCUMENT(X) ({ \
38 | pdftype* pdf = PG_GETARG_PDF_P(X); \
39 | GError* error = NULL; \
40 | GBytes* pdf_data = g_bytes_new(VARDATA(pdf), VARSIZE_ANY_EXHDR(pdf)); \
41 | PopplerDocument* doc = poppler_document_new_from_bytes(pdf_data, NULL, &error); \
42 | g_bytes_unref(pdf_data); \
43 | if (!doc) { \
44 | elog(ERROR, "Error parsing PDF document: %s", error->message); \
45 | g_clear_error(&error); \
46 | } \
47 | doc; \
48 | })
49 |
50 | PG_FUNCTION_INFO_V1(pdf_in);
51 |
52 | Datum
53 | pdf_in(PG_FUNCTION_ARGS)
54 | {
55 | Datum filename_t = CStringGetTextDatum(PG_GETARG_CSTRING(0));
56 | Datum pdf_bytes;
57 | int32 pdf_bytes_len;
58 |
59 | pdftype* result;
60 | GBytes* g_bytes = NULL;
61 | PopplerDocument* doc = NULL;
62 | GError* error = NULL;
63 |
64 | pdf_bytes = DirectFunctionCall1(pg_read_binary_file_all, filename_t);
65 | pdf_bytes_len = VARSIZE_ANY_EXHDR(pdf_bytes);
66 |
67 | result = (pdftype*)palloc(VARHDRSZ + pdf_bytes_len);
68 | SET_VARSIZE(result, VARHDRSZ + pdf_bytes_len);
69 |
70 | memcpy(VARDATA(result), VARDATA_ANY(pdf_bytes), pdf_bytes_len);
71 |
72 | g_bytes = g_bytes_new(VARDATA(result), pdf_bytes_len);
73 |
74 | doc = poppler_document_new_from_bytes(g_bytes, NULL, &error);
75 | g_bytes_unref(g_bytes);
76 |
77 | if (!doc)
78 | {
79 | elog(ERROR, "Error parsing PDF document: %s", error->message);
80 | pfree(result);
81 | g_clear_error(&error);
82 | PG_RETURN_NULL();
83 | }
84 |
85 | g_object_unref(doc);
86 |
87 | PG_RETURN_POINTER(result);
88 | }
89 |
90 | PG_FUNCTION_INFO_V1(pdf_out);
91 |
92 | Datum
93 | pdf_out(PG_FUNCTION_ARGS)
94 | {
95 | PopplerDocument* doc = PG_GETARG_POPPLER_DOCUMENT(0);
96 | StringInfo strinfo = makeStringInfo();
97 |
98 | int num_pages = poppler_document_get_n_pages(doc);
99 | for (int i = 0; i < num_pages; i++)
100 | {
101 | PopplerPage* page = poppler_document_get_page(doc, i);
102 | if (!page)
103 | {
104 | elog(WARNING, "Failed to get page %d\n", i);
105 | continue;
106 | }
107 |
108 | gchar* page_text = poppler_page_get_text(page);
109 | if (page_text)
110 | {
111 | appendStringInfo(strinfo, "%s", page_text);
112 | g_free(page_text);
113 | }
114 | else
115 | {
116 | elog(WARNING, "Failed to extract text from page %d\n", i);
117 | }
118 |
119 | g_object_unref(page);
120 | }
121 |
122 | PG_RETURN_CSTRING(strinfo->data);
123 | }
124 |
125 |
126 | PG_FUNCTION_INFO_V1(pdf_title);
127 | Datum
128 | pdf_title(PG_FUNCTION_ARGS)
129 | {
130 | PopplerDocument *doc = PG_GETARG_POPPLER_DOCUMENT(0);
131 | const char *title = poppler_document_get_title(doc);
132 |
133 | if (title == NULL)
134 | PG_RETURN_NULL();
135 |
136 | PG_RETURN_TEXT_P(cstring_to_text(title));
137 | }
138 |
139 | PG_FUNCTION_INFO_V1(pdf_author);
140 | Datum
141 | pdf_author(PG_FUNCTION_ARGS)
142 | {
143 | PopplerDocument *doc = PG_GETARG_POPPLER_DOCUMENT(0);
144 | const char *author = poppler_document_get_author(doc);
145 |
146 | if (author == NULL)
147 | PG_RETURN_NULL();
148 |
149 | PG_RETURN_TEXT_P(cstring_to_text(author));
150 | }
151 |
152 | PG_FUNCTION_INFO_V1(pdf_creator);
153 | Datum
154 | pdf_creator(PG_FUNCTION_ARGS)
155 | {
156 | PopplerDocument *doc = PG_GETARG_POPPLER_DOCUMENT(0);
157 | const char *creator = poppler_document_get_creator(doc);
158 |
159 | if (creator == NULL)
160 | PG_RETURN_NULL();
161 |
162 | PG_RETURN_TEXT_P(cstring_to_text(creator));
163 | }
164 |
165 | PG_FUNCTION_INFO_V1(pdf_keywords);
166 | Datum
167 | pdf_keywords(PG_FUNCTION_ARGS)
168 | {
169 | PopplerDocument *doc = PG_GETARG_POPPLER_DOCUMENT(0);
170 | const char *keywords = poppler_document_get_keywords(doc);
171 |
172 | if (keywords == NULL)
173 | PG_RETURN_NULL();
174 |
175 | PG_RETURN_TEXT_P(cstring_to_text(keywords));
176 | }
177 |
178 | PG_FUNCTION_INFO_V1(pdf_metadata);
179 | Datum
180 | pdf_metadata(PG_FUNCTION_ARGS)
181 | {
182 | PopplerDocument *doc = PG_GETARG_POPPLER_DOCUMENT(0);
183 | const char *metadata = poppler_document_get_metadata(doc);
184 |
185 | if (metadata == NULL)
186 | PG_RETURN_NULL();
187 |
188 | PG_RETURN_TEXT_P(cstring_to_text(metadata));
189 | }
190 |
191 | PG_FUNCTION_INFO_V1(pdf_version);
192 | Datum
193 | pdf_version(PG_FUNCTION_ARGS)
194 | {
195 | PopplerDocument *doc = PG_GETARG_POPPLER_DOCUMENT(0);
196 | const char *version = poppler_document_get_pdf_version_string(doc);
197 |
198 | if (version == NULL)
199 | PG_RETURN_NULL();
200 |
201 | PG_RETURN_TEXT_P(cstring_to_text(version));
202 | }
203 |
204 | PG_FUNCTION_INFO_V1(pdf_subject);
205 | Datum
206 | pdf_subject(PG_FUNCTION_ARGS)
207 | {
208 | PopplerDocument *doc = PG_GETARG_POPPLER_DOCUMENT(0);
209 | const char *subject = poppler_document_get_subject(doc);
210 |
211 | if (subject == NULL)
212 | PG_RETURN_NULL();
213 |
214 | PG_RETURN_TEXT_P(cstring_to_text(subject));
215 | }
216 |
217 | PG_FUNCTION_INFO_V1(pdf_num_pages);
218 |
219 | Datum
220 | pdf_num_pages(PG_FUNCTION_ARGS)
221 | {
222 | PopplerDocument* doc = PG_GETARG_POPPLER_DOCUMENT(0);
223 | PG_RETURN_INT32(poppler_document_get_n_pages(doc));
224 | }
225 |
226 |
227 | PG_FUNCTION_INFO_V1(pdf_page);
228 |
229 | Datum
230 | pdf_page(PG_FUNCTION_ARGS)
231 | {
232 | PopplerDocument* doc = PG_GETARG_POPPLER_DOCUMENT(0);
233 | int32 i = PG_GETARG_INT32(1);
234 | PopplerPage* page = poppler_document_get_page(doc, i);
235 | PG_RETURN_TEXT_P(cstring_to_text(poppler_page_get_text(page)));
236 | }
237 |
238 | PG_FUNCTION_INFO_V1(pdf_creation);
239 |
240 | Datum
241 | pdf_creation(PG_FUNCTION_ARGS)
242 | {
243 | PopplerDocument *doc = PG_GETARG_POPPLER_DOCUMENT(0);
244 | GDateTime *dt = poppler_document_get_creation_date_time(doc);
245 |
246 | if (dt == NULL)
247 | PG_RETURN_NULL();
248 |
249 | gint year = g_date_time_get_year(dt);
250 | gint month = g_date_time_get_month(dt);
251 | gint day = g_date_time_get_day_of_month(dt);
252 | gint hour = g_date_time_get_hour(dt);
253 | gint minute = g_date_time_get_minute(dt);
254 | gint second = g_date_time_get_second(dt);
255 |
256 | g_date_time_unref(dt);
257 |
258 | TimestampTz ts = DatumGetTimestamp(DirectFunctionCall6(
259 | make_timestamp,
260 | Int32GetDatum(year),
261 | Int32GetDatum(month),
262 | Int32GetDatum(day),
263 | Int32GetDatum(hour),
264 | Int32GetDatum(minute),
265 | Float8GetDatum((double)second)
266 | ));
267 |
268 | PG_RETURN_TIMESTAMPTZ(ts);
269 | }
270 |
271 | PG_FUNCTION_INFO_V1(pdf_modification);
272 |
273 | Datum
274 | pdf_modification(PG_FUNCTION_ARGS)
275 | {
276 | PopplerDocument *doc = PG_GETARG_POPPLER_DOCUMENT(0);
277 | GDateTime *dt = poppler_document_get_modification_date_time(doc);
278 |
279 | if (dt == NULL)
280 | PG_RETURN_NULL();
281 |
282 | gint year = g_date_time_get_year(dt);
283 | gint month = g_date_time_get_month(dt);
284 | gint day = g_date_time_get_day_of_month(dt);
285 | gint hour = g_date_time_get_hour(dt);
286 | gint minute = g_date_time_get_minute(dt);
287 | gint second = g_date_time_get_second(dt);
288 |
289 | g_date_time_unref(dt);
290 |
291 | TimestampTz ts = DatumGetTimestamp(DirectFunctionCall6(
292 | make_timestamp,
293 | Int32GetDatum(year),
294 | Int32GetDatum(month),
295 | Int32GetDatum(day),
296 | Int32GetDatum(hour),
297 | Int32GetDatum(minute),
298 | Float8GetDatum((double)second)
299 | ));
300 |
301 | PG_RETURN_TIMESTAMPTZ(ts);
302 | }
303 |
304 | PG_FUNCTION_INFO_V1(pdf_from_bytea);
305 |
306 | Datum
307 | pdf_from_bytea(PG_FUNCTION_ARGS)
308 | {
309 | bytea* input_bytes = PG_GETARG_BYTEA_P(0); // Get the binary PDF data
310 | int32 bytes_len = VARSIZE_ANY_EXHDR(input_bytes);
311 | pdftype* result;
312 |
313 | result = (pdftype*)palloc(VARHDRSZ + bytes_len);
314 | SET_VARSIZE(result, VARHDRSZ + bytes_len);
315 |
316 | memcpy(VARDATA(result), VARDATA_ANY(input_bytes), bytes_len);
317 |
318 | PG_RETURN_POINTER(result);
319 | }
320 |
321 |
322 | PG_FUNCTION_INFO_V1(pdf_to_bytea);
323 |
324 | Datum
325 | pdf_to_bytea(PG_FUNCTION_ARGS)
326 | {
327 | pdftype *pdf = PG_GETARG_PDF_P(0);
328 | bytea *result;
329 |
330 | int32 data_len = VARSIZE_ANY_EXHDR(pdf);
331 |
332 | result = (bytea *) palloc(VARHDRSZ + data_len);
333 | SET_VARSIZE(result, VARHDRSZ + data_len);
334 |
335 | memcpy(VARDATA(result), VARDATA(pdf), data_len);
336 |
337 | PG_RETURN_BYTEA_P(result);
338 | }
339 |
340 | PG_FUNCTION_INFO_V1(bytea_to_pdf);
341 |
342 |
343 | Datum
344 | bytea_to_pdf(PG_FUNCTION_ARGS)
345 | {
346 | bytea *bytes = PG_GETARG_BYTEA_PP(0);
347 | pdftype *result;
348 | GError *error = NULL;
349 |
350 | GBytes *pdf_data = g_bytes_new(VARDATA_ANY(bytes), VARSIZE_ANY_EXHDR(bytes));
351 |
352 | PopplerDocument *doc = poppler_document_new_from_bytes(pdf_data, NULL, &error);
353 | if (!doc) {
354 | g_bytes_unref(pdf_data);
355 | elog(ERROR, "Error parsing PDF document: %s", error->message);
356 | }
357 |
358 | result = (pdftype*) palloc(VARHDRSZ + VARSIZE_ANY_EXHDR(bytes));
359 | SET_VARSIZE(result, VARHDRSZ + VARSIZE_ANY_EXHDR(bytes));
360 |
361 | memcpy(VARDATA(result), g_bytes_get_data(pdf_data, NULL), VARSIZE_ANY_EXHDR(bytes));
362 |
363 | g_object_unref(doc);
364 | g_bytes_unref(pdf_data);
365 |
366 | PG_RETURN_POINTER(result);
367 | }
368 |
--------------------------------------------------------------------------------
/pgpdf.control:
--------------------------------------------------------------------------------
1 | default_version = '0.1.0'
2 | module_pathname = '$libdir/pgpdf'
3 | comment = 'pdf type'
4 | relocatable = true
--------------------------------------------------------------------------------
/sql/pgpdf--0.1.0.sql:
--------------------------------------------------------------------------------
1 | CREATE TYPE pdf;
2 |
3 | CREATE FUNCTION pdf_in(cstring) RETURNS pdf
4 | IMMUTABLE
5 | STRICT
6 | LANGUAGE C
7 | AS
8 | 'MODULE_PATHNAME';
9 |
10 | CREATE FUNCTION pdf_out(pdf) RETURNS cstring
11 | IMMUTABLE
12 | STRICT
13 | LANGUAGE C
14 | AS
15 | 'MODULE_PATHNAME';
16 |
17 | CREATE TYPE pdf
18 | (
19 | INTERNALLENGTH = -1,
20 | INPUT = pdf_in,
21 | OUTPUT = pdf_out,
22 | STORAGE = extended
23 | );
24 |
25 | CREATE CAST (pdf AS text) WITH INOUT AS ASSIGNMENT;
26 | CREATE CAST (text AS pdf) WITH INOUT AS ASSIGNMENT;
27 |
28 |
29 | /* API */
30 |
31 | CREATE FUNCTION pdf_title(pdf) RETURNS TEXT
32 | IMMUTABLE
33 | STRICT
34 | LANGUAGE C
35 | AS
36 | 'MODULE_PATHNAME';
37 |
38 | CREATE FUNCTION pdf_author(pdf) RETURNS TEXT
39 | IMMUTABLE
40 | STRICT
41 | LANGUAGE C
42 | AS
43 | 'MODULE_PATHNAME';
44 |
45 | CREATE FUNCTION pdf_num_pages(pdf) RETURNS INTEGER
46 | IMMUTABLE
47 | STRICT
48 | LANGUAGE C
49 | AS
50 | 'MODULE_PATHNAME';
51 |
52 | CREATE FUNCTION pdf_page(pdf, integer) RETURNS TEXT
53 | IMMUTABLE
54 | STRICT
55 | LANGUAGE C
56 | AS
57 | 'MODULE_PATHNAME';
58 |
59 | CREATE FUNCTION pdf_creator(pdf) RETURNS TEXT
60 | IMMUTABLE
61 | STRICT
62 | LANGUAGE C
63 | AS
64 | 'MODULE_PATHNAME';
65 |
66 | CREATE FUNCTION pdf_keywords(pdf) RETURNS TEXT
67 | IMMUTABLE
68 | STRICT
69 | LANGUAGE C
70 | AS
71 | 'MODULE_PATHNAME';
72 |
73 | CREATE FUNCTION pdf_metadata(pdf) RETURNS TEXT
74 | IMMUTABLE
75 | STRICT
76 | LANGUAGE C
77 | AS
78 | 'MODULE_PATHNAME';
79 |
80 | CREATE FUNCTION pdf_version(pdf) RETURNS TEXT
81 | IMMUTABLE
82 | STRICT
83 | LANGUAGE C
84 | AS
85 | 'MODULE_PATHNAME';
86 |
87 | CREATE FUNCTION pdf_subject(pdf) RETURNS TEXT
88 | IMMUTABLE
89 | STRICT
90 | LANGUAGE C
91 | AS
92 | 'MODULE_PATHNAME';
93 |
94 | CREATE FUNCTION pdf_creation(pdf) RETURNS TIMESTAMP
95 | IMMUTABLE
96 | STRICT
97 | LANGUAGE C
98 | AS
99 | 'MODULE_PATHNAME';
100 |
101 | CREATE FUNCTION pdf_modification(pdf) RETURNS TIMESTAMP
102 | IMMUTABLE
103 | STRICT
104 | LANGUAGE C
105 | AS
106 | 'MODULE_PATHNAME';
107 |
108 |
109 | CREATE FUNCTION bytea_to_pdf(bytea) RETURNS pdf
110 | LANGUAGE C
111 | IMMUTABLE
112 | STRICT
113 | AS
114 | 'MODULE_PATHNAME';
115 |
116 |
117 | CREATE FUNCTION pdf_to_bytea(pdf) RETURNS bytea
118 | LANGUAGE C
119 | IMMUTABLE
120 | STRICT
121 | AS
122 | 'MODULE_PATHNAME';
123 |
124 | CREATE CAST (bytea AS pdf) WITH FUNCTION bytea_to_pdf(bytea) AS ASSIGNMENT;
125 | CREATE CAST (pdf AS bytea) WITH FUNCTION pdf_to_bytea(pdf) AS ASSIGNMENT;
126 |
127 | --------------------
128 |
129 | CREATE FUNCTION pdf_read_file(text) RETURNS text
130 | LANGUAGE SQL
131 | IMMUTABLE
132 | STRICT
133 | AS 'SELECT $1::pdf::text';
134 |
135 |
136 | CREATE FUNCTION pdf_read_bytes(bytea) RETURNS text
137 | LANGUAGE SQL
138 | IMMUTABLE
139 | STRICT
140 | AS 'SELECT $1::bytea::pdf::text';
141 |
142 |
143 |
--------------------------------------------------------------------------------
/test/expected/pgpdf.out:
--------------------------------------------------------------------------------
1 | /* Errors */
2 | SELECT 'notexists.pdf'::pdf;
3 | ERROR: could not open file "notexists.pdf" for reading: No such file or directory
4 | LINE 2: SELECT 'notexists.pdf'::pdf;
5 | ^
6 | SELECT '/tmp/bad.pdf'::pdf;
7 | ERROR: Error parsing PDF document: PDF document is damaged
8 | LINE 1: SELECT '/tmp/bad.pdf'::pdf;
9 | ^
10 | /* OK */
11 | SELECT '/tmp/pgintro.pdf'::pdf;
12 | pdf
13 | ----------------------------------------------------------------------------------
14 | PostgreSQL Introduction +
15 | Digoal.Zhou +
16 | 7/20/2011Catalog +
17 | PostgreSQL Origin +
18 | Layout +
19 | Features +
20 | Enterprise Class Attribute +
21 | CaseOrigin +
22 | Extract From Wiki +
23 | 1973 +
24 | Postgres95 +
25 | 1995 +
26 | POSTGRES 1985 +
27 | 1996 +
28 | OLTP +
29 | Michael Stonebraker +
30 | H-Store +
31 | C-Store +
32 | DW +
33 | DWPortion ContributersLogical Layout +
34 | InstanceCluster +
35 | DatabaseDatabase(s) +
36 | SchemaSchema(s) +
37 | Object +
38 | Field +
39 | Table(s) +
40 | Row(s) +
41 | Index(s) +
42 | Column +
43 | (s) +
44 | View(s) +
45 | Function(s) +
46 | Sequence(s) +
47 | Other(s)Process Introduction +
48 | Shared Memory Area +
49 | IPC +
50 | APP +
51 | Handshake +
52 | & +
53 | authentication +
54 | postmaster +
55 | fork +
56 | autovacuum launcher +
57 | autovacuum worker +
58 | Shared buffer +
59 | backend process +
60 | WAL buffer +
61 | WAL writer +
62 | XLOGs +
63 | Datafiles +
64 | Archiver +
65 | ARCH FILEs +
66 | bgwriterPotion Features +
67 | GEQO +
68 | WAL +
69 | Online +
70 | Backup +
71 | CBO +
72 | MVCC +
73 | PITR +
74 | Open Source +
75 | & Free +
76 | ACID +
77 | RDBMS +
78 | Stream +
79 | ReplicationPowerful Localization +
80 | Support +
81 | Supported Character Sets +
82 | http://www.postgresql.org/docs/9.1/static/multibyte.html +
83 | Support Database and Column level COLLATE +
84 | Example : CREATE TABLE test1 ( a text COLLATE "de_DE", b +
85 | text COLLATE "es_ES", ... );Powerful Platform Support +
86 | X86 +
87 | X86_64 +
88 | IA64 +
89 | PowerPC +
90 | PowerPC 64 +
91 | S/390 +
92 | S/390x +
93 | Sparc +
94 | Sparc 64 +
95 | Alpha +
96 | ARM +
97 | MIPS +
98 | MIPSEL +
99 | M68K +
100 | PA-RISC +
101 | Linux +
102 | Windows +
103 | FreeBSD +
104 | OpenBSD +
105 | NetBSD +
106 | Mac OS X +
107 | AIX +
108 | HP/UX +
109 | IRIX +
110 | Solaris +
111 | Tru64 Unix +
112 | UnixWareRich Extensions +
113 | adminpackisnpgrowlocks +
114 | auto_explainlopgstattuple +
115 | btree_ginltreeseg +
116 | btree_gistoid2namesepgsql +
117 | chkpasspageinspectspi +
118 | citextpasswordchecksslinfo +
119 | cubepg_buffercachestart-scripts +
120 | dblinkpg_freespacemaptablefunc +
121 | dict_intpg_standbytest_parser +
122 | dict_xsynpg_stat_statementstsearch2 +
123 | earthdistancepg_test_fsyncunaccent +
124 | fuzzystrmatchpg_trgmuuid-ossp +
125 | hstorepg_upgradevacuumlo +
126 | intaggpgbenchxml2 +
127 | intarraypgcryptoPotion Compare +
128 | 1. Language +
129 | SQL/Plsql +
130 | 2. Index +
131 | Global / Partition +
132 | 3. DDL Rollback +
133 | Cann’t rollback but can recovery from +
134 | Backup or Flash Recovery Area. +
135 | 4. Compress +
136 | Table Level +
137 | 5. Trigger +
138 | 6. Data Type +
139 | …… +
140 | 1. Language +
141 | SQL/Plpgsql/Pltcl/Plperl/Plpython… +
142 | 2. Index +
143 | Global(non-partition TABLE) +
144 | Partition +
145 | Partial Index +
146 | 3. DDL Rollback +
147 | Can rollback every ddl sql. +
148 | 4. Compress +
149 | Column Level(Limited) +
150 | 5. Trigger / Rule +
151 | 6. Data Type extention +
152 | IP / MAC / XML / UUID / … +
153 | ……LimitReliability +
154 | ACID +
155 | Atomicity +
156 | All Success or All Fail +
157 | Consistency +
158 | Only valid data will be written to the database +
159 | Example:check (age>=0) +
160 | Isolation +
161 | SERIALIZABLE | REPEATABLE READ | READ COMMITTED | +
162 | READ UNCOMMITTED +
163 | Durability +
164 | The ability of the DBMS to recover the committed transaction +
165 | updates against any kind of system failure (hardware or software).Recoverability+
166 | Requirement +
167 | Baseline Backup +
168 | Parameter +
169 | Open fsync,full_page_writes +
170 | Optional open synchronous_commit +
171 | Open WAL BackupRecoverability +
172 | Mistake +
173 | Checkpoint +
174 | Time Line +
175 | Which Page the first +
176 | Modified after Checkpoint +
177 | WAL +
178 | Archive +
179 | Inconsistent Backup +
180 | PITRSecurity +
181 | PostgreSQL +
182 | Connection Limit +
183 | Auth Method +
184 | (Trust, +
185 | Password, +
186 | Ident, +
187 | LDAP…) +
188 | PG_HBA +
189 | Listene +
190 | Which +
191 | Address +
192 | Roles +
193 | GRANT +
194 | REVOKEScalability +
195 | Hardware +
196 | Software +
197 | ProjectTypeMethodStorage +
198 | PlproxyOLTPDistributedCan Shared-nothing +
199 | GridSQLDWDistributedCan Shared-nothing +
200 | GreenPlumDWDistributedShared-nothing +
201 | Aster DataDWDistributedShared-nothing +
202 | Postgres-XCOLTPDistributedCan Shared-nothing +
203 | Pgpool-IIDWDistributedCan Shared-nothing +
204 | Sequoia/Contin +
205 | uentOLTPDistributedCan Shared-nothing +
206 | PGMemcacheOLTPDistributedCachePerformance +
207 | SAIO Optimizer +
208 | wulczer.org +
209 | Virtual Index +
210 | Prefetch +
211 | Cache State Persistent +
212 | Tablespace Based IO Cost Value +
213 | Async IO +
214 | Partial Index +
215 | Parallel restoreHigh-AvailabilityHigh-AvailabilityArchive Case +
216 | Product SAN +
217 | FingerFingerFingerFingerFinger +
218 | DB1DB2DB3DB4DB5 +
219 | ② +
220 | Compress Transmit +
221 | 。。。。 +
222 | FingerFinger +
223 | DBxDBy +
224 | ③ +
225 | ① +
226 | Product SAN +
227 | Cloud Storage(s) +
228 | DNS +
229 | DB1 +
230 | WAL +
231 | DB2 +
232 | WAL +
233 | 。。。。 +
234 | DBx +
235 | WAL +
236 | Dby +
237 | WAL +
238 | Coordinate +
239 | DB +
240 | DB1 +
241 | WAL +
242 | DB2 +
243 | WAL +
244 | 。。。。 +
245 | DBx +
246 | WAL +
247 | Dby +
248 | WALHA & DR Case +
249 | StreamRep +
250 | pg_xlog +
251 | HOTStandby_A +
252 | pg_xlog +
253 | pg_xlog +
254 | Primary_A +
255 | Standby_A +
256 | Storage Cloud +
257 | pg_xlog +
258 | Primary_B +
259 | pg_xlog +
260 | Primary_C +
261 | Storage Cloud +
262 | WAN +
263 | Compr +
264 | ess +
265 | Transm +
266 | it +
267 | pg_xlog +
268 | Standby_B +
269 | pg_xlog +
270 | Standby_CShard-everything HA Case +
271 | RHCS +
272 | Primary +
273 | Standby +
274 | FailOver +
275 | Intervent +
276 | UP +
277 | Stream Replication +
278 | SAN 2 +
279 | SAN 1 +
280 | WAL Backup +
281 | xlog +
282 | Datafile +
283 | Backup +
284 | Used to PITR +
285 | Datafile +
286 | DatafileThanks +
287 | Thanks all people contribute to PostgreSQL. +
288 | +
289 | Digoal.Zhou +
290 | +
291 | +
292 | Blog +
293 | http://blog.163.com/digoal@126
294 | (1 row)
295 |
296 | /* API */
297 | SELECT pdf_title('/tmp/pgintro.pdf');
298 | pdf_title
299 | -------------------------
300 | PostgreSQL Introduction
301 | (1 row)
302 |
303 | SELECT pdf_author('/tmp/pgintro.pdf');
304 | pdf_author
305 | ------------
306 | 周正中
307 | (1 row)
308 |
309 | SELECT pdf_num_pages('/tmp/pgintro.pdf');
310 | pdf_num_pages
311 | ---------------
312 | 24
313 | (1 row)
314 |
315 | SELECT pdf_page('/tmp/pgintro.pdf', 1);
316 | pdf_page
317 | ------------------------------
318 | Catalog +
319 | PostgreSQL Origin +
320 | Layout +
321 | Features +
322 | Enterprise Class Attribute+
323 | Case
324 | (1 row)
325 |
326 | SELECT pdf_creator('/tmp/pgintro.pdf');
327 | pdf_creator
328 | ------------------------------------
329 | Microsoft® Office PowerPoint® 2007
330 | (1 row)
331 |
332 | SELECT pdf_keywords('/tmp/pgintro.pdf');
333 | pdf_keywords
334 | --------------
335 |
336 | (1 row)
337 |
338 | SELECT pdf_metadata('/tmp/pgintro.pdf');
339 | pdf_metadata
340 | --------------
341 |
342 | (1 row)
343 |
344 | SELECT pdf_version('/tmp/pgintro.pdf');
345 | pdf_version
346 | -------------
347 | PDF-1.5
348 | (1 row)
349 |
350 | SELECT pdf_subject('/tmp/pgintro.pdf');
351 | pdf_subject
352 | -------------
353 |
354 | (1 row)
355 |
356 | SELECT pdf_creation('/tmp/pgintro.pdf');
357 | pdf_creation
358 | --------------------------
359 | Wed Jul 20 11:13:37 2011
360 | (1 row)
361 |
362 | SELECT pdf_modification('/tmp/pgintro.pdf');
363 | pdf_modification
364 | --------------------------
365 | Wed Jul 20 11:13:37 2011
366 | (1 row)
367 |
368 | /* bytea -> pdf */
369 | SELECT pg_read_binary_file('/tmp/pgintro.pdf')::pdf::text = '/tmp/pgintro.pdf'::pdf::text;
370 | ?column?
371 | ----------
372 | t
373 | (1 row)
374 |
375 | /* pdf -> bytea */
376 | SELECT '/tmp/pgintro.pdf'::pdf::bytea = pg_read_binary_file('/tmp/pgintro.pdf');
377 | ?column?
378 | ----------
379 | t
380 | (1 row)
381 |
382 | /* FTS */
383 | SELECT '/tmp/pgintro.pdf'::pdf::text @@ to_tsquery('postgres');
384 | ?column?
385 | ----------
386 | t
387 | (1 row)
388 |
389 | SELECT '/tmp/pgintro.pdf'::pdf::text @@ to_tsquery('oracle');
390 | ?column?
391 | ----------
392 | f
393 | (1 row)
394 |
395 | /* Old functions mentioned in the blog post. Making sure they continue to work */
396 | SELECT pdf_read_file('/tmp/pgintro.pdf') = '/tmp/pgintro.pdf'::pdf::text;
397 | ?column?
398 | ----------
399 | t
400 | (1 row)
401 |
402 | select pdf_read_bytes(pg_read_binary_file('/tmp/pgintro.pdf')) = '/tmp/pgintro.pdf'::pdf::text;
403 | ?column?
404 | ----------
405 | t
406 | (1 row)
407 |
408 | /* bigger files: >8KB */
409 | CREATE TABLE pdfs(i serial primary key, d pdf);
410 | INSERT INTO pdfs(d) VALUES('/tmp/pgintro.pdf');
411 | INSERT INTO pdfs(d) VALUES('/tmp/big.pdf');
412 | SELECT length(d::text) FROM pdfs
413 | length
414 | ---------
415 | 4182
416 | 6042170
417 | (2 rows)
418 |
419 |
--------------------------------------------------------------------------------
/test/pgintro.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Florents-Tselai/pgpdf/01e89bcbec81229cfac1819c625ce9986c608720/test/pgintro.pdf
--------------------------------------------------------------------------------
/test/sql/pgpdf.sql:
--------------------------------------------------------------------------------
1 | /* Errors */
2 | SELECT 'notexists.pdf'::pdf;
3 | SELECT '/tmp/bad.pdf'::pdf;
4 |
5 | /* OK */
6 | SELECT '/tmp/pgintro.pdf'::pdf;
7 |
8 | /* API */
9 | SELECT pdf_title('/tmp/pgintro.pdf');
10 | SELECT pdf_author('/tmp/pgintro.pdf');
11 | SELECT pdf_num_pages('/tmp/pgintro.pdf');
12 | SELECT pdf_page('/tmp/pgintro.pdf', 1);
13 | SELECT pdf_creator('/tmp/pgintro.pdf');
14 | SELECT pdf_keywords('/tmp/pgintro.pdf');
15 | SELECT pdf_metadata('/tmp/pgintro.pdf');
16 | SELECT pdf_version('/tmp/pgintro.pdf');
17 | SELECT pdf_subject('/tmp/pgintro.pdf');
18 | SELECT pdf_creation('/tmp/pgintro.pdf');
19 | SELECT pdf_modification('/tmp/pgintro.pdf');
20 |
21 | /* bytea -> pdf */
22 | SELECT pg_read_binary_file('/tmp/pgintro.pdf')::pdf::text = '/tmp/pgintro.pdf'::pdf::text;
23 |
24 | /* pdf -> bytea */
25 | SELECT '/tmp/pgintro.pdf'::pdf::bytea = pg_read_binary_file('/tmp/pgintro.pdf');
26 |
27 | /* FTS */
28 | SELECT '/tmp/pgintro.pdf'::pdf::text @@ to_tsquery('postgres');
29 | SELECT '/tmp/pgintro.pdf'::pdf::text @@ to_tsquery('oracle');
30 |
31 | /* Old functions mentioned in the blog post. Making sure they continue to work */
32 | SELECT pdf_read_file('/tmp/pgintro.pdf') = '/tmp/pgintro.pdf'::pdf::text;
33 | select pdf_read_bytes(pg_read_binary_file('/tmp/pgintro.pdf')) = '/tmp/pgintro.pdf'::pdf::text;
34 |
35 | /* bigger files: >8KB */
36 | CREATE TABLE pdfs(i serial primary key, d pdf);
37 | INSERT INTO pdfs(d) VALUES('/tmp/pgintro.pdf');
38 | INSERT INTO pdfs(d) VALUES('/tmp/big.pdf');
39 | SELECT length(d::text) FROM pdfs
--------------------------------------------------------------------------------