├── ffi ├── .gitignore ├── Cargo.toml ├── src │ ├── types.rs │ ├── metadata.rs │ ├── stream.rs │ ├── errors.rs │ ├── lib.rs │ ├── config.rs │ └── extractor.rs ├── examples │ ├── basic_extraction.c │ ├── pdf_with_ocr.c │ ├── streaming_extraction.c │ └── README.md ├── cbindgen.toml └── build.rs ├── .prettierrc ├── tests ├── ffi │ ├── test_ffi_interface │ ├── Makefile │ └── test_ffi_interface.c ├── README.md └── go │ └── integration_test.go ├── copy.sh ├── go.mod ├── .gitignore ├── examples ├── basic │ └── main.go ├── streaming │ └── main.go └── config │ └── main.go ├── go.sum ├── cgo.go ├── .github └── workflows │ ├── scripts │ ├── build-local.sh │ └── collect-libs.sh │ ├── release.yml │ └── build.yml ├── check_native_libs.go ├── stream.go ├── metadata.go ├── README.md ├── errors.go ├── LICENSE ├── types.go ├── extractous.h └── cmd └── install └── main.go /ffi/.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | -------------------------------------------------------------------------------- /.prettierrc: -------------------------------------------------------------------------------- 1 | { 2 | "tabWidth": 2 3 | } 4 | -------------------------------------------------------------------------------- /tests/ffi/test_ffi_interface: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rahulpoonia29/extractous-go/HEAD/tests/ffi/test_ffi_interface -------------------------------------------------------------------------------- /copy.sh: -------------------------------------------------------------------------------- 1 | cp /mnt/c/Users/Rahul\ Poonia/Downloads/extractous-ffi-linux_amd64.zip ~/dev/extractous-go/ 2 | 3 | unzip extractous-ffi-linux_amd64.zip -d extractous-ffi-linux_amd64 4 | 5 | cd extractous-ffi-linux_amd64 6 | 7 | cp ./lib ../../benchmark/native/linux_amd64/lib -r 8 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/rahulpoonia29/extractous-go 2 | 3 | go 1.25.1 4 | 5 | require ( 6 | github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db // indirect 7 | github.com/rivo/uniseg v0.4.7 // indirect 8 | github.com/schollz/progressbar/v3 v3.18.0 // indirect 9 | golang.org/x/sys v0.37.0 // indirect 10 | golang.org/x/term v0.36.0 // indirect 11 | ) 12 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Binaries for programs and headers 2 | *.exe 3 | *.exe~ 4 | *.dll 5 | *.so 6 | *.dylib 7 | ffi/*.h 8 | native/* 9 | dist/ 10 | 11 | # Test binary, built with `go test -c` 12 | *.test 13 | 14 | # Dependency directories 15 | native/ 16 | 17 | # Go workspace file 18 | go.work 19 | 20 | # FFI build artifacts 21 | target/ 22 | *.a 23 | *.o 24 | 25 | # OS-specific files 26 | .DS_Store 27 | Thumbs.db 28 | 29 | # IDE files 30 | .vscode/ 31 | .idea/ 32 | 33 | # Logs 34 | *.log 35 | 36 | # Temporary files 37 | *.tmp 38 | *.swp 39 | -------------------------------------------------------------------------------- /ffi/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "extractous-ffi" 3 | version = "0.1.2" 4 | edition = "2024" 5 | rust-version = "1.90" 6 | authors = ["Rahul"] 7 | description = "C FFI bindings for extractous document extraction library" 8 | license = "Apache-2.0" 9 | repository = "https://github.com/rahulpoonia29/extractous-go" 10 | publish = false 11 | 12 | [lib] 13 | name = "extractous_ffi" 14 | crate-type = ["cdylib"] 15 | 16 | [dependencies] 17 | extractous = "0.3.0" 18 | libc = "0.2" 19 | 20 | [build-dependencies] 21 | cbindgen = "0.29" 22 | 23 | [profile.release] 24 | opt-level = 3 25 | lto = true 26 | codegen-units = 1 27 | strip = false 28 | panic = "abort" 29 | 30 | [profile.dev] 31 | opt-level = 0 32 | -------------------------------------------------------------------------------- /examples/basic/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "log" 6 | 7 | "github.com/rahulpoonia29/extractous-go" 8 | ) 9 | 10 | func main() { 11 | // Create a new extractor 12 | extractor := extractous.New() 13 | if extractor == nil { 14 | log.Fatal("Failed to create extractor") 15 | } 16 | defer extractor.Close() 17 | 18 | // Extract text from a PDF file 19 | content, metadata, err := extractor.ExtractFileToString("sample.pdf") 20 | if err != nil { 21 | log.Fatalf("Extraction failed: %v", err) 22 | } 23 | 24 | // Print the extracted content 25 | fmt.Println("Extracted Content:") 26 | fmt.Println(content) 27 | 28 | // Print metadata 29 | fmt.Println("\nMetadata:") 30 | for key, values := range metadata { 31 | fmt.Printf("%s: %v\n", key, values) 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db h1:62I3jR2EmQ4l5rM/4FEfDWcRD+abF5XlKShorW5LRoQ= 2 | github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db/go.mod h1:l0dey0ia/Uv7NcFFVbCLtqEBQbrT4OCwCSKTEv6enCw= 3 | github.com/rivo/uniseg v0.4.7 h1:WUdvkW8uEhrYfLC4ZzdpI2ztxP1I582+49Oc5Mq64VQ= 4 | github.com/rivo/uniseg v0.4.7/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88= 5 | github.com/schollz/progressbar/v3 v3.18.0 h1:uXdoHABRFmNIjUfte/Ex7WtuyVslrw2wVPQmCN62HpA= 6 | github.com/schollz/progressbar/v3 v3.18.0/go.mod h1:IsO3lpbaGuzh8zIMzgY3+J8l4C8GjO0Y9S69eFvNsec= 7 | golang.org/x/sys v0.37.0 h1:fdNQudmxPjkdUTPnLn5mdQv7Zwvbvpaxqs831goi9kQ= 8 | golang.org/x/sys v0.37.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= 9 | golang.org/x/term v0.36.0 h1:zMPR+aF8gfksFprF/Nc/rd1wRS1EI6nDBGyWAvDzx2Q= 10 | golang.org/x/term v0.36.0/go.mod h1:Qu394IJq6V6dCBRgwqshf3mPF85AqzYEzofzRdZkWss= 11 | -------------------------------------------------------------------------------- /examples/streaming/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "io" 6 | "log" 7 | 8 | "github.com/rahulpoonia29/extractous-go" 9 | ) 10 | 11 | func main() { 12 | // Create a new extractor 13 | extractor := extractous.New() 14 | if extractor == nil { 15 | log.Fatal("Failed to create extractor") 16 | } 17 | defer extractor.Close() 18 | 19 | // Extract to a stream for large files 20 | reader, metadata, err := extractor.ExtractFile("large_document.pdf") 21 | if err != nil { 22 | log.Fatalf("Extraction failed: %v", err) 23 | } 24 | defer reader.Close() 25 | 26 | // Read and print in chunks 27 | buffer := make([]byte, 4096) 28 | for { 29 | n, err := reader.Read(buffer) 30 | if err != nil && err != io.EOF { 31 | log.Fatalf("Read failed: %v", err) 32 | } 33 | if n == 0 { 34 | break 35 | } 36 | fmt.Print(string(buffer[:n])) 37 | } 38 | 39 | // Print metadata 40 | fmt.Println("\nMetadata:") 41 | for key, values := range metadata { 42 | fmt.Printf("%s: %v\n", key, values) 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /tests/ffi/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for FFI Tests 2 | 3 | CC = gcc 4 | CFLAGS = -Wall -Wextra -I../../include 5 | LDFLAGS = \ 6 | -L../../extractous-ffi/target/release \ 7 | -L../../native/$(shell uname -s | tr '[:upper:]' '[:lower:]')_$(shell uname -m | sed 's/x86_64/amd64/;s/aarch64/arm64/') \ 8 | -Wl,-rpath,../../extractous-ffi/target/release \ 9 | -Wl,-rpath,../../native/$(shell uname -s | tr '[:upper:]' '[:lower:]')_$(shell uname -m | sed 's/x86_64/amd64/;s/aarch64/arm64/') 10 | LIBS = -lextractous_ffi -ldl -lm -lpthread 11 | 12 | TEST_BINS = test_ffi_interface 13 | 14 | .PHONY: all clean run 15 | 16 | all: $(TEST_BINS) 17 | 18 | test_ffi_interface: test_ffi_interface.c 19 | $(CC) $(CFLAGS) -o $@ $< $(LDFLAGS) $(LIBS) 20 | 21 | run: all 22 | @echo "Running FFI interface tests..." 23 | @./test_ffi_interface 24 | 25 | clean: 26 | rm -f $(TEST_BINS) 27 | rm -f *.o 28 | 29 | help: 30 | @echo "FFI Test Suite" 31 | @echo "" 32 | @echo "Targets:" 33 | @echo " all - Build all test binaries" 34 | @echo " run - Build and run all tests" 35 | @echo " clean - Remove test binaries" 36 | @echo " help - Show this help message" 37 | -------------------------------------------------------------------------------- /ffi/src/types.rs: -------------------------------------------------------------------------------- 1 | use std::os::raw::{c_char, c_int}; 2 | 3 | #[repr(C)] 4 | pub struct CExtractor { 5 | _private: [u8; 0], 6 | } 7 | #[repr(C)] 8 | pub struct CStreamReader { 9 | _private: [u8; 0], 10 | } 11 | #[repr(C)] 12 | pub struct CPdfParserConfig { 13 | _private: [u8; 0], 14 | } 15 | #[repr(C)] 16 | pub struct COfficeParserConfig { 17 | _private: [u8; 0], 18 | } 19 | #[repr(C)] 20 | pub struct CTesseractOcrConfig { 21 | _private: [u8; 0], 22 | } 23 | 24 | #[repr(C)] 25 | pub struct CMetadata { 26 | /// Array of pointers to null-terminated key strings 27 | pub keys: *mut *mut c_char, 28 | /// Array of pointers to null-terminated value strings 29 | pub values: *mut *mut c_char, 30 | /// The number of key-value pairs in the arrays 31 | pub len: libc::size_t, 32 | } 33 | 34 | pub const CHARSET_UTF_8: c_int = 0; 35 | pub const CHARSET_US_ASCII: c_int = 1; 36 | pub const CHARSET_UTF_16BE: c_int = 3; 37 | 38 | pub const PDF_OCR_STRATEGY_NO_OCR: c_int = 0; 39 | pub const PDF_OCR_STRATEGY_OCR_ONLY: c_int = 1; 40 | pub const PDF_OCR_STRATEGY_OCR_AND_TEXT_EXTRACTION: c_int = 2; 41 | pub const PDF_OCR_STRATEGY_AUTO: c_int = 3; 42 | -------------------------------------------------------------------------------- /examples/config/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "log" 6 | 7 | "github.com/rahulpoonia29/extractous-go" 8 | ) 9 | 10 | func main() { 11 | // Create PDF config with OCR 12 | pdfConfig := extractous.NewPdfConfig(). 13 | SetOcrStrategy(extractous.PdfOcrAuto). 14 | SetExtractAnnotationText(true) 15 | 16 | // Create OCR config 17 | ocrConfig := extractous.NewOcrConfig(). 18 | SetLanguage("eng"). 19 | SetTimeoutSeconds(60) 20 | 21 | // Create extractor with configurations 22 | extractor := extractous.New(). 23 | SetExtractStringMaxLength(50000). 24 | SetEncoding(extractous.CharSetUTF8). 25 | SetPdfConfig(pdfConfig). 26 | SetOcrConfig(ocrConfig) 27 | if extractor == nil { 28 | log.Fatal("Failed to create configured extractor") 29 | } 30 | defer extractor.Close() 31 | 32 | // Extract from a URL 33 | content, metadata, err := extractor.ExtractURLToString("https://example.com/sample.pdf") 34 | if err != nil { 35 | log.Fatalf("Extraction failed: %v", err) 36 | } 37 | 38 | fmt.Println("Extracted Content:") 39 | fmt.Println(content) 40 | 41 | fmt.Println("\nMetadata:") 42 | for key, values := range metadata { 43 | fmt.Printf("%s: %v\n", key, values) 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /ffi/examples/basic_extraction.c: -------------------------------------------------------------------------------- 1 | /** 2 | * Basic extraction example 3 | * 4 | * Demonstrates how to: 5 | * - Create an extractor 6 | * - Extract text from a file 7 | * - Access metadata 8 | * - Proper memory cleanup 9 | */ 10 | 11 | #include // Replace with the correct path to extractous.h 12 | #include 13 | #include 14 | 15 | int main() { 16 | CExtractor* ext = extractous_extractor_new(); 17 | if (!ext) { 18 | fprintf(stderr, "Failed to create extractor\n"); 19 | return 1; 20 | } 21 | 22 | char* content; 23 | CMetadata* metadata; 24 | 25 | int result = extractous_extractor_extract_file_to_string( 26 | ext, "document.pdf", &content, &metadata 27 | ); 28 | 29 | if (result == ERR_OK) { 30 | printf("Content: %s\n", content); 31 | 32 | for (size_t i = 0; i < metadata->len; i++) { 33 | printf("%s: %s\n", metadata->keys[i], metadata->values[i]); 34 | } 35 | 36 | extractous_string_free(content); 37 | extractous_metadata_free(metadata); 38 | } else { 39 | char* msg = extractous_error_message(result); 40 | fprintf(stderr, "Error: %s\n", msg); 41 | extractous_string_free(msg); 42 | } 43 | 44 | extractous_extractor_free(ext); 45 | return 0; 46 | } 47 | -------------------------------------------------------------------------------- /cgo.go: -------------------------------------------------------------------------------- 1 | //go:build windows || darwin || linux 2 | // +build windows darwin linux 3 | 4 | //go:generate go run check_native_libs.go 5 | 6 | package extractous 7 | 8 | /* 9 | // Linux 10 | #cgo linux,amd64 CFLAGS: -I${SRCDIR} 11 | #cgo linux,amd64 LDFLAGS: -lextractous_ffi -ldl -lm -lpthread 12 | #cgo linux,arm64 CFLAGS: -I${SRCDIR} 13 | #cgo linux,arm64 LDFLAGS: -lextractous_ffi -ldl -lm -lpthread 14 | 15 | // macOS 16 | #cgo darwin,amd64 CFLAGS: -I${SRCDIR} 17 | #cgo darwin,amd64 LDFLAGS: -lextractous_ffi -ldl -lm -lpthread 18 | #cgo darwin,arm64 CFLAGS: -I${SRCDIR} 19 | #cgo darwin,arm64 LDFLAGS: -lextractous_ffi -ldl -lm -lpthread 20 | 21 | // Windows 22 | #cgo windows,amd64 CFLAGS: -I${SRCDIR} 23 | #cgo windows,amd64 LDFLAGS: -lextractous_ffi 24 | 25 | 26 | // Include the generated header 27 | #include "extractous.h" 28 | */ 29 | import "C" 30 | import ( 31 | "runtime" 32 | "unsafe" 33 | ) 34 | 35 | // init locks the OS thread for JNI compatibility and library initialization. 36 | // The constructor functions above run BEFORE this init() is called. 37 | func init() { 38 | runtime.LockOSThread() 39 | } 40 | 41 | // Helper Functions for C Interop 42 | func cString(s string) *C.char { 43 | return C.CString(s) 44 | } 45 | 46 | func goString(cs *C.char) string { 47 | return C.GoString(cs) 48 | } 49 | 50 | func freeString(cs *C.char) { 51 | C.free(unsafe.Pointer(cs)) 52 | } 53 | -------------------------------------------------------------------------------- /.github/workflows/scripts/build-local.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -e 3 | 4 | # Local build script for extractous-ffi 5 | # Usage: ./scripts/build-local.sh [target] 6 | 7 | # Detect platform 8 | if [ -z "$1" ]; then 9 | case "$(uname -s)" in 10 | Linux*) TARGET="x86_64-unknown-linux-gnu"; LIB_EXT="so"; PLATFORM="linux_amd64";; 11 | Darwin*) 12 | if [ "$(uname -m)" = "arm64" ]; then 13 | TARGET="aarch64-apple-darwin"; PLATFORM="darwin_arm64" 14 | else 15 | TARGET="x86_64-apple-darwin"; PLATFORM="darwin_amd64" 16 | fi 17 | LIB_EXT="dylib" 18 | ;; 19 | MINGW*|MSYS*|CYGWIN*) TARGET="x86_64-pc-windows-msvc"; LIB_EXT="dll"; PLATFORM="windows_amd64";; 20 | *) echo "Unknown platform"; exit 1;; 21 | esac 22 | else 23 | TARGET=$1 24 | # Detect lib_ext and platform from target 25 | fi 26 | 27 | echo "Building for: $TARGET" 28 | echo "Platform: $PLATFORM" 29 | 30 | # Check for GraalVM 31 | if [ -z "$GRAALVM_HOME" ] && [ -z "$JAVA_HOME" ]; then 32 | echo "Error: GRAALVM_HOME or JAVA_HOME must be set" 33 | echo "Install GraalVM 23+ with native-image" 34 | exit 1 35 | fi 36 | 37 | # Build 38 | cd ffi 39 | cargo build --release --target "$TARGET" 40 | cd .. 41 | 42 | # Collect libraries 43 | ./scripts/collect-libs.sh "$PLATFORM" "$TARGET" "$LIB_EXT" 44 | 45 | echo "" 46 | echo "✓ Build complete!" 47 | echo "Distribution: dist/$PLATFORM/" 48 | echo "" 49 | echo "To use in Go:" 50 | echo " export CGO_CFLAGS=\"-I$(pwd)/dist/$PLATFORM/include\"" 51 | echo " export CGO_LDFLAGS=\"-L$(pwd)/dist/$PLATFORM/lib -lextractous_ffi\"" 52 | echo " export LD_LIBRARY_PATH=\"$(pwd)/dist/$PLATFORM/lib\" # Linux" 53 | echo " export DYLD_LIBRARY_PATH=\"$(pwd)/dist/$PLATFORM/lib\" # macOS" 54 | -------------------------------------------------------------------------------- /ffi/examples/pdf_with_ocr.c: -------------------------------------------------------------------------------- 1 | /** 2 | * PDF with OCR example 3 | * 4 | * Demonstrates how to: 5 | * - Configure PDF parser for OCR 6 | * - Set OCR language and parameters 7 | * - Extract from scanned PDFs 8 | */ 9 | 10 | #include 11 | #include 12 | #include 13 | 14 | int main() { 15 | // PDF config 16 | CPdfParserConfig* pdf = extractous_pdf_config_new(); 17 | if (!pdf) { 18 | fprintf(stderr, "Failed to create PDF config\n"); 19 | return 1; 20 | } 21 | pdf = extractous_pdf_config_set_ocr_strategy(pdf, PDF_OCR_STRATEGY_AUTO); 22 | pdf = extractous_pdf_config_set_extract_annotation_text(pdf, true); 23 | 24 | // OCR config 25 | CTesseractOcrConfig* ocr = extractous_ocr_config_new(); 26 | if (!ocr) { 27 | fprintf(stderr, "Failed to create OCR config\n"); 28 | extractous_pdf_config_free(pdf); 29 | return 1; 30 | } 31 | ocr = extractous_ocr_config_set_language(ocr, "eng"); 32 | ocr = extractous_ocr_config_set_density(ocr, 300); 33 | 34 | // Extractor 35 | CExtractor* ext = extractous_extractor_new(); 36 | if (!ext) { 37 | fprintf(stderr, "Failed to create extractor\n"); 38 | extractous_ocr_config_free(ocr); 39 | extractous_pdf_config_free(pdf); 40 | return 1; 41 | } 42 | ext = extractous_extractor_set_pdf_config(ext, pdf); // Consumes pdf 43 | ext = extractous_extractor_set_ocr_config(ext, ocr); // Consumes ocr 44 | 45 | // Extract 46 | char* content; 47 | CMetadata* metadata; 48 | int result = extractous_extractor_extract_file_to_string( 49 | ext, "document.pdf", &content, &metadata 50 | ); 51 | 52 | if (result == ERR_OK) { 53 | printf("Content: %s\n", content); 54 | extractous_string_free(content); 55 | extractous_metadata_free(metadata); 56 | } else { 57 | char* msg = extractous_error_message(result); 58 | fprintf(stderr, "Error: %s\n", msg); 59 | extractous_string_free(msg); 60 | } 61 | 62 | extractous_extractor_free(ext); 63 | return 0; 64 | } 65 | -------------------------------------------------------------------------------- /ffi/examples/streaming_extraction.c: -------------------------------------------------------------------------------- 1 | /** 2 | * Streaming extraction example 3 | * 4 | * Demonstrates how to: 5 | * - Extract large files using streaming 6 | * - Process content in chunks 7 | * - Avoid loading entire file into memory 8 | */ 9 | 10 | #include 11 | #include 12 | #include 13 | 14 | #define BUFFER_SIZE 4096 15 | 16 | int main(int argc, char** argv) { 17 | if (argc != 2) { 18 | fprintf(stderr, "Usage: %s \n", argv[0]); 19 | return 1; 20 | } 21 | 22 | const char* file_path = argv[1]; 23 | 24 | // Create extractor 25 | CExtractor* extractor = extractous_extractor_new(); 26 | if (!extractor) { 27 | fprintf(stderr, "Failed to create extractor\n"); 28 | return 1; 29 | } 30 | 31 | // Extract to stream 32 | CStreamReader* reader = NULL; 33 | CMetadata* metadata = NULL; 34 | 35 | printf("Streaming extraction from: %s\n", file_path); 36 | 37 | int err = extractous_extractor_extract_file( 38 | extractor, 39 | file_path, 40 | &reader, 41 | &metadata 42 | ); 43 | 44 | if (err != ERR_OK) { 45 | char* error_msg = extractous_error_message(err); 46 | fprintf(stderr, "Failed to start extraction (code %d): %s\n", err, error_msg); 47 | extractous_string_free(error_msg); 48 | extractous_extractor_free(extractor); 49 | return 1; 50 | } 51 | 52 | // Print metadata first 53 | printf("\n=== Metadata (%zu entries) ===\n", metadata->len); 54 | for (size_t i = 0; i < metadata->len; i++) { 55 | printf("%s: %s\n", metadata->keys[i], metadata->values[i]); 56 | } 57 | 58 | // Stream content in chunks 59 | printf("\n=== Content ===\n"); 60 | 61 | char buffer[BUFFER_SIZE]; 62 | size_t bytes_read; 63 | size_t total_bytes = 0; 64 | 65 | while (extractous_stream_read(reader, (uint8_t*)buffer, BUFFER_SIZE, &bytes_read) == ERR_OK 66 | && bytes_read > 0) { 67 | // Process chunk (here we just print it) 68 | fwrite(buffer, 1, bytes_read, stdout); 69 | total_bytes += bytes_read; 70 | } 71 | 72 | printf("\n\n=== Summary ===\n"); 73 | printf("Total bytes read: %zu\n", total_bytes); 74 | 75 | // Cleanup 76 | extractous_stream_free(reader); 77 | extractous_metadata_free(metadata); 78 | extractous_extractor_free(extractor); 79 | 80 | printf("Streaming extraction successful!\n"); 81 | return 0; 82 | } 83 | -------------------------------------------------------------------------------- /check_native_libs.go: -------------------------------------------------------------------------------- 1 | //go:build ignore 2 | 3 | package main 4 | 5 | import ( 6 | "fmt" 7 | "os" 8 | "path/filepath" 9 | "runtime" 10 | ) 11 | 12 | func main() { 13 | // This script is run by `go generate` to check if the native libraries 14 | // required for CGO exist. If they don't, it prints a helpful error 15 | // message to guide the user. 16 | 17 | _, currentFile, _, ok := runtime.Caller(0) 18 | if !ok { 19 | fmt.Fprintln(os.Stderr, "Error: Cannot determine file path. Please run the installer.") 20 | os.Exit(1) 21 | } 22 | 23 | projectRoot := filepath.Dir(currentFile) 24 | nativeDir := filepath.Join(projectRoot, "native") 25 | 26 | // Check if native directory exists 27 | if _, err := os.Stat(nativeDir); os.IsNotExist(err) { 28 | printError() 29 | os.Exit(1) 30 | } 31 | 32 | // Check for the platform-specific library directory 33 | platform := fmt.Sprintf("%s_%s", runtime.GOOS, runtime.GOARCH) 34 | libDir := filepath.Join(nativeDir, platform) 35 | 36 | if _, err := os.Stat(libDir); os.IsNotExist(err) { 37 | fmt.Fprintf(os.Stderr, "\n") 38 | fmt.Fprintf(os.Stderr, "Error: Native libraries not found for %s!\n", platform) 39 | fmt.Fprintf(os.Stderr, "Expected library directory: %s\n", libDir) 40 | fmt.Fprintf(os.Stderr, "\n") 41 | printError() 42 | os.Exit(1) 43 | } 44 | 45 | // Verify the actual library file exists 46 | var libFile string 47 | switch runtime.GOOS { 48 | case "windows": 49 | libFile = filepath.Join(libDir, "extractous_ffi.dll") 50 | case "darwin": 51 | libFile = filepath.Join(libDir, "libextractous_ffi.dylib") 52 | default: // linux 53 | libFile = filepath.Join(libDir, "libextractous_ffi.so") 54 | } 55 | 56 | if _, err := os.Stat(libFile); os.IsNotExist(err) { 57 | fmt.Fprintf(os.Stderr, "Error: Library file not found: %s\n", libFile) 58 | printError() 59 | os.Exit(1) 60 | } 61 | 62 | // Success - libraries are present 63 | fmt.Printf("✓ Native libraries verified for %s\n", platform) 64 | fmt.Printf(" Library: %s\n", libFile) 65 | } 66 | 67 | func printError() { 68 | fmt.Fprintln(os.Stderr, "Error: Native FFI libraries not found!") 69 | fmt.Fprintln(os.Stderr, "This project uses CGO and requires pre-compiled native libraries") 70 | fmt.Fprintln(os.Stderr, "that were not found in the 'native/' directory.") 71 | fmt.Fprintln(os.Stderr, "") 72 | fmt.Fprintln(os.Stderr, "To fix this, please run the installer command from your project root:") 73 | fmt.Fprintln(os.Stderr, "") 74 | fmt.Fprintln(os.Stderr, " go run github.com/rahulpoonia29/extractous-go/cmd/install@latest") 75 | fmt.Fprintln(os.Stderr, "") 76 | fmt.Fprintln(os.Stderr, "This will download the correct libraries for your platform.") 77 | fmt.Fprintln(os.Stderr, "After running the installer, try your build again.") 78 | } 79 | -------------------------------------------------------------------------------- /ffi/src/metadata.rs: -------------------------------------------------------------------------------- 1 | use crate::types::CMetadata; 2 | use std::collections::HashMap; 3 | use std::ffi::CString; 4 | use std::os::raw::c_char; 5 | use std::ptr; 6 | 7 | /// Convert a Rust HashMap to a C-compatible metadata structure. 8 | pub(crate) unsafe fn metadata_to_c(metadata: HashMap>) -> *mut CMetadata { 9 | if metadata.is_empty() { 10 | return Box::into_raw(Box::new(CMetadata { 11 | keys: ptr::null_mut(), 12 | values: ptr::null_mut(), 13 | len: 0, 14 | })); 15 | } 16 | 17 | let capacity = metadata.len(); 18 | let mut keys: Vec<*mut c_char> = Vec::with_capacity(capacity); 19 | let mut values: Vec<*mut c_char> = Vec::with_capacity(capacity); 20 | 21 | for (key, value_vec) in metadata { 22 | // CString::new will return an error if the string contains `\0`. 23 | let c_key = match CString::new(key) { 24 | Ok(s) => s.into_raw(), 25 | Err(_) => continue, // Skip metadata with invalid keys. 26 | }; 27 | 28 | let joined_values = value_vec.join(", "); 29 | let c_value = match CString::new(joined_values) { 30 | Ok(s) => s.into_raw(), 31 | Err(_) => { 32 | // Clean up the already-allocated key if the value is invalid. 33 | let _ = unsafe { CString::from_raw(c_key) }; 34 | continue; 35 | } 36 | }; 37 | 38 | keys.push(c_key); 39 | values.push(c_value); 40 | } 41 | 42 | // Final length is derived from the vectors after they are populated. 43 | // Guarantees that the length matches the number of allocated pointers. 44 | let final_len = keys.len(); 45 | assert_eq!(final_len, values.len()); 46 | 47 | if final_len == 0 { 48 | return Box::into_raw(Box::new(CMetadata { 49 | keys: ptr::null_mut(), 50 | values: ptr::null_mut(), 51 | len: 0, 52 | })); 53 | } 54 | 55 | keys.shrink_to_fit(); 56 | values.shrink_to_fit(); 57 | 58 | let keys_ptr = keys.as_mut_ptr(); 59 | let values_ptr = values.as_mut_ptr(); 60 | std::mem::forget(keys); 61 | std::mem::forget(values); 62 | 63 | Box::into_raw(Box::new(CMetadata { 64 | keys: keys_ptr, 65 | values: values_ptr, 66 | len: final_len, 67 | })) 68 | } 69 | 70 | /// Frees a metadata structure and all associated memory. 71 | #[unsafe(no_mangle)] 72 | pub unsafe extern "C" fn extractous_metadata_free(metadata: *mut CMetadata) { 73 | if metadata.is_null() { 74 | return; 75 | } 76 | 77 | // Take ownership of CMetadata struct. 78 | let m = unsafe { Box::from_raw(metadata) }; 79 | 80 | let keys_vec = unsafe { Vec::from_raw_parts(m.keys, m.len, m.len) }; 81 | let values_vec = unsafe { Vec::from_raw_parts(m.values, m.len, m.len) }; 82 | 83 | // Drop to free the memory for each CString. 84 | for key_ptr in keys_vec { 85 | let _ = unsafe { CString::from_raw(key_ptr) }; 86 | } 87 | 88 | for value_ptr in values_vec { 89 | let _ = unsafe { CString::from_raw(value_ptr) }; 90 | } 91 | } 92 | -------------------------------------------------------------------------------- /ffi/cbindgen.toml: -------------------------------------------------------------------------------- 1 | header = """ 2 | /* 3 | * Extractous FFI - C Interface 4 | * 5 | * This header file provides a C-compatible interface to the Extractous 6 | * document extraction library. It is safe for use with Go via cgo or any 7 | * C-compatible FFI system. 8 | * 9 | * License: Apache-2.0 10 | * Repository: https://github.com/rahulpoonia229/extractous-go 11 | * 12 | * MEMORY MANAGEMENT: 13 | * All pointers returned by Extractous functions must be freed using the function extractous_free_string. 14 | * Failure to do so will result in memory leaks. 15 | * 16 | * 17 | * CGO USAGE: 18 | * // #cgo CFLAGS: -I${SRCDIR}/include 19 | * // #cgo LDFLAGS: -L${SRCDIR}/lib -lextractous_ffi 20 | * // #cgo linux LDFLAGS: -Wl,-rpath,$ORIGIN 21 | * // #cgo darwin LDFLAGS: -Wl,-rpath,@loader_path 22 | * // #include "extractous.h" 23 | * import "C" 24 | */ 25 | """ 26 | 27 | # Generate pure C header (no C++) 28 | language = "C" 29 | style = "both" # Both function declarations and typedef definitions 30 | cpp_compat = false # No C++ compatibility 31 | 32 | # Includes 33 | include_guard = "EXTRACTOUS_H" # Add header guard 34 | sys_includes = [] # No system includes needed 35 | includes = [] # No additional includes 36 | 37 | # Warnings 38 | documentation = true # Include Rust doc comments 39 | documentation_style = "c" # Use C-style /** */ comments 40 | 41 | [export] 42 | include = [] # Include all public items 43 | exclude = [] # No exclusions 44 | prefix = "" # No prefix for function names 45 | item_types = [ 46 | "globals", 47 | "enums", 48 | "structs", 49 | "unions", 50 | "typedefs", 51 | "opaque", 52 | "functions", 53 | "constants", 54 | ] 55 | 56 | [layout] 57 | packed = "false" # No packed structs (better portability) 58 | aligned_n = "0" # Natural alignment 59 | 60 | [fn] 61 | rename_args = "None" # Keep original arg names 62 | args = "auto" # Auto-detect by-value vs by-reference 63 | must_use = "auto" # Add [[nodiscard]] for important returns 64 | no_return = "noreturn" # Use noreturn attribute where applicable 65 | 66 | [struct] 67 | rename_fields = "None" # Keep original field names 68 | derive_constructor = false # No C++ constructors 69 | derive_eq = false # No operator overloads 70 | derive_neq = false 71 | derive_lt = false 72 | derive_lte = false 73 | derive_gt = false 74 | derive_gte = false 75 | 76 | [enum] 77 | rename_variants = "None" # Keep original variant names 78 | enum_class = false # C-style enums (not enum class) 79 | prefix_with_name = false # No enum name prefix on variants 80 | 81 | [const] 82 | allow_static_const = true # Allow static const declarations 83 | allow_constexpr = false # No constexpr (C++ feature) 84 | 85 | [macro_expansion] 86 | bitflags = false # Don't expand bitflags macros 87 | 88 | [parse] 89 | parse_deps = false # Don't parse dependencies 90 | include = [] 91 | exclude = [] 92 | clean = false # Don't remove items 93 | extra_bindings = [] 94 | 95 | [parse.expand] 96 | crates = [] 97 | all_features = false 98 | default_features = true 99 | features = [] 100 | -------------------------------------------------------------------------------- /ffi/build.rs: -------------------------------------------------------------------------------- 1 | use std::env; 2 | use std::fs; 3 | use std::path::PathBuf; 4 | 5 | fn main() { 6 | // Skip during docs builds 7 | if env::var("DOCS_RS").is_ok() { 8 | return; 9 | } 10 | 11 | let manifest_dir = env::var("CARGO_MANIFEST_DIR").unwrap(); 12 | let target = env::var("TARGET").unwrap(); 13 | let profile = env::var("PROFILE").unwrap(); 14 | 15 | println!("cargo:warning=Building extractous-ffi for target: {}", target); 16 | println!("cargo:warning=Profile: {}", profile); 17 | 18 | // 1. Generate C header 19 | generate_header(&manifest_dir); 20 | 21 | // 2. Configure RPATH for runtime library discovery 22 | configure_rpath(&target); 23 | 24 | // 3. Ensure extractous dependency built libraries are discoverable 25 | setup_extractous_libs(&target, &profile); 26 | 27 | // 4. Configure rerun triggers 28 | configure_rerun_triggers(); 29 | } 30 | 31 | fn generate_header(crate_dir: &str) { 32 | let root_dir = PathBuf::from(crate_dir).parent().unwrap().to_path_buf(); 33 | let header_path = root_dir.join("extractous.h"); 34 | 35 | match cbindgen::Builder::new() 36 | .with_crate(crate_dir) 37 | .with_config( 38 | cbindgen::Config::from_file("cbindgen.toml") 39 | .unwrap_or_else(|_| cbindgen::Config::default()), 40 | ) 41 | .generate() 42 | { 43 | Ok(bindings) => { 44 | bindings.write_to_file(&header_path); 45 | println!("cargo:warning=Generated C header: {}", header_path.display()); 46 | } 47 | Err(e) => { 48 | println!("cargo:warning=Failed to generate header: {:?}", e); 49 | } 50 | } 51 | } 52 | 53 | fn configure_rpath(target: &str) { 54 | if target.contains("linux") { 55 | // Use $ORIGIN for relocatable libraries 56 | println!("cargo:rustc-link-arg=-Wl,-rpath,$ORIGIN"); 57 | println!("cargo:rustc-link-arg=-Wl,-z,origin"); 58 | println!("cargo:rustc-link-arg=-Wl,--disable-new-dtags"); 59 | println!("cargo:warning=Configured Linux RPATH with $ORIGIN"); 60 | } else if target.contains("darwin") || target.contains("macos") { 61 | // Use @loader_path for macOS 62 | println!("cargo:rustc-link-arg=-Wl,-rpath,@loader_path"); 63 | println!("cargo:rustc-link-arg=-Wl,-install_name,@rpath/libextractous_ffi.dylib"); 64 | println!("cargo:warning=Configured macOS RPATH with @loader_path"); 65 | } else if target.contains("windows") { 66 | println!("cargo:warning=Windows: Using default DLL search path"); 67 | } 68 | } 69 | 70 | fn setup_extractous_libs(_target: &str, _profile: &str) { 71 | // The extractous crate builds libtika_native via its build.rs 72 | // We need to ensure those libraries are found during linking 73 | 74 | let out_dir = env::var("OUT_DIR").unwrap(); 75 | let target_dir = PathBuf::from(&out_dir) 76 | .parent().unwrap() 77 | .parent().unwrap() 78 | .parent().unwrap() 79 | .to_path_buf(); 80 | 81 | // Search for extractous build output 82 | let build_dir = target_dir.join("build"); 83 | 84 | if let Ok(entries) = fs::read_dir(&build_dir) { 85 | for entry in entries.flatten() { 86 | let path = entry.path(); 87 | if let Some(name) = path.file_name() { 88 | if name.to_str().unwrap().starts_with("extractous-") { 89 | let libs_dir = path.join("out").join("libs"); 90 | if libs_dir.exists() { 91 | println!("cargo:rustc-link-search={}", libs_dir.display()); 92 | println!("cargo:warning=Found extractous libs: {}", libs_dir.display()); 93 | } 94 | } 95 | } 96 | } 97 | } 98 | } 99 | 100 | fn configure_rerun_triggers() { 101 | println!("cargo:rerun-if-changed=src"); 102 | println!("cargo:rerun-if-changed=build.rs"); 103 | println!("cargo:rerun-if-changed=cbindgen.toml"); 104 | println!("cargo:rerun-if-changed=Cargo.toml"); 105 | } 106 | -------------------------------------------------------------------------------- /ffi/examples/README.md: -------------------------------------------------------------------------------- 1 | # 📦 Extractous FFI Examples 2 | 3 | This directory contains **C examples** demonstrating how to use the **Extractous FFI** library for text and metadata extraction from various document formats. 4 | 5 | --- 6 | 7 | ## Examples 8 | 9 | | Example | Description | 10 | |----------|-------------| 11 | | **`basic.c`** | Simple file extraction with metadata | 12 | | **`streaming.c`** | Stream large files without loading into memory | 13 | | **`ocr.c`** | Extract scanned PDFs using OCR | 14 | 15 | --- 16 | 17 | ## Running Examples 18 | 19 | ```bash 20 | # Basic extraction — extracts text and metadata from any supported document format 21 | ./basic document.pdf 22 | 23 | # Streaming extraction — streams content from large files (>50MB) 24 | ./streaming large_document.pdf > output.txt 25 | 26 | # OCR extraction — extracts text from scanned PDFs using Tesseract OCR 27 | ./ocr scanned_document.pdf 28 | ```` 29 | 30 | --- 31 | 32 | ## Requirements 33 | 34 | **Tesseract OCR** must be installed for OCR examples to work. 35 | 36 | ### Ubuntu / Debian 37 | 38 | ```bash 39 | sudo apt install tesseract-ocr tesseract-ocr-eng 40 | ``` 41 | 42 | ### macOS 43 | 44 | ```bash 45 | brew install tesseract 46 | ``` 47 | 48 | ### Windows 49 | 50 | Download from the official repository: [https://github.com/UB-Mannheim/tesseract/wiki](https://github.com/UB-Mannheim/tesseract/wiki) 51 | 52 | --- 53 | 54 | ## Error Handling 55 | 56 | All examples demonstrate robust error handling: 57 | 58 | ```c 59 | int err = extractous_extractor_extract_file_to_string(...); 60 | if (err != ERR_OK) { 61 | char* msg = extractous_error_message(err); 62 | fprintf(stderr, "Error: %s\n", msg); 63 | extractous_string_free(msg); 64 | } 65 | ``` 66 | 67 | --- 68 | 69 | ## Memory Management 70 | 71 | Each example ensures proper cleanup of allocated resources: 72 | 73 | ```c 74 | // Extract 75 | extractous_extractor_extract_file_to_string(ex, path, &content, &meta); 76 | 77 | // Use content and metadata 78 | printf("%s\n", content); 79 | 80 | // Cleanup 81 | extractous_string_free(content); 82 | extractous_metadata_free(meta); 83 | extractous_extractor_free(ex); 84 | ``` 85 | 86 | --- 87 | 88 | ## Common Issues 89 | 90 | ### Library Not Found 91 | 92 | If you see: 93 | 94 | ``` 95 | error while loading shared libraries 96 | ``` 97 | 98 | Set the library path manually: 99 | 100 | **Linux** 101 | 102 | ```bash 103 | export LD_LIBRARY_PATH=../target/release:$LD_LIBRARY_PATH 104 | ./basic_extraction document.pdf 105 | ``` 106 | 107 | **macOS** 108 | 109 | ```bash 110 | export DYLD_LIBRARY_PATH=../target/release:$DYLD_LIBRARY_PATH 111 | ./basic_extraction document.pdf 112 | ``` 113 | 114 | --- 115 | 116 | ### OCR Not Available 117 | 118 | If OCR examples fail with `ERR_OCR_NOT_AVAILABLE`: 119 | 120 | 1. **Install Tesseract:** 121 | 122 | ```bash 123 | # Ubuntu/Debian 124 | sudo apt install tesseract-ocr tesseract-ocr-eng 125 | 126 | # macOS 127 | brew install tesseract 128 | ``` 129 | 130 | 2. **Verify installation:** 131 | 132 | ```bash 133 | tesseract --version 134 | ``` 135 | 136 | --- 137 | 138 | ## Rough skeleton 139 | 140 | ```c 141 | #include "../include/extractous.h" 142 | #include 143 | 144 | int main(int argc, char** argv) { 145 | // 1. Create extractor 146 | CExtractor* ex = extractous_extractor_new(); 147 | 148 | // 2. Configure (optional) 149 | ex = extractous_extractor_set_xml_output(ex, false); 150 | 151 | // 3. Extract 152 | char* content = NULL; 153 | CMetadata* meta = NULL; 154 | int err = extractous_extractor_extract_file_to_string( 155 | ex, "file.pdf", &content, &meta 156 | ); 157 | 158 | // 4. Check error 159 | if (err != ERR_OK) { 160 | char* msg = extractous_error_message(err); 161 | fprintf(stderr, "Error: %s\n", msg); 162 | extractous_string_free(msg); 163 | extractous_extractor_free(ex); 164 | return 1; 165 | } 166 | 167 | // 5. Use results 168 | printf("%s\n", content); 169 | 170 | // 6. Cleanup 171 | extractous_string_free(content); 172 | extractous_metadata_free(meta); 173 | extractous_extractor_free(ex); 174 | 175 | return 0; 176 | } 177 | ``` 178 | -------------------------------------------------------------------------------- /ffi/src/stream.rs: -------------------------------------------------------------------------------- 1 | use crate::ecore::StreamReader as CoreStreamReader; 2 | use crate::errors::*; 3 | use crate::types::*; 4 | use std::io::Read; 5 | 6 | /// Reads data from a stream into a user-provided buffer. 7 | /// 8 | /// Returns the actual number of bytes read via the `bytes_read` output parameter. 9 | /// Reaching the end of the stream is indicated by `ERR_OK` and `*bytes_read == 0`. 10 | #[unsafe(no_mangle)] 11 | pub unsafe extern "C" fn extractous_stream_read( 12 | handle: *mut CStreamReader, 13 | buffer: *mut u8, 14 | buffer_size: libc::size_t, 15 | bytes_read: *mut libc::size_t, 16 | ) -> libc::c_int { 17 | if handle.is_null() || buffer.is_null() { 18 | return ERR_NULL_POINTER; 19 | } 20 | if !bytes_read.is_null() { 21 | unsafe { *bytes_read = 0 }; 22 | } 23 | if buffer_size == 0 { 24 | return ERR_OK; 25 | } 26 | 27 | let reader = unsafe { &mut *(handle as *mut CoreStreamReader) }; 28 | let buf_slice = unsafe { std::slice::from_raw_parts_mut(buffer, buffer_size) }; 29 | 30 | match reader.read(buf_slice) { 31 | Ok(n) => { 32 | if !bytes_read.is_null() { 33 | unsafe { *bytes_read = n }; 34 | } 35 | ERR_OK 36 | } 37 | Err(_) => ERR_IO_ERROR, 38 | } 39 | } 40 | 41 | /// Reads exactly `buffer_size` bytes from the stream. 42 | /// 43 | /// Function will continue reading until the buffer is full, or the end of 44 | /// the stream is reached, or an error occurs. 45 | #[unsafe(no_mangle)] 46 | pub unsafe extern "C" fn extractous_stream_read_exact( 47 | handle: *mut CStreamReader, 48 | buffer: *mut u8, 49 | buffer_size: libc::size_t, 50 | bytes_read: *mut libc::size_t, 51 | ) -> libc::c_int { 52 | if handle.is_null() || buffer.is_null() || bytes_read.is_null() { 53 | return ERR_NULL_POINTER; 54 | } 55 | if buffer_size == 0 { 56 | return ERR_OK; 57 | } 58 | 59 | unsafe { *bytes_read = 0 }; 60 | 61 | let reader = unsafe { &mut *(handle as *mut CoreStreamReader) }; 62 | // slice representing the user-provided buffer 63 | let total_buf_slice = unsafe { std::slice::from_raw_parts_mut(buffer, buffer_size) }; 64 | 65 | let mut total_bytes_read = 0; 66 | while total_bytes_read < buffer_size { 67 | // In each loop, we try to read into the remaining part of the buffer 68 | let remaining_buf = &mut total_buf_slice[total_bytes_read..]; 69 | 70 | match reader.read(remaining_buf) { 71 | Ok(0) => { 72 | // `read` returned 0, which signifies the end of the stream 73 | // We break the loop and will return the total bytes we've read 74 | break; 75 | } 76 | Ok(n) => { 77 | total_bytes_read += n; 78 | } 79 | Err(e) if e.kind() == std::io::ErrorKind::Interrupted => { 80 | // The read was interrupted by a signal. This is recoverable so we just continue 81 | continue; 82 | } 83 | Err(_) => { 84 | // A non-recoverable I/O error occurred. 85 | return ERR_IO_ERROR; 86 | } 87 | } 88 | } 89 | 90 | unsafe { *bytes_read = total_bytes_read }; 91 | ERR_OK 92 | } 93 | 94 | /// Reads the remaining stream into a newly allocated buffer. 95 | // #[must_use] 96 | #[unsafe(no_mangle)] 97 | pub unsafe extern "C" fn extractous_stream_read_all( 98 | handle: *mut CStreamReader, 99 | out_buffer: *mut *mut u8, 100 | out_size: *mut libc::size_t, 101 | ) -> libc::c_int { 102 | if handle.is_null() || out_buffer.is_null() || out_size.is_null() { 103 | return ERR_NULL_POINTER; 104 | } 105 | 106 | let reader = unsafe { &mut *(handle as *mut CoreStreamReader) }; 107 | let mut data_vec = Vec::new(); 108 | 109 | match reader.read_to_end(&mut data_vec) { 110 | Ok(_) => { 111 | data_vec.shrink_to_fit(); 112 | 113 | let size = data_vec.len(); 114 | let ptr = data_vec.as_mut_ptr(); 115 | std::mem::forget(data_vec); 116 | 117 | unsafe { *out_buffer = ptr }; 118 | unsafe { *out_size = size }; 119 | ERR_OK 120 | } 121 | Err(_) => ERR_IO_ERROR, 122 | } 123 | } 124 | 125 | /// Frees a buffer allocated by `extractous_stream_read_all`. 126 | #[unsafe(no_mangle)] 127 | pub unsafe extern "C" fn extractous_buffer_free(buffer: *mut u8, size: libc::size_t) { 128 | if buffer.is_null() || size == 0 { 129 | return; 130 | } 131 | let _ = unsafe { Vec::from_raw_parts(buffer, size, size) }; 132 | } 133 | 134 | /// Frees a stream reader and releases its resources. 135 | #[unsafe(no_mangle)] 136 | pub unsafe extern "C" fn extractous_stream_free(handle: *mut CStreamReader) { 137 | if !handle.is_null() { 138 | // Reconstruct the Box and let Rust's drop handler deallocate it. 139 | let _ = unsafe { Box::from_raw(handle as *mut CoreStreamReader) }; 140 | } 141 | } 142 | -------------------------------------------------------------------------------- /ffi/src/errors.rs: -------------------------------------------------------------------------------- 1 | use crate::ecore::Error; 2 | use std::cell::RefCell; 3 | use std::error::Error as StdError; 4 | use std::ffi::CString; 5 | use std::os::raw::{c_char, c_int}; 6 | use std::ptr; 7 | 8 | pub const ERR_OK: c_int = 0; 9 | pub const ERR_NULL_POINTER: c_int = -1; 10 | pub const ERR_INVALID_UTF8: c_int = -2; 11 | pub const ERR_INVALID_STRING: c_int = -3; 12 | pub const ERR_EXTRACTION_FAILED: c_int = -4; 13 | pub const ERR_IO_ERROR: c_int = -5; 14 | pub const ERR_INVALID_CONFIG: c_int = -6; 15 | pub const ERR_INVALID_ENUM: c_int = -7; 16 | pub const ERR_UNSUPPORTED_FORMAT: c_int = -8; 17 | pub const ERR_OUT_OF_MEMORY: c_int = -9; 18 | pub const ERR_OCR_FAILED: c_int = -10; 19 | 20 | pub(crate) fn extractous_error_to_code(err: &Error) -> c_int { 21 | match err { 22 | Error::IoError(_) => ERR_IO_ERROR, 23 | Error::Utf8Error(_) => ERR_INVALID_UTF8, 24 | 25 | // For unknown errors, inspect the message content 26 | Error::ParseError(msg) | Error::Unknown(msg) => { 27 | let lower_msg = msg.to_lowercase(); 28 | if lower_msg.contains("ocr") { 29 | ERR_OCR_FAILED 30 | } else if lower_msg.contains("unsupported") { 31 | ERR_UNSUPPORTED_FORMAT 32 | } else if lower_msg.contains("config") { 33 | ERR_INVALID_CONFIG 34 | } else { 35 | // Default to general extraction failure 36 | ERR_EXTRACTION_FAILED 37 | } 38 | } 39 | 40 | Error::JniError(jni_err) => { 41 | let error_string = jni_err.to_string(); 42 | let lower_error_string = error_string.to_lowercase(); 43 | 44 | if lower_error_string.contains("javaexception") { 45 | // This string appears when the error is due to a Java-side exception, 46 | // which is the case your `jnicallmethodlocal` handles. This is a strong 47 | // indicator of a failure within Tika's processing. 48 | ERR_EXTRACTION_FAILED 49 | } else if lower_error_string.contains("nomemory") { 50 | ERR_OUT_OF_MEMORY 51 | } else { 52 | ERR_EXTRACTION_FAILED 53 | } 54 | } 55 | 56 | Error::JniEnvCall(_) => ERR_EXTRACTION_FAILED, 57 | } 58 | } 59 | 60 | #[unsafe(no_mangle)] 61 | pub extern "C" fn extractous_error_message(code: c_int) -> *mut c_char { 62 | let msg = match code { 63 | ERR_OK => "Operation completed successfully", 64 | ERR_NULL_POINTER => "Null pointer provided as argument", 65 | ERR_INVALID_UTF8 => "Invalid UTF-8 string encoding", 66 | ERR_INVALID_STRING => "String conversion or allocation failed", 67 | ERR_EXTRACTION_FAILED => "Document extraction failed", 68 | ERR_IO_ERROR => "File system or network I/O error", 69 | ERR_INVALID_CONFIG => "Invalid configuration value", 70 | ERR_INVALID_ENUM => "Invalid enumeration value", 71 | ERR_UNSUPPORTED_FORMAT => "Unsupported file format", 72 | ERR_OUT_OF_MEMORY => "Memory allocation failed", 73 | ERR_OCR_FAILED => "OCR operation failed", 74 | _ => "Unknown error code", 75 | }; 76 | match CString::new(msg) { 77 | Ok(s) => s.into_raw(), 78 | Err(_) => ptr::null_mut(), 79 | } 80 | } 81 | 82 | thread_local! { 83 | /// Stores the last detailed error that occurred on the current thread 84 | static LAST_ERROR: RefCell>> = RefCell::new(None); 85 | } 86 | 87 | pub(crate) fn set_last_error(err: impl StdError + Send + 'static) { 88 | LAST_ERROR.with(|cell| { 89 | *cell.borrow_mut() = Some(Box::new(err)); 90 | }); 91 | } 92 | 93 | /// Retrieves a detailed debug report for the last error on this thread 94 | /// full error chain and a backtrace if RUST_BACKTRACE=1 95 | #[unsafe(no_mangle)] 96 | pub extern "C" fn extractous_error_get_last_debug() -> *mut c_char { 97 | LAST_ERROR.with(|cell| { 98 | if let Some(err) = cell.borrow_mut().take() { 99 | let mut debug_output = format!("Error: {}", err); 100 | let mut source = err.source(); 101 | if source.is_some() { 102 | debug_output.push_str("\n\nCaused by:"); 103 | } 104 | let mut level = 0; 105 | while let Some(cause) = source { 106 | debug_output.push_str(&format!("\n {}: {}", level, cause)); 107 | source = cause.source(); 108 | level += 1; 109 | } 110 | debug_output.push_str(&format!("\n\nDebug Representation:\n{:?}", err)); 111 | match CString::new(debug_output) { 112 | Ok(s) => s.into_raw(), 113 | Err(_) => ptr::null_mut(), 114 | } 115 | } else { 116 | ptr::null_mut() 117 | } 118 | }) 119 | } 120 | 121 | /// Checks if debug information is available for the current thread 122 | #[unsafe(no_mangle)] 123 | pub extern "C" fn extractous_error_has_debug() -> c_int { 124 | LAST_ERROR.with(|cell| if cell.borrow().is_some() { 1 } else { 0 }) 125 | } 126 | 127 | #[unsafe(no_mangle)] 128 | pub extern "C" fn extractous_error_clear_last() { 129 | LAST_ERROR.with(|cell| { 130 | *cell.borrow_mut() = None; 131 | }); 132 | } 133 | -------------------------------------------------------------------------------- /tests/README.md: -------------------------------------------------------------------------------- 1 | # Extractous-Go Test Suite 2 | 3 | This directory contains comprehensive tests for the extractous-go library at multiple levels. 4 | 5 | ## Test Structure 6 | 7 | ``` 8 | tests/ 9 | ├── ffi/ # FFI layer tests (C interface) 10 | │ ├── test_ffi_interface.c 11 | │ └── Makefile 12 | ├── go/ # Go binding tests 13 | │ ├── bindings_test.go # Unit tests for Go API 14 | │ └── integration_test.go # Integration tests with actual files 15 | └── testdata/ # Test files (created at runtime) 16 | ``` 17 | 18 | ## Running Tests 19 | 20 | ### 1. FFI Layer Tests (C) 21 | 22 | First, ensure the Rust FFI library is built: 23 | 24 | ```bash 25 | cd ffi 26 | cargo build --release 27 | cd .. 28 | ``` 29 | 30 | Then run the FFI tests: 31 | 32 | ```bash 33 | cd tests/ffi 34 | make run 35 | ``` 36 | 37 | Or run individual test categories: 38 | 39 | ```bash 40 | make run # Run all tests 41 | make clean # Clean build artifacts 42 | ``` 43 | 44 | **What FFI tests validate:** 45 | - Extractor lifecycle (new, free, double-free safety) 46 | - Configuration functions (max length, encoding, XML output) 47 | - PDF/Office/OCR configuration 48 | - Error handling and null pointer safety 49 | - URL extraction 50 | - Memory management 51 | 52 | ### 2. Go Binding Tests 53 | 54 | Run all Go tests: 55 | 56 | ```bash 57 | cd tests/go 58 | go test -v 59 | ``` 60 | 61 | Run specific test files: 62 | 63 | ```bash 64 | go test -v -run TestExtractor # Run extractor tests 65 | go test -v -run TestPdfConfig # Run PDF config tests 66 | go test -v -run TestIntegration # Run integration tests 67 | ``` 68 | 69 | Run with race detection: 70 | 71 | ```bash 72 | go test -race -v 73 | ``` 74 | 75 | **What Go binding tests validate:** 76 | 77 | #### `bindings_test.go` - Unit Tests 78 | - Extractor lifecycle and nil-safety 79 | - Configuration methods (max length, encoding, XML output) 80 | - PDF/Office/OCR configuration 81 | - Builder pattern and method chaining 82 | - Error handling for nil extractors 83 | - Metadata API (Get, GetAll, Has, Keys) 84 | - CharSet constants 85 | 86 | #### `integration_test.go` - Integration Tests 87 | - Plain text file extraction 88 | - Byte array extraction (string and stream) 89 | - Configuration effects (max length, encoding, XML output) 90 | - Metadata extraction and parsing 91 | - Error handling (nonexistent files, empty files) 92 | - Concurrent extraction (multiple goroutines) 93 | - Multiple extractors on same file 94 | 95 | ## Test Data 96 | 97 | Integration tests create temporary test files in `tests/testdata/` directory. These files are: 98 | - Created at test runtime 99 | - Cleaned up after each test 100 | - Simple text files for validation 101 | 102 | ## Memory Management 103 | 104 | **Important:** All config objects (PdfConfig, OfficeConfig, OcrConfig) use Go finalizers for automatic cleanup. You should **NOT** call any `Free()` method manually in Go code - they don't exist in the public API. 105 | 106 | The FFI layer tests validate that the underlying C functions properly manage memory. 107 | 108 | ## Prerequisites 109 | 110 | ### For FFI Tests: 111 | - GCC or compatible C compiler 112 | - libextractous_ffi.so (built from Rust FFI layer) 113 | - extractous.h header file 114 | 115 | ### For Go Tests: 116 | - Go 1.25.1 or later 117 | - CGo enabled 118 | - libextractous_ffi.so in library path or proper LD_LIBRARY_PATH 119 | 120 | ## Troubleshooting 121 | 122 | ### FFI Tests 123 | 124 | **Error: `libextractous_ffi.so: cannot open shared object file`** 125 | ```bash 126 | # Ensure the library is built and in the right location 127 | cd ffi && cargo build --release 128 | # Check native/ directory for the compiled library 129 | ls -la native/*/ 130 | ``` 131 | 132 | **Error: `extractous.h: No such file or directory`** 133 | ```bash 134 | # Regenerate the header with cbindgen 135 | cd ffi 136 | cbindgen --config cbindgen.toml --crate extractous-ffi --output ../include/extractous.h 137 | ``` 138 | 139 | ### Go Tests 140 | 141 | **Error: `undefined reference to extractous_*`** 142 | - Ensure the FFI library is built: `cd ffi && cargo build --release` 143 | - Check that CGo can find the library (see `src/cgo.go` for paths) 144 | 145 | **Error: Package import issues** 146 | - Ensure you're running tests from the `tests/go/` directory 147 | - Module path should be `extractous-go` (check `go.mod`) 148 | 149 | **Segmentation fault** 150 | - This usually indicates a problem at the FFI boundary 151 | - Run FFI tests first to validate the C interface 152 | - Check that all CGo calls handle nil pointers correctly 153 | 154 | ## Continuous Integration 155 | 156 | For CI pipelines, run tests in this order: 157 | 158 | ```bash 159 | # 1. Build FFI library 160 | cd ffi && cargo build --release && cd .. 161 | 162 | # 2. Run FFI tests 163 | cd tests/ffi && make run && cd ../.. 164 | 165 | # 3. Run Go tests 166 | cd tests/go && go test -v -race && cd ../.. 167 | ``` 168 | 169 | ## Test Coverage 170 | 171 | To generate coverage reports for Go tests: 172 | 173 | ```bash 174 | cd tests/go 175 | go test -coverprofile=coverage.out 176 | go tool cover -html=coverage.out 177 | ``` 178 | 179 | ## Contributing 180 | 181 | When adding new features: 182 | 183 | 1. **Add FFI tests first** - Validate the C interface in `tests/ffi/test_ffi_interface.c` 184 | 2. **Add Go unit tests** - Test the Go wrapper in `tests/go/bindings_test.go` 185 | 3. **Add integration tests** - Test end-to-end functionality in `tests/go/integration_test.go` 186 | 187 | This ensures full validation from the C boundary up through the Go API. 188 | -------------------------------------------------------------------------------- /ffi/src/lib.rs: -------------------------------------------------------------------------------- 1 | //! This crate provides a **C-compatible Foreign Function Interface (FFI)** for the 2 | //! Extractous library. Extractous is a fast and efficient solution for extracting 3 | //! content and metadata from various document formats including PDF, Word, Excel, and more. 4 | //! 5 | //! This FFI layer is meticulously designed for safety and performance, featuring: 6 | //! - Opaque pointers to prevent unsafe access to internal data structures. 7 | //! - A robust, thread-safe error handling mechanism with on-demand debug info. 8 | //! - A clear memory ownership model with explicit `_new` and `_free` functions. 9 | //! 10 | //! ## Quick Start 11 | //! 12 | //! ``` 13 | //! // 1. Create an extractor instance. 14 | //! CExtractor* extractor = extractous_extractor_new(); 15 | //! 16 | //! // 2. Configure the extractor. Setters modify the object in-place. 17 | //! // DO NOT re-assign the pointer. 18 | //! extractous_extractor_set_xml_output(extractor, true); 19 | //! 20 | //! // 3. Extract content and metadata from a file. 21 | //! char* content = NULL; 22 | //! CMetadata* metadata = NULL; 23 | //! int result = extractous_extractor_extract_file_to_string( 24 | //! extractor, "document.pdf", &content, &metadata 25 | //! ); 26 | //! 27 | //! // 4. Check for errors and handle them. 28 | //! if (result != ERR_OK) { 29 | //! // Handle the error (see Error Handling section). 30 | //! fprintf(stderr, "Extraction failed with code: %d\n", result); 31 | //! } else { 32 | //! // 5. Use the results. 33 | //! printf("Content: %s\n", content); 34 | //! } 35 | //! 36 | //! // 6. Clean up all allocated resources in reverse order. 37 | //! extractous_string_free(content); 38 | //! extractous_metadata_free(metadata); 39 | //! extractous_extractor_free(extractor); 40 | //! ``` 41 | //! 42 | //! ## Thread Safety 43 | //! 44 | //! - **Extractor Instances**: `CExtractor` and its associated config/stream objects are 45 | //! **NOT thread-safe**. Do not share a handle across threads. The recommended pattern is 46 | //! to create one `CExtractor` instance per thread that needs it. 47 | //! - **Error Handling**: The error reporting system **IS thread-safe**. Each thread stores 48 | //! its own last error information independently, preventing race conditions. You can safely 49 | //! call error-handling functions from any thread. 50 | //! 51 | //! # Advanced Error Handling 52 | //! 53 | //! This library uses a powerful two-tier error system for maximum performance and diagnostics. 54 | //! 55 | //! ### Tier 1: Fast Path (Error Codes) 56 | //! 57 | //! All FFI functions return an integer error code. `ERR_OK` (0) signifies success. This allows 58 | //! for a very fast check without any overhead. 59 | //! 60 | //! ### Tier 2: Slow Path (On-Demand Detailed Info) 61 | //! 62 | //! When an error occurs, you can request more information on demand. 63 | //! 64 | //! **1. Get the Error Category:** 65 | //! Use `extractous_error_category()` to get a stable, machine-readable string 66 | //! representing the *type* of error. This is perfect for building idiomatic Go error wrappers. 67 | //! The returned pointer is static and **must not be freed**. 68 | //! 69 | //! **2. Get a Simple Message:** 70 | //! Use `extractous_error_message()` to get a simple, human-readable description. 71 | //! The returned string **must be freed** with `extractous_string_free()`. 72 | //! 73 | //! **3. Get a Full Debug Report:** 74 | //! If `extractous_error_has_debug()` returns `1`, you can call `extractous_error_get_last_debug()` 75 | //! to get a detailed report, including the full error chain and a backtrace (if enabled with `RUST_BACKTRACE=1`). 76 | //! The returned string **must be freed**. 77 | //! 78 | //! ### Go Usage Pattern 79 | //! 80 | //! ``` 81 | //! // (Inside a function that calls the FFI) 82 | //! resultCode := C.some_extractous_function(...) 83 | //! if resultCode != C.ERR_OK { 84 | //! // Get stable category for idiomatic error wrapping. 85 | //! category := C.GoString(C.extractous_error_category(resultCode)) 86 | //! 87 | //! // Get the simple message for the error string. 88 | //! msgCStr := C.extractous_error_message(resultCode) 89 | //! defer C.extractous_string_free(msgCStr) 90 | //! message := C.GoString(msgCStr) 91 | //! 92 | //! var baseError error 93 | //! switch category { 94 | //! case "io_error": baseError = ErrIO 95 | //! default: baseError = ErrUnknown 96 | //! } 97 | //! 98 | //! // Optionally log the full debug info for developers. 99 | //! if C.extractous_error_has_debug() != 0 { 100 | //! debugCStr := C.extractous_error_get_last_debug() 101 | //! defer C.extractous_string_free(debugCStr) 102 | //! log.Printf("Full debug details: %s", C.GoString(debugCStr)) 103 | //! } 104 | //! 105 | //! return fmt.Errorf("%w: %s", baseError, message) 106 | //! } 107 | //! ``` 108 | #![warn(clippy::all)] 109 | #![allow(clippy::missing_safety_doc)] 110 | 111 | // Re-export the core library under a consistent, private alias. 112 | pub use extractous as ecore; 113 | 114 | // Module declarations. 115 | mod config; 116 | mod errors; 117 | mod extractor; 118 | mod metadata; 119 | mod stream; 120 | mod types; 121 | 122 | // Publicly re-export all FFI-safe functions and types for C header generation. 123 | pub use config::*; 124 | pub use errors::*; 125 | pub use extractor::*; 126 | pub use metadata::*; 127 | pub use stream::*; 128 | pub use types::*; 129 | 130 | /// Returns the FFI wrapper version as a null-terminated UTF-8 string. 131 | /// The returned pointer is to a static string and must not be freed. 132 | #[unsafe(no_mangle)] 133 | pub extern "C" fn extractous_ffi_version() -> *const libc::c_char { 134 | // Use a static byte array with a null terminator for guaranteed memory safety. 135 | static VERSION: &[u8] = concat!(env!("CARGO_PKG_VERSION"), "\0").as_bytes(); 136 | VERSION.as_ptr() as *const libc::c_char 137 | } 138 | 139 | /// Returns the underlying Extractous core library version. 140 | /// The returned pointer is to a static string and must not be freed. 141 | #[unsafe(no_mangle)] 142 | pub extern "C" fn extractous_core_version() -> *const libc::c_char { 143 | static VERSION: &[u8] = b"0.3.0\0"; 144 | VERSION.as_ptr() as *const libc::c_char 145 | } 146 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: Release 2 | 3 | on: 4 | push: 5 | tags: 6 | - 'v*.*.*' # Trigger on semantic version tags (v1.0.0, v0.2.1, etc.) 7 | workflow_dispatch: 8 | inputs: 9 | tag_name: 10 | description: 'Release tag name (e.g., v1.0.0)' 11 | required: true 12 | type: string 13 | 14 | # Prevent concurrent releases 15 | concurrency: 16 | group: release-${{ github.ref }} 17 | cancel-in-progress: false 18 | 19 | permissions: 20 | contents: write 21 | actions: read 22 | 23 | jobs: 24 | # Build artifacts for all platforms 25 | build: 26 | name: Build Release Artifacts 27 | uses: ./.github/workflows/build.yml 28 | secrets: inherit 29 | 30 | # Create GitHub Release with built artifacts 31 | create-release: 32 | name: Create GitHub Release 33 | runs-on: ubuntu-latest 34 | needs: [build] 35 | 36 | steps: 37 | - name: Checkout repository 38 | uses: actions/checkout@v4 39 | 40 | - name: Determine tag name 41 | id: tag 42 | shell: bash 43 | run: | 44 | if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then 45 | echo "tag_name=${{ github.event.inputs.tag_name }}" >> $GITHUB_OUTPUT 46 | else 47 | echo "tag_name=${{ github.ref_name }}" >> $GITHUB_OUTPUT 48 | fi 49 | 50 | - name: Download all artifacts 51 | uses: actions/download-artifact@v4 52 | with: 53 | path: artifacts 54 | 55 | - name: List downloaded artifacts 56 | shell: bash 57 | run: | 58 | echo "Downloaded artifacts:" 59 | ls -R artifacts 60 | 61 | - name: Package per-platform archives 62 | id: package 63 | shell: bash 64 | run: | 65 | set -e 66 | mkdir -p release 67 | 68 | echo "Creating per-platform release archives..." 69 | 70 | # Package each platform separately 71 | for platform_dir in artifacts/extractous-ffi-*; do 72 | if [ -d "$platform_dir" ]; then 73 | platform=$(basename "$platform_dir" | sed 's/extractous-ffi-//') 74 | 75 | echo "Packaging $platform..." 76 | 77 | # Determine archive extension based on platform 78 | if [[ "$platform" == windows_* ]]; then 79 | archive_name="extractous-ffi-${platform}.zip" 80 | (cd "$platform_dir" && zip -r "../../release/$archive_name" .) 81 | else 82 | archive_name="extractous-ffi-${platform}.tar.gz" 83 | tar -czf "release/$archive_name" -C "$platform_dir" . 84 | fi 85 | 86 | # Generate checksum 87 | (cd release && sha256sum "$archive_name" > "${archive_name}.sha256") 88 | 89 | echo " ✓ Created $archive_name" 90 | fi 91 | done 92 | 93 | echo "" 94 | echo "Release assets:" 95 | ls -lh release/ 96 | 97 | - name: Generate release notes 98 | id: release-notes 99 | shell: bash 100 | run: | 101 | TAG_NAME="${{ steps.tag.outputs.tag_name }}" 102 | 103 | cat > release_notes.md <<'EOF' 104 | # Extractous Go FFI ${{ steps.tag.outputs.tag_name }} 105 | 106 | This release contains the native library binaries for extractous-go. 107 | 108 | ## Installation 109 | 110 | Use the installation command from your Go project: 111 | 112 | ```bash 113 | go run github.com/rahulpoonia29/extractous-go/cmd/install@latest 114 | ``` 115 | 116 | The installer will automatically download the correct platform libraries for you. 117 | 118 | ## Available Platforms 119 | 120 | - **linux_amd64**: Linux x86_64 121 | - **windows_amd64**: Windows x86_64 122 | - **darwin_arm64**: macOS Apple Silicon (M1/M2) 123 | 124 | ## Manual Installation 125 | 126 | If you prefer to install manually: 127 | 128 | 1. Download the archive for your platform 129 | 2. Extract it to `./native/{platform}/` in your project 130 | 3. Verify the checksum using the `.sha256` file 131 | 132 | ## Archive Contents 133 | 134 | Each archive contains: 135 | - `include/extractous.h` - C header file 136 | - `lib/` - Platform-specific shared libraries 137 | 138 | ## Build Information 139 | 140 | - Rust version: 1.90.0 141 | - Built with GraalVM Native Image 142 | - Build date: $(date -u +"%Y-%m-%d") 143 | 144 | --- 145 | 146 | **Note:** This is a draft release. Please review and edit these notes before publishing. 147 | EOF 148 | 149 | cat release_notes.md 150 | 151 | - name: Create GitHub Release (Draft) 152 | uses: softprops/action-gh-release@v2 153 | with: 154 | tag_name: ${{ steps.tag.outputs.tag_name }} 155 | name: Release ${{ steps.tag.outputs.tag_name }} 156 | body_path: release_notes.md 157 | draft: true 158 | prerelease: ${{ contains(steps.tag.outputs.tag_name, '-') }} 159 | files: release/* 160 | fail_on_unmatched_files: true 161 | env: 162 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 163 | 164 | - name: Release summary 165 | shell: bash 166 | run: | 167 | TAG_NAME="${{ steps.tag.outputs.tag_name }}" 168 | 169 | echo "## 📦 Draft Release Created: ${TAG_NAME}" >> $GITHUB_STEP_SUMMARY 170 | echo "" >> $GITHUB_STEP_SUMMARY 171 | echo "The release has been created as a **draft**. Please review it before publishing:" >> $GITHUB_STEP_SUMMARY 172 | echo "" >> $GITHUB_STEP_SUMMARY 173 | echo "https://github.com/${{ github.repository }}/releases/tag/${TAG_NAME}" >> $GITHUB_STEP_SUMMARY 174 | echo "" >> $GITHUB_STEP_SUMMARY 175 | echo "### Release Assets" >> $GITHUB_STEP_SUMMARY 176 | echo "" >> $GITHUB_STEP_SUMMARY 177 | 178 | for file in release/*.{tar.gz,zip}; do 179 | if [ -f "$file" ]; then 180 | size=$(du -h "$file" | cut -f1) 181 | filename=$(basename "$file") 182 | echo "- \`$filename\` ($size)" >> $GITHUB_STEP_SUMMARY 183 | fi 184 | done 185 | -------------------------------------------------------------------------------- /.github/workflows/scripts/collect-libs.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -e 3 | 4 | # Collect all native libraries for distribution 5 | # Usage: ./scripts/collect-libs.sh 6 | 7 | PLATFORM=$1 # e.g., linux_amd64 8 | TARGET=$2 # e.g., x86_64-unknown-linux-gnu 9 | LIB_EXT=$3 # e.g., so, dll, dylib 10 | 11 | echo "=== Collecting Libraries ===" 12 | echo "Platform: $PLATFORM | Target: $TARGET | Extension: $LIB_EXT" 13 | 14 | # Create distribution structure 15 | DIST_DIR="dist/$PLATFORM" 16 | mkdir -p "$DIST_DIR" 17 | # Determine OS-specific naming 18 | if [[ "$PLATFORM" == *"windows"* ]]; then 19 | OS="Windows" 20 | MAIN_LIB_PREFIX="" 21 | TIKA_LIB_PREFIX="" 22 | elif [[ "$PLATFORM" == *"darwin"* ]]; then 23 | OS="macOS" 24 | MAIN_LIB_PREFIX="lib" 25 | TIKA_LIB_PREFIX="lib" 26 | else 27 | OS="Linux" 28 | MAIN_LIB_PREFIX="lib" 29 | TIKA_LIB_PREFIX="lib" 30 | fi 31 | 32 | # 1. Find and copy main FFI library 33 | RELEASE_DIR="ffi/target/$TARGET/release" 34 | MAIN_LIB="$RELEASE_DIR/${MAIN_LIB_PREFIX}extractous_ffi.$LIB_EXT" 35 | 36 | if [ ! -f "$MAIN_LIB" ]; then 37 | echo "✗ Error: Main FFI library not found: $MAIN_LIB" 38 | exit 1 39 | fi 40 | 41 | echo "✓ Found main FFI library: $MAIN_LIB" 42 | cp "$MAIN_LIB" "$DIST_DIR" 43 | 44 | # 2. Find extractous build output directory 45 | # Look for the canonical libs directory created by extractous build.rs 46 | echo "" 47 | echo "Searching for extractous dependencies..." 48 | 49 | BUILD_BASE="$RELEASE_DIR/build" 50 | 51 | # Find all extractous-*/out/libs directories, sort by modification time (newest first) 52 | LIBS_DIR=$(find "$BUILD_BASE" -type d -path "*/extractous-*/out/libs" -printf "%T@ %p\n" 2>/dev/null | 53 | sort -rn | 54 | head -1 | 55 | cut -d' ' -f2) 56 | 57 | # Fallback for macOS (no -printf support) 58 | if [ -z "$LIBS_DIR" ]; then 59 | # Find all matching directories 60 | FOUND_DIRS=$(find "$BUILD_BASE" -type d -path "*/extractous-*/out/libs" 2>/dev/null) 61 | 62 | if [ -n "$FOUND_DIRS" ]; then 63 | # Get the most recently modified directory 64 | LIBS_DIR=$(echo "$FOUND_DIRS" | while read -r dir; do 65 | echo "$(stat -f "%m" "$dir") $dir" 66 | done | sort -rn | head -1 | cut -d' ' -f2-) 67 | fi 68 | fi 69 | 70 | if [ -z "$LIBS_DIR" ]; then 71 | echo "✗ Error: Could not find extractous out/libs directory" 72 | echo "Searched in: $BUILD_BASE/extractous-*/out/libs" 73 | echo "" 74 | echo "Available build directories:" 75 | find "$BUILD_BASE" -type d -name "extractous-*" 2>/dev/null || echo "None found" 76 | echo "" 77 | echo "Checking for out directories:" 78 | find "$BUILD_BASE" -type d -name "out" 2>/dev/null || echo "None found" 79 | exit 1 80 | fi 81 | 82 | echo "✓ Found libs directory: $LIBS_DIR" 83 | 84 | # 3. Verify libtika_native exists 85 | # Try both with and without prefix for Windows compatibility 86 | TIKA_LIB="$LIBS_DIR/${TIKA_LIB_PREFIX}tika_native.$LIB_EXT" 87 | if [ ! -f "$TIKA_LIB" ] && [ "$OS" = "Windows" ]; then 88 | # Try with lib prefix on Windows as fallback 89 | TIKA_LIB_ALT="$LIBS_DIR/libtika_native.$LIB_EXT" 90 | if [ -f "$TIKA_LIB_ALT" ]; then 91 | TIKA_LIB="$TIKA_LIB_ALT" 92 | fi 93 | fi 94 | 95 | if [ ! -f "$TIKA_LIB" ]; then 96 | echo "✗ Error: tika_native.$LIB_EXT not found in $LIBS_DIR" 97 | echo "Directory contents:" 98 | ls -lh "$LIBS_DIR" || echo "Directory not accessible" 99 | 100 | # Show all DLL/SO/DYLIB files to help debug 101 | echo "" 102 | echo "All native libraries found:" 103 | find "$LIBS_DIR" -name "*.$LIB_EXT" -o -name "*.dll" -o -name "*.so" -o -name "*.dylib" 2>/dev/null || echo "None found" 104 | exit 1 105 | fi 106 | 107 | echo "✓ Found libtika_native: $TIKA_LIB" 108 | 109 | # 4. Copy ALL libraries from out/libs/ 110 | # These are all required dependencies bundled by GraalVM 111 | echo "" 112 | echo "Copying all native dependencies..." 113 | cp "$LIBS_DIR"/*."$LIB_EXT" "$DIST_DIR" || { 114 | echo "✗ Error: Failed to copy libraries" 115 | exit 1 116 | } 117 | 118 | # 5. Patch libextractous_ffi on macOS to use @rpath and replace absolute path 119 | # https://github.com/rahulpoonia29/extractous-go/issues/5 120 | if [ "$OS" = "macOS" ]; then 121 | echo "" 122 | echo "Verify XCode tools" 123 | # XCode tools are present on github macOS runners by default, but verify anyway 124 | which otool || { echo "✗ otool not found"; exit 1; } 125 | which install_name_tool || { echo "✗ install_name_tool not found"; exit 1; } 126 | otool -L "$DIST_DIR/libextractous_ffi.dylib" || { echo "✗ otool test failed"; exit 1; } 127 | 128 | echo "Patching libextractous_ffi.dylib to use @loader_path for tika" 129 | OLD_PATH=$(otool -L "$DIST_DIR/libextractous_ffi.dylib" | grep libtika_native.dylib | awk '{print $1}') 130 | echo " Old tika_native path: $OLD_PATH" 131 | 132 | # Replace with @loader_path (directory of the main library) 133 | install_name_tool -change "$OLD_PATH" "@loader_path/libtika_native.dylib" "$DIST_DIR/libextractous_ffi.dylib" 134 | 135 | echo "Debug: New Path" 136 | echo "!! Should be @loader_path/libtika_native.dylib" 137 | otool -L "$DIST_DIR/libextractous_ffi.dylib" | sed 's/^/ /' 138 | fi 139 | 140 | # Count copied libraries 141 | LIB_COUNT=$(find "$DIST_DIR" -name "*.$LIB_EXT" | wc -l) 142 | echo "✓ Copied $LIB_COUNT libraries" 143 | 144 | # 5. Display distribution contents 145 | echo "" 146 | echo "=== Distribution Contents ===" 147 | echo "Libraries ($LIB_COUNT total):" 148 | ls -lh "$DIST_DIR" | tail -n +2 149 | 150 | if [ -d "$DIST_DIR/include" ] && [ -n "$(ls -A "$DIST_DIR/include" 2>/dev/null)" ]; then 151 | echo "" 152 | echo "Headers:" 153 | ls -lh "$DIST_DIR/include/" 154 | fi 155 | 156 | # 7. Verify dependencies (platform-specific) 157 | echo "" 158 | echo "=== Dependency Verification ===" 159 | 160 | if [ "$OS" = "Linux" ]; then 161 | echo "Checking RPATH configuration..." 162 | for lib in "$DIST_DIR"*.$LIB_EXT; do 163 | LIB_NAME=$(basename "$lib") 164 | echo " $LIB_NAME:" 165 | readelf -d "$lib" | grep -E "RPATH|RUNPATH" | sed 's/^/ /' || echo " No RPATH set" 166 | done 167 | 168 | echo "" 169 | echo "Checking main FFI dependencies..." 170 | ldd "$DIST_DIR/${MAIN_LIB_PREFIX}extractous_ffi.$LIB_EXT" || true 171 | 172 | elif [ "$OS" = "macOS" ]; then 173 | echo "Checking install names..." 174 | for lib in "$DIST_DIR"*.$LIB_EXT; do 175 | LIB_NAME=$(basename "$lib") 176 | echo " $LIB_NAME:" 177 | otool -L "$lib" | grep -v "$LIB_NAME:" | sed 's/^/ /' 178 | done 179 | 180 | elif [ "$OS" = "Windows" ]; then 181 | echo "Windows DLL validation..." 182 | file "$DIST_DIR"/*.dll 2>/dev/null || echo " DLLs present" 183 | fi 184 | 185 | # 8. Calculate total size 186 | echo "" 187 | echo "=== Distribution Size ===" 188 | echo "Total library size: $(du -sh "$DIST_DIR" | cut -f1)" 189 | echo "Distribution complete: $DIST_DIR" 190 | -------------------------------------------------------------------------------- /.github/workflows/build.yml: -------------------------------------------------------------------------------- 1 | name: Build Cross-Platform 2 | 3 | on: 4 | push: 5 | branches: [main] 6 | paths: 7 | - "ffi/**" 8 | - ".github/workflows/build.yml" 9 | - ".github/workflows/scripts/**" 10 | pull_request: 11 | branches: [main] 12 | paths: 13 | - "ffi/**" 14 | - ".github/workflows/build.yml" 15 | - ".github/workflows/scripts/**" 16 | workflow_dispatch: 17 | workflow_call: 18 | outputs: 19 | run_id: 20 | description: "The run ID of this workflow" 21 | value: ${{ github.run_id }} 22 | 23 | concurrency: 24 | group: ${{ github.workflow }}-${{ github.ref }} 25 | cancel-in-progress: true 26 | 27 | env: 28 | RUST_VERSION: "1.90.0" 29 | CARGO_TERM_COLOR: always 30 | 31 | jobs: 32 | build: 33 | name: Build ${{ matrix.platform }} 34 | runs-on: ${{ matrix.os }} 35 | strategy: 36 | fail-fast: false 37 | matrix: 38 | include: 39 | - os: ubuntu-latest 40 | platform: linux_amd64 41 | target: x86_64-unknown-linux-gnu 42 | lib_ext: so 43 | 44 | - os: windows-latest 45 | platform: windows_amd64 46 | target: x86_64-pc-windows-msvc 47 | lib_ext: dll 48 | 49 | - os: macos-latest 50 | platform: darwin_arm64 51 | target: aarch64-apple-darwin 52 | lib_ext: dylib 53 | 54 | steps: 55 | - name: Checkout 56 | uses: actions/checkout@v4 57 | 58 | # Platform Setup 59 | - name: Setup MSVC (Windows) 60 | if: runner.os == 'Windows' 61 | uses: ilammy/msvc-dev-cmd@v1 62 | with: 63 | arch: x64 64 | 65 | - name: Install build dependencies (Linux) 66 | if: runner.os == 'Linux' 67 | run: | 68 | sudo apt-get update 69 | sudo apt-get install -y build-essential pkg-config libssl-dev tree 70 | 71 | # GraalVM Setup - Platform specific 72 | # https://github.com/oracle/graal/issues/4921 73 | - name: Setup GraalVM (macOS) 74 | if: runner.os == 'macOS' 75 | uses: graalvm/setup-graalvm@v1 76 | with: 77 | java-version: "23" 78 | distribution: "liberica" # Use Liberica NIK for macOS for AWT metadata support 79 | github-token: ${{ secrets.GITHUB_TOKEN }} 80 | native-image-job-reports: "false" 81 | set-java-home: "true" 82 | 83 | - name: Setup GraalVM (Linux/Windows) 84 | if: runner.os != 'macOS' 85 | uses: graalvm/setup-graalvm@v1 86 | with: 87 | java-version: "23" 88 | distribution: "graalvm-community" # Keep GraalVM CE for Linux/Windows 89 | github-token: ${{ secrets.GITHUB_TOKEN }} 90 | native-image-job-reports: "false" 91 | set-java-home: "true" 92 | 93 | - name: Verify GraalVM 94 | shell: bash 95 | run: | 96 | if [ "${{ runner.os }}" = "Windows" ]; then 97 | # On Windows, convert path and use .cmd extension 98 | GRAALVM_HOME_UNIX=$(cygpath "$GRAALVM_HOME") 99 | JAVA_HOME_UNIX=$(cygpath "$JAVA_HOME") 100 | echo "GRAALVM_HOME (Windows): $GRAALVM_HOME" 101 | echo "GRAALVM_HOME (Unix): $GRAALVM_HOME_UNIX" 102 | echo "JAVA_HOME (Windows): $JAVA_HOME" 103 | echo "JAVA_HOME (Unix): $JAVA_HOME_UNIX" 104 | 105 | # Call native-image with Windows path 106 | "$GRAALVM_HOME/bin/native-image.cmd" --version 107 | else 108 | # Unix systems work normally 109 | echo "GRAALVM_HOME: $GRAALVM_HOME" 110 | echo "JAVA_HOME: $JAVA_HOME" 111 | java --version 112 | native-image --version 113 | fi 114 | 115 | # Rust Setup 116 | - name: Setup Rust 117 | uses: dtolnay/rust-toolchain@stable 118 | with: 119 | toolchain: ${{ env.RUST_VERSION }} 120 | targets: ${{ matrix.target }} 121 | 122 | # Aggressive Caching Strategy 123 | - name: Cache Rust dependencies 124 | uses: Swatinem/rust-cache@v2 125 | with: 126 | prefix-key: "v2-rust" 127 | shared-key: ${{ matrix.target }} 128 | workspaces: "./ffi" 129 | cache-on-failure: true 130 | save-if: ${{ github.ref == 'refs/heads/main' }} 131 | 132 | - name: Cache GraalVM Native Image 133 | id: cache-graalvm 134 | uses: actions/cache@v4 135 | with: 136 | path: | 137 | ffi/target/${{ matrix.target }}/release/build/extractous-*/out 138 | key: graalvm-v5-${{ runner.os }}-${{ matrix.target }}-${{ hashFiles('ffi/Cargo.toml') }} 139 | restore-keys: | 140 | graalvm-v5-${{ runner.os }}-${{ matrix.target }}- 141 | 142 | - name: Build FFI library (Unix) 143 | if: runner.os != 'Windows' 144 | working-directory: ./ffi 145 | shell: bash 146 | run: | 147 | cargo build --release --target ${{ matrix.target }} 148 | env: 149 | RUST_BACKTRACE: 1 150 | 151 | # Build on Windows using PowerShell to avoid PATH issues with link.exe 152 | - name: Build FFI library (Windows) 153 | if: runner.os == 'Windows' 154 | working-directory: ./ffi 155 | shell: pwsh 156 | run: | 157 | cargo build --release --target ${{ matrix.target }} 158 | env: 159 | RUST_BACKTRACE: 1 160 | 161 | # Collect Dependencies 162 | - name: Collect libraries 163 | shell: bash 164 | run: | 165 | chmod +x .github/workflows/scripts/collect-libs.sh 166 | .github/workflows/scripts/collect-libs.sh ${{ matrix.platform }} ${{ matrix.target }} ${{ matrix.lib_ext }} 167 | 168 | # Strip Debug Symbols 169 | - name: Optimize libraries (Unix) 170 | if: runner.os != 'Windows' 171 | shell: bash 172 | run: | 173 | if [ "${{ runner.os }}" = "Linux" ]; then 174 | find dist/${{ matrix.platform }} -name "*.${{ matrix.lib_ext }}" -exec strip --strip-debug {} \; 175 | elif [ "${{ runner.os }}" = "macOS" ]; then 176 | find dist/${{ matrix.platform }} -name "*.${{ matrix.lib_ext }}" -exec strip -x {} \; 177 | fi 178 | echo "Optimized size: $(du -sh dist/${{ matrix.platform }}/lib/ | cut -f1)" 179 | 180 | # Upload Artifacts 181 | - name: Upload artifacts 182 | uses: actions/upload-artifact@v4 183 | with: 184 | name: extractous-ffi-${{ matrix.platform }} 185 | path: dist/${{ matrix.platform }} 186 | retention-days: 7 187 | compression-level: 9 188 | if-no-files-found: error 189 | -------------------------------------------------------------------------------- /stream.go: -------------------------------------------------------------------------------- 1 | package extractous 2 | 3 | /* 4 | #cgo CFLAGS: -I${SRCDIR}/../include 5 | 6 | #include 7 | #include 8 | */ 9 | import "C" 10 | import ( 11 | "io" 12 | "runtime" 13 | "unsafe" 14 | ) 15 | 16 | // StreamReader implements io.Reader for streaming document content. 17 | // 18 | // StreamReader provides efficient streaming access to extracted document content, 19 | // allowing you to process large documents without loading everything into memory. 20 | // It implements the standard io.Reader interface and can be used with any Go code 21 | // that works with readers. 22 | // 23 | // # Interface Compliance 24 | // 25 | // StreamReader implements: 26 | // - io.Reader: Read(p []byte) (n int, err error) 27 | // - io.Closer: Close() error 28 | // 29 | // This means it can be used with: 30 | // - io.Copy, io.ReadAll, io.ReadFull 31 | // - bufio.NewReader, bufio.NewScanner 32 | // - io.TeeReader, io.LimitReader 33 | // - Any function accepting io.Reader or io.ReadCloser 34 | // 35 | // # Resource Management 36 | // 37 | // StreamReaders must be closed when done to free underlying resources. While they 38 | // use finalizers for automatic cleanup, calling Close() explicitly is strongly 39 | // recommended: 40 | // 41 | // reader, metadata, err := extractor.ExtractFile("document.pdf") 42 | // if err != nil { 43 | // log.Fatal(err) 44 | // } 45 | // defer reader.Close() // Always close 46 | // 47 | // # Usage Patterns 48 | // 49 | // Copy to stdout: 50 | // 51 | // reader, _, _ := extractor.ExtractFile("document.pdf") 52 | // defer reader.Close() 53 | // io.Copy(os.Stdout, reader) 54 | // 55 | // Read in chunks: 56 | // 57 | // reader, _, _ := extractor.ExtractFile("document.pdf") 58 | // defer reader.Close() 59 | // buf := make([]byte, 4096) 60 | // for { 61 | // n, err := reader.Read(buf) 62 | // if err == io.EOF { 63 | // break 64 | // } 65 | // if err != nil { 66 | // log.Fatal(err) 67 | // } 68 | // process(buf[:n]) 69 | // } 70 | // 71 | // Use with bufio.Scanner: 72 | // 73 | // reader, _, _ := extractor.ExtractFile("document.pdf") 74 | // defer reader.Close() 75 | // scanner := bufio.NewScanner(reader) 76 | // for scanner.Scan() { 77 | // line := scanner.Text() 78 | // fmt.Println(line) 79 | // } 80 | // 81 | // Read all at once (for moderate-sized documents): 82 | // 83 | // reader, _, _ := extractor.ExtractFile("document.pdf") 84 | // defer reader.Close() 85 | // content, err := io.ReadAll(reader) 86 | // if err != nil { 87 | // log.Fatal(err) 88 | // } 89 | // fmt.Println(string(content)) 90 | // 91 | // # Performance Considerations 92 | // 93 | // StreamReader is buffered at the FFI layer, so you don't need to wrap it with 94 | // bufio.Reader for basic read operations. However, bufio.Scanner can still be 95 | // useful for line-oriented processing. 96 | // 97 | // Typical buffer sizes: 98 | // - Small reads (< 512 bytes): May have overhead, prefer larger reads 99 | // - Medium reads (4KB - 64KB): Optimal for most use cases 100 | // - Large reads (> 1MB): Generally no advantage over medium reads 101 | // 102 | // # Thread Safety 103 | // 104 | // StreamReader is NOT safe for concurrent use. Do not call Read() from multiple 105 | // goroutines simultaneously. 106 | type StreamReader struct { 107 | ptr *C.struct_CStreamReader // FFI stream pointer 108 | closed bool // Whether Close() has been called 109 | } 110 | 111 | // newStreamReader creates a StreamReader from a C pointer. 112 | // 113 | // This is an internal function used to wrap FFI stream pointers. It sets up 114 | // a finalizer for automatic resource cleanup. 115 | // 116 | // Returns nil if the C pointer is nil. 117 | // 118 | // Internal use only. 119 | func newStreamReader(ptr *C.struct_CStreamReader) *StreamReader { 120 | if ptr == nil { 121 | return nil 122 | } 123 | 124 | reader := &StreamReader{ptr: ptr} 125 | runtime.SetFinalizer(reader, (*StreamReader).Close) 126 | return reader 127 | } 128 | 129 | // Read reads up to len(p) bytes into p. 130 | // 131 | // This implements the io.Reader interface. It reads extracted content from the 132 | // document into the provided byte slice. 133 | // 134 | // Parameters: 135 | // - p: Byte slice to read into (must not be nil or empty for meaningful reads) 136 | // 137 | // Returns: 138 | // - n: Number of bytes read (0 <= n <= len(p)) 139 | // - err: Error if read failed, or io.EOF when stream is exhausted 140 | // 141 | // # Behavior 142 | // 143 | // Read may return fewer bytes than requested (0 < n < len(p)) without error. 144 | // This is normal io.Reader behavior and does not indicate an error or EOF. 145 | // 146 | // Read returns io.EOF when no more data is available. After receiving io.EOF, 147 | // all subsequent calls will return (0, io.EOF). 148 | // 149 | // If the reader has been closed, Read returns (0, io.EOF). 150 | // 151 | // # Example 152 | // 153 | // reader, _, err := extractor.ExtractFile("document.pdf") 154 | // if err != nil { 155 | // log.Fatal(err) 156 | // } 157 | // defer reader.Close() 158 | // 159 | // buf := make([]byte, 4096) 160 | // for { 161 | // n, err := reader.Read(buf) 162 | // if n > 0 { 163 | // // Process buf[:n] 164 | // fmt.Print(string(buf[:n])) 165 | // } 166 | // if err == io.EOF { 167 | // break 168 | // } 169 | // if err != nil { 170 | // log.Fatal(err) 171 | // } 172 | // } 173 | // 174 | // # Error Handling 175 | // 176 | // Errors other than io.EOF indicate actual read failures and should be handled: 177 | // 178 | // n, err := reader.Read(buf) 179 | // if err != nil && err != io.EOF { 180 | // log.Printf("Read error: %v", err) 181 | // return 182 | // } 183 | func (r *StreamReader) Read(p []byte) (n int, err error) { 184 | if r.closed || r.ptr == nil { 185 | return 0, io.EOF 186 | } 187 | 188 | if len(p) == 0 { 189 | return 0, nil 190 | } 191 | 192 | var bytesRead C.size_t 193 | code := C.extractous_stream_read( 194 | r.ptr, 195 | (*C.uint8_t)(unsafe.Pointer(&p[0])), 196 | C.size_t(len(p)), 197 | &bytesRead, 198 | ) 199 | 200 | if code != errOK { 201 | return 0, newError(code) 202 | } 203 | 204 | if bytesRead == 0 { 205 | return 0, io.EOF 206 | } 207 | 208 | return int(bytesRead), nil 209 | } 210 | 211 | // Close closes the stream and releases underlying resources. 212 | // 213 | // This implements the io.Closer interface. After calling Close, the StreamReader 214 | // should not be used. Subsequent calls to Read will return (0, io.EOF). 215 | // 216 | // Calling Close multiple times is safe - subsequent calls are no-ops and return 217 | // nil. 218 | // 219 | // While StreamReaders use finalizers for automatic cleanup, calling Close 220 | // explicitly is strongly recommended for deterministic resource management, 221 | // especially when processing many documents. 222 | // 223 | // Returns: 224 | // - Always returns nil (implements io.Closer) 225 | // 226 | // Example: 227 | // 228 | // reader, _, err := extractor.ExtractFile("document.pdf") 229 | // if err != nil { 230 | // log.Fatal(err) 231 | // } 232 | // defer reader.Close() // Ensure cleanup 233 | // 234 | // // Use reader... 235 | // io.Copy(os.Stdout, reader) 236 | // 237 | // // Explicit close (defer will call again, which is safe) 238 | // reader.Close() 239 | // 240 | // # Resource Management Best Practices 241 | // 242 | // Always use defer: 243 | // 244 | // reader, _, err := extractor.ExtractFile("doc.pdf") 245 | // if err != nil { 246 | // return err 247 | // } 248 | // defer reader.Close() // Cleanup even if function panics 249 | // 250 | // For long-running processes, close explicitly in loops: 251 | // 252 | // for _, file := range files { 253 | // reader, _, err := extractor.ExtractFile(file) 254 | // if err != nil { 255 | // log.Printf("Error: %v", err) 256 | // continue 257 | // } 258 | // 259 | // processStream(reader) 260 | // reader.Close() // Don't wait for defer in loop 261 | // } 262 | func (r *StreamReader) Close() error { 263 | if r.closed || r.ptr == nil { 264 | return nil 265 | } 266 | 267 | C.extractous_stream_free(r.ptr) 268 | r.ptr = nil 269 | r.closed = true 270 | return nil 271 | } 272 | -------------------------------------------------------------------------------- /ffi/src/config.rs: -------------------------------------------------------------------------------- 1 | use crate::ecore::{ 2 | OfficeParserConfig as CoreOfficeConfig, PdfOcrStrategy, PdfParserConfig as CorePdfConfig, 3 | TesseractOcrConfig as CoreOcrConfig, 4 | }; 5 | use crate::types::*; 6 | use std::ffi::CStr; 7 | use std::os::raw::c_char; 8 | use std::ptr; 9 | 10 | /// Macro to safely update a config instance behind a raw pointer. 11 | macro_rules! update_config { 12 | ($handle:expr, $T:ty, |$config_val:ident| $body:block) => { 13 | if $handle.is_null() { 14 | return; 15 | } 16 | unsafe { 17 | let config_ptr = $handle as *mut $T; 18 | let old_config = ptr::read(config_ptr); 19 | let new_config = { 20 | let $config_val = old_config; 21 | $body 22 | }; 23 | ptr::write(config_ptr, new_config); 24 | } 25 | }; 26 | } 27 | 28 | /// Creates a new PDF parser configuration with default settings. 29 | /// The returned handle must be freed with `extractous_pdf_config_free()` 30 | /// unless passed to an extractor, which will take ownership. 31 | // #[must_use] 32 | #[unsafe(no_mangle)] 33 | pub extern "C" fn extractous_pdf_config_new() -> *mut CPdfParserConfig { 34 | let config = Box::new(CorePdfConfig::new()); 35 | Box::into_raw(config) as *mut CPdfParserConfig 36 | } 37 | 38 | /// Frees the memory associated with a PDF parser configuration. 39 | /// Do not call this if the config has been attached to an extractor. 40 | #[unsafe(no_mangle)] 41 | pub unsafe extern "C" fn extractous_pdf_config_free(handle: *mut CPdfParserConfig) { 42 | if !handle.is_null() { 43 | drop(unsafe { Box::from_raw(handle as *mut CorePdfConfig) }); 44 | } 45 | } 46 | 47 | /// Sets the OCR strategy for PDF parsing. Modifies the config in-place. 48 | #[unsafe(no_mangle)] 49 | pub unsafe extern "C" fn extractous_pdf_config_set_ocr_strategy( 50 | handle: *mut CPdfParserConfig, 51 | strategy: libc::c_int, 52 | ) { 53 | let ocr_strategy = match strategy { 54 | PDF_OCR_STRATEGY_NO_OCR => PdfOcrStrategy::NO_OCR, 55 | PDF_OCR_STRATEGY_OCR_ONLY => PdfOcrStrategy::OCR_ONLY, 56 | PDF_OCR_STRATEGY_OCR_AND_TEXT_EXTRACTION => PdfOcrStrategy::OCR_AND_TEXT_EXTRACTION, 57 | PDF_OCR_STRATEGY_AUTO => PdfOcrStrategy::AUTO, 58 | _ => return, // Invalid strategy, do nothing. 59 | }; 60 | update_config!(handle, CorePdfConfig, |config| { 61 | config.set_ocr_strategy(ocr_strategy) 62 | }); 63 | } 64 | 65 | /// Enables or disables extraction of inline images. Modifies the config in-place. 66 | #[unsafe(no_mangle)] 67 | pub unsafe extern "C" fn extractous_pdf_config_set_extract_inline_images( 68 | handle: *mut CPdfParserConfig, 69 | value: bool, 70 | ) { 71 | update_config!(handle, CorePdfConfig, |config| { 72 | config.set_extract_inline_images(value) 73 | }); 74 | } 75 | 76 | /// If enabled, only unique inline images (by digest) will be extracted. 77 | #[unsafe(no_mangle)] 78 | pub unsafe extern "C" fn extractous_pdf_config_set_extract_unique_inline_images_only( 79 | handle: *mut CPdfParserConfig, 80 | value: bool, 81 | ) { 82 | update_config!(handle, CorePdfConfig, |config| { 83 | config.set_extract_unique_inline_images_only(value) 84 | }); 85 | } 86 | 87 | /// Enables or disables extraction of text from marked content sections. 88 | #[unsafe(no_mangle)] 89 | pub unsafe extern "C" fn extractous_pdf_config_set_extract_marked_content( 90 | handle: *mut CPdfParserConfig, 91 | value: bool, 92 | ) { 93 | update_config!(handle, CorePdfConfig, |config| { 94 | config.set_extract_marked_content(value) 95 | }); 96 | } 97 | 98 | /// Enables or disables extraction of text from annotations. 99 | #[unsafe(no_mangle)] 100 | pub unsafe extern "C" fn extractous_pdf_config_set_extract_annotation_text( 101 | handle: *mut CPdfParserConfig, 102 | value: bool, 103 | ) { 104 | update_config!(handle, CorePdfConfig, |config| { 105 | config.set_extract_annotation_text(value) 106 | }); 107 | } 108 | 109 | /// Creates a new Office parser configuration with default settings. 110 | // #[must_use] 111 | #[unsafe(no_mangle)] 112 | pub extern "C" fn extractous_office_config_new() -> *mut COfficeParserConfig { 113 | let config = Box::new(CoreOfficeConfig::new()); 114 | Box::into_raw(config) as *mut COfficeParserConfig 115 | } 116 | 117 | /// Frees the memory associated with an Office parser configuration. 118 | #[unsafe(no_mangle)] 119 | pub unsafe extern "C" fn extractous_office_config_free(handle: *mut COfficeParserConfig) { 120 | if !handle.is_null() { 121 | drop(unsafe { Box::from_raw(handle as *mut CoreOfficeConfig) }); 122 | } 123 | } 124 | 125 | /// Enables or disables macro extraction. Modifies the config in-place. 126 | #[unsafe(no_mangle)] 127 | pub unsafe extern "C" fn extractous_office_config_set_extract_macros( 128 | handle: *mut COfficeParserConfig, 129 | value: bool, 130 | ) { 131 | update_config!(handle, CoreOfficeConfig, |config| { 132 | config.set_extract_macros(value) 133 | }); 134 | } 135 | 136 | /// Enables or disables inclusion of deleted content (track changes). 137 | #[unsafe(no_mangle)] 138 | pub unsafe extern "C" fn extractous_office_config_set_include_deleted_content( 139 | handle: *mut COfficeParserConfig, 140 | value: bool, 141 | ) { 142 | update_config!(handle, CoreOfficeConfig, |config| { 143 | config.set_include_deleted_content(value) 144 | }); 145 | } 146 | 147 | /// Enables or disables inclusion of moved-from content (track changes). 148 | #[unsafe(no_mangle)] 149 | pub unsafe extern "C" fn extractous_office_config_set_include_move_from_content( 150 | handle: *mut COfficeParserConfig, 151 | value: bool, 152 | ) { 153 | update_config!(handle, CoreOfficeConfig, |config| { 154 | config.set_include_move_from_content(value) 155 | }); 156 | } 157 | 158 | /// Enables or disables inclusion of content from shapes. 159 | #[unsafe(no_mangle)] 160 | pub unsafe extern "C" fn extractous_office_config_set_include_shape_based_content( 161 | handle: *mut COfficeParserConfig, 162 | value: bool, 163 | ) { 164 | update_config!(handle, CoreOfficeConfig, |config| { 165 | config.set_include_shape_based_content(value) 166 | }); 167 | } 168 | 169 | /// Creates a new Tesseract OCR configuration with default settings. 170 | // #[must_use] 171 | #[unsafe(no_mangle)] 172 | pub extern "C" fn extractous_ocr_config_new() -> *mut CTesseractOcrConfig { 173 | let config = Box::new(CoreOcrConfig::new()); 174 | Box::into_raw(config) as *mut CTesseractOcrConfig 175 | } 176 | 177 | /// Frees the memory associated with a Tesseract OCR configuration. 178 | #[unsafe(no_mangle)] 179 | pub unsafe extern "C" fn extractous_ocr_config_free(handle: *mut CTesseractOcrConfig) { 180 | if !handle.is_null() { 181 | drop(unsafe { Box::from_raw(handle as *mut CoreOcrConfig) }); 182 | } 183 | } 184 | 185 | /// Sets the OCR language. Modifies the config in-place. 186 | #[unsafe(no_mangle)] 187 | pub unsafe extern "C" fn extractous_ocr_config_set_language( 188 | handle: *mut CTesseractOcrConfig, 189 | language: *const c_char, 190 | ) { 191 | if language.is_null() { 192 | return; 193 | } 194 | let lang_str = match unsafe { CStr::from_ptr(language).to_str() } { 195 | Ok(s) => s, 196 | Err(_) => return, // Invalid UTF-8, do nothing. 197 | }; 198 | update_config!(handle, CoreOcrConfig, |config| { 199 | config.set_language(lang_str) 200 | }); 201 | } 202 | 203 | /// Sets the DPI for OCR processing. Modifies the config in-place. 204 | #[unsafe(no_mangle)] 205 | pub unsafe extern "C" fn extractous_ocr_config_set_density( 206 | handle: *mut CTesseractOcrConfig, 207 | density: i32, 208 | ) { 209 | update_config!(handle, CoreOcrConfig, |config| { 210 | config.set_density(density) 211 | }); 212 | } 213 | 214 | /// Sets the bit depth for OCR processing. 215 | #[unsafe(no_mangle)] 216 | pub unsafe extern "C" fn extractous_ocr_config_set_depth( 217 | handle: *mut CTesseractOcrConfig, 218 | depth: i32, 219 | ) { 220 | update_config!(handle, CoreOcrConfig, |config| { config.set_depth(depth) }); 221 | } 222 | 223 | /// Enables or disables image preprocessing for OCR. 224 | #[unsafe(no_mangle)] 225 | pub unsafe extern "C" fn extractous_ocr_config_set_enable_image_preprocessing( 226 | handle: *mut CTesseractOcrConfig, 227 | value: bool, 228 | ) { 229 | update_config!(handle, CoreOcrConfig, |config| { 230 | config.set_enable_image_preprocessing(value) 231 | }); 232 | } 233 | 234 | /// Sets the timeout for the Tesseract process in seconds. 235 | #[unsafe(no_mangle)] 236 | pub unsafe extern "C" fn extractous_ocr_config_set_timeout_seconds( 237 | handle: *mut CTesseractOcrConfig, 238 | seconds: i32, 239 | ) { 240 | update_config!(handle, CoreOcrConfig, |config| { 241 | config.set_timeout_seconds(seconds) 242 | }); 243 | } 244 | -------------------------------------------------------------------------------- /metadata.go: -------------------------------------------------------------------------------- 1 | package extractous 2 | 3 | /* 4 | #include 5 | #include 6 | */ 7 | import "C" 8 | import ( 9 | "runtime" 10 | "strings" 11 | "unsafe" 12 | ) 13 | 14 | // Metadata represents document metadata as key-value pairs. 15 | // 16 | // Metadata contains information about a document such as author, title, creation 17 | // date, modification date, and other document properties. Each metadata field can 18 | // have multiple values, so values are stored as string slices. 19 | // 20 | // # Common Metadata Fields 21 | // 22 | // Different document formats provide different metadata fields. Common fields 23 | // include: 24 | // 25 | // - "title" - Document title 26 | // - "author" - Document author(s) 27 | // - "creator" - Application that created the document 28 | // - "producer" - Application that produced the PDF (for PDFs) 29 | // - "subject" - Document subject/description 30 | // - "keywords" - Document keywords 31 | // - "created" - Creation date/time 32 | // - "modified" - Last modification date/time 33 | // - "Content-Type" - MIME type of the document 34 | // - "dc:title" - Dublin Core title (some formats) 35 | // - "dc:creator" - Dublin Core creator (some formats) 36 | // 37 | // # Multi-valued Fields 38 | // 39 | // Some fields can have multiple values, particularly "author" and "keywords": 40 | // 41 | // metadata := Metadata{ 42 | // "author": []string{"Alice", "Bob"}, 43 | // "keywords": []string{"report", "quarterly", "finance"}, 44 | // } 45 | // 46 | // # Case Sensitivity 47 | // 48 | // Metadata keys are case-sensitive. Some formats use lowercase keys ("author"), 49 | // others use mixed case ("Author" or "dc:creator"). Always check the actual keys 50 | // returned from extraction. 51 | // 52 | // # Usage Examples 53 | // 54 | // Basic access: 55 | // 56 | // content, metadata, err := extractor.ExtractFileToString("document.pdf") 57 | // if err != nil { 58 | // log.Fatal(err) 59 | // } 60 | // 61 | // // Get single value 62 | // title := metadata.Get("title") 63 | // author := metadata.Get("author") 64 | // 65 | // // Get all values 66 | // allAuthors := metadata.GetAll("author") 67 | // for _, author := range allAuthors { 68 | // fmt.Println("Author:", author) 69 | // } 70 | // 71 | // // Check existence 72 | // if metadata.Has("keywords") { 73 | // keywords := metadata.GetAll("keywords") 74 | // fmt.Println("Keywords:", keywords) 75 | // } 76 | // 77 | // Iterate all metadata: 78 | // 79 | // for _, key := range metadata.Keys() { 80 | // values := metadata.GetAll(key) 81 | // fmt.Printf("%s: %v\n", key, values) 82 | // } 83 | // 84 | // # Empty Metadata 85 | // 86 | // If a document has no metadata, an empty Metadata map is returned (not nil). 87 | // Always safe to call methods on Metadata even when empty. 88 | type Metadata map[string][]string 89 | 90 | // metadataWrapper wraps C metadata for proper cleanup. 91 | // 92 | // This is an internal type used to manage the lifecycle of C metadata pointers. 93 | // It ensures that C resources are freed when the Go garbage collector determines 94 | // they are no longer needed. 95 | // 96 | // Internal use only. 97 | type metadataWrapper struct { 98 | ptr *C.struct_CMetadata 99 | } 100 | 101 | // newMetadata converts C metadata to Go and sets up cleanup. 102 | // 103 | // This function: 104 | // 1. Converts C metadata structure to Go map 105 | // 2. Sets up a finalizer to free C resources 106 | // 3. Splits comma-separated values into slices 107 | // 4. Trims whitespace from values 108 | // 109 | // Returns an empty Metadata map if the C pointer is nil. 110 | // 111 | // Internal use only. 112 | func newMetadata(cMeta *C.struct_CMetadata) Metadata { 113 | if cMeta == nil { 114 | return make(Metadata) 115 | } 116 | 117 | // Create wrapper for proper cleanup 118 | wrapper := &metadataWrapper{ptr: cMeta} 119 | runtime.SetFinalizer(wrapper, (*metadataWrapper).free) 120 | 121 | // Convert to Go map 122 | result := make(Metadata, int(cMeta.len)) 123 | 124 | if cMeta.len == 0 { 125 | return result 126 | } 127 | 128 | // Convert C arrays to Go slices 129 | keys := unsafe.Slice(cMeta.keys, cMeta.len) 130 | values := unsafe.Slice(cMeta.values, cMeta.len) 131 | 132 | for i := 0; i < int(cMeta.len); i++ { 133 | key := C.GoString(keys[i]) 134 | value := C.GoString(values[i]) 135 | 136 | // Values are comma-separated in C, split them into individual values 137 | valueSlice := strings.Split(value, ",") 138 | // Trim whitespace from each value 139 | for j := range valueSlice { 140 | valueSlice[j] = strings.TrimSpace(valueSlice[j]) 141 | } 142 | result[key] = valueSlice 143 | } 144 | 145 | return result 146 | } 147 | 148 | // free releases C metadata resources. 149 | // 150 | // This is called automatically by the garbage collector via the finalizer. 151 | // Application code should not call this directly. 152 | // 153 | // Internal use only. 154 | func (m *metadataWrapper) free() { 155 | if m.ptr != nil { 156 | C.extractous_metadata_free(m.ptr) 157 | m.ptr = nil 158 | } 159 | } 160 | 161 | // Get returns the first value for a metadata key. 162 | // 163 | // If the key exists and has one or more values, the first value is returned. 164 | // If the key doesn't exist or has no values, an empty string is returned. 165 | // 166 | // This is the most convenient method for metadata fields that typically have 167 | // a single value (like "title", "author", "created"). 168 | // 169 | // Parameters: 170 | // - key: Metadata field name (case-sensitive) 171 | // 172 | // Returns: 173 | // - First value as a string, or "" if not found 174 | // 175 | // Example: 176 | // 177 | // title := metadata.Get("title") 178 | // if title == "" { 179 | // fmt.Println("No title") 180 | // } else { 181 | // fmt.Println("Title:", title) 182 | // } 183 | // 184 | // // For potentially multi-valued fields, Get returns first value 185 | // firstAuthor := metadata.Get("author") 186 | func (m Metadata) Get(key string) string { 187 | if vals, ok := m[key]; ok && len(vals) > 0 { 188 | return vals[0] 189 | } 190 | return "" 191 | } 192 | 193 | // GetAll returns all values for a metadata key. 194 | // 195 | // Some metadata fields can have multiple values (particularly "author" and 196 | // "keywords"). This method returns all values as a slice. 197 | // 198 | // If the key doesn't exist, nil is returned (not an empty slice). 199 | // 200 | // Parameters: 201 | // - key: Metadata field name (case-sensitive) 202 | // 203 | // Returns: 204 | // - Slice of all values, or nil if key not found 205 | // 206 | // Example: 207 | // 208 | // // Get all authors 209 | // authors := metadata.GetAll("author") 210 | // if authors != nil { 211 | // for _, author := range authors { 212 | // fmt.Println("Author:", author) 213 | // } 214 | // } 215 | // 216 | // // Get all keywords 217 | // keywords := metadata.GetAll("keywords") 218 | // if keywords != nil { 219 | // fmt.Println("Keywords:", strings.Join(keywords, ", ")) 220 | // } 221 | // 222 | // // Check for nil vs empty 223 | // if metadata.GetAll("nonexistent") == nil { 224 | // fmt.Println("Key not found") 225 | // } 226 | func (m Metadata) GetAll(key string) []string { 227 | return m[key] 228 | } 229 | 230 | // Has checks if a metadata key exists. 231 | // 232 | // Returns true if the key exists in the metadata, even if it has empty values. 233 | // Returns false if the key doesn't exist. 234 | // 235 | // This is useful for distinguishing between a missing key and a key with an 236 | // empty value. 237 | // 238 | // Parameters: 239 | // - key: Metadata field name (case-sensitive) 240 | // 241 | // Returns: 242 | // - true if key exists, false otherwise 243 | // 244 | // Example: 245 | // 246 | // if metadata.Has("author") { 247 | // author := metadata.Get("author") 248 | // if author == "" { 249 | // fmt.Println("Author field exists but is empty") 250 | // } else { 251 | // fmt.Println("Author:", author) 252 | // } 253 | // } else { 254 | // fmt.Println("No author field") 255 | // } 256 | // 257 | // // Conditional processing 258 | // if metadata.Has("keywords") { 259 | // processKeywords(metadata.GetAll("keywords")) 260 | // } 261 | func (m Metadata) Has(key string) bool { 262 | _, ok := m[key] 263 | return ok 264 | } 265 | 266 | // Keys returns all metadata keys. 267 | // 268 | // Returns a slice of all keys present in the metadata. The order is not 269 | // guaranteed and may vary between calls. 270 | // 271 | // This is useful for iterating over all metadata fields without knowing the 272 | // specific keys in advance. 273 | // 274 | // Returns: 275 | // - Slice of all metadata keys (may be empty but never nil) 276 | // 277 | // Example: 278 | // 279 | // // Print all metadata 280 | // for _, key := range metadata.Keys() { 281 | // values := metadata.GetAll(key) 282 | // fmt.Printf("%s: %v\n", key, values) 283 | // } 284 | // 285 | // // Count metadata fields 286 | // fmt.Printf("Found %d metadata fields\n", len(metadata.Keys())) 287 | // 288 | // // Filter specific fields 289 | // for _, key := range metadata.Keys() { 290 | // if strings.HasPrefix(key, "dc:") { 291 | // // Process Dublin Core fields 292 | // fmt.Printf("%s = %s\n", key, metadata.Get(key)) 293 | // } 294 | // } 295 | // 296 | // Note: The returned slice is a new allocation and can be modified without 297 | // affecting the Metadata. 298 | func (m Metadata) Keys() []string { 299 | keys := make([]string, 0, len(m)) 300 | for k := range m { 301 | keys = append(keys, k) 302 | } 303 | return keys 304 | } 305 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Extractous Go 2 | 3 | Go bindings for [Extractous](https://github.com/yobix-ai/extractous) - fast, high-performance, rust-powered document extraction built on Apache Tika and Tesseract OCR. 4 | 5 | [![Go Reference](https://pkg.go.dev/badge/github.com/rahulpoonia29/extractous-go.svg)](https://pkg.go.dev/github.com/rahulpoonia29/extractous-go) 6 | [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) 7 | [![Build Status](https://github.com/rahulpoonia29/extractous-go/actions/workflows/build.yml/badge.svg?branch=main)](https://github.com/rahulpoonia29/extractous-go/actions/workflows/build.yml) 8 | 9 | --- 10 | 11 | ## Features 12 | 13 | - **High Performance**: Built on Rust for maximum throughput and minimal memory overhead 14 | - **60+ File Formats**: PDF, Office documents (DOCX, XLSX, PPTX), HTML, XML, and more 15 | - **OCR Support**: Extract text from scanned documents and images using Tesseract 16 | - **Streaming API**: Process large files with minimal memory usage 17 | 18 | --- 19 | 20 | ## Installation 21 | 22 | ### Step 1: Install the Go Package 23 | 24 | ```bash 25 | go get github.com/rahulpoonia29/extractous-go 26 | ``` 27 | 28 | ### Step 2: Download Native Libraries 29 | 30 | ```bash 31 | # Download libraries for your current platform 32 | go run github.com/rahulpoonia29/extractous-go/cmd/install@latest 33 | 34 | # Download for a specific platform 35 | go run github.com/rahulpoonia29/extractous-go/cmd/install@latest --platform linux-amd64 36 | 37 | # Download for all platforms (useful for cross-compilation) 38 | go run github.com/rahulpoonia29/extractous-go/cmd/install@latest --all 39 | ``` 40 | 41 | This creates a `native/` directory with libraries for your specific platform. 42 | 43 | --- 44 | 45 | ## Quick Start 46 | 47 | ### Basic Text Extraction 48 | 49 | ```go 50 | package main 51 | 52 | import ( 53 | "fmt" 54 | "log" 55 | 56 | "github.com/rahulpoonia29/extractous-go" 57 | ) 58 | 59 | func main() { 60 | // Create a new extractor 61 | extractor := extractous.New() 62 | if extractor == nil { 63 | log.Fatal("Failed to create extractor") 64 | } 65 | defer extractor.Close() 66 | 67 | // Extract text and metadata from file 68 | content, metadata, err := extractor.ExtractFileToString("document.pdf") 69 | if err != nil { 70 | log.Fatalf("Extraction failed: %v", err) 71 | } 72 | 73 | // Results 74 | fmt.Println("Content:", content) 75 | fmt.Println("Metadata:", metadata) 76 | } 77 | ``` 78 | 79 | ### Streaming Large Files 80 | 81 | For memory efficient processing of large documents: 82 | 83 | ```go 84 | package main 85 | 86 | import ( 87 | "fmt" 88 | "io" 89 | "log" 90 | 91 | "github.com/rahulpoonia29/extractous-go" 92 | ) 93 | 94 | func main() { 95 | extractor := extractous.New() 96 | if extractor == nil { 97 | log.Fatal("Failed to create extractor") 98 | } 99 | defer extractor.Close() 100 | 101 | // Get a streaming reader for the document 102 | reader, metadata, err := extractor.ExtractFile("large_document.pdf") 103 | if err != nil { 104 | log.Fatal(err) 105 | } 106 | defer reader.Close() 107 | 108 | // Process the document in chunks 109 | buffer := make([]byte, 8192) 110 | for { 111 | n, err := reader.Read(buffer) 112 | if err == io.EOF { 113 | break 114 | } 115 | if err != nil { 116 | log.Fatal(err) 117 | } 118 | 119 | // Process buffer[:n] 120 | fmt.Printf("Read %d bytes\n", n) 121 | } 122 | } 123 | ``` 124 | 125 | ### Configuration 126 | 127 | ```go 128 | package main 129 | 130 | import ( 131 | "log" 132 | 133 | "github.com/rahulpoonia29/extractous-go" 134 | ) 135 | 136 | func main() { 137 | // Configure PDF extraction with OCR 138 | pdfConfig := extractous.NewPdfConfig() 139 | pdfConfig.SetOcrStrategy(extractous.PdfOcrAuto) 140 | pdfConfig.SetExtractInlineImages(true) 141 | pdfConfig.SetExtractAnnotationText(true) 142 | 143 | // Configure OCR settings 144 | ocrConfig := extractous.NewTesseractOcrConfig() 145 | ocrConfig.SetLanguage("eng") 146 | ocrConfig.SetDensity(300) 147 | 148 | // Apply configurations to extractor 149 | extractor := extractous.New() 150 | extractor.SetPdfConfig(pdfConfig) 151 | extractor.SetTesseractOcrConfig(ocrConfig) 152 | extractor.SetXmlOutput(true) // Enable structured output 153 | 154 | defer extractor.Close() 155 | 156 | content, _, err := extractor.ExtractFileToString("scanned_document.pdf") 157 | if err != nil { 158 | log.Fatal(err) 159 | } 160 | 161 | log.Println(content) 162 | } 163 | ``` 164 | 165 | --- 166 | 167 | ## Building 168 | 169 | The library uses CGO to interface with native libraries. Below are platform-specific build instructions. 170 | 171 | ### Prerequisite 172 | 173 | - CGO enabled 174 | - Native libraries 175 | - Platform-specific C compiler 176 | 177 | ### Linux and macOS 178 | 179 | ```bash 180 | # Set up environment 181 | export CGO_ENABLED=1 182 | export CC=gcc 183 | export CXX=g++ 184 | 185 | # Set library path for the build 186 | export CGO_LDFLAGS="-L$(pwd)/native/$(go env GOOS)_$(go env GOARCH) -lextractous_ffi" 187 | 188 | # Build the application 189 | go build -o myapp main.go 190 | 191 | # Set the library path for runtime before executing 192 | export LD_LIBRARY_PATH="$(pwd)/native/$(go env GOOS)_$(go env GOARCH):$LD_LIBRARY_PATH" # For Linux 193 | export DYLD_LIBRARY_PATH="$(pwd)/native/$(go env GOOS)_$(go env GOARCH):$DYLD_LIBRARY_PATH" # For macOS 194 | 195 | ./myapp 196 | 197 | ``` 198 | 199 | ### Windows (PowerShell) 200 | 201 | ```powershell 202 | # Set up environment 203 | $env:CGO_ENABLED = "1" 204 | $env:CC = "gcc" 205 | $env:CXX = "g++" 206 | 207 | # Set library path for the build 208 | $env:CGO_LDFLAGS = "-L$pwd\native\windows_amd64 -lextractous_ffi" # Only x86-64 is supported 209 | 210 | # Build the application 211 | go build -o myapp.exe main.go 212 | 213 | # Add the DLL to the system's path 214 | $env:Path = "$pwd\native\windows_amd64;" + $env:Path 215 | .\myapp.exe 216 | ``` 217 | 218 | --- 219 | 220 | ## Error Handling 221 | 222 | ### Basic Error Handling 223 | 224 | ```go 225 | content, metadata, err := extractor.ExtractFileToString("document.pdf") 226 | if err != nil { 227 | // Check error type 228 | if errors.Is(err, extractous.ErrIO) { 229 | log.Println("File I/O error") 230 | } else if errors.Is(err, extractous.ErrExtraction) { 231 | log.Println("Document extraction failed") 232 | } 233 | 234 | log.Fatal(err) 235 | } 236 | ``` 237 | 238 | ### Error Handling with Debug Info 239 | 240 | ```go 241 | content, metadata, err := extractor.ExtractFileToString("document.pdf") 242 | if err != nil { 243 | // Get structured error information 244 | var extractErr *extractous.ExtractError 245 | if errors.As(err, &extractErr) { 246 | fmt.Printf("Error code: %d\n", extractErr.Code) 247 | fmt.Printf("Message: %s\n", extractErr.Message) 248 | 249 | // Optionally get detailed debug information 250 | // (includes full error chain and backtrace if available) 251 | if debug := extractErr.Debug(); debug != "" { 252 | fmt.Printf("Debug info:\n%s\n", debug) 253 | } 254 | } 255 | } 256 | ``` 257 | --- 258 | 259 | ## Performance 260 | 261 | | Operation | Throughput (MB/s) | Memory (MB) | Accuracy (%) | 262 | | ------------------ | ----------------- | ----------- | ------------ | 263 | | String Extraction | 36.70 | 15.78 | 86.95 | 264 | | Stream Extraction | 14.16 | 21.83 | 87.74 | 265 | | Reference (Go PDF) | 79.38 | 44.67 | 82.02 | 266 | 267 | --- 268 | 269 | ## Supported Formats 270 | 271 | Extractous Go supports PDF, Microsoft Office, OpenDocument, HTML/XML, plain text, images (with OCR) and more. 272 | 273 | For the full list of supported formats, see [Apache Tika Supported Formats](https://tika.apache.org/2.0.0/formats.html). 274 | 275 | --- 276 | 277 | ## Requirements 278 | 279 | ### Runtime Requirements 280 | 281 | - Go 1.19 or later 282 | - CGO enabled (`CGO_ENABLED=1`) 283 | - Platform-specific native libraries (provided by installer) 284 | - **Tesseract OCR**: Required only for OCR functionality on images and scanned PDFs 285 | - Ubuntu/Debian: `sudo apt-get install tesseract-ocr` 286 | - macOS: `brew install tesseract` 287 | - Windows: Download from [Tesseract at UB Mannheim](https://github.com/UB-Mannheim/tesseract/wiki) 288 | 289 | --- 290 | 291 | ## Distribution 292 | 293 | When distributing applications built with extractous-go: 294 | 295 | 1. **Bundle Native Libraries**: Include the platform-specific `.so`, `.dylib`, or `.dll` files with your application. 296 | 297 | 2. **Set Library Search Path**: 298 | - **Linux**: Set `LD_LIBRARY_PATH` or install to `/usr/local/lib` 299 | - **macOS**: Set `DYLD_LIBRARY_PATH` or use `@rpath` 300 | - **Windows**: Place DLL in the same directory as the executable or in `System32` 301 | 302 | 3. **Cross-Platform Builds**: Download libraries for all target platforms using: 303 | ```bash 304 | go run github.com/rahulpoonia29/extractous-go/cmd/install@latest --all 305 | ``` 306 | 307 | --- 308 | 309 | ## Acknowledgments 310 | 311 | - [Extractous](https://github.com/yobix-ai/extractous) - The underlying Rust library 312 | - [Apache Tika](https://tika.apache.org/) - Document extraction engine 313 | - [Tesseract OCR](https://github.com/tesseract-ocr/tesseract) - OCR engine 314 | -------------------------------------------------------------------------------- /errors.go: -------------------------------------------------------------------------------- 1 | package extractous 2 | 3 | /* 4 | #include 5 | #include 6 | */ 7 | import "C" 8 | import ( 9 | "errors" 10 | "fmt" 11 | ) 12 | 13 | // Sentinel errors that can be checked with errors.Is. 14 | // 15 | // These errors represent common failure modes in document extraction. They can 16 | // be used with errors.Is() and errors.As() for error handling and classification. 17 | // 18 | // # Error Handling Pattern 19 | // 20 | // content, _, err := extractor.ExtractFileToString("document.pdf") 21 | // if err != nil { 22 | // if errors.Is(err, extractous.ErrIO) { 23 | // // File not found or not readable 24 | // log.Printf("File error: %v", err) 25 | // } else if errors.Is(err, extractous.ErrExtraction) { 26 | // // Document format issue or corruption 27 | // log.Printf("Extraction error: %v", err) 28 | // } else { 29 | // // Other error 30 | // log.Printf("Unknown error: %v", err) 31 | // } 32 | // return 33 | // } 34 | // 35 | // # Error Unwrapping 36 | // 37 | // All errors returned by this package can be unwrapped to get the sentinel error: 38 | // 39 | // var extractErr *extractous.ExtractError 40 | // if errors.As(err, &extractErr) { 41 | // fmt.Printf("Error code: %d\n", extractErr.Code) 42 | // fmt.Printf("Message: %s\n", extractErr.Message) 43 | // fmt.Printf("Type: %v\n", errors.Unwrap(extractErr)) 44 | // } 45 | var ( 46 | // ErrNullPointer indicates a null pointer was encountered internally. 47 | // 48 | // This error typically indicates a programming error or corrupted internal 49 | // state. It should not occur during normal operation. 50 | // 51 | // Common causes: 52 | // - Using an extractor after calling Close() 53 | // - Passing a nil configuration to a setter 54 | // - Internal FFI layer issues 55 | // 56 | // Example: 57 | // 58 | // extractor := extractous.New() 59 | // extractor.Close() 60 | // _, _, err := extractor.ExtractFileToString("doc.pdf") 61 | // if errors.Is(err, extractous.ErrNullPointer) { 62 | // // Extractor was already closed 63 | // } 64 | ErrNullPointer = errors.New("null pointer provided") 65 | 66 | // ErrInvalidUTF8 indicates a string contains invalid UTF-8 sequences. 67 | // 68 | // This can occur when: 69 | // - Extracting documents with corrupt character encoding 70 | // - Document claims to be UTF-8 but contains invalid sequences 71 | // - Character set mismatch between document and configuration 72 | // 73 | // Example handling: 74 | // 75 | // if errors.Is(err, extractous.ErrInvalidUTF8) { 76 | // // Try with a different encoding 77 | // extractor = extractor.SetEncoding(extractous.CharSetUSASCII) 78 | // content, _, err = extractor.ExtractFileToString(path) 79 | // } 80 | ErrInvalidUTF8 = errors.New("invalid UTF-8 string") 81 | 82 | // ErrInvalidString indicates string conversion or processing failed. 83 | // 84 | // This error occurs when: 85 | // - String parameters cannot be converted properly 86 | // - Extracted text contains characters that cannot be represented 87 | // - Internal string buffer operations fail 88 | // 89 | // This is less common than ErrInvalidUTF8 and typically indicates a more 90 | // fundamental issue with the document or extraction process. 91 | ErrInvalidString = errors.New("string conversion failed") 92 | 93 | // ErrExtraction indicates document extraction failed. 94 | // 95 | // This is the most common error and can have many causes: 96 | // - Unsupported document format 97 | // - Corrupted or malformed document 98 | // - Document is encrypted or password-protected 99 | // - Document uses unsupported features 100 | // - OCR processing failed 101 | // 102 | // The ExtractError.Message field usually contains specific details about 103 | // what went wrong. 104 | // 105 | // Example handling: 106 | // 107 | // if errors.Is(err, extractous.ErrExtraction) { 108 | // var extractErr *extractous.ExtractError 109 | // if errors.As(err, &extractErr) { 110 | // log.Printf("Extraction failed: %s", extractErr.Message) 111 | // // Try alternative extraction method or skip document 112 | // } 113 | // } 114 | ErrExtraction = errors.New("extraction failed") 115 | 116 | // ErrIO indicates an I/O operation failed. 117 | // 118 | // Common causes: 119 | // - File not found 120 | // - Permission denied 121 | // - Disk read/write error 122 | // - Network error (for URL extraction) 123 | // - Out of disk space 124 | // 125 | // Example handling: 126 | // 127 | // if errors.Is(err, extractous.ErrIO) { 128 | // if os.IsNotExist(err) { 129 | // log.Println("File not found") 130 | // } else if os.IsPermission(err) { 131 | // log.Println("Permission denied") 132 | // } else { 133 | // log.Printf("I/O error: %v", err) 134 | // } 135 | // } 136 | ErrIO = errors.New("IO error") 137 | 138 | // ErrInvalidConfig indicates the provided configuration is invalid. 139 | // 140 | // This error occurs when: 141 | // - Configuration parameter is out of valid range 142 | // - Conflicting configuration options 143 | // - Required configuration is missing 144 | // 145 | // Example: 146 | // 147 | // config := extractous.NewOcrConfig().SetDensity(-100) // Invalid DPI 148 | // extractor := extractous.New().SetOcrConfig(config) 149 | // // May return ErrInvalidConfig when used 150 | ErrInvalidConfig = errors.New("invalid configuration") 151 | 152 | // ErrInvalidEnum indicates an invalid enum value was provided. 153 | // 154 | // This typically indicates a programming error where an enum constant 155 | // is used incorrectly or has an unexpected value. 156 | // 157 | // Example: 158 | // 159 | // strategy := extractous.PdfOcrStrategy(999) // Invalid value 160 | // config := extractous.NewPdfConfig().SetOcrStrategy(strategy) 161 | // // May return ErrInvalidEnum 162 | ErrInvalidEnum = errors.New("invalid enum value") 163 | ) 164 | 165 | // ExtractError wraps detailed extraction error information. 166 | // 167 | // ExtractError provides structured error information including an error code, 168 | // a human-readable message, and a sentinel error for classification. It 169 | // implements the error interface and supports error unwrapping for use with 170 | // errors.Is() and errors.As(). 171 | // 172 | // # Fields 173 | // 174 | // - Code: Numeric error code from the FFI layer (negative values) 175 | // - Message: Detailed error message from the underlying extraction library 176 | // - Err: Wrapped sentinel error (one of ErrNullPointer, ErrIO, etc.) 177 | // 178 | // # Usage 179 | // 180 | // Use errors.Is() to check error types: 181 | // 182 | // if errors.Is(err, extractous.ErrExtraction) { 183 | // // Handle extraction error 184 | // } 185 | // 186 | // Use errors.As() to access error details: 187 | // 188 | // var extractErr *extractous.ExtractError 189 | // if errors.As(err, &extractErr) { 190 | // fmt.Printf("Error code: %d\n", extractErr.Code) 191 | // fmt.Printf("Message: %s\n", extractErr.Message) 192 | // } 193 | // 194 | // # Example 195 | // 196 | // content, _, err := extractor.ExtractFileToString("corrupt.pdf") 197 | // if err != nil { 198 | // var extractErr *extractous.ExtractError 199 | // if errors.As(err, &extractErr) { 200 | // switch { 201 | // case extractErr.Code == -4: 202 | // log.Printf("Extraction failed: %s", extractErr.Message) 203 | // case extractErr.Code == -5: 204 | // log.Printf("I/O error: %s", extractErr.Message) 205 | // default: 206 | // log.Printf("Error %d: %s", extractErr.Code, extractErr.Message) 207 | // } 208 | // } 209 | // } 210 | // errors.go 211 | 212 | // ExtractError wraps detailed extraction error information. 213 | type ExtractError struct { 214 | Code int // Numeric error code from FFI layer 215 | Message string // User-facing error message 216 | Err error // Wrapped sentinel error for errors.Is() 217 | } 218 | 219 | // newError creates an ExtractError from a C error code. 220 | // Debug details are NOT fetched here to avoid memory overhead. 221 | // Users must explicitly call Debug() to get detailed information. 222 | func newError(code C.int) error { 223 | if code == errOK { 224 | return nil 225 | } 226 | 227 | var sentinelErr error 228 | switch int(code) { 229 | case errNullPointer: 230 | sentinelErr = ErrNullPointer 231 | case errInvalidUTF8: 232 | sentinelErr = ErrInvalidUTF8 233 | case errInvalidString: 234 | sentinelErr = ErrInvalidString 235 | case errExtractionFailed: 236 | sentinelErr = ErrExtraction 237 | case errIOError: 238 | sentinelErr = ErrIO 239 | case errInvalidConfig: 240 | sentinelErr = ErrInvalidConfig 241 | case errInvalidEnum: 242 | sentinelErr = ErrInvalidEnum 243 | default: 244 | sentinelErr = fmt.Errorf("unknown error code: %d", code) 245 | } 246 | 247 | // Get user-facing error message (fast, small string) 248 | cMsg := C.extractous_error_message(code) 249 | var msg string 250 | if cMsg != nil { 251 | msg = goString(cMsg) 252 | C.extractous_string_free(cMsg) 253 | } 254 | 255 | return &ExtractError{ 256 | Code: int(code), 257 | Message: msg, 258 | Err: sentinelErr, 259 | } 260 | } 261 | 262 | // Error implements the error interface. 263 | // Returns user-friendly error message. 264 | func (e *ExtractError) Error() string { 265 | if e.Message != "" { 266 | return fmt.Sprintf("extractous error (code %d): %s", e.Code, e.Message) 267 | } 268 | return fmt.Sprintf("extractous error (code %d)", e.Code) 269 | } 270 | 271 | // Unwrap returns the underlying sentinel error for errors.Is() support 272 | func (e *ExtractError) Unwrap() error { 273 | return e.Err 274 | } 275 | 276 | // Debug retrieves detailed debug information for the last error 277 | // that occurred on the current thread. 278 | // 279 | // This function is EXPENSIVE - it formats the full error chain with 280 | // backtrace (if RUST_BACKTRACE=1). Only call it when you actually 281 | // need detailed debugging information. 282 | // 283 | // **Important**: This clears the stored error. Subsequent calls to 284 | // Debug() will return empty string unless a new error occurs. 285 | // 286 | // Example: 287 | // 288 | // _, _, err := extractor.ExtractFileToString("corrupt.pdf") 289 | // if err != nil { 290 | // var extractErr *extractous.ExtractError 291 | // if errors.As(err, &extractErr) { 292 | // // Show user-facing error 293 | // fmt.Printf("Error: %s\n", extractErr.Error()) 294 | // 295 | // // Optionally get debug details (for developers only) 296 | // if debug := extractErr.Debug(); debug != "" { 297 | // log.Printf("DEBUG:\n%s", debug) 298 | // } 299 | // } 300 | // } 301 | func (e *ExtractError) Debug() string { 302 | cDebug := C.extractous_error_get_last_debug() 303 | if cDebug == nil { 304 | return "" 305 | } 306 | defer C.extractous_string_free(cDebug) 307 | return goString(cDebug) 308 | } 309 | 310 | // HasDebug checks if debug information is available for the last error 311 | // on the current thread without retrieving it. 312 | // 313 | // This is useful to avoid the overhead of Debug() when no error is stored. 314 | func (e *ExtractError) HasDebug() bool { 315 | return C.extractous_error_has_debug() != 0 316 | } 317 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /types.go: -------------------------------------------------------------------------------- 1 | package extractous 2 | 3 | // CharSet represents character encoding for text extraction. 4 | // 5 | // Character sets determine how bytes are interpreted as text characters. Most 6 | // modern documents use UTF-8, which supports all Unicode characters. Other 7 | // encodings are provided for legacy compatibility. 8 | // 9 | // # Supported Encodings 10 | // 11 | // - CharSetUTF8: Unicode UTF-8 (default, recommended for all modern uses) 12 | // - CharSetUSASCII: US-ASCII (7-bit, legacy systems only) 13 | // - CharSetUTF16BE: UTF-16 Big Endian (some legacy systems) 14 | // 15 | // # When to Use Different Encodings 16 | // 17 | // UTF-8 (default): Use for all modern applications. It's the universal standard 18 | // and supports all languages and symbols. 19 | // 20 | // US-ASCII: Only for legacy systems that cannot handle Unicode. This encoding 21 | // only supports basic English characters (a-z, A-Z, 0-9, and basic punctuation). 22 | // Any characters outside this range will be lost or corrupted. 23 | // 24 | // UTF-16BE: Rare. Only needed for specific legacy systems that require this 25 | // encoding. Modern systems should use UTF-8. 26 | // 27 | // Example: 28 | // 29 | // // Default UTF-8 (recommended) 30 | // extractor := extractous.New() 31 | // 32 | // // Explicitly set UTF-8 33 | // extractor := extractous.New(). 34 | // SetEncoding(extractous.CharSetUTF8) 35 | // 36 | // // Legacy ASCII (not recommended) 37 | // extractor := extractous.New(). 38 | // SetEncoding(extractous.CharSetUSASCII) 39 | type CharSet int 40 | 41 | const ( 42 | // CharSetUTF8 is UTF-8 encoding (default, recommended). 43 | // 44 | // UTF-8 is the universal standard character encoding that supports all 45 | // Unicode characters, including: 46 | // - All human languages (Latin, Cyrillic, Arabic, CJK, etc.) 47 | // - Emojis and symbols 48 | // - Mathematical notation 49 | // - Technical symbols 50 | // 51 | // UTF-8 is backward compatible with ASCII, space-efficient, and the 52 | // default encoding for modern systems. 53 | // 54 | // Use this for all new applications unless you have a specific requirement 55 | // for another encoding. 56 | CharSetUTF8 CharSet = 0 57 | 58 | // CharSetUSASCII is US-ASCII encoding (7-bit, legacy only). 59 | // 60 | // US-ASCII supports only 128 characters: 61 | // - Uppercase letters: A-Z 62 | // - Lowercase letters: a-z 63 | // - Digits: 0-9 64 | // - Basic punctuation and symbols 65 | // 66 | // Characters outside this range (accented letters, non-Latin scripts, etc.) 67 | // will be lost or converted to placeholders. 68 | // 69 | // Only use this encoding if: 70 | // - You need compatibility with very old systems 71 | // - Your documents contain only basic English text 72 | // - You have explicit requirements for ASCII-only output 73 | // 74 | // Warning: This encoding cannot represent most international text. 75 | CharSetUSASCII CharSet = 1 76 | 77 | // CharSetUTF16BE is UTF-16 Big Endian encoding (legacy). 78 | // 79 | // UTF-16 uses 16-bit code units and can represent all Unicode characters. 80 | // The "Big Endian" variant stores the most significant byte first. 81 | // 82 | // This encoding is less space-efficient than UTF-8 for most text and is 83 | // primarily used for: 84 | // - Windows internal APIs (UTF-16LE, not UTF-16BE) 85 | // - Java internal string representation 86 | // - Some legacy systems 87 | // 88 | // Modern systems should use UTF-8 instead. Only use UTF-16BE if you have 89 | // explicit requirements for this encoding. 90 | CharSetUTF16BE CharSet = 2 91 | ) 92 | 93 | // String returns the human-readable name of the character set. 94 | // 95 | // This is useful for logging, debugging, and displaying the current encoding 96 | // configuration to users. 97 | // 98 | // Example: 99 | // 100 | // charset := extractous.CharSetUTF8 101 | // fmt.Println(charset.String()) // Output: "UTF-8" 102 | func (c CharSet) String() string { 103 | switch c { 104 | case CharSetUTF8: 105 | return "UTF-8" 106 | case CharSetUSASCII: 107 | return "US-ASCII" 108 | case CharSetUTF16BE: 109 | return "UTF-16BE" 110 | default: 111 | return "Unknown" 112 | } 113 | } 114 | 115 | // PdfOcrStrategy defines how OCR is applied to PDF documents. 116 | // 117 | // PDF documents can contain different types of text content: 118 | // - Embedded text: Text that is directly stored in the PDF (selectable text) 119 | // - Image-based text: Text visible only as pixels in images (not selectable) 120 | // 121 | // The OCR strategy determines how the extractor handles these different types 122 | // of content. 123 | // 124 | // # Strategy Comparison 125 | // 126 | // PdfOcrNoOcr (fastest): 127 | // - Only extracts embedded text 128 | // - No OCR processing 129 | // - Fast and efficient 130 | // - Good for: PDFs with selectable text, e-books, digital documents 131 | // - Bad for: Scanned documents, photos of text 132 | // 133 | // PdfOcrAuto (recommended): 134 | // - Automatically detects pages without embedded text 135 | // - Performs OCR only on those pages 136 | // - Balanced performance and accuracy 137 | // - Good for: Mixed documents, unknown document types 138 | // - The smart default for most use cases 139 | // 140 | // PdfOcrOcrOnly (specialized): 141 | // - Only performs OCR, ignores embedded text 142 | // - Useful when embedded text is corrupt or incorrect 143 | // - Slow, processes every page with OCR 144 | // - Good for: PDFs with broken text layers 145 | // - Bad for: General purpose extraction 146 | // 147 | // PdfOcrOcrAndTextExtraction (comprehensive): 148 | // - Extracts both embedded text AND performs OCR 149 | // - Most comprehensive but slowest 150 | // - May produce duplicate content 151 | // - Good for: Maximum content extraction, forensic analysis 152 | // - Bad for: Production systems (very slow) 153 | // 154 | // # Performance Implications 155 | // 156 | // OCR is computationally expensive: 157 | // - PdfOcrNoOcr: ~100-1000x faster than OCR strategies 158 | // - PdfOcrAuto: Variable (depends on document content) 159 | // - PdfOcrOcrOnly: Slowest, processes every page 160 | // - PdfOcrOcrAndTextExtraction: Slowest, maximum processing 161 | // 162 | // Example: 163 | // 164 | // // Digital PDF with embedded text (fast) 165 | // pdfConfig := extractous.NewPdfConfig(). 166 | // SetOcrStrategy(extractous.PdfOcrNoOcr) 167 | // 168 | // // Scanned document (auto OCR when needed) 169 | // pdfConfig := extractous.NewPdfConfig(). 170 | // SetOcrStrategy(extractous.PdfOcrAuto) 171 | // 172 | // // Force OCR on all pages (slow but comprehensive) 173 | // pdfConfig := extractous.NewPdfConfig(). 174 | // SetOcrStrategy(extractous.PdfOcrOcrAndTextExtraction) 175 | type PdfOcrStrategy int 176 | 177 | const ( 178 | // PdfOcrNoOcr extracts only embedded text, no OCR processing (fastest). 179 | // 180 | // This strategy only extracts selectable text that is directly embedded in 181 | // the PDF. It does NOT perform OCR on images or scanned pages. 182 | // 183 | // Use when: 184 | // - You know the PDFs contain selectable text 185 | // - Performance is critical 186 | // - Processing digital documents (not scans) 187 | // 188 | // This is the fastest strategy, typically 100-1000x faster than OCR-based 189 | // strategies. 190 | // 191 | // Example: 192 | // 193 | // // Fast extraction for digital PDFs 194 | // config := extractous.NewPdfConfig(). 195 | // SetOcrStrategy(extractous.PdfOcrNoOcr) 196 | // 197 | // Note: Scanned documents and images will produce little or no text with 198 | // this strategy. 199 | PdfOcrNoOcr PdfOcrStrategy = 0 200 | 201 | // PdfOcrOcrOnly performs OCR on all pages, ignores embedded text. 202 | // 203 | // This strategy applies OCR to all pages regardless of whether they contain 204 | // embedded text. It's useful when the embedded text is corrupt, incorrect, 205 | // or lower quality than what OCR would produce. 206 | // 207 | // Use when: 208 | // - The PDF has a broken or incorrect text layer 209 | // - You want consistent OCR output across all pages 210 | // - You need to extract from pure image PDFs 211 | // 212 | // Warning: This is very slow as it performs OCR on every page, even pages 213 | // that already have good embedded text. 214 | // 215 | // Example: 216 | // 217 | // // Force OCR for PDFs with broken text layers 218 | // config := extractous.NewPdfConfig(). 219 | // SetOcrStrategy(extractous.PdfOcrOcrOnly) 220 | // 221 | // Note: Requires Tesseract OCR to be installed on the system. 222 | PdfOcrOcrOnly PdfOcrStrategy = 1 223 | 224 | // PdfOcrOcrAndTextExtraction extracts both embedded text AND performs OCR. 225 | // 226 | // This strategy is the most comprehensive, extracting both: 227 | // 1. Embedded text from the PDF text layer 228 | // 2. Text via OCR from images and visual content 229 | // 230 | // This can produce duplicate content if the same text appears both as 231 | // embedded text and in images. 232 | // 233 | // Use when: 234 | // - Maximum content extraction is required 235 | // - You need both text layers for comparison 236 | // - Forensic analysis or complete document preservation 237 | // 238 | // Warning: This is the slowest strategy, combining all extraction methods. 239 | // It may also produce duplicate or redundant content. 240 | // 241 | // Example: 242 | // 243 | // // Maximum extraction for forensic analysis 244 | // config := extractous.NewPdfConfig(). 245 | // SetOcrStrategy(extractous.PdfOcrOcrAndTextExtraction) 246 | // 247 | // Note: Best for offline processing where completeness matters more than 248 | // speed or deduplication. 249 | PdfOcrOcrAndTextExtraction PdfOcrStrategy = 2 250 | 251 | // PdfOcrAuto automatically decides based on page content (recommended). 252 | // 253 | // This strategy intelligently detects whether each page has embedded text: 254 | // - Pages WITH embedded text: Extract directly (fast) 255 | // - Pages WITHOUT embedded text: Apply OCR (slower) 256 | // 257 | // This provides the best balance of performance and completeness for 258 | // documents of unknown type or mixed content. 259 | // 260 | // Use when: 261 | // - Document type is unknown 262 | // - Handling mixed documents (some pages scanned, some digital) 263 | // - You want good default behavior 264 | // 265 | // This is the recommended strategy for general-purpose PDF extraction. 266 | // 267 | // Example: 268 | // 269 | // // Smart extraction that adapts to content 270 | // config := extractous.NewPdfConfig(). 271 | // SetOcrStrategy(extractous.PdfOcrAuto) 272 | // 273 | // Performance: Variable depending on content. Pages with embedded text are 274 | // processed quickly; only scanned pages incur OCR overhead. 275 | PdfOcrAuto PdfOcrStrategy = 3 276 | ) 277 | 278 | // String returns the human-readable name of the OCR strategy. 279 | // 280 | // This is useful for logging, debugging, and displaying the current OCR 281 | // configuration to users. 282 | // 283 | // Example: 284 | // 285 | // strategy := extractous.PdfOcrAuto 286 | // fmt.Println(strategy.String()) // Output: "Auto" 287 | func (s PdfOcrStrategy) String() string { 288 | switch s { 289 | case PdfOcrNoOcr: 290 | return "NoOCR" 291 | case PdfOcrOcrOnly: 292 | return "OCROnly" 293 | case PdfOcrOcrAndTextExtraction: 294 | return "OCRAndTextExtraction" 295 | case PdfOcrAuto: 296 | return "Auto" 297 | default: 298 | return "Unknown" 299 | } 300 | } 301 | 302 | // Error codes from the FFI layer. 303 | // 304 | // These constants map to error codes returned by the underlying C FFI library. 305 | // They are used internally to construct Go error values. Application code should 306 | // use the exported error variables (ErrNullPointer, ErrIO, etc.) instead of 307 | // checking these raw codes. 308 | // 309 | // Internal use only. 310 | const ( 311 | errOK = 0 // No error 312 | errNullPointer = -1 // Null pointer passed to FFI 313 | errInvalidUTF8 = -2 // String is not valid UTF-8 314 | errInvalidString = -3 // String parameter is invalid 315 | errExtractionFailed = -4 // Document extraction failed 316 | errIOError = -5 // File I/O error 317 | errInvalidConfig = -6 // Configuration is invalid 318 | errInvalidEnum = -7 // Enum value is invalid 319 | ) 320 | -------------------------------------------------------------------------------- /ffi/src/extractor.rs: -------------------------------------------------------------------------------- 1 | use crate::ecore::{CharSet, Extractor as CoreExtractor}; 2 | use crate::errors::*; 3 | use crate::metadata::metadata_to_c; 4 | use crate::types::*; 5 | use std::ffi::{CStr, CString}; 6 | use std::os::raw::c_char; 7 | use std::ptr; 8 | 9 | /// Creates a new `Extractor` with a default configuration. 10 | /// The returned handle must be freed with `extractous_extractor_free`. 11 | // #[must_use] 12 | #[unsafe(no_mangle)] 13 | pub extern "C" fn extractous_extractor_new() -> *mut CExtractor { 14 | let extractor = Box::new(CoreExtractor::new()); 15 | Box::into_raw(extractor) as *mut CExtractor 16 | } 17 | 18 | /// Frees the memory associated with an `Extractor` handle. 19 | #[unsafe(no_mangle)] 20 | pub unsafe extern "C" fn extractous_extractor_free(handle: *mut CExtractor) { 21 | if !handle.is_null() { 22 | unsafe { 23 | drop(Box::from_raw(handle as *mut CoreExtractor)); 24 | } 25 | } 26 | } 27 | 28 | /// A macro to safely update an Extractor instance behind a raw pointer. 29 | macro_rules! update_extractor { 30 | ($handle:expr, |$extractor_val:ident| $body:block) => { 31 | if $handle.is_null() { 32 | return; 33 | } 34 | unsafe { 35 | let extractor_ptr = $handle as *mut CoreExtractor; 36 | let old_extractor = ptr::read(extractor_ptr); 37 | let new_extractor = { 38 | let $extractor_val = old_extractor; 39 | $body 40 | }; 41 | ptr::write(extractor_ptr, new_extractor); 42 | } 43 | }; 44 | } 45 | 46 | /// Sets the maximum length for extracted string content. 47 | #[unsafe(no_mangle)] 48 | pub unsafe extern "C" fn extractous_extractor_set_extract_string_max_length_mut( 49 | handle: *mut CExtractor, 50 | max_length: libc::c_int, 51 | ) { 52 | update_extractor!(handle, |extractor| { 53 | extractor.set_extract_string_max_length(max_length as i32) 54 | }); 55 | } 56 | 57 | /// Sets the character encoding for the extracted text. 58 | #[unsafe(no_mangle)] 59 | pub unsafe extern "C" fn extractous_extractor_set_encoding_mut( 60 | handle: *mut CExtractor, 61 | encoding: libc::c_int, 62 | ) { 63 | update_extractor!(handle, |extractor| { 64 | let charset = match encoding { 65 | CHARSET_UTF_8 => CharSet::UTF_8, 66 | CHARSET_US_ASCII => CharSet::US_ASCII, 67 | CHARSET_UTF_16BE => CharSet::UTF_16BE, 68 | _ => return, 69 | }; 70 | extractor.set_encoding(charset) 71 | }); 72 | } 73 | 74 | /// Sets the configuration for the PDF parser. 75 | #[unsafe(no_mangle)] 76 | pub unsafe extern "C" fn extractous_extractor_set_pdf_config_mut( 77 | handle: *mut CExtractor, 78 | config: *const CPdfParserConfig, 79 | ) { 80 | if config.is_null() { 81 | return; 82 | } 83 | update_extractor!(handle, |extractor| { 84 | let pdf_config = &*(config as *const crate::ecore::PdfParserConfig); 85 | extractor.set_pdf_config(pdf_config.clone()) 86 | }); 87 | } 88 | 89 | /// Sets the configuration for the Office document parser. 90 | #[unsafe(no_mangle)] 91 | pub unsafe extern "C" fn extractous_extractor_set_office_config_mut( 92 | handle: *mut CExtractor, 93 | config: *const COfficeParserConfig, 94 | ) { 95 | if config.is_null() { 96 | return; 97 | } 98 | update_extractor!(handle, |extractor| { 99 | let office_config = &*(config as *const crate::ecore::OfficeParserConfig); 100 | extractor.set_office_config(office_config.clone()) 101 | }); 102 | } 103 | 104 | /// Sets the configuration for Tesseract OCR. 105 | #[unsafe(no_mangle)] 106 | pub unsafe extern "C" fn extractous_extractor_set_ocr_config_mut( 107 | handle: *mut CExtractor, 108 | config: *const CTesseractOcrConfig, 109 | ) { 110 | if config.is_null() { 111 | return; 112 | } 113 | update_extractor!(handle, |extractor| { 114 | let ocr_config = &*(config as *const crate::ecore::TesseractOcrConfig); 115 | extractor.set_ocr_config(ocr_config.clone()) 116 | }); 117 | } 118 | 119 | /// Sets whether to output structured XML instead of plain text. 120 | #[unsafe(no_mangle)] 121 | pub unsafe extern "C" fn extractous_extractor_set_xml_output_mut( 122 | handle: *mut CExtractor, 123 | xml_output: bool, 124 | ) { 125 | update_extractor!(handle, |extractor| { extractor.set_xml_output(xml_output) }); 126 | } 127 | 128 | // Macro to handle the common extraction logic and error wrapping. 129 | macro_rules! perform_extraction { 130 | ( 131 | $handle:expr, 132 | $out_ptr1:expr, 133 | $out_ptr2:expr, 134 | $extractor_call:expr, 135 | $success_handler:expr 136 | ) => {{ 137 | if $handle.is_null() || $out_ptr1.is_null() || $out_ptr2.is_null() { 138 | return ERR_NULL_POINTER; 139 | } 140 | 141 | // Safely get a shared reference to the extractor. 142 | let extractor = unsafe { &*($handle as *const CoreExtractor) }; 143 | 144 | match $extractor_call(extractor) { 145 | Ok((res1, res2)) => { 146 | $success_handler($out_ptr1, $out_ptr2, res1, res2); 147 | ERR_OK 148 | } 149 | Err(e) => { 150 | let code = extractous_error_to_code(&e); 151 | set_last_error(e); 152 | code 153 | } 154 | } 155 | }}; 156 | } 157 | 158 | /// Extracts content and metadata from a local file path into a string. 159 | /// 160 | /// Output strings must be freed with `extractous_string_free`. 161 | /// Output metadata must be freed with `extractous_metadata_free`. 162 | #[unsafe(no_mangle)] 163 | pub unsafe extern "C" fn extractous_extractor_extract_file_to_string( 164 | handle: *mut CExtractor, 165 | path: *const c_char, 166 | out_content: *mut *mut c_char, 167 | out_metadata: *mut *mut CMetadata, 168 | ) -> libc::c_int { 169 | if path.is_null() { 170 | return ERR_NULL_POINTER; 171 | } 172 | let path_str = match unsafe { CStr::from_ptr(path).to_str() } { 173 | Ok(s) => s, 174 | Err(_) => return ERR_INVALID_UTF8, 175 | }; 176 | 177 | perform_extraction!( 178 | handle, 179 | out_content, 180 | out_metadata, 181 | |extractor: &CoreExtractor| extractor.extract_file_to_string(path_str), 182 | |out_c: *mut *mut c_char, out_m: *mut *mut CMetadata, content, metadata| { 183 | unsafe { 184 | *out_c = CString::new(content).map_or(ptr::null_mut(), |s| s.into_raw()); 185 | *out_m = metadata_to_c(metadata); 186 | } 187 | } 188 | ) 189 | } 190 | 191 | /// Extracts content and metadata from a local file path into a stream. 192 | #[unsafe(no_mangle)] 193 | pub unsafe extern "C" fn extractous_extractor_extract_file( 194 | handle: *mut CExtractor, 195 | path: *const c_char, 196 | out_reader: *mut *mut CStreamReader, 197 | out_metadata: *mut *mut CMetadata, 198 | ) -> libc::c_int { 199 | if path.is_null() { 200 | return ERR_NULL_POINTER; 201 | } 202 | let path_str = match unsafe { CStr::from_ptr(path).to_str() } { 203 | Ok(s) => s, 204 | Err(_) => return ERR_INVALID_UTF8, 205 | }; 206 | 207 | perform_extraction!( 208 | handle, 209 | out_reader, 210 | out_metadata, 211 | |extractor: &CoreExtractor| extractor.extract_file(path_str), 212 | |out_r: *mut *mut CStreamReader, out_m: *mut *mut CMetadata, reader, metadata| { 213 | unsafe { 214 | *out_r = Box::into_raw(Box::new(reader)) as *mut CStreamReader; 215 | *out_m = metadata_to_c(metadata); 216 | } 217 | } 218 | ) 219 | } 220 | 221 | /// Extracts content and metadata from a byte slice into a string. 222 | #[unsafe(no_mangle)] 223 | pub unsafe extern "C" fn extractous_extractor_extract_bytes_to_string( 224 | handle: *mut CExtractor, 225 | data: *const u8, 226 | data_len: libc::size_t, 227 | out_content: *mut *mut c_char, 228 | out_metadata: *mut *mut CMetadata, 229 | ) -> libc::c_int { 230 | if data.is_null() { 231 | return ERR_NULL_POINTER; 232 | } 233 | let bytes = unsafe { std::slice::from_raw_parts(data, data_len) }; 234 | 235 | perform_extraction!( 236 | handle, 237 | out_content, 238 | out_metadata, 239 | |extractor: &CoreExtractor| extractor.extract_bytes_to_string(bytes), 240 | |out_c: *mut *mut c_char, out_m: *mut *mut CMetadata, content, metadata| { 241 | unsafe { 242 | *out_c = CString::new(content).map_or(ptr::null_mut(), |s| s.into_raw()); 243 | *out_m = metadata_to_c(metadata); 244 | } 245 | } 246 | ) 247 | } 248 | 249 | /// Extracts content and metadata from a byte slice into a stream. 250 | #[unsafe(no_mangle)] 251 | pub unsafe extern "C" fn extractous_extractor_extract_bytes( 252 | handle: *mut CExtractor, 253 | data: *const u8, 254 | data_len: libc::size_t, 255 | out_reader: *mut *mut CStreamReader, 256 | out_metadata: *mut *mut CMetadata, 257 | ) -> libc::c_int { 258 | if data.is_null() { 259 | return ERR_NULL_POINTER; 260 | } 261 | let bytes = unsafe { std::slice::from_raw_parts(data, data_len) }; 262 | 263 | perform_extraction!( 264 | handle, 265 | out_reader, 266 | out_metadata, 267 | |extractor: &CoreExtractor| extractor.extract_bytes(bytes), 268 | |out_r: *mut *mut CStreamReader, out_m: *mut *mut CMetadata, reader, metadata| { 269 | unsafe { 270 | *out_r = Box::into_raw(Box::new(reader)) as *mut CStreamReader; 271 | *out_m = metadata_to_c(metadata); 272 | } 273 | } 274 | ) 275 | } 276 | 277 | /// Extracts content and metadata from a URL into a string. 278 | #[unsafe(no_mangle)] 279 | pub unsafe extern "C" fn extractous_extractor_extract_url_to_string( 280 | handle: *mut CExtractor, 281 | url: *const c_char, 282 | out_content: *mut *mut c_char, 283 | out_metadata: *mut *mut CMetadata, 284 | ) -> libc::c_int { 285 | if url.is_null() { 286 | return ERR_NULL_POINTER; 287 | } 288 | let url_str = match unsafe { CStr::from_ptr(url).to_str() } { 289 | Ok(s) => s, 290 | Err(_) => return ERR_INVALID_UTF8, 291 | }; 292 | 293 | perform_extraction!( 294 | handle, 295 | out_content, 296 | out_metadata, 297 | |extractor: &CoreExtractor| extractor.extract_url_to_string(url_str), 298 | |out_c: *mut *mut c_char, out_m: *mut *mut CMetadata, content, metadata| { 299 | unsafe { 300 | *out_c = CString::new(content).map_or(ptr::null_mut(), |s| s.into_raw()); 301 | *out_m = metadata_to_c(metadata); 302 | } 303 | } 304 | ) 305 | } 306 | 307 | /// Extracts content and metadata from a URL into a stream. 308 | #[unsafe(no_mangle)] 309 | pub unsafe extern "C" fn extractous_extractor_extract_url( 310 | handle: *mut CExtractor, 311 | url: *const c_char, 312 | out_reader: *mut *mut CStreamReader, 313 | out_metadata: *mut *mut CMetadata, 314 | ) -> libc::c_int { 315 | if url.is_null() { 316 | return ERR_NULL_POINTER; 317 | } 318 | let url_str = match unsafe { CStr::from_ptr(url).to_str() } { 319 | Ok(s) => s, 320 | Err(_) => return ERR_INVALID_UTF8, 321 | }; 322 | 323 | perform_extraction!( 324 | handle, 325 | out_reader, 326 | out_metadata, 327 | |extractor: &CoreExtractor| extractor.extract_url(url_str), 328 | |out_r: *mut *mut CStreamReader, out_m: *mut *mut CMetadata, reader, metadata| { 329 | unsafe { 330 | *out_r = Box::into_raw(Box::new(reader)) as *mut CStreamReader; 331 | *out_m = metadata_to_c(metadata); 332 | } 333 | } 334 | ) 335 | } 336 | 337 | /// Frees a C-style string that was allocated by this library. 338 | #[unsafe(no_mangle)] 339 | pub unsafe extern "C" fn extractous_string_free(s: *mut c_char) { 340 | if !s.is_null() { 341 | drop(unsafe { CString::from_raw(s) }); 342 | } 343 | } 344 | -------------------------------------------------------------------------------- /tests/go/integration_test.go: -------------------------------------------------------------------------------- 1 | package extractous_test 2 | 3 | import ( 4 | "os" 5 | "path/filepath" 6 | "strings" 7 | "testing" 8 | 9 | extractous "github.com/rahulpoonia29/extractous-go" 10 | ) 11 | 12 | // ============================================================================ 13 | // Test Setup 14 | // ============================================================================ 15 | 16 | const ( 17 | testDataDir = "../testdata" 18 | ) 19 | 20 | func setupTestDir(t *testing.T) string { 21 | dir := filepath.Join(testDataDir) 22 | if err := os.MkdirAll(dir, 0755); err != nil { 23 | t.Fatalf("Failed to create test data directory: %v", err) 24 | } 25 | return dir 26 | } 27 | 28 | func createTestFile(t *testing.T, filename, content string) string { 29 | dir := setupTestDir(t) 30 | filePath := filepath.Join(dir, filename) 31 | 32 | if err := os.WriteFile(filePath, []byte(content), 0644); err != nil { 33 | t.Fatalf("Failed to create test file: %v", err) 34 | } 35 | 36 | return filePath 37 | } 38 | 39 | // ============================================================================ 40 | // Text File Tests 41 | // ============================================================================ 42 | 43 | func TestIntegration_ExtractPlainText(t *testing.T) { 44 | content := "Hello, World!\nThis is a test file." 45 | filePath := createTestFile(t, "test.txt", content) 46 | defer os.Remove(filePath) 47 | 48 | extractor := extractous.New() 49 | if extractor == nil { 50 | t.Fatal("Failed to create extractor") 51 | } 52 | defer extractor.Close() 53 | 54 | extracted, metadata, err := extractor.ExtractFileToString(filePath) 55 | if err != nil { 56 | t.Fatalf("ExtractFileToString failed: %v", err) 57 | } 58 | 59 | if !strings.Contains(extracted, "Hello, World!") { 60 | t.Errorf("Expected content not found in extracted text: %s", extracted) 61 | } 62 | 63 | if !strings.Contains(extracted, "This is a test file") { 64 | t.Errorf("Expected content not found in extracted text: %s", extracted) 65 | } 66 | 67 | if metadata == nil { 68 | t.Error("Expected non-nil metadata") 69 | } 70 | 71 | // Check for common metadata 72 | if contentType := metadata.Get("Content-Type"); contentType == "" { 73 | t.Log("Warning: Content-Type not found in metadata") 74 | } 75 | } 76 | 77 | func TestIntegration_ExtractBytes(t *testing.T) { 78 | content := "Test content for bytes extraction" 79 | 80 | extractor := extractous.New() 81 | if extractor == nil { 82 | t.Fatal("Failed to create extractor") 83 | } 84 | defer extractor.Close() 85 | 86 | extracted, metadata, err := extractor.ExtractBytesToString([]byte(content)) 87 | if err != nil { 88 | t.Fatalf("ExtractBytesToString failed: %v", err) 89 | } 90 | 91 | if !strings.Contains(extracted, content) { 92 | t.Errorf("Expected content not found. Got: %s", extracted) 93 | } 94 | 95 | if metadata == nil { 96 | t.Error("Expected non-nil metadata") 97 | } 98 | } 99 | 100 | func TestIntegration_ExtractBytesStream(t *testing.T) { 101 | content := "Test content for streaming bytes extraction" 102 | 103 | extractor := extractous.New() 104 | if extractor == nil { 105 | t.Fatal("Failed to create extractor") 106 | } 107 | defer extractor.Close() 108 | 109 | stream, metadata, err := extractor.ExtractBytes([]byte(content)) 110 | if err != nil { 111 | t.Fatalf("ExtractBytes failed: %v", err) 112 | } 113 | 114 | if stream == nil { 115 | t.Fatal("Expected non-nil stream") 116 | } 117 | 118 | if metadata == nil { 119 | t.Error("Expected non-nil metadata") 120 | } 121 | 122 | // Read content from stream 123 | extractedBytes := make([]byte, 1024) 124 | n, _ := stream.Read(extractedBytes) 125 | 126 | extracted := string(extractedBytes[:n]) 127 | if !strings.Contains(extracted, content) { 128 | t.Errorf("Expected content not found in stream. Got: %s", extracted) 129 | } 130 | } 131 | 132 | // ============================================================================ 133 | // Configuration Tests 134 | // ============================================================================ 135 | 136 | func TestIntegration_MaxLengthConfiguration(t *testing.T) { 137 | // Create a long text file 138 | longContent := strings.Repeat("A", 10000) 139 | filePath := createTestFile(t, "long_test.txt", longContent) 140 | defer os.Remove(filePath) 141 | 142 | // Extract with small max length 143 | extractor := extractous.New().SetExtractStringMaxLength(100) 144 | if extractor == nil { 145 | t.Fatal("Failed to create extractor") 146 | } 147 | defer extractor.Close() 148 | 149 | extracted, _, err := extractor.ExtractFileToString(filePath) 150 | if err != nil { 151 | t.Fatalf("ExtractFileToString failed: %v", err) 152 | } 153 | 154 | // Extracted content should be truncated (or close to max length) 155 | if len(extracted) > 200 { // Some overhead is allowed 156 | t.Logf("Warning: Extracted %d chars, expected ~100", len(extracted)) 157 | } 158 | } 159 | 160 | func TestIntegration_EncodingConfiguration(t *testing.T) { 161 | content := "UTF-8 content: こんにちは" 162 | filePath := createTestFile(t, "utf8_test.txt", content) 163 | defer os.Remove(filePath) 164 | 165 | extractor := extractous.New().SetEncoding(extractous.CharSetUTF8) 166 | if extractor == nil { 167 | t.Fatal("Failed to create extractor") 168 | } 169 | defer extractor.Close() 170 | 171 | extracted, _, err := extractor.ExtractFileToString(filePath) 172 | if err != nil { 173 | t.Fatalf("ExtractFileToString failed: %v", err) 174 | } 175 | 176 | if !strings.Contains(extracted, "UTF-8") { 177 | t.Logf("Extracted content: %s", extracted) 178 | } 179 | } 180 | 181 | func TestIntegration_XmlOutputConfiguration(t *testing.T) { 182 | content := "Test content for XML output" 183 | filePath := createTestFile(t, "xml_test.txt", content) 184 | defer os.Remove(filePath) 185 | 186 | // Test with XML output enabled 187 | extractor := extractous.New().SetXmlOutput(true) 188 | if extractor == nil { 189 | t.Fatal("Failed to create extractor") 190 | } 191 | defer extractor.Close() 192 | 193 | extracted, _, err := extractor.ExtractFileToString(filePath) 194 | if err != nil { 195 | t.Fatalf("ExtractFileToString failed: %v", err) 196 | } 197 | 198 | // XML output should contain XML tags 199 | if !strings.Contains(extracted, "<") { 200 | t.Logf("Warning: XML output doesn't seem to contain XML tags: %s", extracted[:min(100, len(extracted))]) 201 | } 202 | 203 | // Test with XML output disabled 204 | extractor2 := extractous.New().SetXmlOutput(false) 205 | if extractor2 == nil { 206 | t.Fatal("Failed to create extractor") 207 | } 208 | defer extractor2.Close() 209 | 210 | extracted2, _, err := extractor2.ExtractFileToString(filePath) 211 | if err != nil { 212 | t.Fatalf("ExtractFileToString failed: %v", err) 213 | } 214 | 215 | if strings.Contains(extracted2, "Test content") { 216 | t.Log("Plain text extraction successful") 217 | } 218 | } 219 | 220 | // ============================================================================ 221 | // Metadata Tests 222 | // ============================================================================ 223 | 224 | func TestIntegration_MetadataExtraction(t *testing.T) { 225 | content := "Test content" 226 | filePath := createTestFile(t, "metadata_test.txt", content) 227 | defer os.Remove(filePath) 228 | 229 | extractor := extractous.New() 230 | if extractor == nil { 231 | t.Fatal("Failed to create extractor") 232 | } 233 | defer extractor.Close() 234 | 235 | _, metadata, err := extractor.ExtractFileToString(filePath) 236 | if err != nil { 237 | t.Fatalf("ExtractFileToString failed: %v", err) 238 | } 239 | 240 | if metadata == nil { 241 | t.Fatal("Expected non-nil metadata") 242 | } 243 | 244 | // Test metadata methods 245 | if !metadata.Has("Content-Type") { 246 | t.Log("Warning: Content-Type not found in metadata") 247 | } 248 | 249 | keys := metadata.Keys() 250 | if len(keys) == 0 { 251 | t.Error("Expected some metadata keys") 252 | } 253 | 254 | t.Logf("Metadata keys: %v", keys) 255 | 256 | // Test Get method 257 | for _, key := range keys { 258 | value := metadata.Get(key) 259 | if value == "" { 260 | t.Errorf("Get returned empty string for existing key: %s", key) 261 | } 262 | t.Logf("%s: %s", key, value) 263 | } 264 | 265 | // Test GetAll method 266 | for _, key := range keys { 267 | values := metadata.GetAll(key) 268 | if len(values) == 0 { 269 | t.Errorf("GetAll returned nil/empty for existing key: %s", key) 270 | } 271 | } 272 | } 273 | 274 | func TestIntegration_MetadataWithMultipleValues(t *testing.T) { 275 | // Some metadata fields can have multiple values (comma-separated) 276 | content := "Test content" 277 | filePath := createTestFile(t, "multi_meta_test.txt", content) 278 | defer os.Remove(filePath) 279 | 280 | extractor := extractous.New() 281 | if extractor == nil { 282 | t.Fatal("Failed to create extractor") 283 | } 284 | defer extractor.Close() 285 | 286 | _, metadata, err := extractor.ExtractFileToString(filePath) 287 | if err != nil { 288 | t.Fatalf("ExtractFileToString failed: %v", err) 289 | } 290 | 291 | // Check if any metadata has multiple values 292 | for _, key := range metadata.Keys() { 293 | values := metadata.GetAll(key) 294 | if len(values) > 1 { 295 | t.Logf("Key '%s' has multiple values: %v", key, values) 296 | } 297 | } 298 | } 299 | 300 | // ============================================================================ 301 | // Error Handling Tests 302 | // ============================================================================ 303 | 304 | func TestIntegration_NonexistentFile(t *testing.T) { 305 | extractor := extractous.New() 306 | if extractor == nil { 307 | t.Fatal("Failed to create extractor") 308 | } 309 | defer extractor.Close() 310 | 311 | _, _, err := extractor.ExtractFileToString("/nonexistent/file.txt") 312 | if err == nil { 313 | t.Error("Expected error for nonexistent file") 314 | } 315 | } 316 | 317 | func TestIntegration_EmptyFile(t *testing.T) { 318 | filePath := createTestFile(t, "empty.txt", "") 319 | defer os.Remove(filePath) 320 | 321 | extractor := extractous.New() 322 | if extractor == nil { 323 | t.Fatal("Failed to create extractor") 324 | } 325 | defer extractor.Close() 326 | 327 | extracted, metadata, err := extractor.ExtractFileToString(filePath) 328 | if err != nil { 329 | t.Fatalf("ExtractFileToString failed: %v", err) 330 | } 331 | 332 | if extracted != "" { 333 | t.Logf("Note: Empty file produced content: %s", extracted) 334 | } 335 | 336 | if metadata == nil { 337 | t.Error("Expected non-nil metadata even for empty file") 338 | } 339 | } 340 | 341 | // ============================================================================ 342 | // Concurrency Tests 343 | // ============================================================================ 344 | 345 | func TestIntegration_ConcurrentExtraction(t *testing.T) { 346 | content := "Concurrent test content" 347 | filePath := createTestFile(t, "concurrent_test.txt", content) 348 | defer os.Remove(filePath) 349 | 350 | const numGoroutines = 10 351 | errors := make(chan error, numGoroutines) 352 | 353 | for i := range numGoroutines { 354 | go func(id int) { 355 | extractor := extractous.New() 356 | if extractor == nil { 357 | errors <- nil // Signal completion even on nil 358 | return 359 | } 360 | defer extractor.Close() 361 | 362 | extracted, _, err := extractor.ExtractFileToString(filePath) 363 | if err != nil { 364 | errors <- err 365 | return 366 | } 367 | 368 | if !strings.Contains(extracted, content) { 369 | errors <- nil 370 | return 371 | } 372 | 373 | errors <- nil // Success 374 | }(i) 375 | } 376 | 377 | // Wait for all goroutines 378 | for i := 0; i < numGoroutines; i++ { 379 | err := <-errors 380 | if err != nil { 381 | t.Errorf("Goroutine failed: %v", err) 382 | } 383 | } 384 | } 385 | 386 | func TestIntegration_MultipleExtractorsSameFile(t *testing.T) { 387 | content := "Multiple extractors test" 388 | filePath := createTestFile(t, "multi_ext_test.txt", content) 389 | defer os.Remove(filePath) 390 | 391 | extractors := make([]*extractous.Extractor, 5) 392 | for i := range extractors { 393 | extractors[i] = extractous.New() 394 | if extractors[i] == nil { 395 | t.Fatal("Failed to create extractor") 396 | } 397 | defer extractors[i].Close() 398 | } 399 | 400 | // All extractors extract the same file 401 | for i, ext := range extractors { 402 | extracted, _, err := ext.ExtractFileToString(filePath) 403 | if err != nil { 404 | t.Errorf("Extractor %d failed: %v", i, err) 405 | } 406 | if !strings.Contains(extracted, content) { 407 | t.Errorf("Extractor %d didn't extract correct content", i) 408 | } 409 | } 410 | } 411 | 412 | // ============================================================================ 413 | // Helper Functions 414 | // ============================================================================ 415 | 416 | func min(a, b int) int { 417 | if a < b { 418 | return a 419 | } 420 | return b 421 | } 422 | -------------------------------------------------------------------------------- /extractous.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Extractous FFI - C Interface 3 | * 4 | * This header file provides a C-compatible interface to the Extractous 5 | * document extraction library. It is safe for use with Go via cgo or any 6 | * C-compatible FFI system. 7 | * 8 | * License: Apache-2.0 9 | * Repository: https://github.com/rahulpoonia229/extractous-go 10 | * 11 | * MEMORY MANAGEMENT: 12 | * All pointers returned by Extractous functions must be freed using the function extractous_free_string. 13 | * Failure to do so will result in memory leaks. 14 | * 15 | * 16 | * CGO USAGE: 17 | * // #cgo CFLAGS: -I${SRCDIR}/include 18 | * // #cgo LDFLAGS: -L${SRCDIR}/lib -lextractous_ffi 19 | * // #cgo linux LDFLAGS: -Wl,-rpath,$ORIGIN 20 | * // #cgo darwin LDFLAGS: -Wl,-rpath,@loader_path 21 | * // #include "extractous.h" 22 | * import "C" 23 | */ 24 | 25 | 26 | #ifndef EXTRACTOUS_H 27 | #define EXTRACTOUS_H 28 | 29 | #include 30 | #include 31 | #include 32 | #include 33 | 34 | #define ERR_OK 0 35 | 36 | #define ERR_NULL_POINTER -1 37 | 38 | #define ERR_INVALID_UTF8 -2 39 | 40 | #define ERR_INVALID_STRING -3 41 | 42 | #define ERR_EXTRACTION_FAILED -4 43 | 44 | #define ERR_IO_ERROR -5 45 | 46 | #define ERR_INVALID_CONFIG -6 47 | 48 | #define ERR_INVALID_ENUM -7 49 | 50 | #define ERR_UNSUPPORTED_FORMAT -8 51 | 52 | #define ERR_OUT_OF_MEMORY -9 53 | 54 | #define ERR_OCR_FAILED -10 55 | 56 | #define CHARSET_UTF_8 0 57 | 58 | #define CHARSET_US_ASCII 1 59 | 60 | #define CHARSET_UTF_16BE 3 61 | 62 | #define PDF_OCR_STRATEGY_NO_OCR 0 63 | 64 | #define PDF_OCR_STRATEGY_OCR_ONLY 1 65 | 66 | #define PDF_OCR_STRATEGY_OCR_AND_TEXT_EXTRACTION 2 67 | 68 | #define PDF_OCR_STRATEGY_AUTO 3 69 | 70 | typedef struct CPdfParserConfig { 71 | uint8_t _private[0]; 72 | } CPdfParserConfig; 73 | 74 | typedef struct COfficeParserConfig { 75 | uint8_t _private[0]; 76 | } COfficeParserConfig; 77 | 78 | typedef struct CTesseractOcrConfig { 79 | uint8_t _private[0]; 80 | } CTesseractOcrConfig; 81 | 82 | typedef struct CExtractor { 83 | uint8_t _private[0]; 84 | } CExtractor; 85 | 86 | typedef struct CMetadata { 87 | /* 88 | Array of pointers to null-terminated key strings 89 | */ 90 | char **keys; 91 | /* 92 | Array of pointers to null-terminated value strings 93 | */ 94 | char **values; 95 | /* 96 | The number of key-value pairs in the arrays 97 | */ 98 | size_t len; 99 | } CMetadata; 100 | 101 | typedef struct CStreamReader { 102 | uint8_t _private[0]; 103 | } CStreamReader; 104 | 105 | /* 106 | Returns the FFI wrapper version as a null-terminated UTF-8 string. 107 | The returned pointer is to a static string and must not be freed. 108 | */ 109 | const char *extractous_ffi_version(void); 110 | 111 | /* 112 | Returns the underlying Extractous core library version. 113 | The returned pointer is to a static string and must not be freed. 114 | */ 115 | const char *extractous_core_version(void); 116 | 117 | /* 118 | Creates a new PDF parser configuration with default settings. 119 | The returned handle must be freed with `extractous_pdf_config_free()` 120 | unless passed to an extractor, which will take ownership. 121 | */ 122 | struct CPdfParserConfig *extractous_pdf_config_new(void); 123 | 124 | /* 125 | Frees the memory associated with a PDF parser configuration. 126 | Do not call this if the config has been attached to an extractor. 127 | */ 128 | void extractous_pdf_config_free(struct CPdfParserConfig *handle); 129 | 130 | /* 131 | Sets the OCR strategy for PDF parsing. Modifies the config in-place. 132 | */ 133 | void extractous_pdf_config_set_ocr_strategy(struct CPdfParserConfig *handle, int strategy); 134 | 135 | /* 136 | Enables or disables extraction of inline images. Modifies the config in-place. 137 | */ 138 | void extractous_pdf_config_set_extract_inline_images(struct CPdfParserConfig *handle, bool value); 139 | 140 | /* 141 | If enabled, only unique inline images (by digest) will be extracted. 142 | */ 143 | void extractous_pdf_config_set_extract_unique_inline_images_only(struct CPdfParserConfig *handle, 144 | bool value); 145 | 146 | /* 147 | Enables or disables extraction of text from marked content sections. 148 | */ 149 | void extractous_pdf_config_set_extract_marked_content(struct CPdfParserConfig *handle, bool value); 150 | 151 | /* 152 | Enables or disables extraction of text from annotations. 153 | */ 154 | void extractous_pdf_config_set_extract_annotation_text(struct CPdfParserConfig *handle, bool value); 155 | 156 | /* 157 | Creates a new Office parser configuration with default settings. 158 | */ 159 | struct COfficeParserConfig *extractous_office_config_new(void); 160 | 161 | /* 162 | Frees the memory associated with an Office parser configuration. 163 | */ 164 | void extractous_office_config_free(struct COfficeParserConfig *handle); 165 | 166 | /* 167 | Enables or disables macro extraction. Modifies the config in-place. 168 | */ 169 | void extractous_office_config_set_extract_macros(struct COfficeParserConfig *handle, bool value); 170 | 171 | /* 172 | Enables or disables inclusion of deleted content (track changes). 173 | */ 174 | void extractous_office_config_set_include_deleted_content(struct COfficeParserConfig *handle, 175 | bool value); 176 | 177 | /* 178 | Enables or disables inclusion of moved-from content (track changes). 179 | */ 180 | void extractous_office_config_set_include_move_from_content(struct COfficeParserConfig *handle, 181 | bool value); 182 | 183 | /* 184 | Enables or disables inclusion of content from shapes. 185 | */ 186 | void extractous_office_config_set_include_shape_based_content(struct COfficeParserConfig *handle, 187 | bool value); 188 | 189 | /* 190 | Creates a new Tesseract OCR configuration with default settings. 191 | */ 192 | struct CTesseractOcrConfig *extractous_ocr_config_new(void); 193 | 194 | /* 195 | Frees the memory associated with a Tesseract OCR configuration. 196 | */ 197 | void extractous_ocr_config_free(struct CTesseractOcrConfig *handle); 198 | 199 | /* 200 | Sets the OCR language. Modifies the config in-place. 201 | */ 202 | void extractous_ocr_config_set_language(struct CTesseractOcrConfig *handle, const char *language); 203 | 204 | /* 205 | Sets the DPI for OCR processing. Modifies the config in-place. 206 | */ 207 | void extractous_ocr_config_set_density(struct CTesseractOcrConfig *handle, int32_t density); 208 | 209 | /* 210 | Sets the bit depth for OCR processing. 211 | */ 212 | void extractous_ocr_config_set_depth(struct CTesseractOcrConfig *handle, int32_t depth); 213 | 214 | /* 215 | Enables or disables image preprocessing for OCR. 216 | */ 217 | void extractous_ocr_config_set_enable_image_preprocessing(struct CTesseractOcrConfig *handle, 218 | bool value); 219 | 220 | /* 221 | Sets the timeout for the Tesseract process in seconds. 222 | */ 223 | void extractous_ocr_config_set_timeout_seconds(struct CTesseractOcrConfig *handle, int32_t seconds); 224 | 225 | char *extractous_error_message(int code); 226 | 227 | /* 228 | Retrieves a detailed debug report for the last error on this thread 229 | full error chain and a backtrace if RUST_BACKTRACE=1 230 | */ 231 | char *extractous_error_get_last_debug(void); 232 | 233 | /* 234 | Checks if debug information is available for the current thread 235 | */ 236 | int extractous_error_has_debug(void); 237 | 238 | void extractous_error_clear_last(void); 239 | 240 | /* 241 | Creates a new `Extractor` with a default configuration. 242 | The returned handle must be freed with `extractous_extractor_free`. 243 | */ 244 | struct CExtractor *extractous_extractor_new(void); 245 | 246 | /* 247 | Frees the memory associated with an `Extractor` handle. 248 | */ 249 | void extractous_extractor_free(struct CExtractor *handle); 250 | 251 | /* 252 | Sets the maximum length for extracted string content. 253 | */ 254 | void extractous_extractor_set_extract_string_max_length_mut(struct CExtractor *handle, 255 | int max_length); 256 | 257 | /* 258 | Sets the character encoding for the extracted text. 259 | */ 260 | void extractous_extractor_set_encoding_mut(struct CExtractor *handle, int encoding); 261 | 262 | /* 263 | Sets the configuration for the PDF parser. 264 | */ 265 | void extractous_extractor_set_pdf_config_mut(struct CExtractor *handle, 266 | const struct CPdfParserConfig *config); 267 | 268 | /* 269 | Sets the configuration for the Office document parser. 270 | */ 271 | void extractous_extractor_set_office_config_mut(struct CExtractor *handle, 272 | const struct COfficeParserConfig *config); 273 | 274 | /* 275 | Sets the configuration for Tesseract OCR. 276 | */ 277 | void extractous_extractor_set_ocr_config_mut(struct CExtractor *handle, 278 | const struct CTesseractOcrConfig *config); 279 | 280 | /* 281 | Sets whether to output structured XML instead of plain text. 282 | */ 283 | void extractous_extractor_set_xml_output_mut(struct CExtractor *handle, bool xml_output); 284 | 285 | /* 286 | Extracts content and metadata from a local file path into a string. 287 | 288 | Output strings must be freed with `extractous_string_free`. 289 | Output metadata must be freed with `extractous_metadata_free`. 290 | */ 291 | int extractous_extractor_extract_file_to_string(struct CExtractor *handle, 292 | const char *path, 293 | char **out_content, 294 | struct CMetadata **out_metadata); 295 | 296 | /* 297 | Extracts content and metadata from a local file path into a stream. 298 | */ 299 | int extractous_extractor_extract_file(struct CExtractor *handle, 300 | const char *path, 301 | struct CStreamReader **out_reader, 302 | struct CMetadata **out_metadata); 303 | 304 | /* 305 | Extracts content and metadata from a byte slice into a string. 306 | */ 307 | int extractous_extractor_extract_bytes_to_string(struct CExtractor *handle, 308 | const uint8_t *data, 309 | size_t data_len, 310 | char **out_content, 311 | struct CMetadata **out_metadata); 312 | 313 | /* 314 | Extracts content and metadata from a byte slice into a stream. 315 | */ 316 | int extractous_extractor_extract_bytes(struct CExtractor *handle, 317 | const uint8_t *data, 318 | size_t data_len, 319 | struct CStreamReader **out_reader, 320 | struct CMetadata **out_metadata); 321 | 322 | /* 323 | Extracts content and metadata from a URL into a string. 324 | */ 325 | int extractous_extractor_extract_url_to_string(struct CExtractor *handle, 326 | const char *url, 327 | char **out_content, 328 | struct CMetadata **out_metadata); 329 | 330 | /* 331 | Extracts content and metadata from a URL into a stream. 332 | */ 333 | int extractous_extractor_extract_url(struct CExtractor *handle, 334 | const char *url, 335 | struct CStreamReader **out_reader, 336 | struct CMetadata **out_metadata); 337 | 338 | /* 339 | Frees a C-style string that was allocated by this library. 340 | */ 341 | void extractous_string_free(char *s); 342 | 343 | /* 344 | Frees a metadata structure and all associated memory. 345 | */ 346 | void extractous_metadata_free(struct CMetadata *metadata); 347 | 348 | /* 349 | Reads data from a stream into a user-provided buffer. 350 | 351 | Returns the actual number of bytes read via the `bytes_read` output parameter. 352 | Reaching the end of the stream is indicated by `ERR_OK` and `*bytes_read == 0`. 353 | */ 354 | int extractous_stream_read(struct CStreamReader *handle, 355 | uint8_t *buffer, 356 | size_t buffer_size, 357 | size_t *bytes_read); 358 | 359 | /* 360 | Reads exactly `buffer_size` bytes from the stream. 361 | 362 | Function will continue reading until the buffer is full, or the end of 363 | the stream is reached, or an error occurs. 364 | */ 365 | int extractous_stream_read_exact(struct CStreamReader *handle, 366 | uint8_t *buffer, 367 | size_t buffer_size, 368 | size_t *bytes_read); 369 | 370 | /* 371 | Reads the remaining stream into a newly allocated buffer. 372 | */ 373 | int extractous_stream_read_all(struct CStreamReader *handle, 374 | uint8_t **out_buffer, 375 | size_t *out_size); 376 | 377 | /* 378 | Frees a buffer allocated by `extractous_stream_read_all`. 379 | */ 380 | void extractous_buffer_free(uint8_t *buffer, size_t size); 381 | 382 | /* 383 | Frees a stream reader and releases its resources. 384 | */ 385 | void extractous_stream_free(struct CStreamReader *handle); 386 | 387 | #endif /* EXTRACTOUS_H */ 388 | -------------------------------------------------------------------------------- /cmd/install/main.go: -------------------------------------------------------------------------------- 1 | // go run github.com/rahulpoonia29/extractous-go/cmd/install@latest 2 | package main 3 | 4 | import ( 5 | "archive/tar" 6 | "archive/zip" 7 | "compress/gzip" 8 | "context" 9 | "encoding/json" 10 | "flag" 11 | "fmt" 12 | "io" 13 | "log" 14 | "math" 15 | "net/http" 16 | "os" 17 | "path/filepath" 18 | "runtime" 19 | "sort" 20 | "strconv" 21 | "strings" 22 | "time" 23 | 24 | "github.com/schollz/progressbar/v3" 25 | ) 26 | 27 | const ( 28 | repoOwner = "rahulpoonia29" 29 | repoName = "extractous-go" 30 | nativeDir = "native" 31 | ) 32 | 33 | type platformList []string 34 | 35 | func (p *platformList) String() string { 36 | return strings.Join(*p, ", ") 37 | } 38 | 39 | func (p *platformList) Set(value string) error { 40 | *p = append(*p, value) 41 | return nil 42 | } 43 | 44 | var ( 45 | verbose bool 46 | client = http.DefaultClient 47 | ) 48 | 49 | func main() { 50 | var platforms platformList 51 | var listPlatforms, downloadAll bool 52 | 53 | flag.Var(&platforms, "platform", "Specify a platform to download (e.g., linux_amd64). Can be used multiple times.") 54 | flag.BoolVar(&listPlatforms, "list", false, "List available platforms from the latest release and exit.") 55 | flag.BoolVar(&downloadAll, "all", false, "Download all available platforms from the latest release.") 56 | flag.BoolVar(&verbose, "v", false, "Verbose logging") 57 | flag.Parse() 58 | 59 | // use logging for errors and info (timestamps) 60 | log.SetFlags(0) // keep messages clean 61 | infof("Fetching Extractous FFI release information from GitHub...") 62 | 63 | availablePlatforms, err := getAvailablePlatforms() 64 | if err != nil { 65 | fatalf("Error retrieving available platforms: %v", err) 66 | } 67 | 68 | if listPlatforms { 69 | printAvailablePlatforms(availablePlatforms) 70 | return 71 | } 72 | 73 | platformsToDownload := determinePlatformsToDownload(platforms, downloadAll, availablePlatforms) 74 | if len(platformsToDownload) == 0 { 75 | infof("No platforms selected for download.") 76 | infof("Available platforms (run with --list to view):") 77 | printAvailablePlatforms(availablePlatforms) 78 | infof("To install for this machine run without flags, or pass --platform for the platform you want.") 79 | return 80 | } 81 | 82 | infof("Platforms selected for installation: %s", strings.Join(platformsToDownload, ", ")) 83 | 84 | for _, platform := range platformsToDownload { 85 | archiveURL, ok := availablePlatforms[platform] 86 | if !ok { 87 | log.Printf("Warning: Platform '%s' not found in latest release. Skipping.", platform) 88 | continue 89 | } 90 | 91 | infof("Downloading release for platform: %s", platform) 92 | 93 | archivePath, err := downloadFileWithRetries(archiveURL, 3) 94 | if err != nil { 95 | fatalf("Failed to download asset for %s: %v", platform, err) 96 | } 97 | // ensure cleanup of downloaded archive 98 | defer os.Remove(archivePath) 99 | 100 | archiveFormat := "tar.gz" 101 | if strings.HasSuffix(archiveURL, ".zip") { 102 | archiveFormat = "zip" 103 | } 104 | 105 | if err := extractArchive(archivePath, nativeDir, platform, archiveFormat); err != nil { 106 | // attempt cleanup of partial extraction 107 | destPath := filepath.Join(nativeDir, platform) 108 | _ = os.RemoveAll(destPath) 109 | fatalf("Failed to extract archive for %s: %v", platform, err) 110 | } 111 | infof("Libraries for %s extracted to ./%s/%s", platform, nativeDir, platform) 112 | } 113 | 114 | infof("Installation completed successfully.") 115 | } 116 | 117 | func infof(format string, args ...interface{}) { 118 | fmt.Printf(format+"\n", args...) 119 | } 120 | 121 | func fatalf(format string, args ...interface{}) { 122 | log.Fatalf(format, args...) 123 | } 124 | 125 | func printAvailablePlatforms(platforms map[string]string) { 126 | if len(platforms) == 0 { 127 | fmt.Println(" (no platforms found)") 128 | return 129 | } 130 | names := make([]string, 0, len(platforms)) 131 | for n := range platforms { 132 | names = append(names, n) 133 | } 134 | sort.Strings(names) 135 | for _, name := range names { 136 | fmt.Printf(" - %s\n", name) 137 | } 138 | } 139 | 140 | func determinePlatformsToDownload(platforms platformList, downloadAll bool, availablePlatforms map[string]string) []string { 141 | if downloadAll { 142 | keys := make([]string, 0, len(availablePlatforms)) 143 | for k := range availablePlatforms { 144 | keys = append(keys, k) 145 | } 146 | sort.Strings(keys) 147 | return keys 148 | } 149 | 150 | if len(platforms) > 0 { 151 | return platforms 152 | } 153 | 154 | currentPlatform, _ := getPlatformAndFormat() 155 | if _, ok := availablePlatforms[currentPlatform]; ok { 156 | return []string{currentPlatform} 157 | } 158 | 159 | // not found for current platform 160 | return []string{} 161 | } 162 | 163 | func getAvailablePlatforms() (map[string]string, error) { 164 | apiURL := fmt.Sprintf("https://api.github.com/repos/%s/%s/releases/latest", repoOwner, repoName) 165 | 166 | var resp *http.Response 167 | var err error 168 | 169 | // simple retry here too 170 | for attempt := 0; attempt < 3; attempt++ { 171 | ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second) 172 | defer cancel() 173 | req, _ := http.NewRequestWithContext(ctx, http.MethodGet, apiURL, nil) 174 | req.Header.Set("Accept", "application/vnd.github.v3+json") 175 | resp, err = client.Do(req) 176 | if err == nil { 177 | break 178 | } 179 | wait := time.Duration(math.Pow(2, float64(attempt))) * time.Second 180 | time.Sleep(wait) 181 | } 182 | if err != nil { 183 | return nil, fmt.Errorf("failed to query GitHub API: %w", err) 184 | } 185 | defer resp.Body.Close() 186 | 187 | if resp.StatusCode == http.StatusForbidden { 188 | remaining := resp.Header.Get("X-RateLimit-Remaining") 189 | reset := resp.Header.Get("X-RateLimit-Reset") 190 | if remaining == "0" && reset != "" { 191 | // parse reset as unix timestamp 192 | if ts, err := strconv.ParseInt(reset, 10, 64); err == nil { 193 | resetTime := time.Unix(ts, 0).Local() 194 | // Round to nearest minute for nicer display 195 | duration := max(time.Until(resetTime), 0) 196 | humanWait := fmt.Sprintf("about %d min", int(duration.Minutes()+0.5)) 197 | 198 | return nil, fmt.Errorf( 199 | "GitHub API rate limit exceeded.\nLimit resets at: %s (%s from now)\nTip: set a personal access token to increase your limit", 200 | resetTime.Format("Mon 2 15:04 MST"), 201 | humanWait, 202 | ) 203 | } 204 | } 205 | return nil, fmt.Errorf("access forbidden from GitHub API: %s", resp.Status) 206 | } 207 | 208 | if resp.StatusCode != http.StatusOK { 209 | return nil, fmt.Errorf("unexpected status from GitHub API: %s", resp.Status) 210 | } 211 | 212 | var releaseInfo struct { 213 | Assets []struct { 214 | Name string `json:"name"` 215 | DownloadURL string `json:"browser_download_url"` 216 | } `json:"assets"` 217 | } 218 | 219 | if err := json.NewDecoder(resp.Body).Decode(&releaseInfo); err != nil { 220 | return nil, fmt.Errorf("failed to decode GitHub release info: %w", err) 221 | } 222 | 223 | platforms := make(map[string]string) 224 | for _, asset := range releaseInfo.Assets { 225 | if !strings.HasPrefix(asset.Name, "extractous-ffi-") { 226 | continue 227 | } 228 | // skip checksum assets like .sha256 229 | if strings.HasSuffix(asset.Name, ".sha256") || strings.HasSuffix(asset.Name, ".sha256.txt") { 230 | if verbose { 231 | log.Printf("Skipping checksum asset: %s", asset.Name) 232 | } 233 | continue 234 | } 235 | after := strings.TrimPrefix(asset.Name, "extractous-ffi-") 236 | name := strings.TrimSuffix(after, ".zip") 237 | name = strings.TrimSuffix(name, ".tar.gz") 238 | name = strings.TrimSuffix(name, ".tgz") 239 | platforms[name] = asset.DownloadURL 240 | } 241 | 242 | if len(platforms) == 0 { 243 | return nil, fmt.Errorf("no compatible FFI assets found in the latest release") 244 | } 245 | 246 | return platforms, nil 247 | } 248 | 249 | func getPlatformAndFormat() (platform, format string) { 250 | goos := runtime.GOOS 251 | goarch := runtime.GOARCH 252 | 253 | switch goos { 254 | case "linux": 255 | return fmt.Sprintf("linux_%s", goarch), "tar.gz" 256 | case "darwin": 257 | return fmt.Sprintf("darwin_%s", goarch), "tar.gz" 258 | case "windows": 259 | return fmt.Sprintf("windows_%s", goarch), "zip" 260 | default: 261 | fatalf("Unsupported operating system: %s", goos) 262 | return "", "" 263 | } 264 | } 265 | 266 | // downloadFileWithRetries will try a few times and show a progress bar. 267 | func downloadFileWithRetries(url string, attempts int) (string, error) { 268 | var lastErr error 269 | for i := 1; i <= attempts; i++ { 270 | if i > 1 { 271 | // backoff 272 | backoff := time.Duration(i*i) * time.Second 273 | if verbose { 274 | log.Printf("Retrying in %s...", backoff) 275 | } 276 | time.Sleep(backoff) 277 | } 278 | path, err := downloadFile(url) 279 | if err == nil { 280 | return path, nil 281 | } 282 | lastErr = err 283 | if verbose { 284 | log.Printf("Attempt %d/%d failed: %v", i, attempts, err) 285 | } 286 | } 287 | return "", fmt.Errorf("download failed after %d attempts: %w", attempts, lastErr) 288 | } 289 | 290 | func downloadFile(url string) (string, error) { 291 | ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute) 292 | defer cancel() 293 | req, _ := http.NewRequestWithContext(ctx, http.MethodGet, url, nil) 294 | resp, err := client.Do(req) 295 | if err != nil { 296 | return "", err 297 | } 298 | defer resp.Body.Close() 299 | 300 | if resp.StatusCode != http.StatusOK { 301 | return "", fmt.Errorf("bad status: %s", resp.Status) 302 | } 303 | 304 | tmpFile, err := os.CreateTemp("", "extractous-*.download") 305 | if err != nil { 306 | return "", err 307 | } 308 | defer tmpFile.Close() 309 | 310 | bar := progressbar.NewOptions64( 311 | resp.ContentLength, 312 | progressbar.OptionSetWidth(30), 313 | progressbar.OptionShowBytes(true), 314 | progressbar.OptionSetDescription("Downloading"), 315 | progressbar.OptionShowCount(), 316 | progressbar.OptionShowElapsedTimeOnFinish(), 317 | progressbar.OptionSetTheme(progressbar.Theme{ 318 | Saucer: "=", 319 | SaucerHead: ">", 320 | SaucerPadding: " ", 321 | BarStart: "[", 322 | BarEnd: "]", 323 | }), 324 | ) 325 | if _, err = io.Copy(io.MultiWriter(tmpFile, bar), resp.Body); err != nil { 326 | return "", err 327 | } 328 | println("") 329 | 330 | return tmpFile.Name(), nil 331 | } 332 | 333 | func extractArchive(src, dest, platform, format string) error { 334 | destPath := filepath.Join(dest, platform) 335 | if err := os.MkdirAll(destPath, 0o755); err != nil { 336 | return err 337 | } 338 | 339 | switch format { 340 | case "zip": 341 | return unzip(src, destPath) 342 | case "tar.gz": 343 | return untar(src, destPath) 344 | default: 345 | return fmt.Errorf("unsupported archive format: %s", format) 346 | } 347 | } 348 | 349 | // prevent zip-slip and path traversal by resolving absolute paths 350 | func safeJoin(dest, name string) (string, error) { 351 | absDest, err := filepath.Abs(dest) 352 | if err != nil { 353 | return "", err 354 | } 355 | cleanName := filepath.Clean(strings.ReplaceAll(name, "\\", string(os.PathSeparator))) 356 | joined := filepath.Join(absDest, cleanName) 357 | absJoined, err := filepath.Abs(joined) 358 | if err != nil { 359 | return "", err 360 | } 361 | // allow the file to be exactly the dest dir or inside it 362 | if absJoined == absDest || strings.HasPrefix(absJoined, absDest+string(os.PathSeparator)) { 363 | return absJoined, nil 364 | } 365 | return "", fmt.Errorf("illegal file path outside destination: %s", name) 366 | } 367 | 368 | func unzip(src, dest string) error { 369 | r, err := zip.OpenReader(src) 370 | if err != nil { 371 | return err 372 | } 373 | defer r.Close() 374 | 375 | for _, f := range r.File { 376 | // use forward slashes in zip entries; convert for local FS 377 | fname := filepath.FromSlash(f.Name) 378 | targetPath, err := safeJoin(dest, fname) 379 | if err != nil { 380 | return err 381 | } 382 | 383 | if f.FileInfo().IsDir() { 384 | if err := os.MkdirAll(targetPath, f.Mode()); err != nil { 385 | return err 386 | } 387 | continue 388 | } 389 | 390 | if err := os.MkdirAll(filepath.Dir(targetPath), 0o755); err != nil { 391 | return err 392 | } 393 | 394 | outFile, err := os.OpenFile(targetPath, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, f.Mode()) 395 | if err != nil { 396 | return err 397 | } 398 | 399 | rc, err := f.Open() 400 | if err != nil { 401 | outFile.Close() 402 | return err 403 | } 404 | 405 | if _, err = io.Copy(outFile, rc); err != nil { 406 | outFile.Close() 407 | rc.Close() 408 | return err 409 | } 410 | 411 | outFile.Close() 412 | rc.Close() 413 | } 414 | return nil 415 | } 416 | 417 | func untar(src, dest string) error { 418 | file, err := os.Open(src) 419 | if err != nil { 420 | return err 421 | } 422 | defer file.Close() 423 | 424 | gzr, err := gzip.NewReader(file) 425 | if err != nil { 426 | return err 427 | } 428 | defer gzr.Close() 429 | 430 | tr := tar.NewReader(gzr) 431 | 432 | for { 433 | header, err := tr.Next() 434 | if err == io.EOF { 435 | return nil 436 | } 437 | if err != nil { 438 | return err 439 | } 440 | 441 | // Clean header name to avoid path traversal 442 | name := header.Name 443 | if name == "" { 444 | continue 445 | } 446 | targetPath, err := safeJoin(dest, name) 447 | if err != nil { 448 | return err 449 | } 450 | 451 | switch header.Typeflag { 452 | case tar.TypeDir: 453 | if err := os.MkdirAll(targetPath, 0o755); err != nil { 454 | return err 455 | } 456 | case tar.TypeReg, tar.TypeRegA: 457 | if err := os.MkdirAll(filepath.Dir(targetPath), 0o755); err != nil { 458 | return err 459 | } 460 | outFile, err := os.OpenFile(targetPath, os.O_CREATE|os.O_RDWR|os.O_TRUNC, os.FileMode(header.Mode)) 461 | if err != nil { 462 | return err 463 | } 464 | if _, err := io.Copy(outFile, tr); err != nil { 465 | outFile.Close() 466 | return err 467 | } 468 | outFile.Close() 469 | case tar.TypeSymlink, tar.TypeLink: 470 | // skip symlinks for safety 471 | if verbose { 472 | log.Printf("Skipping symlink: %s", header.Name) 473 | } 474 | default: 475 | if verbose { 476 | log.Printf("Skipping unknown tar entry type %c for %s", header.Typeflag, header.Name) 477 | } 478 | } 479 | } 480 | } 481 | -------------------------------------------------------------------------------- /tests/ffi/test_ffi_interface.c: -------------------------------------------------------------------------------- 1 | /** 2 | * FFI Layer Tests for Extractous 3 | * 4 | * These tests verify that the C FFI interface works correctly 5 | * and all functions are properly exposed from the Rust library. 6 | */ 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include "../../include/extractous.h" 13 | 14 | // Test result tracking 15 | static int tests_run = 0; 16 | static int tests_passed = 0; 17 | static int tests_failed = 0; 18 | 19 | // Color codes for output 20 | #define COLOR_GREEN "\x1b[32m" 21 | #define COLOR_RED "\x1b[31m" 22 | #define COLOR_YELLOW "\x1b[33m" 23 | #define COLOR_RESET "\x1b[0m" 24 | 25 | // Test macros 26 | #define TEST(name) \ 27 | void test_##name(); \ 28 | void run_test_##name() { \ 29 | tests_run++; \ 30 | printf("[ RUN ] %s\n", #name); \ 31 | test_##name(); \ 32 | tests_passed++; \ 33 | printf(COLOR_GREEN "[ OK ] %s\n" COLOR_RESET, #name); \ 34 | } \ 35 | void test_##name() 36 | 37 | #define ASSERT_NOT_NULL(ptr, msg) \ 38 | if (ptr == NULL) { \ 39 | printf(COLOR_RED "[FAILED] %s: %s is NULL\n" COLOR_RESET, __func__, msg); \ 40 | tests_failed++; \ 41 | tests_passed--; \ 42 | return; \ 43 | } 44 | 45 | #define ASSERT_NULL(ptr, msg) \ 46 | if (ptr != NULL) { \ 47 | printf(COLOR_RED "[FAILED] %s: %s is not NULL\n" COLOR_RESET, __func__, msg); \ 48 | tests_failed++; \ 49 | tests_passed--; \ 50 | return; \ 51 | } 52 | 53 | #define ASSERT_EQ(expected, actual, msg) \ 54 | if (expected != actual) { \ 55 | printf(COLOR_RED "[FAILED] %s: %s - expected %d, got %d\n" COLOR_RESET, \ 56 | __func__, msg, expected, actual); \ 57 | tests_failed++; \ 58 | tests_passed--; \ 59 | return; \ 60 | } 61 | 62 | #define ASSERT_TRUE(condition, msg) \ 63 | if (!(condition)) { \ 64 | printf(COLOR_RED "[FAILED] %s: %s\n" COLOR_RESET, __func__, msg); \ 65 | tests_failed++; \ 66 | tests_passed--; \ 67 | return; \ 68 | } 69 | 70 | // ============================================================================ 71 | // Test: Extractor Lifecycle 72 | // ============================================================================ 73 | 74 | TEST(extractor_new) { 75 | struct CExtractor *extractor = extractous_extractor_new(); 76 | ASSERT_NOT_NULL(extractor, "extractor"); 77 | extractous_extractor_free(extractor); 78 | } 79 | 80 | TEST(extractor_free_null) { 81 | // Should not crash 82 | extractous_extractor_free(NULL); 83 | } 84 | 85 | TEST(extractor_double_free) { 86 | struct CExtractor *extractor = extractous_extractor_new(); 87 | ASSERT_NOT_NULL(extractor, "extractor"); 88 | extractous_extractor_free(extractor); 89 | // Second free on same pointer would cause issues in real code 90 | // but this test just verifies it doesn't crash the suite 91 | } 92 | 93 | // ============================================================================ 94 | // Test: Configuration Functions 95 | // ============================================================================ 96 | 97 | TEST(extractor_set_max_length) { 98 | struct CExtractor *extractor = extractous_extractor_new(); 99 | ASSERT_NOT_NULL(extractor, "extractor"); 100 | 101 | struct CExtractor *new_extractor = extractous_extractor_set_extract_string_max_length( 102 | extractor, 10000 103 | ); 104 | ASSERT_NOT_NULL(new_extractor, "new_extractor"); 105 | 106 | extractous_extractor_free(new_extractor); 107 | } 108 | 109 | TEST(extractor_set_encoding) { 110 | struct CExtractor *extractor = extractous_extractor_new(); 111 | ASSERT_NOT_NULL(extractor, "extractor"); 112 | 113 | struct CExtractor *new_extractor = extractous_extractor_set_encoding( 114 | extractor, CHARSET_UTF_8 115 | ); 116 | ASSERT_NOT_NULL(new_extractor, "new_extractor with UTF-8"); 117 | 118 | extractous_extractor_free(new_extractor); 119 | } 120 | 121 | TEST(extractor_set_invalid_encoding) { 122 | struct CExtractor *extractor = extractous_extractor_new(); 123 | ASSERT_NOT_NULL(extractor, "extractor"); 124 | 125 | struct CExtractor *new_extractor = extractous_extractor_set_encoding( 126 | extractor, 999 // Invalid encoding 127 | ); 128 | ASSERT_NULL(new_extractor, "new_extractor with invalid encoding"); 129 | 130 | // Original extractor was consumed, don't free 131 | } 132 | 133 | TEST(extractor_set_xml_output) { 134 | struct CExtractor *extractor = extractous_extractor_new(); 135 | ASSERT_NOT_NULL(extractor, "extractor"); 136 | 137 | struct CExtractor *new_extractor = extractous_extractor_set_xml_output( 138 | extractor, true 139 | ); 140 | ASSERT_NOT_NULL(new_extractor, "new_extractor with XML enabled"); 141 | 142 | extractous_extractor_free(new_extractor); 143 | } 144 | 145 | TEST(extractor_chained_configuration) { 146 | struct CExtractor *e1 = extractous_extractor_new(); 147 | ASSERT_NOT_NULL(e1, "e1"); 148 | 149 | struct CExtractor *e2 = extractous_extractor_set_extract_string_max_length(e1, 5000); 150 | ASSERT_NOT_NULL(e2, "e2"); 151 | 152 | struct CExtractor *e3 = extractous_extractor_set_encoding(e2, CHARSET_UTF_8); 153 | ASSERT_NOT_NULL(e3, "e3"); 154 | 155 | struct CExtractor *e4 = extractous_extractor_set_xml_output(e3, false); 156 | ASSERT_NOT_NULL(e4, "e4"); 157 | 158 | extractous_extractor_free(e4); 159 | } 160 | 161 | // ============================================================================ 162 | // Test: PDF Configuration 163 | // ============================================================================ 164 | 165 | TEST(pdf_config_new) { 166 | struct CPdfParserConfig *config = extractous_pdf_config_new(); 167 | ASSERT_NOT_NULL(config, "pdf_config"); 168 | extractous_pdf_config_free(config); 169 | } 170 | 171 | TEST(pdf_config_set_ocr_strategy) { 172 | struct CPdfParserConfig *c1 = extractous_pdf_config_new(); 173 | ASSERT_NOT_NULL(c1, "c1"); 174 | 175 | struct CPdfParserConfig *c2 = extractous_pdf_config_set_ocr_strategy( 176 | c1, PDF_OCR_AUTO 177 | ); 178 | ASSERT_NOT_NULL(c2, "c2"); 179 | 180 | extractous_pdf_config_free(c2); 181 | } 182 | 183 | TEST(pdf_config_set_extract_inline_images) { 184 | struct CPdfParserConfig *c1 = extractous_pdf_config_new(); 185 | ASSERT_NOT_NULL(c1, "c1"); 186 | 187 | struct CPdfParserConfig *c2 = extractous_pdf_config_set_extract_inline_images(c1, true); 188 | ASSERT_NOT_NULL(c2, "c2"); 189 | 190 | extractous_pdf_config_free(c2); 191 | } 192 | 193 | TEST(extractor_set_pdf_config) { 194 | struct CExtractor *extractor = extractous_extractor_new(); 195 | ASSERT_NOT_NULL(extractor, "extractor"); 196 | 197 | struct CPdfParserConfig *pdf_config = extractous_pdf_config_new(); 198 | ASSERT_NOT_NULL(pdf_config, "pdf_config"); 199 | 200 | struct CExtractor *new_extractor = extractous_extractor_set_pdf_config( 201 | extractor, pdf_config 202 | ); 203 | ASSERT_NOT_NULL(new_extractor, "new_extractor"); 204 | 205 | extractous_pdf_config_free(pdf_config); 206 | extractous_extractor_free(new_extractor); 207 | } 208 | 209 | // ============================================================================ 210 | // Test: Office Configuration 211 | // ============================================================================ 212 | 213 | TEST(office_config_new) { 214 | struct COfficeParserConfig *config = extractous_office_config_new(); 215 | ASSERT_NOT_NULL(config, "office_config"); 216 | extractous_office_config_free(config); 217 | } 218 | 219 | TEST(office_config_set_extract_macros) { 220 | struct COfficeParserConfig *c1 = extractous_office_config_new(); 221 | ASSERT_NOT_NULL(c1, "c1"); 222 | 223 | struct COfficeParserConfig *c2 = extractous_office_config_set_extract_macros(c1, true); 224 | ASSERT_NOT_NULL(c2, "c2"); 225 | 226 | extractous_office_config_free(c2); 227 | } 228 | 229 | // ============================================================================ 230 | // Test: OCR Configuration 231 | // ============================================================================ 232 | 233 | TEST(ocr_config_new) { 234 | struct CTesseractOcrConfig *config = extractous_ocr_config_new(); 235 | ASSERT_NOT_NULL(config, "ocr_config"); 236 | extractous_ocr_config_free(config); 237 | } 238 | 239 | TEST(ocr_config_set_language) { 240 | struct CTesseractOcrConfig *c1 = extractous_ocr_config_new(); 241 | ASSERT_NOT_NULL(c1, "c1"); 242 | 243 | struct CTesseractOcrConfig *c2 = extractous_ocr_config_set_language(c1, "eng"); 244 | ASSERT_NOT_NULL(c2, "c2"); 245 | 246 | extractous_ocr_config_free(c2); 247 | } 248 | 249 | // ============================================================================ 250 | // Test: Error Handling 251 | // ============================================================================ 252 | 253 | TEST(error_message) { 254 | char *msg = extractous_error_message(ERR_OK); 255 | ASSERT_NOT_NULL(msg, "error message for ERR_OK"); 256 | ASSERT_TRUE(strlen(msg) > 0, "error message not empty"); 257 | extractous_string_free(msg); 258 | 259 | msg = extractous_error_message(ERR_NULL_POINTER); 260 | ASSERT_NOT_NULL(msg, "error message for ERR_NULL_POINTER"); 261 | extractous_string_free(msg); 262 | 263 | msg = extractous_error_message(ERR_EXTRACTION_FAILED); 264 | ASSERT_NOT_NULL(msg, "error message for ERR_EXTRACTION_FAILED"); 265 | extractous_string_free(msg); 266 | } 267 | 268 | TEST(extract_with_null_extractor) { 269 | char *content = NULL; 270 | struct CMetadata *metadata = NULL; 271 | 272 | int result = extractous_extractor_extract_file_to_string( 273 | NULL, "test.txt", &content, &metadata 274 | ); 275 | 276 | ASSERT_EQ(ERR_NULL_POINTER, result, "error code"); 277 | } 278 | 279 | TEST(extract_with_null_path) { 280 | struct CExtractor *extractor = extractous_extractor_new(); 281 | ASSERT_NOT_NULL(extractor, "extractor"); 282 | 283 | char *content = NULL; 284 | struct CMetadata *metadata = NULL; 285 | 286 | int result = extractous_extractor_extract_file_to_string( 287 | extractor, NULL, &content, &metadata 288 | ); 289 | 290 | ASSERT_EQ(ERR_NULL_POINTER, result, "error code"); 291 | extractous_extractor_free(extractor); 292 | } 293 | 294 | TEST(extract_with_null_output) { 295 | struct CExtractor *extractor = extractous_extractor_new(); 296 | ASSERT_NOT_NULL(extractor, "extractor"); 297 | 298 | int result = extractous_extractor_extract_file_to_string( 299 | extractor, "test.txt", NULL, NULL 300 | ); 301 | 302 | ASSERT_EQ(ERR_NULL_POINTER, result, "error code"); 303 | extractous_extractor_free(extractor); 304 | } 305 | 306 | // ============================================================================ 307 | // Test: String Memory Management 308 | // ============================================================================ 309 | 310 | TEST(string_free_null) { 311 | // Should not crash 312 | extractous_string_free(NULL); 313 | } 314 | 315 | // ============================================================================ 316 | // Test: Metadata Functions 317 | // ============================================================================ 318 | 319 | TEST(metadata_free_null) { 320 | // Should not crash 321 | extractous_metadata_free(NULL); 322 | } 323 | 324 | // ============================================================================ 325 | // Test: URL Extraction Functions (if they exist) 326 | // ============================================================================ 327 | 328 | TEST(url_extraction_null_checks) { 329 | struct CExtractor *extractor = extractous_extractor_new(); 330 | ASSERT_NOT_NULL(extractor, "extractor"); 331 | 332 | char *content = NULL; 333 | struct CMetadata *metadata = NULL; 334 | 335 | // NULL URL 336 | int result = extractous_extractor_extract_url_to_string( 337 | extractor, NULL, &content, &metadata 338 | ); 339 | ASSERT_EQ(ERR_NULL_POINTER, result, "null URL error code"); 340 | 341 | // NULL outputs 342 | result = extractous_extractor_extract_url_to_string( 343 | extractor, "http://example.com", NULL, NULL 344 | ); 345 | ASSERT_EQ(ERR_NULL_POINTER, result, "null outputs error code"); 346 | 347 | extractous_extractor_free(extractor); 348 | } 349 | 350 | // ============================================================================ 351 | // Test Runner 352 | // ============================================================================ 353 | 354 | void run_all_tests() { 355 | printf("\n"); 356 | printf("========================================\n"); 357 | printf(" FFI Layer Tests for Extractous\n"); 358 | printf("========================================\n\n"); 359 | 360 | // Lifecycle tests 361 | printf(COLOR_YELLOW "--- Extractor Lifecycle ---\n" COLOR_RESET); 362 | run_test_extractor_new(); 363 | run_test_extractor_free_null(); 364 | run_test_extractor_double_free(); 365 | 366 | // Configuration tests 367 | printf(COLOR_YELLOW "\n--- Configuration Functions ---\n" COLOR_RESET); 368 | run_test_extractor_set_max_length(); 369 | run_test_extractor_set_encoding(); 370 | run_test_extractor_set_invalid_encoding(); 371 | run_test_extractor_set_xml_output(); 372 | run_test_extractor_chained_configuration(); 373 | 374 | // PDF config tests 375 | printf(COLOR_YELLOW "\n--- PDF Configuration ---\n" COLOR_RESET); 376 | run_test_pdf_config_new(); 377 | run_test_pdf_config_set_ocr_strategy(); 378 | run_test_pdf_config_set_extract_inline_images(); 379 | run_test_extractor_set_pdf_config(); 380 | 381 | // Office config tests 382 | printf(COLOR_YELLOW "\n--- Office Configuration ---\n" COLOR_RESET); 383 | run_test_office_config_new(); 384 | run_test_office_config_set_extract_macros(); 385 | 386 | // OCR config tests 387 | printf(COLOR_YELLOW "\n--- OCR Configuration ---\n" COLOR_RESET); 388 | run_test_ocr_config_new(); 389 | run_test_ocr_config_set_language(); 390 | 391 | // Error handling tests 392 | printf(COLOR_YELLOW "\n--- Error Handling ---\n" COLOR_RESET); 393 | run_test_error_message(); 394 | run_test_extract_with_null_extractor(); 395 | run_test_extract_with_null_path(); 396 | run_test_extract_with_null_output(); 397 | 398 | // Memory management tests 399 | printf(COLOR_YELLOW "\n--- Memory Management ---\n" COLOR_RESET); 400 | run_test_string_free_null(); 401 | run_test_metadata_free_null(); 402 | 403 | // URL extraction tests 404 | printf(COLOR_YELLOW "\n--- URL Extraction ---\n" COLOR_RESET); 405 | run_test_url_extraction_null_checks(); 406 | 407 | // Summary 408 | printf("\n"); 409 | printf("========================================\n"); 410 | printf(" Test Summary\n"); 411 | printf("========================================\n"); 412 | printf("Total: %d\n", tests_run); 413 | printf(COLOR_GREEN "Passed: %d\n" COLOR_RESET, tests_passed); 414 | 415 | if (tests_failed > 0) { 416 | printf(COLOR_RED "Failed: %d\n" COLOR_RESET, tests_failed); 417 | } else { 418 | printf("Failed: 0\n"); 419 | } 420 | 421 | printf("========================================\n\n"); 422 | } 423 | 424 | int main() { 425 | run_all_tests(); 426 | return tests_failed > 0 ? 1 : 0; 427 | } 428 | --------------------------------------------------------------------------------