) {
47 | #$gonetime = time();
48 | #$lapse = $gonetime - $donetime;
49 | #if ($lapse == 20){
50 | #$donetime = $gonetime;
51 | #$count++;
52 | #print STATUS "SEENTWENTYSECONDS $count TIMES\n";
53 | #}
54 | #$mtime = (stat(STATUS))[9];
55 | #$mtime = (stat($STATUSFILE))[9];
56 | #$nowtime = time();
57 | #$difftime = $nowtime - $mtime;
58 | #print STDERR "MTIME: $mtime\n";
59 | #if ($difftime =~ /20/){
60 | #$count++;
61 | #print STATUS "SEENTWENTYSECONDS $count TIMES\n";
62 | #$mtime = ();
63 | #}
64 | }
65 | close(README) or die "Couldn't close: $!\n";
66 | close(STATUS);
67 |
--------------------------------------------------------------------------------
/tools/PhilosTEI.FoLiAtoTEI.pl:
--------------------------------------------------------------------------------
1 | # Follow up from PhilosTEI.hOCRtoTEI.pl
2 | # by Martin Reynaert
3 | # TiCC - Tilburg University - The Netherlands
4 | # 2014 - Licensed under GPLv3
5 |
6 | ##Usage: Basic
7 |
8 | ##Requires: Perl module XML::Twig::XPath and POSIX. Further requires the basic TEI XML template Simple.xml
9 |
10 | ## Copyright Martin Reynaert 2016
11 | ## MRE 2014-07-16
12 | ## Written in CLARIN-NL project @PhilosTEI
13 |
14 | ##Actual command lines used:
15 | ## This script is part of the @PhilosTEI workflow and gets initialised by TICCLops.pl, the script which steers the whole workflow.
16 |
17 | ##This script converts FoLiA XML files for e.g. a book into a single electronic book in TEI XML format.
18 |
19 |
20 | ##Which directory/ies do we want to work on?
21 | if ($ARGV[0] =~ /\#/){
22 | ($dir, $teidir) = split '#', $ARGV[0];
23 | }
24 | ##Extension for the files to be processed
25 | if ($ARGV[1] =~ /\#/){
26 | ($ext, $exttei) = split '#', $ARGV[1];
27 | }
28 |
29 | $template = $ARGV[2];
30 | $prefix = $ARGV[3];
31 |
32 | use File::Find;
33 | use POSIX qw/strftime/;
34 | use Sort::Naturally;
35 |
36 | ##We set the binmode to UTF-8 for STDOUT and STDERR--BEGIN
37 | binmode(STDOUT, ":utf8");
38 | binmode(STDERR, ":utf8");
39 | #binmode($template, ":utf8");
40 | ##We set the binmode to UTF-8 for STDOUT and STDERR--END
41 |
42 | find( sub{
43 | -f $_ and push @documents, $File::Find::name;
44 | -d $_ and push @dirs, $File::Find::name;
45 | }, $dir );
46 | print LOG "DOCNAMESREAD: $imdidir >> @documents\n" if ($mode =~ /Z/);
47 | foreach $docname (@documents){
48 | if ($docname =~ /$ext$/){
49 | push(@coldocs, $docname);
50 | }
51 | }
52 |
53 | @coldocs = nsort @coldocs;
54 |
55 |
56 |
57 | #$teishadow = $doc;
58 | #$teishadow =~ s/$ext/$exttei/;
59 | $teishadow = $teidir . $prefix . '.ticcl.tei.xml';
60 |
61 | print STDERR "TEIDOC: $doc\t$teishadow\n";
62 | print STDOUT "TEIDOC: $doc\t$teishadow\n";
63 |
64 | open (SHADOW, ">$teishadow");
65 | binmode(SHADOW, ":utf8");
66 |
67 | use XML::Twig::XPath;
68 | ##We use the use XML::Twig::XPath Perl module in this mode--END
69 |
70 | ##We write the number of documents to be processed to the log file
71 | $nrdocs = $#coldocs + 1;
72 | print STDERR "NR DOCS: $nrdocs\n";
73 |
74 | ##Numbering the documents starts at '1'
75 | $document_number = 0;
76 | $follow = -1;
77 | ##We next process each document listed in the array in turn
78 | foreach $doc (@coldocs) {
79 | binmode($doc, ":utf8");
80 | ##That is: if the particular document listed has the extension specified on the command-line
81 | if ($doc =~ /$ext$/){
82 |
83 | ##Used to have file data info here!!
84 |
85 | ##Numbering the documents is incremented by '1' for each document to be processed
86 | $document_number++;
87 | ##We write info about the document being processed to the log file
88 |
89 | my $t = XML::Twig::XPath->new(
90 | twig_roots => {
91 | #"//p/t[@lass]" => \&getfromfolia, ##Sowieso typo in!! Wil hier dus ook de id van de //p vatten en meenemen!!
92 | "//p" => \&getfromfolia1,
93 | "//p/t" => \&getfromfolia,
94 | ##
95 | ##
96 | ##Ninus-, en cussfchen Mnuiusrs en cis-EIN- unanwend-
97 | ##
98 | ##negus en cussfchen Mnuiusrs en cissen aanwend
99 |
100 | #"//div//p/span" => \&getfromhtml2,
101 | #"//body//p" => \&getfromalto5,
102 | #"//body//head" =>\&getfromalto1,
103 | #"//body//note" =>\&getfromalto2,
104 | #"//body//l" =>\&getfromalto3,
105 | #"//body//speaker" =>\&getfromalto4,
106 | },
107 | );
108 |
109 | $t->parsefile( $doc );
110 |
111 | sub getfromfolia1
112 | { my( $t, $item1)= @_;
113 | {
114 | ##$elt->parent('p[@conref != ""]')
115 | #$ref = $item->{'..'}->{'att'}->{'xml:id'}; # get the reference
116 | #$ref = $item->{'parent'}->{'att'}->{'xml:id'}; # get the reference
117 | $ref = $item1->{'att'}->{'xml:id'}; # get the reference
118 | #print STDOUT "REF1: $ref ITEM: $item1\n";
119 | push @PATTS, $ref;
120 | }}
121 | sub getfromfolia
122 | { my( $t, $item)= @_;
123 | {
124 | $class = $item->{'att'}->{'class'}; # get the class
125 | #print STDOUT "REF: $ref ITEM: $item CLASS: $class T: $t\n";
126 | if ($class =~ /TICCL/i){ ##MRE: Ticcl hoger staat dus anders gecapitaliseerd... Nu case insensitive??
127 | $txt = $item->text;
128 | $step++;
129 | #push @TEXT, 'par@#@';
130 | push @TEXT, $txt;
131 | #print STDOUT "TEXT: @TEXT STEPS: $step\n";
132 | }}}
133 |
134 | #sub getfromfolia2
135 | # { my( $t, $item2)= @_;
136 | # {
137 | # $txt = $item2->text;
138 | # $step2++;
139 | # push @TEXT, 'line@#@' . $txt;
140 | # print STDOUT "TEXT2: @TEXT STEPS: $step >> $step2\n";
141 | # }}
142 |
143 | #sub getfromalto5
144 | # { my( $t, $item5)= @_;
145 | # {
146 | # $txt = $item5->text;
147 | # push @TEXT, 'par@#@' . $txt;
148 | # }}
149 |
150 | #foreach $par (@TEXT){
151 | #print STDOUT "PAR: $par\n\n";
152 | #}
153 | }}
154 | my $t = XML::Twig::XPath->new(PrettyPrint => 'indented',
155 | twig_roots => {
156 | #"//TEI" => \&filltei,
157 | "//TEI//text/body/p" => \&filltei2,
158 | #"//FoLiA//div" => \&fillfolia1,
159 | },
160 | twig_print_outside_roots => \*SHADOW,
161 | );
162 |
163 |
164 |
165 | $t->parsefile( $template );
166 |
167 | #sub fillfolia2
168 | # { my( $t, $fill2)= @_;
169 | # {
170 | # $fill2->set_att("xml:id" => "$shortnamedoc");
171 | # $fill2->print(\*SHADOW);
172 | # }}
173 |
174 | #sub fillfolia3
175 | # { my( $t, $fill3)= @_;
176 | # {
177 | # $txtname = $shortnamedoc . '.text';
178 | # $fill3->set_att("xml:id" => "$txtname");
179 | # }}
180 |
181 | sub filltei2
182 | { my( $t, $fill)= @_;
183 | {
184 | #$divnr++;
185 | #$divname = $shortnamedoc . '.div' . $divnr;
186 | #$fill->set_att("xml:id" => "$divname");
187 |
188 | ##Zoiets als voor elke content CONTENT, maak een nieuwe
en kleef dat erin...
189 | foreach $par (@TEXT){
190 | print STDERR "PAR1: $par LC: $linecollect\n";
191 | #$parid++;
192 | chomp $par;
193 | $new_elt = XML::Twig::XPath::Elt->new('p');
194 | $new_elt->set_text( $par );
195 | $new_elt->set_att("xml:id" => "@PATTS[$parid]");
196 | $new_elt->print;
197 | #print STDOUT "PARFILL: $par\n\n";
198 | $parid++;
199 | }
200 | #$t->purge; # frees the memory
201 | }
202 | $parid = ();
203 | @TEXT = ();
204 | @PATTS = ();
205 | # }} ###MRE: move/divide(?) to collate all pages into 1 TEI??
206 | }
207 | #}}
208 | close(SHADOW);
209 |
210 |
211 |
--------------------------------------------------------------------------------
/tools/Simple.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
5 |
6 |
7 |
8 |
9 | Title
10 |
11 |
12 | Publication Information
13 |
14 |
15 | Information about the source
16 |
17 |
18 |
19 |
20 |
21 | Some text here.
22 |
23 |
24 |
25 |
--------------------------------------------------------------------------------
/tools/config.hocr:
--------------------------------------------------------------------------------
1 | tessedit_create_hocr T
2 |
--------------------------------------------------------------------------------