();
13 |
14 | public static Searcher searcherForPath(final Path path) throws IOException {
15 | synchronized (path2searchers) {
16 | Searcher result = path2searchers.get(path);
17 | if(result != null)
18 | return result;
19 |
20 | final Word2VecModel word2VecModel = Word2VecModel.fromBinFile(path.toFile());
21 | result = word2VecModel.forSearch();
22 | path2searchers.put(path, result);
23 | return result;
24 | }
25 | }
26 | }
27 |
--------------------------------------------------------------------------------
/core/src/main/java/org/allenai/scienceparse/pdfapi/PDFDoc.java:
--------------------------------------------------------------------------------
1 | package org.allenai.scienceparse.pdfapi;
2 |
3 | import lombok.Builder;
4 | import lombok.Data;
5 | import lombok.experimental.Wither;
6 |
7 | import java.util.ArrayList;
8 | import java.util.List;
9 |
10 | @Data
11 | @Builder
12 | public class PDFDoc {
13 | /**
14 | * Index in the lines of the first page which is the stop (one beyond the last)
15 | * line that makes the header of the document (the title, authors, etc.)
16 | *
17 | * This is < 0 if we can't find an appropriate header/main cut.
18 | */
19 | @Wither public final List pages;
20 | public final PDFMetadata meta;
21 |
22 | public PDFDoc withoutSuperscripts() {
23 | final List newPages = new ArrayList<>(pages.size());
24 | for(PDFPage page : pages)
25 | newPages.add(page.withoutSuperscripts());
26 | return this.withPages(newPages);
27 | }
28 | }
29 |
--------------------------------------------------------------------------------
/core/src/main/java/org/allenai/scienceparse/pdfapi/PDFFontMetrics.java:
--------------------------------------------------------------------------------
1 | package org.allenai.scienceparse.pdfapi;
2 |
3 | import lombok.Data;
4 | import lombok.val;
5 |
6 | import java.util.concurrent.ConcurrentHashMap;
7 |
8 | @Data
9 | public class PDFFontMetrics {
10 | private static final ConcurrentHashMap canonical
11 | = new ConcurrentHashMap<>();
12 | /**
13 | * The special value for when the underlying font didn't have
14 | * an extractable family name.
15 | */
16 | public static String UNKNWON_FONT_FAMILY = "*UNKNOWN*";
17 | public final String name;
18 | public final float ptSize;
19 | public final float spaceWidth;
20 |
21 | /**
22 | * Ensures one font object per unique font name
23 | *
24 | * @param name
25 | * @param ptSize
26 | * @param spaceWidth
27 | * @return
28 | */
29 | public static PDFFontMetrics of(String name, float ptSize, float spaceWidth) {
30 | val fontMetrics = new PDFFontMetrics(name, ptSize, spaceWidth);
31 | val curValue = canonical.putIfAbsent(name, fontMetrics);
32 | return curValue != null ? curValue : fontMetrics;
33 | }
34 |
35 | public String stringRepresentation() {
36 | return String.format("%s-%f", name, ptSize);
37 | }
38 | }
39 |
--------------------------------------------------------------------------------
/core/src/main/java/org/allenai/scienceparse/pdfapi/PDFLine.java:
--------------------------------------------------------------------------------
1 | package org.allenai.scienceparse.pdfapi;
2 |
3 | import com.gs.collections.api.list.primitive.FloatList;
4 | import com.gs.collections.impl.list.mutable.primitive.FloatArrayList;
5 | import lombok.Builder;
6 | import lombok.Data;
7 | import lombok.experimental.Wither;
8 | import lombok.val;
9 |
10 | import java.util.ArrayList;
11 | import java.util.List;
12 | import java.util.regex.Pattern;
13 | import java.util.stream.Collectors;
14 | import java.util.stream.DoubleStream;
15 |
16 | /**
17 | * Immutable value class representing a single contiguous line of a PDF. A contiguous line means
18 | * a sequence of tokens/glyphs which are intended to be read sequentially. For instance, a two column
19 | * paper might have two lines at the same y-position.
20 | */
21 | @Builder
22 | @Data
23 | public class PDFLine {
24 | @Wither public final List tokens;
25 |
26 | private DoubleStream projectCoord(int dim) {
27 | return tokens.stream().mapToDouble(t -> t.bounds.get(dim));
28 | }
29 |
30 | /**
31 | * (0,0) origin bounds [x0,y0, x1, y1] for the entire line.
32 | * Should
33 | */
34 | public FloatList bounds() {
35 | float x0 = (float) projectCoord(0).min().getAsDouble();
36 | float y0 = (float) projectCoord(1).min().getAsDouble();
37 | float x1 = (float) projectCoord(2).max().getAsDouble();
38 | float y1 = (float) projectCoord(3).max().getAsDouble();
39 | return FloatArrayList.newListWith(x0, y0, x1, y1);
40 | }
41 |
42 | public float height() {
43 | val bs = bounds();
44 | return bs.get(3) - bs.get(1);
45 | }
46 |
47 | public String lineText() {
48 | return tokens.stream().map(PDFToken::getToken).collect(Collectors.joining(" "));
49 | }
50 |
51 | public double avgFontSize() {
52 | return tokens.stream().mapToDouble(t -> t.getFontMetrics().getPtSize()).average().orElse(0.0);
53 | }
54 |
55 | public PDFLine withoutSuperscripts() {
56 | final List newTokens = new ArrayList<>(tokens.size());
57 | for(PDFToken token : tokens) {
58 | final String newTokenText = token.token.replaceAll("⍐[^⍗]*⍗", "");
59 | if(!newTokenText.isEmpty())
60 | newTokens.add(token.withToken(newTokenText));
61 | }
62 | return this.withTokens(newTokens);
63 | }
64 | }
65 |
--------------------------------------------------------------------------------
/core/src/main/java/org/allenai/scienceparse/pdfapi/PDFMetadata.java:
--------------------------------------------------------------------------------
1 | package org.allenai.scienceparse.pdfapi;
2 |
3 | import com.fasterxml.jackson.databind.ObjectMapper;
4 | import com.fasterxml.jackson.databind.ObjectWriter;
5 | import lombok.Builder;
6 | import lombok.Data;
7 | import lombok.SneakyThrows;
8 | import lombok.val;
9 |
10 | import java.io.FileInputStream;
11 | import java.io.InputStream;
12 | import java.util.Date;
13 | import java.util.List;
14 |
15 | /**
16 | * Immutable class representing information obtained from scanning for PDF
17 | * meta-data. Many pdf creation programs (like pdflatex) will actuallly output
18 | * information like these fields which substantially aids downstream extraction.
19 | */
20 | @Builder
21 | @Data
22 | public class PDFMetadata {
23 | public final String title;
24 | public final List authors;
25 | public final List keywords;
26 | public final Date createDate;
27 | public final Date lastModifiedDate;
28 | public final String creator;
29 |
30 | // HACK(aria42) For external testing purpose
31 | @SneakyThrows
32 | public static void main(String[] args) {
33 | val extractor = new PDFExtractor();
34 | ObjectWriter ow = new ObjectMapper().writer();
35 | if (args.length <= 1)
36 | ow = ow.withDefaultPrettyPrinter();
37 | for (final String arg : args) {
38 | String prefix = "";
39 | if (args.length > 1)
40 | prefix = arg + "\t";
41 | try (InputStream pdfInputStream = new FileInputStream(arg)) {
42 | try {
43 | PDFMetadata meta = extractor.extractFromInputStream(pdfInputStream).getMeta();
44 | String json = ow.writeValueAsString(meta);
45 | System.out.println(prefix + json);
46 | } catch (final Exception e) {
47 | System.out.println(prefix + "ERROR: " + e);
48 | }
49 | }
50 | }
51 | }
52 | }
53 |
--------------------------------------------------------------------------------
/core/src/main/java/org/allenai/scienceparse/pdfapi/PDFPage.java:
--------------------------------------------------------------------------------
1 | package org.allenai.scienceparse.pdfapi;
2 |
3 | import lombok.Builder;
4 | import lombok.Data;
5 | import lombok.experimental.Wither;
6 |
7 | import java.util.ArrayList;
8 | import java.util.List;
9 |
10 | @Builder
11 | @Data
12 | public class PDFPage {
13 | @Wither public final List lines;
14 | public final int pageNumber;
15 | public final int pageWidth;
16 | public final int pageHeight;
17 |
18 | public PDFPage withoutSuperscripts() {
19 | final List newLines = new ArrayList<>(lines.size());
20 | for(PDFLine line : lines) {
21 | final PDFLine newLine = line.withoutSuperscripts();
22 | if(!newLine.tokens.isEmpty())
23 | newLines.add(newLine);
24 | }
25 | return this.withLines(newLines);
26 | }
27 | }
28 |
--------------------------------------------------------------------------------
/core/src/main/java/org/allenai/scienceparse/pdfapi/PDFToken.java:
--------------------------------------------------------------------------------
1 | package org.allenai.scienceparse.pdfapi;
2 |
3 | import com.gs.collections.api.list.primitive.FloatList;
4 | import lombok.Builder;
5 | import lombok.Value;
6 | import lombok.experimental.Wither;
7 |
8 | @Builder
9 | @Value
10 | public class PDFToken {
11 | @Wither public final String token;
12 | public final PDFFontMetrics fontMetrics;
13 | /**
14 | * List of ints [x0, y0, x1, y1] where [0,0] is upper left
15 | */
16 | public final FloatList bounds;
17 | }
18 |
--------------------------------------------------------------------------------
/core/src/main/java/org/allenai/scienceparse/pdfapi/PdfDocExtractionResult.java:
--------------------------------------------------------------------------------
1 | package org.allenai.scienceparse.pdfapi;
2 |
3 | import lombok.Builder;
4 | import lombok.Data;
5 |
6 | @Builder
7 | @Data
8 | public class PdfDocExtractionResult {
9 | public final PDFDoc document;
10 | public final boolean highPrecision;
11 | }
12 |
--------------------------------------------------------------------------------
/core/src/main/resources/golddata/isaac/bib-years.tsv:
--------------------------------------------------------------------------------
1 | 15aa277b1054cdcdf7fc018e3a3abe2df7a1691b 2002 2008 2008 2007 2005 2004 1988 1998 2006 2005 2005 2008 2008
2 | ddf1ea128fbc14a203c3b3d44d0135bb4dc33ffe 2009 1996 2005 2005 1985 2000 1997 2005 2005 2004 1970 2001 2004 2009 1998 2000 1998 1964 2004 2003 1973 1991 2000 2007 1993 2004 1966 2002
3 | 97a6613da7eeb0ac4f70ae2e3b793364c794e6dc 1992 1997 2002 2000 2005 2006 2006 2001 1986 2003 1999 1999 1998 2004 2003 1999 1998 2006 0 2003
4 | 13050adb7aa8aaf2e1c38f2b2c7e3d070358d261 2000 1992 2005 2003 1996 1995 2007 2009 2009 1974 1989 1978 2010 2003 2007 2008
5 | d3ba6b48f62e2fe1802efb46c3799362572eeb1d 1960 1968 2001 1995 1998 2002 2000 2001 2005 1996 1991 1991 1988 2003 2002 2002 1995 2003 2002 1977 1963 2000 2000 1975 2000 1999 2001 1998 2000 1967 2003 2002 1999 2005
6 | fd906edf5833d0d506a220097ced14a03ff40a73 2009 2004 1999 2002 2009 1994 2003 1989 1997 1999 1977 2007
7 | 45705e7cf3337e2469b5dbcaa31579c28bde89d1 2014 2007 1950 2014 2011 2013 2015 2010 2006 2010 2012 2014 2013 2014 2014 2013 2014 1981 2012 2009 2005 2012 2014 2004 2012
8 | 41b3a5272d0c8f98b73e7275481cd917802ad8b7 2007 2009 2009 2007 2007 2012 2013 2009 2012 2006 2008 2005 2009 2012 1998 1985
9 | b5e6da04c35a586609a46bbbd7b1ad031a658b08 2004 2009 2008 2008 2010 1981 1984 1995 2001 1989 1970 2007 1937 2009 2007 2008 1969 2007 1999 2004 2010 2005 2005 2007 2007 2009 1970 1970 1970 2003 2009 1971 2010 2005
10 | 0eb7343cdd90265282bd261c0e48cf2fd73ea465 1976 2000 1999 2004 2001 1994 1995 2002 2003 1998 2005 2003 2001 1999 2003 2003 2000 2004 2004 2001 2001 2001 2003 2004 2002 2004 2002 1998 1986 2001
11 | 030cadedef2370bd296af07fc3324c6bb8409ba5 1997 1979 1993 1998 1994 1974 1994 1998 1961 1994 1997 1991 1995 2000 1984 1995
12 | 31368c6398a34b489f78708039177d858b171d13 2001 1990 2007 2007 2008 2007 2010 2010 2006 1998 2001 2002 2007 2007 2009 2009 2009 2007 2008 2002 2005 2009 2009 2006 2010 2003 2006 2007 2010 1992 1945 2002 2000
13 | 6ee7d70f2dbfc0d45fbf20485f82a9ed7e175725 2009 2012 2014 2011 0 2007 2012 2013 2013 2013 2011 2011 2011 2012 2009 2014 2013 1989 1998 2014 2013 2014 2014 2014 2014 2014 2014 2013 2006 2004 2001 2013 1999 2014 2013 2008
14 | 5268d3d7f15ffa9c6a904a138b2b2794263c856e 2002 2007 2005 1985 2002 2002 2002 2005 2001 1982 1997 2005 2002 0 1998 1983 1985 1975
15 | 3521f22e34fef8a53d55df180a76df5a7a4e7f87 2004 2000 2005 2006 1997 1997 2007 2002 2006 2007 2005 2006 2006 2004 2005 1998 1990 2004 2006 2006 2007
16 | bcae858633935c727739a73447b50b40b7c52794 2011 2010 2011 2011 2009 2010 2004 1977 2012 2006 2005 2012 2013 2005 2012 2012 2013 2013 2008
17 | 6f9167ddb392a43a7e36a2df1feefa184d82763e 2007 2007 2010 2010 0 2013 2011 0 2015 2012 2012 2014 2011 2009 2006 2010 2007 2010 2002 2013 2003 2009 2012 2013 2013 1979 2009 2001 2015 2015 2013 1953 2007 2011 1998 0 0 2011 2011 2014 2010 2008 2014
18 | 8ecd49c474b701f69d962f8337490c7f342266c3 2010 2011 2008 2010 2011 2010
19 | bdb1b4128730838eb2fed83829f46a9077eca9f7 2010 2011 2010 2012 1955 2011 2014 2007 2001 2001 2000 1969 1997 1950 1973 1988 1992 2008 2012 2012 2013 2013 2014 2009
20 | a98488969aed4d6add1115ce18c19c89b4826a92 1990 2009 2009 2003 2004 2009 2008 2011 2001 2012 2006 2009 2005 1997 1999 1996 1995 1998 2004 2008 2011 2011 2012 1999 2011 2009 2009 2004 2007 2001 1978 1979 1970
21 | 2942516df2695e73365e78a51e8bbe9ea1397f8f 2001 2003 2009 2007 2005 2008 2010 1994 2003 2000 2009 2004 1999 1980 2004 1991 1998 2008 2008 1997 1994 2003 1980 1995 1996 2005
22 | 9ba994ddf01d2431c6c8de129b3a5a7797e5a5e6 2010 2013 2000 2013 2014 2006 0 0 2015 2013 2010 2013 2008 2006 0 2012 1998 0 0 2004 2004 0 2006 2012 1986 1995
23 | a14feb1f5d1f35815eca17c91365a728a27ade94 2004 2001 2008 2003 2003 2010 2010 1967 2003 2004 2008 2005 2005 2003 2005 2003 2012 2010 2012 2008
24 | 2e939ed3bb378ea966bf9f710fc1138f4e16ef38 2004 2009 2011 2001 2012 2014 2001 2002 1981 1973 2006 2012 2013 1990 1996 2009 2013 2001 2003 1998 2010 2008 2013 2014 2000 2011 2004 1998 2000 2012 2009 1996
25 | 2a2b8a525eae19087cc0248a45a8e17de44b021f 2000 2010 2008 2011 2010 2008 2008 2011 2010 2011 2012 2012 1995 2002 2005 2010 2007 2009 0 2009 2007 2009 1976 2003
26 | 765f6ca92d5c228847c2ceb37b756ecf980c95a4 1995 1991 2008 2010 2006 2006 1978 1980 1991 1996 2003 2007 1995
27 | 10a1e6233fce78a5c6bd3a40cca3e9298da55abe 1989 1992 1997 1998 2000 2002 2002 2003 1992 1999 2003 1999 1990 1977 2001 2000 1976 1985 1990 1999 1998 2001 2000 1993 2000 2000 2003 2001 1998 0 2001 1990 1997 2002 1998 2001 2003 1995 2001 1989 1997 2001 1992 1999 2002 2002 1992 1999 2002 1996 1997 1999 1990 1981 2003
28 | 1ad4974c4d79b00c890bd2dd1562600bd9c7e2bd 1998 1999 1995 2005 2005 1992 2005 2004 1960 1999 2000 1997 1998 2004 1993 2000 2001 1998 2004 1992 2005 1977 2003 1992 1990 2000 2005 2007 2003 2003 1980 2001 1994 1993 2005 2003 1999 2001 2002 2006 1980 2002 1989 1987 2006 2002 1995 1969 1979 2001
29 | 45099df43a2692bb4eea8ec12b331bb827403d57 1987 1991 1987 1992 1994 1989 1992 1993 1993 1990 1991 1991
30 | 9e59d03fbb534832ced523250ee429f41893ab39 2006 2007 2006 2006 1985 2006 2004 2004 2003 1976 1998 1994 1993 2007 2005
31 | 2774393ecb042926ba7fa6957841853ffff0396d 1997 1998 1996 1997 1994 1999 1997 1997 1997 1990 1988 1987 1995 1992 1997 1997 1997 1983 1997 1995
32 | aeb717fbb9aac3501236bce498cbf8b98f5d8926 0 0 0 0 0 0 0 0 0 0 0 0 1952 0 1979 0 0 0 0 0 1977
33 | 5f17cac51538fc860379e2a4887586757be6182e 2005 2003 2007 2007 2007 2002 2006 2005 2007 2008 2008 2008 2008 2010 2002 2002 1997 2003 2007 1989
34 | 520633c68777988873f5aa011df45a5289c04217 1978 1993 1996 1996 2003 1993 1977 2011 1986 2009 0 0 2008 1977 2008 1981 2011 2012 0
35 | 01ce0903206717ac40f9a26ce9478bdeff5c1262 2006 2008 2010 2010 2011 2010 2006 2007
36 | 2dd3138ebdaad92ace967fe7d2842937a5c729e4 2005
37 | 05d1cee851e2d900ab78b3542237af4db84590fd 1989 1991 1979 1992 1994 1995 1989
38 | d9f69149832fa4ceb0ab3b0285863ae2c8977d2e 1995 2009 1997 2009 2004 1991 1994 1988 2008 2002 2008 2006 2002 2007 2006 2009 1978
39 | 50fb3680934ae424e4c73c18e9928e9741171d5b 1998 2006 2006 1998 2000 2003 2003 1996 2006 2006 2004 2006 1990 1986 2002 1999 2000 2000 1998 1997 2001 2004 1998 2005 1998
40 | 246cd6090088e84f7b7ce85f94ad1969ad860118 0 2006 2005 2006 2003 2005 2002 1941 2002 2000 2006
41 | 152119fe816def89f825ac5b56e85b16530d0bbe 2009 1997 2011 2013 2013 2011 2001 1935 1999 2004 2007 1988 2000 1999 1949 2012
42 | 4699e2c09244f3496b1c202925618ccf732a617d 2002 2004 1991 1994 2009 1998 1992 2007 1990 1997 2006 0 2005 2005 2008 2008 2008 2008 1999 2000 2005 2009 2009 2010 2006 2006 2004 2003 1996 2001 1991 1979 1980 1994
43 | 7e2677bb7c4b1c1bcb7bd89c79cfc17a9d955f48 1985 1973 1999 1990 1994 1980 1838 1845 1980 2005 1972 1973 1994 1976 1953 1972 1975 1978 1978 1980 1995 2000 1991 2004 2003 2004 2006 1995 1996 2006 1997 2007 1979 2008 2014 2010 2011 1974 1992 1997 2005
44 | 4a59d7877aba1aa4f5a94758030f99989b2989dc 1996 2000 1996 2000 2012 2011 2011 2009 2004 2007 2005 2007 2009 2009 2005 2003 2010 2010 2010 2010 2010 2010 2009 1989 2002 2009 1996 2012 2012 1975 2012 2000
45 | 198dcf518298e0afe39d005da5320cfe840480c1 2013 1995 1982 1991 2013 1990 2012 2009 2011 2004 2008 2013 1997 2009 1971 2012 2008 2011 1973 1997 2013 1974 2010 2006 2003 1987
46 | 1e1ea21612a781634ae21271372b77cd19d09212 1999 2003 1990 2007 1971 2008 2005 2008 2013 2003 1993 1993
47 | d7e464a1e466fb04e72a961c24d10ecf65c20890 1999 1999 1997 1952 2004 1997 2000 1995 1996 2002 2009 2000 2003 2002 2009 2007 2007 1997 2007 2001 1998 2006 1995 1965 2005 2002 1998 1996 1973 1997 2008 2001 1976 2010 1984 2008 2002 2003 1993 2006 2006 2008 2006 1992 2001 2005 1998 1999 2001 2003 2007 2005 1999 2008 2000 1975 2001 2008 2007 2005 2005 1999 1993 1997 1989 1994 1999 2007 1988 1985 1994 1991 2009 2009 1998 2003 2008 2002 1968 2007 1981 2007 1997 1997 1983 2002 2006 1994 1996 1997 1991 2001 2007 1948 1998 1991 1966 2000 1989 1969 1997 2000 2001 2002 2004 2007 2008 2004 1965
48 | b58e020f6c9c834ba83f07322adc75f3756e087f 2013 2013 2007 2000 2013 2013
49 | 5c7eebc8ba8fe8df00f54496ab743ede61314419 1763 1979 1952 1990 2005 1975 2007 2005 2006 1984 1993 1974 1983 2004 2001 2003 2003 2003 2004 2006 2003 1946 1961 1965 1998 1996 1812 1974 1997 1976 2004 1998 2006 2004 2006 1983 1989 2002 2002 2004 1964 1978 1999 1996 2005 1997 1970
50 | 8ab8bb8aa41151b85ab367f1152ff17cc17b87d1 1974 1994 2007 2008 2009 1980 1985 1999 1972 1967 2004 1996 1999 2003 1930 1999 2005 2005 2005 2005 1971 1982 1973 1996 2008
51 | 03a7d6e56d70018b355f4b3ee0f8c8a240bf89a2 2006 2005 2005 2005 2001 1998 2000 2001 2000 2006 2002 2000 2006 1997 2003 1998 2006 2004 2007
52 | 8c1c76248e128c7e5789c7f0a9a89fbba17e4c11 2005 2014 2006 2007 2009 2006 2013 2011 2010 2005 1998 2006 2010 2012
53 | 3717dc0dba9fdb13d1459ed4edf7955dce2e06b3 1991 1998 1996 1996 1977 1995 1977 1997 1997 1993 1999 1998
54 |
--------------------------------------------------------------------------------
/core/src/main/resources/golddata/isaac/bibs-to-tsv.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 |
3 | import csv
4 | import argparse
5 |
6 | parser = argparse.ArgumentParser(description='Convert bibliography data into an empty TSV file for annotators to fill '
7 | 'in with citation mention data')
8 | # bibliographies.tsv is the file inside this very directory containing high quality human-annotated bibliography data
9 | # Note that bibliographies.tsv itself is generated from the scholar project:
10 | # scholar-project/pipeline/src/main/resources/ground-truths/bibliographies.json
11 | parser.add_argument('-i', metavar='INPUT.TSV', default='bibliographies.tsv', help='Filename for bibliography TSV data')
12 | parser.add_argument('-o', metavar='OUTPUT.TSV', default='mentions-blank.tsv', help='Filename for empty TSV')
13 | args = parser.parse_args()
14 |
15 | with open(args.i) as bibs, open(args.o, 'w') as mentions:
16 | bibs = csv.reader(bibs, delimiter='\t')
17 | mentions = csv.writer(mentions, delimiter='\t')
18 | mentions.writerow(["Paper ID", "URL", "Bib Entry", "Context", "Context Reference"])
19 | for paper in bibs:
20 | if paper:
21 | id, entries = paper[0], paper[1:]
22 | for i, entry in enumerate(entries):
23 | title, year, venue, authors = entry.split('|')
24 | authors = authors.split(':')
25 | mentions.writerow([id,"https://www.semanticscholar.org/paper/{0}".format(id), "[{0}] {1}. {2}".format(i + 1, ', '.join(authors), year), "", ""])
26 |
--------------------------------------------------------------------------------
/core/src/main/resources/golddata/isaac/import_bib_gold.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | # format scholar-project/pipeline/src/main/resources/ground-truths/bibliographies.json as valid JSON before running this
3 | # script inside that directory
4 |
5 | from jsonsempai import magic
6 | import bibliographies
7 |
8 | papers = bibliographies.bibs
9 |
10 | def refToStr(ref): # edit as necessary to include only authors/years/venues/etc.
11 | return ref.title.text + "|" + str(ref.year) + "|" + ref.venue.name + "|" + ":".join([" ".join([a.firstName] + a.middleNames + [a.lastName]) for a in ref.authors])
12 |
13 | def paperToStr(paper):
14 | return "\t".join([paper.id] + [refToStr(ref) for ref in paper.refs])
15 |
16 | with open('bibliographies.tsv', 'w') as f:
17 | for paper in papers:
18 | f.write(paperToStr(paper).encode('utf-8') + "\n")
19 |
--------------------------------------------------------------------------------
/core/src/main/resources/golddata/isaac/tsv-to-gold.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 |
3 | import csv
4 | import re
5 | import argparse
6 |
7 | parser = argparse.ArgumentParser(description='After annotators fill in empty TSV file generated by bibs-to-tsv.py, '
8 | 'this converts the filled-in TSV data into the format used for other TSV '
9 | 'gold data')
10 | parser.add_argument('-i', metavar='INPUT.TSV', default='mentions-filled.tsv', help='Filename for filled-in TSV')
11 | parser.add_argument('-o', metavar='OUTPUT.TSV', default='mentions.tsv', help='Filename for final gold TSV')
12 | args = parser.parse_args()
13 |
14 | with open(args.i) as mentions, open(args.o, 'w') as gold_writer:
15 | mentions = csv.reader(mentions, delimiter='\t')
16 | next(mentions) # skip header row
17 | gold_writer = csv.writer(gold_writer, delimiter='\t')
18 | gold = {}
19 | for paper, _, _, context, mention in mentions:
20 | if paper not in gold:
21 | gold[paper] = []
22 | cleaned_context = re.sub(r'\s+', ' ', context.strip())
23 | if not cleaned_context:
24 | continue
25 | gold[paper].append("{0}|{1}".format(cleaned_context, re.sub(r'[()]', '', mention)))
26 | for paper, bib_entries in gold.items():
27 | if len(bib_entries) > 0:
28 | gold_writer.writerow([paper] + bib_entries)
--------------------------------------------------------------------------------
/core/src/main/resources/opennlp/tools/tokenize/en-token.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/science-parse/f83983aa1b898f0e47c97f4997c57427bf277182/core/src/main/resources/opennlp/tools/tokenize/en-token.bin
--------------------------------------------------------------------------------
/core/src/main/resources/org/allenai/scienceparse/pipeline/highfreq.tsv:
--------------------------------------------------------------------------------
1 | a_simple 12720690
2 | a_dynamic 3753200
3 | dynamics_of 1532890
4 | determination_of 1777002
5 | investigation_of 1961935
6 | performance_of 8679010
7 | design_of 49726446
8 | estimation_of 8083152
9 | a_multi 10658088
10 | a_statistical 1095776
11 | an_efficient 50271243
12 | use_of 8632806
13 | an_optimal 1618386
14 | visualization_of 1070969
15 | the_effects 5530896
16 | a_fast 10222655
17 | a_probabilistic 1931745
18 | synthesis_of 2068747
19 | a_scalable 1899108
20 | on_line 3244472
21 | application_of 41340376
22 | control_of 2037770
23 | stability_of 1756670
24 | recognition_of 1007944
25 | simulation_of 4727016
26 | a_data 1264830
27 | an_adaptive 8786936
28 | toward_a 2473704
29 | a_general 5332851
30 | a_bayesian 1330236
31 | applications_of 2929491
32 | effects_of 27700900
33 | an_integrated 5452943
34 | a_web 1050192
35 | measuring_the 1450275
36 | a_neural 1214904
37 | the_evolution 1135497
38 | multi_agent 1027299
39 | assessing_the 1886130
40 | the_impact 17252433
41 | a_practical 2301740
42 | low_complexity 1198054
43 | a_class 1027026
44 | evaluation_of 59043315
45 | the_complexity 2251237
46 | the_role 26007145
47 | what_is 2925672
48 | a_parallel 3641565
49 | an_experimental 2116674
50 | real_time 50445213
51 | a_robust 3952640
52 | extraction_of 1018251
53 | a_high 3576455
54 | exploring_the 3538080
55 | advances_in 2604354
56 | implementation_of 10742238
57 | a_unified 3918522
58 | state_of 3048066
59 | a_comparison 19333800
60 | low_power 1194318
61 | a_self 1060920
62 | evolution_of 2981682
63 | review_of 4298535
64 | lower_bounds 1005609
65 | the_design 5208700
66 | a_non 1036504
67 | development_and 1553940
68 | fast_and 1592184
69 | data_mining 1549602
70 | building_a 1888656
71 | evaluating_the 2875506
72 | modeling_of 1830630
73 | model_based 4964960
74 | a_two 2270310
75 | a_framework 33836573
76 | research_on 4838940
77 | a_computational 1634932
78 | the_use 6342888
79 | how_to 21718515
80 | the_development 1297113
81 | the_application 1091897
82 | analysis_and 4345255
83 | using_a 1959879
84 | a_hierarchical 1344020
85 | prediction_of 2722464
86 | characterization_of 6124240
87 | development_of 39929760
88 | generation_of 1629189
89 | modeling_and 10579272
90 | a_hybrid 13076028
91 | analysis_of 134179470
92 | neural_network 1384011
93 | towards_an 3914460
94 | high_performance 3483712
95 | an_architecture 1631500
96 | fault_tolerant 1160492
97 | in_the 1368941
98 | an_approach 9797140
99 | a_fuzzy 1988224
100 | an_algorithm 4011406
101 | study_of 4196997
102 | book_review 1114371
103 | a_note 31770520
104 | introduction_to 21261555
105 | study_on 2350493
106 | understanding_the 1899040
107 | a_low 4074049
108 | developing_a 1389696
109 | a_case 4785264
110 | a_new 383773671
111 | assessment_of 2821824
112 | large_scale 3419310
113 | the_influence 2529252
114 | algorithms_for 3336032
115 | classification_of 4431254
116 | construction_of 2663066
117 | estimating_the 1313070
118 | modeling_the 2438241
119 | learning_to 3248856
120 | an_investigation 1475595
121 | three_dimensional 2384190
122 | effect_of 18819000
123 | a_flexible 1281834
124 | comments_on 3485950
125 | energy_efficient 4985232
126 | performance_analysis 18339839
127 | an_introduction 2103116
128 | an_evaluation 2941470
129 | impact_of 14113162
130 | role_of 1418820
131 | influence_of 3787980
132 | integration_of 4965832
133 | a_generic 1558388
134 | optimization_of 6624849
135 | a_formal 2443825
136 | an_improved 11404888
137 | detection_of 9207810
138 | a_distributed 4994100
139 | a_methodology 1508225
140 | proceedings_of 15661230
141 | performance_evaluation 10518768
142 | a_study 31082424
143 | a_model 16993140
144 | a_system 1834632
145 | model_checking 1130500
146 | on_the 2050795697
147 | special_issue 1762722
148 | identification_of 10424040
149 | a_novel 110788635
150 | towards_the 1676733
151 | machine_learning 1310518
152 | agent_based 1434524
153 | comparison_of 28484820
154 | a_method 8919118
155 | on_a 9368724
156 | object_oriented 1596160
157 | a_review 3881448
158 | using_the 2515350
159 | measurement_of 1337719
160 | an_overview 2984265
161 | from_the 2218287
162 | an_analysis 6661556
163 | the_effect 16498944
164 | an_empirical 7243839
165 | a_survey 15373729
166 | verification_of 1178220
167 | an_application 1608600
168 | an_effective 1223620
169 | design_and 52970592
170 | a_comparative 6703872
171 | improving_the 7066752
172 | a_generalized 1330711
173 | a_real 1238160
174 | overview_of 2647164
175 | towards_a 49618212
176 | an_overview_of 2566320
177 | a_review_of 2284447
178 | a_case_study 1537596
179 | on_the_use 1848079
180 | the_evolution_of 1003312
181 | a_new_approach 5122026
182 | a_study_of 10074025
183 | analysis_of_the 2585856
184 | a_comparative_study 2961265
185 | introduction_to_the 1544796
186 | a_study_on 4248972
187 | a_novel_approach 1431635
188 | performance_analysis_of 13852134
189 | an_application_of 1011780
190 | a_method_for 3571193
191 | development_of_a 5741686
192 | a_new_method 1279488
193 | an_introduction_to 2002362
194 | a_note_on 30272320
195 | the_complexity_of 2138344
196 | the_use_of 6072724
197 | a_model_for 2191425
198 | a_framework_for 26200476
199 | a_survey_of 6222890
200 | a_survey_on 1445036
201 | the_effect_of 16211580
202 | an_evaluation_of 2148589
203 | the_impact_of 16671875
204 | design_of_a 4213936
205 | the_effects_of 5397246
206 | the_design_of 1383060
207 | a_comparison_of 15086610
208 | an_analysis_of 5500050
209 | the_role_of 25140654
210 | on_the_complexity 1898598
211 | an_empirical_study 1371527
212 | an_approach_to 4261494
213 | a_model_of 1413822
214 | the_influence_of 2448115
215 | an_algorithm_for 2530008
216 | proceedings_of_the 12292616
217 | design_and_implementation 7314000
218 | performance_evaluation_of 8114553
219 | a_new_approach_to 1996090
220 | on_the_use_of 1809780
221 | on_the_complexity_of 1797614
222 | design_and_implementation_of 6962494
223 | a_comparative_study_of 1840080
224 | a_note_on_the 2921148
225 | design_and_implementation_of_a 1076582
226 |
--------------------------------------------------------------------------------
/core/src/main/scala/org/allenai/scienceparse/BibTraining.scala:
--------------------------------------------------------------------------------
1 | package org.allenai.scienceparse
2 |
3 | import java.io.{ File, FileInputStream }
4 |
5 | import com.gs.collections.impl.set.mutable.UnifiedSet
6 | import org.allenai.common.{ Logging, Resource }
7 | import org.allenai.datastore.Datastores
8 | import org.allenai.scienceparse.Parser.ParseOpts
9 | import scopt.OptionParser
10 |
11 | import scala.collection.JavaConverters._
12 | import scala.io.Source
13 |
14 | object BibTraining extends App with Datastores with Logging {
15 | // The Files are all Option[File] defaulting to None. Properly, they should be set to the
16 | // defaults from the datastore, but if we do that here, they will download several gigabytes
17 | // of files during startup, even if they are unused later.
18 | case class Config(
19 | output: File = null,
20 | groundTruth: Option[File] = None,
21 | maxIterations: Int = 150,
22 | backgroundSampleDocs: Int = 4000,
23 | backgroundDirectory: Option[File] = None,
24 | gazetteerFile: Option[File] = None,
25 | trainFraction: Double = 0.9,
26 | minExpectedFeatureCount: Int = 1
27 | )
28 |
29 | val parser = new OptionParser[Config](this.getClass.getSimpleName) {
30 | head("Options that are not specified default to the settings that were used to make the production model.")
31 |
32 | opt[File]('o', "output") required () action { (o, c) =>
33 | c.copy(output = o)
34 | } text "The output file"
35 |
36 | opt[File]('t', "groundTruth") action { (t, c) =>
37 | c.copy(groundTruth = Some(t))
38 | } text "The ground truth directory"
39 |
40 | opt[Int]("maxIterations") action { (i, c) =>
41 | c.copy(maxIterations = i)
42 | } text "Maximum number of iterations during training"
43 |
44 | opt[Int]("backgroundSampleDocs") action { (d, c) =>
45 | c.copy(backgroundSampleDocs = d)
46 | } text "The number of documents to use to build the background language model"
47 |
48 | opt[File]("backgroundDirectory") action { (d, c) =>
49 | c.copy(backgroundDirectory = Some(d))
50 | } text "The directory in which the background documents are found"
51 |
52 | opt[File]('g', "gazetteerFile") action { (f, c) =>
53 | c.copy(gazetteerFile = Some(f))
54 | } text "The gazetteer file"
55 |
56 | opt[Double]("trainFraction") action { (f, c) =>
57 | c.copy(trainFraction = f)
58 | } text "The fraction of the ground truth to use for training"
59 |
60 | opt[Int]("minExpectedFeatureCount") action { (n, c) =>
61 | c.copy(minExpectedFeatureCount = n)
62 | } text "The minimum number of times we should see a feature before accepting it."
63 |
64 | help("help") text "Prints help text"
65 | }
66 |
67 | parser.parse(args, Config()).foreach { config =>
68 | val groundTruthDirectory =
69 | config.groundTruth.getOrElse(publicDirectory("productionBibGroundTruth", 2).toFile)
70 |
71 | val opts = new ParseOpts
72 | opts.modelFile = config.output.toString
73 | opts.iterations = config.maxIterations
74 | opts.threads = Runtime.getRuntime.availableProcessors() * 2
75 | opts.backgroundSamples = config.backgroundSampleDocs
76 |
77 | val backgroundDirectory =
78 | config.backgroundDirectory.getOrElse(publicDirectory("productionBackgroundDocs", 1).toFile)
79 | opts.backgroundDirectory = backgroundDirectory.toString
80 |
81 | val gazetteerFile = config.gazetteerFile.getOrElse(Parser.getDefaultGazetteer.toFile)
82 | opts.gazetteerFile = gazetteerFile.toString
83 |
84 | opts.trainFraction = config.trainFraction
85 | opts.minExpectedFeatureCount = config.minExpectedFeatureCount
86 |
87 | Parser.trainBibliographyCRF(groundTruthDirectory, opts)
88 |
89 | logger.info(s"New model at ${opts.modelFile}")
90 | }
91 | }
92 |
--------------------------------------------------------------------------------
/core/src/main/scala/org/allenai/scienceparse/CachedGrobidServer.scala:
--------------------------------------------------------------------------------
1 | package org.allenai.scienceparse
2 |
3 | import java.io.{InputStream, ByteArrayInputStream, IOException}
4 | import java.net.{SocketTimeoutException, URL}
5 | import java.nio.file.{NoSuchFileException, Paths, Files}
6 | import java.util.zip.{GZIPOutputStream, GZIPInputStream}
7 |
8 | import org.allenai.common.{Logging, Resource}
9 | import org.allenai.datastore.Datastores
10 | import org.apache.commons.io.{FileUtils, IOUtils}
11 |
12 | import scala.util.control.NonFatal
13 | import scala.util.{Success, Failure, Try, Random}
14 | import scalaj.http.{Http, MultiPart, HttpResponse}
15 |
16 |
17 | class CachedGrobidServer(url: URL) extends Logging with Datastores {
18 | private val cacheDir = {
19 | val dirName = url.toString.replaceAll("[^\\w-.]+", "#")
20 | Files.createDirectories(CachedGrobidServer.cacheDir)
21 | val dir = CachedGrobidServer.cacheDir.resolve(dirName)
22 | if(!Files.exists(dir)) {
23 | // Warm the cache, so for most evaluations we don't need to have a running Grobid server at
24 | // all.
25 | val warmCacheDir = publicDirectory("GrobidServerCache", 2)
26 | FileUtils.copyDirectory(warmCacheDir.toFile, dir.toFile)
27 | }
28 | dir
29 | }
30 |
31 | private val random = new Random
32 | /** Gets a response from an HTTP server given a request. Retries if we think retrying might fix it. */
33 | private def withRetries[T](f: () => HttpResponse[T], retries: Int = 10): HttpResponse[T] = if (retries <= 0) {
34 | f()
35 | } else {
36 | val sleepTime = random.nextInt(1000) + 2500 // sleep between 2.5 and 3.5 seconds
37 | // If something goes wrong, we sleep a random amount of time, to make sure that we don't slam
38 | // the server, get timeouts, wait for exactly the same amount of time on all threads, and then
39 | // slam the server again.
40 |
41 | Try(f()) match {
42 | case Failure(e: SocketTimeoutException) =>
43 | logger.warn(s"$e while querying Grobid. $retries retries left.")
44 | Thread.sleep(sleepTime)
45 | withRetries(f, retries - 1)
46 |
47 | case Failure(e: IOException) =>
48 | logger.warn(s"Got IOException '${e.getMessage}' while querying Grobid. $retries retries left.")
49 | Thread.sleep(sleepTime)
50 | withRetries(f, retries - 1)
51 |
52 | case Success(response) if response.isServerError =>
53 | logger.warn(s"Got response code '${response.statusLine}' while querying Grobid. $retries retries left.")
54 | Thread.sleep(sleepTime)
55 | withRetries(f, retries - 1)
56 |
57 | case Failure(e) => throw e
58 |
59 | case Success(response) => response
60 | }
61 | }
62 |
63 | // Note: This is not thread safe if you have two threads or processes ask for the same file at
64 | // the same time.
65 | def getExtractions(bytes: Array[Byte]): InputStream = {
66 | val paperId = Utilities.shaForBytes(bytes)
67 |
68 | val cacheFile = cacheDir.resolve(paperId + ".xml.gz")
69 | try {
70 | if (Files.size(cacheFile) == 0)
71 | throw new IOException(s"Paper $paperId is tombstoned")
72 | else
73 | new GZIPInputStream(Files.newInputStream(cacheFile))
74 | } catch {
75 | case _: NoSuchFileException =>
76 | logger.debug(s"Cache miss for $paperId")
77 | try {
78 | val response = withRetries { () =>
79 | val multipart = MultiPart("input", s"$paperId.pdf", "application/octet-stream", bytes)
80 | Http(url + "/processFulltextDocument").timeout(60000, 60000).postMulti(multipart).asBytes
81 | }
82 | val bais = new ByteArrayInputStream(response.body)
83 | Resource.using(new GZIPOutputStream(Files.newOutputStream(cacheFile))) { os =>
84 | IOUtils.copy(bais, os)
85 | }
86 | bais.reset()
87 | bais
88 | } catch {
89 | case NonFatal(e) =>
90 | logger.warn(s"Tombstoning $paperId because of the following error:", e)
91 | Files.deleteIfExists(cacheFile)
92 | Files.createFile(cacheFile)
93 | throw e
94 | }
95 | }
96 | }
97 | }
98 |
99 | object CachedGrobidServer {
100 | val cacheDir = Files.createDirectories(
101 | Paths.get(
102 | System.getProperty("java.io.tmpdir"),
103 | this.getClass.getSimpleName.stripSuffix("$")))
104 | }
105 |
--------------------------------------------------------------------------------
/core/src/main/scala/org/allenai/scienceparse/GazetteerFromPMC.scala:
--------------------------------------------------------------------------------
1 | package org.allenai.scienceparse
2 |
3 | import java.util.concurrent.atomic.AtomicInteger
4 |
5 | import org.allenai.common.ParIterator._
6 | import scala.concurrent.ExecutionContext.Implicits.global
7 | import spray.json._
8 |
9 | object GazetteerFromPMC extends App {
10 | case class GazetteerEntry(id: String, title: String, authors: Seq[String], year: Int)
11 | import DefaultJsonProtocol._
12 | implicit val gazetteerEntryFormat = jsonFormat4(GazetteerEntry.apply)
13 |
14 | // We use the first 1k of this for testing, so let's drop 10k just to be sure.
15 | val labeledDataNotUsedForTesting = LabeledPapersFromPMC.get.drop(10000)
16 |
17 | val noneCount = new AtomicInteger()
18 |
19 | labeledDataNotUsedForTesting.parMap { lp =>
20 | val ld = lp.labels
21 | (ld.title, ld.authors, ld.year) match {
22 | case (Some(title), Some(authors), Some(year)) =>
23 | Some(GazetteerEntry(lp.paperId, title.replaceAll("\\s+", " "), authors.map(_.name), year))
24 | case _ =>
25 | noneCount.incrementAndGet()
26 | None
27 | }
28 | }.flatten.take(1000000).foreach { entry =>
29 | println(entry.toJson)
30 | }
31 | }
32 |
--------------------------------------------------------------------------------
/core/src/main/scala/org/allenai/scienceparse/GrobidParser.scala:
--------------------------------------------------------------------------------
1 | package org.allenai.scienceparse
2 |
3 | import java.io.InputStream
4 | import java.nio.file.Path
5 | import java.util.Calendar
6 |
7 | import org.allenai.common.StringUtils.StringExtras
8 | import org.allenai.scienceparse.{ Section => SPSection }
9 | import org.jsoup.Jsoup
10 | import org.jsoup.nodes.{Document, Element, TextNode}
11 | import org.jsoup.parser.{Parser => JsoupParser}
12 |
13 | import scala.collection.JavaConverters._
14 |
15 | object GrobidParser {
16 | def addDot(x: String) = if (x.length == 1) s"$x." else x
17 |
18 | def author(e: Element): String = {
19 | val first = List(e.findText("persName>forename[type=first]"))
20 | val mids = e.select("persName>forename[type=middle]").asScala.map(_.text).toList
21 | val last = List(e.findText("persName>surname"))
22 | (first ++ mids ++ last).filter(!_.isEmpty).map(a => addDot(a.trimNonAlphabetic)).mkString(" ")
23 | }
24 |
25 | def extractTitle(doc: Element): String = {
26 | doc.findText("teiHeader>fileDesc>titleStmt>title").titleCase()
27 | }
28 |
29 | def toTitle(s: String) = {
30 | s.trimChars(",.").find(c => Character.isAlphabetic(c)) match {
31 | case None => ""
32 | case Some(_) => s
33 | }
34 | }
35 |
36 | def extractYear(str: String): Int = "\\d{4}".r.findFirstIn(str) match {
37 | case Some(y) => y.toInt
38 | case None => 0
39 | }
40 |
41 | def extractBibEntriesWithId(doc: Element) =
42 | for {
43 | bib <- doc.select("listBibl>biblStruct").asScala
44 | } yield {
45 | val title = toTitle(bib.findText("analytic>title[type=main]")) match {
46 | case "" => bib.findText("monogr>title")
47 | case s => s
48 | }
49 | val authors = bib.select("analytic>author").asScala.map(author).toList match {
50 | case List() => bib.select("monogr>author").asScala.map(author).toList
51 | case l => l
52 | }
53 | val venue = bib.findText("monogr>title")
54 | val yr = extractYear(bib.findAttributeValue("monogr>imprint>date[type=published]", "when"))
55 | new BibRecord(title, authors.asJava, venue, null, null, yr)
56 | }
57 |
58 | def ifNonEmpty(s: String) = if (s.nonEmpty) Some(s) else None
59 |
60 | case class Section(id: Option[String], header: Option[String], text: String)
61 |
62 | private def extractSectionInfo(div: Element) = {
63 | val bodyPlusHeaderText = div.text
64 |
65 | val head = div.select("head").asScala.headOption
66 | val (id, headerText, bodyTextOffset) = head match {
67 | case Some(h) =>
68 | val hText = h.text
69 | (
70 | ifNonEmpty(h.attr("n")),
71 | Some(hText),
72 | hText.size + bodyPlusHeaderText.drop(hText.size).takeWhile(_ <= ' ').size
73 | )
74 | case None =>
75 | (None, None, 0)
76 | }
77 | val section = Section(id = id, text = bodyPlusHeaderText.drop(bodyTextOffset), header = head.map(_.text))
78 | (div, bodyPlusHeaderText, bodyTextOffset, section)
79 | }
80 |
81 | def extractReferenceMentions(doc: Element, sectionInfo: Iterable[(Element, String, Int, Section)]): List[CitationRecord] = {
82 | val bibMentions =
83 | for {
84 | ref <- doc.select("ref[type=bibr").asScala
85 | ((div, fullText, offset, _), sectionNumber) <- sectionInfo.zipWithIndex.find {
86 | case ((div, fullText, offset, _), i) =>
87 | ref.parents.contains(div)
88 | }
89 | } yield {
90 | val id = ref.attr("target").dropWhile(_ == '#')
91 | val begin = ref.textOffset(div) - offset
92 | val end = begin + ref.text.length
93 | Parser.extractContext(0, fullText, begin, end)
94 | }
95 | bibMentions.toList
96 | }
97 |
98 | def parseGrobidXml(grobidExtraction: Path): ExtractedMetadata = {
99 | val doc = Jsoup.parse(grobidExtraction.toFile, "UTF-8")
100 | parseGrobidXml(doc)
101 | }
102 |
103 | def parseGrobidXml(is: InputStream, baseUrl: String): ExtractedMetadata = {
104 | val doc = Jsoup.parse(is, "UTF-8", baseUrl, JsoupParser.xmlParser())
105 | parseGrobidXml(doc)
106 | }
107 |
108 | private def parseGrobidXml(doc: Document): ExtractedMetadata = {
109 | val year = extractYear(doc.findAttributeValue("teiHeader>fileDesc>sourceDesc>biblStruct>monogr>imprint>date[type=published]", "when"))
110 | val calendar = Calendar.getInstance()
111 | calendar.set(Calendar.YEAR, year)
112 |
113 | val sectionInfo = doc.select("text>body>div").asScala.map(extractSectionInfo)
114 |
115 | val em = new ExtractedMetadata(extractTitle(doc), doc.select("teiHeader>fileDesc>sourceDesc>biblStruct>analytic>author").asScala.map(author).asJava, calendar.getTime)
116 | em.year = year
117 | em.references = extractBibEntriesWithId(doc).asJava
118 | em.referenceMentions = extractReferenceMentions(doc, sectionInfo).asJava
119 | em.abstractText = doc.select("teiHeader>profileDesc>abstract").asScala.headOption.map(_.text).getOrElse("")
120 |
121 | em.sections = sectionInfo.map { case (_, _, _, grobidSection) =>
122 | new SPSection(
123 | Seq(grobidSection.id, grobidSection.header).flatten.map(_.trim).mkString(" "),
124 | grobidSection.text)
125 | }.asJava
126 |
127 | em
128 | }
129 |
130 | implicit class JsoupElementsImplicits(e: Element) {
131 |
132 | def findText(path: String): String =
133 | e.select(path).asScala.headOption.map(_.text).getOrElse("")
134 |
135 | def findAttributeValue(path: String, attrName: String): String =
136 | e.select(path).asScala.headOption.map(_.attr(attrName)).getOrElse("")
137 |
138 | // The number of text characters in the ancestor that preceed the given element
139 | def textOffset(ancestor: Element): Int = {
140 | if (ancestor == e.parent) {
141 | val ancestorText = ancestor.text
142 | val elementText = e.text
143 | val index = ancestorText.indexOf(elementText)
144 | ancestorText.indexOf(elementText, index + 1) match {
145 | case -1 => // The common and easy case: Text only occurs once in the parent.
146 | index
147 | case _ => // Our text occurs multiple times in the parent. Bogus!
148 | // Count how many times it occurs previous to our element
149 | def countOccurencesIn(base: String) = {
150 | var count = 0
151 | var index = base.indexOf(elementText)
152 | while (index > 0) {
153 | count += 1
154 | index = base.indexOf(elementText, index + 1)
155 | }
156 | count
157 | }
158 | val precedingSiblingText =
159 | ancestor.childNodes.asScala.takeWhile(_ != e).map {
160 | case t: TextNode => t.getWholeText.trim()
161 | case e: Element => e.text
162 | case _ => ""
163 | }
164 | val precedingCount = precedingSiblingText.map(countOccurencesIn).sum
165 | // Now get the next occurrence of our text
166 | def nthIndexOf(base: String, n: Int) = {
167 | var i = 0
168 | var index = base.indexOf(elementText)
169 | while (i < n) {
170 | index = base.indexOf(elementText, index + 1)
171 | i += 1
172 | }
173 | index
174 | }
175 | nthIndexOf(ancestorText, precedingCount)
176 | }
177 | } else if (e.parent == null) {
178 | sys.error("Must specify an ancestor element to find text offset")
179 | } else {
180 | e.parent.textOffset(ancestor) + e.textOffset(e.parent)
181 | }
182 | }
183 | }
184 |
185 | implicit class StringImplicits2(val str: String) extends AnyVal with StringExtras {
186 | /** @return Given full name such as "Doe, John A.", returns the last name assuming
187 | * that it's the word before the comma.
188 | */
189 | def lastNameFromFull(): String = str.trim.takeWhile(_ != ',')
190 | }
191 | }
192 |
--------------------------------------------------------------------------------
/core/src/main/scala/org/allenai/scienceparse/InterleavingIterator.scala:
--------------------------------------------------------------------------------
1 | package org.allenai.scienceparse
2 |
3 | class InterleavingIterator[T](inners: Iterator[T]*) extends Iterator[T] {
4 | override def hasNext = inners.exists(_.hasNext)
5 |
6 | private var index = 0
7 | private def bumpIndex(): Unit = {
8 | index += 1
9 | index %= inners.size
10 | }
11 |
12 | while(!inners(index).hasNext)
13 | bumpIndex()
14 |
15 | private def moveToNextIndex(): Unit = {
16 | require(hasNext)
17 | bumpIndex()
18 | while(!inners(index).hasNext)
19 | bumpIndex()
20 | }
21 |
22 | override def next() = {
23 | require(inners(index).hasNext)
24 | val result = inners(index).next()
25 | if(hasNext)
26 | moveToNextIndex()
27 | result
28 | }
29 | }
30 |
--------------------------------------------------------------------------------
/core/src/main/scala/org/allenai/scienceparse/JsonProtocol.scala:
--------------------------------------------------------------------------------
1 | package org.allenai.scienceparse
2 |
3 | import java.util.regex.Pattern
4 | import scala.collection.JavaConverters._
5 | import spray.json._
6 |
7 | object JsonProtocol extends DefaultJsonProtocol {
8 | import java.util.{ List => JavaList }
9 |
10 | private def expected(name: String) = throw DeserializationException(s"Expected $name")
11 |
12 | private def optional[T >: Null](obj: JsValue)(implicit format: JsonFormat[T]): T =
13 | obj.convertTo[Option[T]].orNull
14 |
15 | implicit def javaListJsonFormat[T: JsonFormat]: RootJsonFormat[JavaList[T]] =
16 | new RootJsonFormat[JavaList[T]] {
17 | override def write(list: JavaList[T]): JsValue =
18 | JsArray(list.asScala.map(_.toJson): _*)
19 |
20 | override def read(json: JsValue): JavaList[T] = json match {
21 | case JsArray(values) => values.map { value => value.convertTo[T] }.toList.asJava
22 | case _ => expected("List<>")
23 | }
24 | }
25 |
26 | implicit object PatternJsonFormat extends RootJsonFormat[Pattern] {
27 | override def write(pattern: Pattern): JsValue = JsString(pattern.pattern())
28 |
29 | override def read(json: JsValue): Pattern = json match {
30 | case JsString(p) => Pattern.compile(p)
31 | case _ => expected("Pattern")
32 | }
33 | }
34 |
35 | implicit object ExtractedMetadataSourceJsonFormat extends RootJsonFormat[ExtractedMetadata.Source] {
36 | override def write(source: ExtractedMetadata.Source): JsValue = {
37 | JsString(source.name())
38 | }
39 |
40 | override def read(json: JsValue): ExtractedMetadata.Source = {
41 | json match {
42 | case JsString(name) => ExtractedMetadata.Source.valueOf(name)
43 | case _ => expected("ExtractedMetadata.Source")
44 | }
45 | }
46 | }
47 |
48 | implicit object SectionJsonFormat extends RootJsonFormat[Section] {
49 | override def write(section: Section): JsValue = JsObject(
50 | "heading" -> Option(section.heading).toJson,
51 | "text" -> section.text.toJson
52 | )
53 |
54 | override def read(json: JsValue): Section = json.asJsObject.getFields("heading", "text") match {
55 | case Seq(heading, JsString(text)) =>
56 | new Section(
57 | optional[String](heading),
58 | text)
59 | case _ => expected("Section")
60 | }
61 | }
62 |
63 | implicit object BibRecordJsonFormat extends RootJsonFormat[BibRecord] {
64 | override def write(bibRecord: BibRecord) = JsObject(
65 | "title" -> Option(bibRecord.title).toJson,
66 | "author" -> bibRecord.author.toJson,
67 | "venue" -> Option(bibRecord.venue).toJson,
68 | "citeRegEx" -> Option(bibRecord.citeRegEx).toJson,
69 | "shortCiteRegEx" -> Option(bibRecord.shortCiteRegEx).toJson,
70 | "year" -> bibRecord.year.toJson
71 | )
72 |
73 | override def read(json: JsValue): BibRecord = json.asJsObject.getFields(
74 | "title",
75 | "author",
76 | "venue",
77 | "citeRegEx",
78 | "shortCiteRegEx",
79 | "year"
80 | ) match {
81 | case Seq(
82 | title,
83 | author,
84 | venue,
85 | citeRegEx,
86 | shortCiteRegEx,
87 | JsNumber(year)
88 | ) =>
89 | new BibRecord(
90 | optional[String](title),
91 | author.convertTo[JavaList[String]],
92 | optional[String](venue),
93 | optional[Pattern](citeRegEx),
94 | optional[Pattern](shortCiteRegEx),
95 | year.intValue()
96 | )
97 | case _ => expected("BibRecord")
98 | }
99 | }
100 |
101 | implicit object CitationRecordJsonFormat extends RootJsonFormat[CitationRecord] {
102 | override def write(cr: CitationRecord): JsValue = JsObject(
103 | "referenceID" -> cr.referenceID.toJson,
104 | "context" -> cr.context.toJson,
105 | "startOffset" -> cr.startOffset.toJson,
106 | "endOffset" -> cr.endOffset.toJson
107 | )
108 |
109 | override def read(json: JsValue): CitationRecord = json.asJsObject.getFields(
110 | "referenceID",
111 | "context",
112 | "startOffset",
113 | "endOffset"
114 | ) match {
115 | case Seq(
116 | JsNumber(referenceID),
117 | JsString(context),
118 | JsNumber(startOffset),
119 | JsNumber(endOffset)
120 | ) => new CitationRecord(referenceID.toInt, context, startOffset.toInt, endOffset.toInt)
121 | case _ => expected("CitationRecord")
122 | }
123 | }
124 |
125 | implicit object ExtractedMetadataJsonFormat extends RootJsonFormat[ExtractedMetadata] {
126 | override def write(em: ExtractedMetadata): JsValue = JsObject(
127 | "source" -> Option(em.source).toJson,
128 | "title" -> Option(em.title).toJson,
129 | "authors" -> em.authors.toJson,
130 | "emails" -> em.emails.toJson,
131 | "sections" -> Option(em.sections).toJson,
132 | "references" -> Option(em.references).toJson,
133 | "referenceMentions" -> Option(em.referenceMentions).toJson,
134 | "year" -> em.year.toJson,
135 | "abstractText" -> Option(em.abstractText).toJson,
136 | "creator" -> Option(em.creator).toJson
137 | )
138 |
139 | override def read(json: JsValue): ExtractedMetadata = json.asJsObject.getFields(
140 | "source",
141 | "title",
142 | "authors",
143 | "emails",
144 | "sections",
145 | "references",
146 | "referenceMentions",
147 | "year",
148 | "abstractText",
149 | "creator"
150 | ) match {
151 | case Seq(
152 | source,
153 | title,
154 | authors,
155 | emails,
156 | sections,
157 | references,
158 | referenceMentions,
159 | JsNumber(year),
160 | abstractText,
161 | creator
162 | ) =>
163 | val em = new ExtractedMetadata(
164 | optional[String](title),
165 | authors.convertTo[JavaList[String]],
166 | null)
167 | em.source = optional[ExtractedMetadata.Source](source)
168 | em.emails = emails.convertTo[JavaList[String]]
169 | em.sections = optional[JavaList[Section]](sections)
170 | em.references = optional[JavaList[BibRecord]](references)
171 | em.referenceMentions = optional[JavaList[CitationRecord]](referenceMentions)
172 | em.year = year.intValue()
173 | em.abstractText = optional[String](abstractText)
174 | em.creator = optional[String](creator)
175 | em
176 | case _ => expected("ExtractedMetadata")
177 | }
178 | }
179 |
180 | // Some formats for LabeledData
181 | implicit val authorFormat = jsonFormat3(LabeledData.Author)
182 | implicit val sectionFormat = jsonFormat2(LabeledData.Section)
183 | implicit val referenceFormat = jsonFormat7(LabeledData.Reference)
184 | implicit val rangeFormat = jsonFormat2(LabeledData.Range)
185 | implicit val mentionFormat = jsonFormat3(LabeledData.Mention)
186 | implicit val labeledDataFormat = jsonFormat9(LabeledData.apply)
187 | }
188 |
--------------------------------------------------------------------------------
/core/src/main/scala/org/allenai/scienceparse/PrintCRFInput.scala:
--------------------------------------------------------------------------------
1 | package org.allenai.scienceparse
2 |
3 | import java.awt.Desktop
4 | import java.io.{ PrintWriter, File }
5 |
6 | import org.allenai.common.Resource
7 | import org.allenai.scienceparse.pdfapi.{ PDFFontMetrics, PDFExtractor }
8 | import org.apache.commons.lang3.StringEscapeUtils
9 | import scopt.OptionParser
10 | import scala.collection.JavaConverters._
11 |
12 | object PrintCRFInput extends App {
13 | case class Config(
14 | paperDir: Option[File] = None,
15 | paperId: String = null
16 | )
17 |
18 | val parser = new OptionParser[Config](this.getClass.getSimpleName) {
19 | opt[File]('d', "paperDir") action { (d, c) =>
20 | c.copy(paperDir = Some(d))
21 | } text "The directory that contains the papers"
22 |
23 | arg[String]("") required () action { (p, c) =>
24 | c.copy(paperId = p)
25 | } text "The ID of the paper whose CRF input you want to see"
26 | }
27 |
28 | parser.parse(args, Config()).foreach { config =>
29 | val paperSource = config.paperDir.map(new DirectoryPaperSource(_)).getOrElse {
30 | PaperSource.getDefault
31 | }
32 |
33 | val seq = Resource.using(paperSource.getPdf(config.paperId)) { is =>
34 | val ext = new PDFExtractor
35 | val doc = ext.extractFromInputStream(is)
36 | PDFToCRFInput.getSequence(doc).asScala
37 | }
38 |
39 | // make a font-to-color map
40 | def font2style(fm: PDFFontMetrics) = f"font${fm.hashCode()}%x"
41 | val fonts = seq.map(_.getPdfToken.fontMetrics).toSet.map(font2style)
42 | val colors = Stream.from(1).
43 | map { n => (n * 0.61803398875 * 360).round % 360 }.
44 | map { hue => s"hsl($hue, 90%%, 85%%)" }
45 | val font2color = (fonts zip colors).toMap
46 |
47 | val tempFile = File.createTempFile(s"CRFInput-${config.paperId}.", ".html")
48 | tempFile.deleteOnExit()
49 | try {
50 | Resource.using(new PrintWriter(tempFile, "UTF-8")) { out =>
51 | out.println("")
52 | out.println("")
53 | out.println(s"CRF input for ${config.paperId}")
54 | out.println("")
60 | out.println("")
61 | out.println("")
62 | var line = 0
63 | var page = 0
64 | seq.foreach { token =>
65 | if (token.getPage != page) {
66 | out.println("
")
67 | line = 0
68 | page = token.getPage
69 | } else if (token.getLine != line) {
70 | out.println("
")
71 | line = token.getLine
72 | }
73 |
74 | val style = font2style(token.getPdfToken.fontMetrics)
75 | val escaped = StringEscapeUtils.escapeHtml4(token.getPdfToken.token)
76 | out.println(s"$escaped")
77 | }
78 | out.println("")
79 | out.println("")
80 | }
81 |
82 | Desktop.getDesktop.browse(tempFile.toURI)
83 | Thread.sleep(5000)
84 | } finally {
85 | tempFile.delete()
86 | }
87 | }
88 | }
89 |
--------------------------------------------------------------------------------
/core/src/main/scala/org/allenai/scienceparse/PrintFeaturizedCRFInput.scala:
--------------------------------------------------------------------------------
1 | package org.allenai.scienceparse
2 |
3 | import java.io.{ DataInputStream, File }
4 | import java.nio.file.Files
5 | import java.util
6 |
7 | import com.gs.collections.api.map.primitive.ObjectDoubleMap
8 | import org.allenai.common.Resource
9 | import org.allenai.scienceparse.pdfapi.PDFExtractor
10 | import scopt.OptionParser
11 |
12 | import scala.collection.JavaConverters._
13 |
14 | object PrintFeaturizedCRFInput extends App {
15 | case class Config(
16 | paperDir: Option[File] = None,
17 | modelFile: Option[File] = None,
18 | paperId: String = null
19 | )
20 |
21 | val parser = new OptionParser[Config](this.getClass.getSimpleName) {
22 | opt[File]('d', "paperDir") action { (d, c) =>
23 | c.copy(paperDir = Some(d))
24 | } text "The directory that contains the papers"
25 |
26 | opt[File]('m', "model") action { (m, c) =>
27 | c.copy(modelFile = Some(m))
28 | } text "A model to load LM feature values from"
29 |
30 | arg[String]("") required () action { (p, c) =>
31 | c.copy(paperId = p)
32 | } text "The ID of the paper whose CRF input you want to see"
33 | }
34 |
35 | parser.parse(args, Config()).foreach { config =>
36 | val paperSource = config.paperDir.map(new DirectoryPaperSource(_)).getOrElse {
37 | PaperSource.getDefault
38 | }
39 |
40 | val predExtractor = {
41 | val modelPath = config.modelFile.map(_.toPath).getOrElse(Parser.getDefaultProductionModel)
42 | Resource.using(new DataInputStream(Files.newInputStream(modelPath))) { dis =>
43 | Parser.loadModelComponents(dis).predExtractor
44 | }
45 | }
46 |
47 | val seq = Resource.using(paperSource.getPdf(config.paperId)) { is =>
48 | val ext = new PDFExtractor
49 | val doc = ext.extractFromInputStream(is)
50 | PDFToCRFInput.getSequence(doc)
51 | }
52 |
53 | val paddedSeq = PDFToCRFInput.padSequence(seq).asScala.toSeq
54 |
55 | val lines = stringsFromFeaturizedSeq(predExtractor.nodePredicates(paddedSeq.asJava))
56 |
57 | lines.asScala.foreach(println)
58 | }
59 |
60 | def stringsFromFeaturizedSeq(
61 | featurizedJava: util.List[ObjectDoubleMap[String]],
62 | prefix: String = ""
63 | ) = {
64 | // do a complicated dance to map from GS collections to Scala collections
65 | val featurized = featurizedJava.asScala.map { gsMap =>
66 | gsMap.keySet().asScala.map { key => key -> gsMap.get(key) }.toMap
67 | }.toSeq
68 |
69 | // token feature is special
70 | val tokenFeaturePrefix = "%t="
71 |
72 | // figure out binary features
73 | val feature2values = featurized.flatten.foldLeft(Map.empty[String, Set[Double]]) {
74 | case (acc, (key, value)) => acc.updated(key, acc.getOrElse(key, Set[Double]()) + value)
75 | }
76 | val binaryFeatures = feature2values.
77 | filter(_._2 subsetOf Set(0.0, 1.0)).
78 | keys.
79 | filterNot(_.startsWith(tokenFeaturePrefix)).
80 | toSet
81 |
82 | // figure out an order for non-binary features
83 | val orderedNonBinaryFeatures = featurized.
84 | flatMap(_.keys).
85 | filterNot(binaryFeatures).
86 | filterNot(_.startsWith(tokenFeaturePrefix)).
87 | groupBy(identity).
88 | mapValues(_.size).
89 | toSeq.sortBy { case (feature, count) => (-count, feature) }.
90 | map(_._1)
91 |
92 | // write header
93 | val header = (tokenFeaturePrefix +: orderedNonBinaryFeatures).mkString("\t")
94 |
95 | // write entries
96 | val body = featurized.zipWithIndex.map {
97 | case (features, index) =>
98 | (
99 | // token feature
100 | Seq(
101 | features.filter(_._1.startsWith(tokenFeaturePrefix)).map { case (key, value) => s"$key=$value" }.mkString("/")
102 | ) ++
103 |
104 | // non-binary features
105 | orderedNonBinaryFeatures.map { f => features.get(f).map(d => f"$d%.3f").getOrElse("") } ++
106 |
107 | // binary features
108 | (features.keySet & binaryFeatures).toSeq.sorted
109 | ).mkString("\t")
110 | }
111 |
112 | val result = header +: body
113 |
114 | if (prefix.isEmpty) {
115 | result.asJava
116 | } else {
117 | result.zipWithIndex.map { case (line, i) => f"$prefix\t$i%04d\t$line" }.asJava
118 | }
119 | }
120 | }
121 |
--------------------------------------------------------------------------------
/core/src/main/scala/org/allenai/scienceparse/S2PaperSource.scala:
--------------------------------------------------------------------------------
1 | package org.allenai.scienceparse
2 |
3 | import java.io.{BufferedInputStream, IOException}
4 | import java.net.SocketTimeoutException
5 | import java.nio.file.{Files, Path, StandardCopyOption}
6 |
7 | import org.allenai.common.Logging
8 |
9 | import scala.util.control.NonFatal
10 | import scala.util.{Failure, Random, Success, Try}
11 | import scalaj.http.{Http, HttpResponse}
12 |
13 | object S2PaperSource extends PaperSource with Logging {
14 |
15 | private val random = new Random
16 | /** Gets a response from an HTTP server given a request. Retries if we think retrying might fix it. */
17 | private def withRetries[T](f: () => HttpResponse[T], retries: Int = 10): T = if (retries <= 0) {
18 | val result = f()
19 | if(result.isSuccess)
20 | result.body
21 | else
22 | throw new IOException(s"Got error ${result.code} (${result.statusLine}) from S2 server")
23 | } else {
24 | val sleepTime = random.nextInt(1000) + 2500 // sleep between 2.5 and 3.5 seconds
25 | // If something goes wrong, we sleep a random amount of time, to make sure that we don't slam
26 | // the server, get timeouts, wait for exactly the same amount of time on all threads, and then
27 | // slam the server again.
28 |
29 | Try(f()) match {
30 | case Failure(e: SocketTimeoutException) =>
31 | logger.warn(s"$e while querying S2. $retries retries left.")
32 | Thread.sleep(sleepTime)
33 | withRetries(f, retries - 1)
34 |
35 | case Failure(e: IOException) =>
36 | logger.warn(s"Got IOException '${e.getMessage}' while querying S2. $retries retries left.")
37 | Thread.sleep(sleepTime)
38 | withRetries(f, retries - 1)
39 |
40 | case Success(response) if response.isServerError =>
41 | logger.warn(s"Got response code '${response.statusLine}' while querying S2. $retries retries left.")
42 | Thread.sleep(sleepTime)
43 | withRetries(f, retries - 1)
44 |
45 | case Failure(e) => throw e
46 |
47 | case Success(response) => response.body
48 | }
49 | }
50 |
51 | override def getPdf(paperId: String) = {
52 | val key = paperId.take(4) + "/" + paperId.drop(4) + ".pdf"
53 |
54 | // We download to a temp file first. If we gave out an InputStream that comes directly from
55 | // S3, it would time out if the caller of this function reads the stream too slowly.
56 | val tempFile = withRetries { () =>
57 | Http(s"https://pdfs.semanticscholar.org/$key").timeout(30000, 30000).execute { is =>
58 | val result = Files.createTempFile(paperId + ".", ".paper.pdf")
59 | try {
60 | Files.copy(is, result, StandardCopyOption.REPLACE_EXISTING)
61 | result
62 | } catch {
63 | case NonFatal(e) =>
64 | Files.deleteIfExists(result)
65 | throw e
66 | }
67 | }
68 | }
69 | tempFile.toFile.deleteOnExit()
70 | new BufferedInputStream(Files.newInputStream(tempFile))
71 | }
72 | }
73 |
--------------------------------------------------------------------------------
/core/src/main/scala/org/allenai/scienceparse/StringUtils.scala:
--------------------------------------------------------------------------------
1 | package org.allenai.scienceparse
2 |
3 | import org.allenai.common.{ StringUtils => CommonStringUtils }
4 |
5 | object StringUtils {
6 | import CommonStringUtils.StringImplicits
7 |
8 | def normalize(s: String) = s.normalize.replaceFancyUnicodeChars.removeUnprintable.replace('ı', 'i')
9 |
10 | def makeSingleLine(s: String) = s.replaceAll("\\n", "\\\\n").replaceAll("\\r", "\\\\r")
11 |
12 |
13 | /** Splits a name into first and last names */
14 | def splitName(name: String) = {
15 | val suffixes = Set("Jr.", "Sr.", "II", "III")
16 | val lastNamePrefixes = Set("van", "da", "von")
17 |
18 | val parts = name.split("\\s", -1)
19 |
20 | if(parts.length <= 1) {
21 | ("", name)
22 | } else {
23 | var lastNameIndex = parts.length - 1
24 | def skipToNonemptyPart() =
25 | while(lastNameIndex > 0 && parts(lastNameIndex).isEmpty)
26 | lastNameIndex -= 1
27 | def skipToRightAfterNonemptyPart() =
28 | while(lastNameIndex > 1 && parts(lastNameIndex - 1).isEmpty)
29 | lastNameIndex -= 1
30 |
31 | // move to the first non-empty part
32 | skipToNonemptyPart()
33 |
34 | // deal with suffixes
35 | if(lastNameIndex > 0 && suffixes.contains(parts(lastNameIndex)))
36 | lastNameIndex -= 1
37 | skipToNonemptyPart()
38 |
39 | // deal with last name prefixes
40 | skipToRightAfterNonemptyPart()
41 | if(lastNameIndex > 1 && lastNamePrefixes.contains(parts(lastNameIndex - 1)))
42 | lastNameIndex -= 1
43 | skipToRightAfterNonemptyPart()
44 |
45 | (parts.take(lastNameIndex).mkString(" "), parts.drop(lastNameIndex).mkString(" "))
46 | }
47 | }
48 |
49 | def getFirstName(name: String) = splitName(name)._1
50 | def getLastName(name: String) = splitName(name)._2
51 | }
52 |
--------------------------------------------------------------------------------
/core/src/main/scala/org/allenai/scienceparse/Training.scala:
--------------------------------------------------------------------------------
1 | package org.allenai.scienceparse
2 |
3 | import java.io.{ FileInputStream, File }
4 |
5 | import com.gs.collections.impl.set.mutable.UnifiedSet
6 | import org.allenai.common.{ Resource, Logging }
7 | import org.allenai.datastore.Datastores
8 | import org.allenai.scienceparse.Parser.ParseOpts
9 | import scopt.OptionParser
10 |
11 | import scala.io.Source
12 | import scala.collection.JavaConverters._
13 |
14 | object Training extends App with Datastores with Logging {
15 | // The Files are all Option[File] defaulting to None. Properly, they should be set to the
16 | // defaults from the datastore, but if we do that here, they will download several gigabytes
17 | // of files during startup, even if they are unused later.
18 | case class Config(
19 | output: File = null,
20 | maxHeaderWords: Int = Parser.MAXHEADERWORDS,
21 | maxIterations: Int = 1000,
22 | backgroundSampleDocs: Int = 40000,
23 | backgroundDirectory: Option[File] = None,
24 | gazetteerFile: Option[File] = None,
25 | trainFraction: Double = 0.9,
26 | minYear: Int = 2008,
27 | maxPaperCount: Int = 34000,
28 | excludeIdsFile: Option[File] = None,
29 | minExpectedFeatureCount: Int = 13,
30 | trainingData: Iterator[LabeledPaper] = LabeledPapersFromDBLP.get
31 | )
32 |
33 | val parser = new OptionParser[Config](this.getClass.getSimpleName) {
34 | head("Options that are not specified default to the settings that were used to make the production model.")
35 |
36 | opt[File]('o', "output") required () action { (o, c) =>
37 | c.copy(output = o)
38 | } text "The output file"
39 |
40 | opt[Int]("maxHeaderWords") action { (m, c) =>
41 | c.copy(maxHeaderWords = m)
42 | } text "Specifies the maximum number of words to use for the header if we don't have any other information about where the header ends"
43 |
44 | opt[Int]("maxIterations") action { (i, c) =>
45 | c.copy(maxIterations = i)
46 | } text "Maximum number of iterations during training"
47 |
48 | opt[Int]("backgroundSampleDocs") action { (d, c) =>
49 | c.copy(backgroundSampleDocs = d)
50 | } text "The number of documents to use to build the background language model"
51 |
52 | opt[File]("backgroundDirectory") action { (d, c) =>
53 | c.copy(backgroundDirectory = Some(d))
54 | } text "The directory in which the background documents are found"
55 |
56 | opt[File]('g', "gazetteerFile") action { (f, c) =>
57 | c.copy(gazetteerFile = Some(f))
58 | } text "The gazetteer file"
59 |
60 | opt[Double]("trainFraction") action { (f, c) =>
61 | c.copy(trainFraction = f)
62 | } text "The fraction of the ground truth to use for training"
63 |
64 | opt[Int]("minYear") action { (y, c) =>
65 | c.copy(minYear = y)
66 | } text "The earliest published year we're willing to consider"
67 |
68 | opt[Int]('c', "maxPaperCount") action { (p, c) =>
69 | c.copy(maxPaperCount = p)
70 | } text "The maximum number of labeled documents to consider"
71 |
72 | opt[File]("excludeIdsFile") action { (e, c) =>
73 | c.copy(excludeIdsFile = Some(e))
74 | } text "A file with paper IDs to exclude, one per line. We always exclude the papers from the evaluation set."
75 |
76 | opt[Int]("minExpectedFeatureCount") action { (n, c) =>
77 | c.copy(minExpectedFeatureCount = n)
78 | } text "The minimum number of times we should see a feature before accepting it."
79 |
80 | opt[Unit]("trainOnDBLP") action { (_, c) =>
81 | c.copy(trainingData = LabeledPapersFromDBLP.get)
82 | } text "Train with data from DBLP"
83 |
84 | opt[Unit]("trainOnPMC") action { (_, c) =>
85 | c.copy(trainingData = LabeledPapersFromPMC.getCleaned.drop(10000))
86 | // Drop 10000 because we test on those.
87 | } text "Train with data from PMC"
88 |
89 | opt[Unit]("trainOnBoth") action { (_, c) =>
90 | c.copy(trainingData = new InterleavingIterator(LabeledPapersFromPMC.getCleaned, LabeledPapersFromDBLP.get))
91 | } text "Train with data from DBLP and PMC"
92 |
93 | help("help") text "Prints help text"
94 | }
95 |
96 | parser.parse(args, Config()).foreach { config =>
97 | val opts = new ParseOpts
98 | opts.modelFile = config.output.toString
99 | opts.headerMax = config.maxHeaderWords
100 | opts.iterations = config.maxIterations
101 | opts.threads = Runtime.getRuntime.availableProcessors() * 2
102 | opts.backgroundSamples = config.backgroundSampleDocs
103 |
104 | val backgroundDirectory =
105 | config.backgroundDirectory.getOrElse(publicDirectory("productionBackgroundDocs", 1).toFile)
106 | opts.backgroundDirectory = backgroundDirectory.toString
107 |
108 | val gazetteerFile = config.gazetteerFile.getOrElse(Parser.getDefaultGazetteer.toFile)
109 | opts.gazetteerFile = gazetteerFile.toString
110 |
111 | opts.trainFraction = config.trainFraction
112 | opts.checkAuthors = true
113 | opts.minYear = config.minYear
114 | opts.documentCount = config.maxPaperCount
115 | opts.minExpectedFeatureCount = config.minExpectedFeatureCount
116 |
117 | val excludedIds = Evaluation.goldDocIds ++ config.excludeIdsFile.map { excludedIdsFile =>
118 | Resource.using(Source.fromFile(excludedIdsFile)) { source =>
119 | source.getLines().map(_.trim)
120 | }.toSet
121 | }.getOrElse(Set.empty)
122 |
123 | val labeledData = config.trainingData.asJava
124 |
125 | Parser.trainParser(
126 | labeledData,
127 | opts,
128 | UnifiedSet.newSet(excludedIds.toIterable.asJava)
129 | )
130 |
131 | logger.info(s"New model at ${opts.modelFile}")
132 | }
133 | }
134 |
--------------------------------------------------------------------------------
/core/src/main/scala/org/allenai/scienceparse/Utilities.scala:
--------------------------------------------------------------------------------
1 | package org.allenai.scienceparse
2 |
3 | import java.security.MessageDigest
4 |
5 | object Utilities {
6 | private val sha1HexLength = 40
7 | def toHex(bytes: Array[Byte]): String = {
8 | val sb = new scala.collection.mutable.StringBuilder(sha1HexLength)
9 | bytes.foreach { byte => sb.append(f"$byte%02x") }
10 | sb.toString
11 | }
12 |
13 | def shaForBytes(bytes: Array[Byte]): String = {
14 | val digest = MessageDigest.getInstance("SHA-1")
15 | digest.reset()
16 | digest.update(bytes)
17 | toHex(digest.digest())
18 | }
19 | }
20 |
--------------------------------------------------------------------------------
/core/src/main/scala/org/allenai/scienceparse/pipeline/Bucketizers.scala:
--------------------------------------------------------------------------------
1 | package org.allenai.scienceparse.pipeline
2 |
3 | import scala.io.Source
4 |
5 | /** This contains a bunch of helper functions stolen from the pipeline code. We need it here to
6 | * anticipate how well the pipeline will work with the output from science-parse. */
7 | object Bucketizers {
8 | import Normalizers._
9 |
10 | /** This file contains 225 high-frequency n-grams from title prefixes.
11 | * High means the S2 * Dblp bucket size is > 1M. (Early Sept. 2015)
12 | * n is 2, 3, 4, 5.
13 | */
14 | val highFreqTitleNgramStream = this.getClass.getResourceAsStream("/org/allenai/scienceparse/pipeline/highfreq.tsv")
15 |
16 | val highFreqNameNgramStream = this.getClass.getResourceAsStream("/org/allenai/scienceparse/pipeline/highfreqNames.tsv")
17 |
18 | def loadHighFreqs(is: java.io.InputStream): Map[String, Int] =
19 | Source.fromInputStream(is).getLines.map { l =>
20 | val Array(t, f) = l.split("\t")
21 | t -> f.toInt
22 | }.toMap
23 |
24 | lazy val highFreqTitleNgrams = loadHighFreqs(highFreqTitleNgramStream)
25 |
26 | lazy val highFreqNameNgrams = loadHighFreqs(highFreqTitleNgramStream) // This looks like a typo, but I copied it this way from the pipeline.
27 |
28 | val defaultTitleCutoffThreshold = 1000000
29 |
30 | val defaultNameCutoffThreshold = 100000
31 |
32 | val concatChar = "_"
33 |
34 | def toBucket(words: Iterable[String]) = words.mkString(concatChar)
35 |
36 | def toBucket(s: String) = s.split(" ").mkString(concatChar)
37 |
38 | val defaultTitleNgramLength = 3
39 |
40 | val defaultNameNgramLength = 2
41 |
42 | val defaultAllowTruncated = true
43 |
44 | val defaultUpto = 1
45 |
46 | def cutoffFilter(b: String, cutoffOption: Option[Int], highFreqs: Map[String, Int]): Boolean =
47 | cutoffOption.isEmpty || !highFreqs.contains(b) || highFreqs(b) < cutoffOption.get
48 |
49 | /** Return the array of tokens for the given input.
50 | * Limit number of tokens to maxCount
51 | */
52 | def words(text: String, maxCount: Int = 40): Array[String] = {
53 | val words = alphaNumericNormalize(text).split(' ').filter(_.nonEmpty)
54 | words.take(maxCount)
55 | }
56 |
57 | /** Returns a list of ngrams.
58 | * If cutoff is specified, continue to add more words until the result has frequency
59 | * lower than the cutoff value.
60 | * If allowTruncated is set to true, accept ngrams that have length less than n.
61 | * For example, if the text is "local backbones" and n = 3, we will generate
62 | * the ngram "local_backbones".
63 | */
64 | def ngrams(
65 | text: String,
66 | n: Int,
67 | cutoffOption: Option[Int],
68 | allowTruncated: Boolean = defaultAllowTruncated,
69 | highFreqs: Map[String, Int] = highFreqTitleNgrams,
70 | upto: Int = defaultUpto
71 | ): Iterator[String] = ngramAux(words(text), n, cutoffOption, allowTruncated, highFreqs, upto)
72 |
73 | def tailNgrams(
74 | text: String,
75 | n: Int,
76 | cutoffOption: Option[Int],
77 | allowTruncated: Boolean = defaultAllowTruncated,
78 | highFreqs: Map[String, Int] = highFreqTitleNgrams,
79 | upto: Int = defaultUpto
80 | ) = ngramAux(words(text).reverse, n, cutoffOption, allowTruncated, highFreqs, upto)
81 |
82 | def ngramAux(
83 | chunks: Array[String],
84 | n: Int,
85 | cutoffOption: Option[Int],
86 | allowTruncated: Boolean,
87 | highFreqs: Map[String, Int],
88 | upto: Int
89 | ): Iterator[String] = {
90 | chunks.sliding(n)
91 | .filter(x => (allowTruncated && x.nonEmpty) || x.length == n)
92 | .map(x => toBucket(x.toIterable))
93 | .filter(cutoffFilter(_, cutoffOption, highFreqs))
94 | .take(upto)
95 | }
96 |
97 | def titleNgrams(title: String, upto: Int, allowTruncated: Boolean = defaultAllowTruncated) = {
98 | ngrams(
99 | title,
100 | n = defaultTitleNgramLength,
101 | cutoffOption = Some(defaultTitleCutoffThreshold),
102 | upto = upto,
103 | allowTruncated = allowTruncated
104 | )
105 | }
106 |
107 | def titleTailNgrams(title: String, upto: Int = 1, allowTruncated: Boolean = defaultAllowTruncated) = {
108 | tailNgrams(
109 | title,
110 | n = defaultTitleNgramLength,
111 | cutoffOption = Some(defaultTitleCutoffThreshold),
112 | upto = upto,
113 | allowTruncated = allowTruncated
114 | )
115 | }
116 |
117 | def nameNgrams(name: String) = ngrams(
118 | name,
119 | n = defaultNameNgramLength,
120 | allowTruncated = false,
121 | cutoffOption = Some(defaultNameCutoffThreshold),
122 | highFreqs = highFreqNameNgrams,
123 | upto = 3
124 | )
125 |
126 | /** This is used in V1. */
127 | def simple3TitlePrefix(text: String): List[String] =
128 | ngrams(text, n = 3, cutoffOption = None, allowTruncated = true, highFreqTitleNgrams, upto = 1).toList
129 | }
130 |
--------------------------------------------------------------------------------
/core/src/main/scala/org/allenai/scienceparse/pipeline/Normalizers.scala:
--------------------------------------------------------------------------------
1 | package org.allenai.scienceparse.pipeline
2 |
3 | import java.text.Normalizer
4 |
5 | /** This contains a bunch of helper functions stolen from the pipeline code. We need it here to
6 | * anticipate how well the pipeline will work with the output from science-parse. */
7 | object Normalizers {
8 | def removeDiacritics(s: String): String =
9 | "\\p{InCombiningDiacriticalMarks}+".r
10 | .replaceAllIn(Normalizer.normalize(s, Normalizer.Form.NFD), "")
11 |
12 | def removePunctuation(s: String): String =
13 | s.replaceAll("\\p{P}", " ")
14 |
15 | def removeNonAphanumeric(s: String): String =
16 | s.replaceAll("[^A-Za-z0-9]", " ")
17 |
18 | def implodeSpaces(s: String) = " +".r.replaceAllIn(s.trim, " ")
19 |
20 | def removeSpaces(s: String) = " +".r.replaceAllIn(s, "")
21 |
22 | def normalize(s: String): String =
23 | implodeSpaces(removePunctuation(removeDiacritics(s.toLowerCase)))
24 |
25 | def alphaNumericNormalize(s: String): String =
26 | implodeSpaces(removeNonAphanumeric(removeDiacritics(s.toLowerCase)))
27 |
28 | def alphaNumericNormalizeNoSpaces(s: String): String =
29 | removeSpaces(removeNonAphanumeric(removeDiacritics(s.toLowerCase)))
30 |
31 | def strictNormalize(s: String): String = s.toLowerCase.replaceAll("[^a-z]", "")
32 |
33 | def soundexWord(word: String): String = {
34 | val s = strictNormalize(word)
35 | if (s.isEmpty) return ""
36 | s.head + (s.substring(1)
37 | .replaceAll("[hw]", "")
38 | .replaceAll("[bfpv]", "1")
39 | .replaceAll("[cgjkqsxz]", "2")
40 | .replaceAll("[dt]", "3")
41 | .replaceAll("l", "4")
42 | .replaceAll("[mn]", "5")
43 | .replaceAll("r", "6")
44 | .replaceAll("(\\d)+", "$1")
45 | .replaceAll("[aeiouy]", "")
46 | + "000").take(3)
47 | }
48 |
49 | def soundex(s: String): String = s.split(" ").map(soundexWord).mkString(" ")
50 |
51 | def truncateWords(s: String): String = s.split(" ").map(strictNormalize(_).take(3)).mkString(" ")
52 | }
53 |
--------------------------------------------------------------------------------
/core/src/main/scala/org/allenai/scienceparse/pipeline/SimilarityMeasures.scala:
--------------------------------------------------------------------------------
1 | package org.allenai.scienceparse.pipeline
2 |
3 | import org.allenai.scienceparse.LabeledData.Reference
4 | import org.allenai.scienceparse.StringUtils
5 |
6 | /** This contains a bunch of helper functions stolen from the pipeline code. We need it here to
7 | * anticipate how well the pipeline will work with the output from science-parse. */
8 | object SimilarityMeasures {
9 | def jaccardSim[T](s1: Set[T], s2: Set[T]): Double = {
10 | s1.intersect(s2).size.toDouble / s1.union(s2).size
11 | }
12 |
13 | def containmentJaccardSim[T](s1: Set[T], s2: Set[T]): Double = {
14 | s1.intersect(s2).size.toDouble / math.min(s1.size, s2.size)
15 | }
16 |
17 | def identical(left: String, right: String) =
18 | if (left == right) Some(1.0) else None
19 |
20 | def prePostfix(left: String, right: String, transform: Int => Double = x => x / (x + 0.5)) = {
21 | if (left.length > right.length && (left.startsWith(right) || left.endsWith(right))) {
22 | Some(transform(right.split(" ").length))
23 | } else {
24 | None
25 | }
26 | }
27 |
28 | def pickFromOptions[T](members: Option[T]*): Option[T] =
29 | members.toSeq.find(_.isDefined).getOrElse(None)
30 |
31 | def twoWayPrePostfix(left: String, right: String, transform: Int => Double = x => x / (x + 0.5)) =
32 | pickFromOptions(prePostfix(left, right, transform), prePostfix(right, left, transform))
33 |
34 | /** Smooth interpolation between containment Jaccard and plain Jaccard,
35 | * based on character n-grams.
36 | * Short strings must match exactly, but longer strings are considered a match
37 | * if one is a substring of the other.
38 | *
39 | * The final score is (J + F * JC) / (1 + F) in which
40 | * J is the plain Jaccard
41 | * JC is the containment Jaccard
42 | * F = s ** (m - 1)
43 | * m is the minimum length of the two strings
44 | * s, l are parameters
45 | *
46 | * @param left String to compare
47 | * @param right Other string to compare
48 | * @param ngramLength Longer values will give a larger penalty to single-character typos
49 | * @param s Determines how rapidly F rises with string length
50 | * @param l The string length (in characters) for which which the two Jaccard scores have equal weights
51 | * @return
52 | */
53 | def characterNgramSimilarity(
54 | left: String,
55 | right: String,
56 | ngramLength: Int = 3,
57 | s: Double = 1.2,
58 | l: Int = 10
59 | ): Option[Double] = {
60 | if (left == right) {
61 | Some(1.0)
62 | } else {
63 | val ngramsLeft = left.sliding(ngramLength).toSet
64 | val ngramsRight = right.sliding(ngramLength).toSet
65 | val minSize = math.min(ngramsLeft.size, ngramsRight.size)
66 | val directSim = jaccardSim(ngramsLeft, ngramsRight)
67 | val containmentSim = containmentJaccardSim(ngramsLeft, ngramsRight)
68 | val containmentWeight = math.min(math.pow(s, minSize - l), 100000.0)
69 | Some((directSim + containmentWeight * containmentSim) / (1.0 + containmentWeight))
70 | }
71 | }
72 |
73 | def titleNgramSimilarity(
74 | left: TitleAuthors,
75 | right: TitleAuthors,
76 | s: Double = 1.2,
77 | l: Int = 10
78 | ): Option[Double] = {
79 | if (left == right) {
80 | Some(1.0)
81 | } else {
82 | val ngramsLeft = left.normalizedTitleNgrams
83 | val ngramsRight = right.normalizedTitleNgrams
84 | val minSize = math.min(ngramsLeft.size, ngramsRight.size)
85 | val directSim = jaccardSim(ngramsLeft, ngramsRight)
86 | val containmentSim = containmentJaccardSim(ngramsLeft, ngramsRight)
87 | val containmentWeight = math.min(math.pow(s, minSize - l), 100000.0)
88 | Some((directSim + containmentWeight * containmentSim) / (1.0 + containmentWeight))
89 | }
90 | }
91 | }
92 |
93 | case class AuthorNameMatch(first: String, last: String, full: String)
94 |
95 | case class TitleAuthors(title: String, names: Seq[AuthorNameMatch], year: Option[Int] = None) {
96 | def lastNames: Seq[String] = names.map(_.last)
97 |
98 | def fullNames: Seq[String] = names.map(_.full)
99 |
100 | // Note: There is a slight inversion of control here. This logic would be more properly contained within
101 | // BibEntryToPaperMatcher and TitleAuthorsMatchScheme, but is here for performance reasons.
102 | lazy val normalizedTitleNgrams: Set[String] = Normalizers.alphaNumericNormalize(title).sliding(3).toSet
103 | lazy val normalizedAuthors: Set[String] = names.map(x => Normalizers.alphaNumericNormalize(x.last)).toSet
104 | // Does not include empty strings.
105 | lazy val normalizedAuthorsAllNames: Set[String] = {
106 | val allNames = names.flatMap(name => Seq(name.first, name.last, name.full))
107 | val normalized = allNames.map(Normalizers.alphaNumericNormalize)
108 | normalized.filter(_.nonEmpty).toSet
109 | }
110 | }
111 |
112 | object TitleAuthors {
113 | def fromReference(ref: Reference) = TitleAuthors(
114 | ref.title.getOrElse(""),
115 | ref.authors.map { a =>
116 | val (first, last) = StringUtils.splitName(a)
117 | AuthorNameMatch(first, last, a)
118 | },
119 | ref.year
120 | )
121 | }
122 |
--------------------------------------------------------------------------------
/core/src/test/java/org/allenai/scienceparse/CRFBibRecordParserTest.java:
--------------------------------------------------------------------------------
1 | package org.allenai.scienceparse;
2 |
3 | import java.io.File;
4 | import java.io.IOException;
5 | import java.util.List;
6 |
7 | import org.testng.annotations.Test;
8 |
9 | import com.gs.collections.api.tuple.Pair;
10 |
11 | import junit.framework.Assert;
12 | import lombok.val;
13 | import lombok.extern.slf4j.Slf4j;
14 |
15 | @Test
16 | @Slf4j
17 | public class CRFBibRecordParserTest {
18 |
19 | public void testReadData() throws IOException {
20 | File coraFile = new File(this.getClass().getResource("/coratest.txt").getFile());
21 | val labels = CRFBibRecordParser.labelFromCoraFile(coraFile);
22 | log.info(labels.toString());
23 | Assert.assertEquals(1, labels.size());
24 | boolean foundOne = false;
25 | boolean foundTwo = false;
26 | boolean foundThree = false;
27 | for(Pair p : labels.get(0)) {
28 | if(p.getOne().equals("Formalising") && p.getTwo().equals("B_T"))
29 | foundOne = true;
30 | if(p.getOne().equals("formalism.") && p.getTwo().equals("E_T"))
31 | foundTwo = true;
32 | if(p.getOne().equals("1992.") && p.getTwo().equals("W_Y"))
33 | foundThree = true;
34 | }
35 | Assert.assertTrue(foundOne);
36 | Assert.assertTrue(foundTwo);
37 | Assert.assertTrue(foundThree);
38 |
39 | File umassFile = new File(this.getClass().getResource("/umasstest.txt").getFile());
40 | val labels2 = CRFBibRecordParser.labelFromUMassFile(umassFile);
41 | log.info(labels2.toString());
42 | Assert.assertEquals(1, labels2.size());
43 | foundOne = false;
44 | foundTwo = false;
45 | for(Pair p : labels2.get(0)) {
46 | if(p.getOne().equals("E.") && p.getTwo().equals("B_A"))
47 | foundOne = true;
48 | if(p.getOne().equals("1979") && p.getTwo().equals("B_Y"))
49 | foundTwo = true;
50 | }
51 | Assert.assertTrue(foundOne);
52 | Assert.assertTrue(foundTwo);
53 |
54 | File kermitFile = new File(this.getClass().getResource("/kermittest.txt").getFile());
55 | val labels3 = CRFBibRecordParser.labelFromKermitFile(kermitFile);
56 | log.info(labels3.toString());
57 | Assert.assertEquals(2, labels3.size());
58 | foundOne = false;
59 | foundTwo = false;
60 | for(Pair p : labels3.get(1)) {
61 | if(p.getOne().equals("Hinshaw,") && p.getTwo().equals("B_A"))
62 | foundOne = true;
63 | if(p.getOne().equals("Shock") && p.getTwo().equals("E_V"))
64 | foundTwo = true;
65 | }
66 | Assert.assertTrue(foundOne);
67 | Assert.assertTrue(foundTwo);
68 |
69 | }
70 |
71 | public void testCoraLabeling() throws Exception {
72 | String s = " A. Cau Formalising Dijkstra's development strategy within Stark's formalism. BCS-FACS Refinement Workshop, 1992. ";
73 | int tokens = 2 + 21 - 8; //start/stop plus tokens in source minus eight tags.
74 | List> labeledData = CRFBibRecordParser.getLabeledLineCora(s);
75 | Assert.assertEquals(tokens, labeledData.size());
76 | Assert.assertEquals("Cau", labeledData.get(2).getOne());
77 | Assert.assertEquals("E_A", labeledData.get(2).getTwo());
78 | Assert.assertEquals("Formalising", labeledData.get(3).getOne());
79 | Assert.assertEquals("B_T", labeledData.get(3).getTwo());
80 | Assert.assertEquals("development", labeledData.get(5).getOne());
81 | Assert.assertEquals("I_T", labeledData.get(5).getTwo());
82 | Assert.assertEquals("1992.", labeledData.get(13).getOne());
83 | Assert.assertEquals("W_Y", labeledData.get(13).getTwo());
84 | }
85 |
86 | }
87 |
--------------------------------------------------------------------------------
/core/src/test/java/org/allenai/scienceparse/CheckReferencesTest.java:
--------------------------------------------------------------------------------
1 | package org.allenai.scienceparse;
2 |
3 | import junit.framework.Assert;
4 | import lombok.extern.slf4j.Slf4j;
5 | import org.allenai.datastore.Datastore;
6 | import org.testng.annotations.Test;
7 |
8 | import java.io.IOException;
9 | import java.util.Arrays;
10 |
11 | @Test
12 | @Slf4j
13 | public class CheckReferencesTest {
14 | public void smallTest() throws IOException {
15 | final String jsonFile =
16 | Datastore.apply().filePath("org.allenai.scienceparse", "gazetteer.json", 5).toString();
17 | CheckReferences cr = new CheckReferences(jsonFile);
18 | log.info("num hashes: " + cr.getHashSize());
19 | Assert.assertEquals(1557178, cr.getHashSize());
20 | Assert.assertTrue(cr.hasPaper(
21 | "Ecological Sampling of Gaze Shifts",
22 | Arrays.asList("Giuseppe Boccignone",
23 | "Mario Ferraro"), 2014, "KDD"));
24 | Assert.assertTrue(cr.hasPaper(
25 | "HIST: A Methodology for the Automatic Insertion of a Hierarchical Self Test",
26 | Arrays.asList("Oliver F. Haberl",
27 | "Thomas Kropf"), 1992, "KDD"));
28 | Assert.assertFalse(cr.hasPaper(
29 | "Fake paper titles: A case study in negative examples",
30 | Arrays.asList("Kevin Bache",
31 | "David Newman",
32 | "Padhraic Smyth"), 2013, "KDD"));
33 | Assert.assertFalse(cr.hasPaper(
34 | "Text-based measures of document diversity",
35 | Arrays.asList("Captain Bananas",
36 | "David Newman",
37 | "Padhraic Smyth"), 2013, "KDD"));
38 | }
39 | }
40 |
--------------------------------------------------------------------------------
/core/src/test/java/org/allenai/scienceparse/GazetteerFeaturesTest.java:
--------------------------------------------------------------------------------
1 | package org.allenai.scienceparse;
2 |
3 | import java.util.Arrays;
4 | import java.util.List;
5 |
6 | import org.testng.annotations.Test;
7 |
8 | import junit.framework.Assert;
9 | import lombok.val;
10 | import lombok.extern.slf4j.Slf4j;
11 |
12 | @Test
13 | @Slf4j
14 | public class GazetteerFeaturesTest {
15 |
16 | public String filePathOfResource(String path) {
17 | return this.getClass().getResource(path).getFile();
18 | }
19 |
20 | public void testLength() {
21 | Assert.assertTrue(GazetteerFeatures.withinLength("this string is only six words."));
22 | Assert.assertFalse(GazetteerFeatures.withinLength("this string by contrast is eight words long."));
23 |
24 | }
25 |
26 | public void testGazetteers() throws Exception {
27 | GazetteerFeatures gf = new GazetteerFeatures(filePathOfResource("/gazetteer-test/"));
28 |
29 | int namesId = gf.gazetteerNumber("names.male.txt");
30 | int univId = gf.gazetteerNumber("education.university.small.txt");
31 | Assert.assertEquals(gf.size(), 2);
32 | Assert.assertEquals(3, gf.sizeOfSet(univId));
33 | Assert.assertEquals(5, gf.sizeOfSet(namesId));
34 | boolean [] abbeyInSet = gf.inSet("Abbey");
35 | Assert.assertEquals(2, abbeyInSet.length);
36 | Assert.assertFalse(abbeyInSet[univId]);
37 | Assert.assertTrue(abbeyInSet[namesId]);
38 | boolean [] beautyInSet = gf.inSet("marinello school of beauty");
39 | Assert.assertEquals(2, beautyInSet.length);
40 | Assert.assertTrue(beautyInSet[univId]);
41 | Assert.assertFalse(beautyInSet[namesId]);
42 | boolean [] wilkinsInSet = gf.inSet("d. wilkins school of windmill dunks");
43 | Assert.assertEquals(2, wilkinsInSet.length);
44 | Assert.assertFalse(wilkinsInSet[univId]);
45 | Assert.assertFalse(wilkinsInSet[namesId]);
46 | boolean [] apolloInSet = gf.inSet("Apollo College Phoenix Inc.");
47 | Assert.assertTrue(apolloInSet[univId]);
48 | }
49 |
50 | public void testGazetteerFeatures() throws Exception {
51 | List elems = Arrays.asList("Abbey", "is", "at", "Apollo", "College", "Phoenix", "Inc.");
52 | ReferencesPredicateExtractor rpe = new ReferencesPredicateExtractor();
53 | GazetteerFeatures gf = new GazetteerFeatures(filePathOfResource("/gazetteer-test/"));
54 | val spns = gf.getSpans(elems);
55 | log.info(spns.toString());
56 | Assert.assertEquals(2, spns.size());
57 |
58 | rpe.setGf(gf);
59 | val preds = rpe.nodePredicates(elems);
60 | log.info(preds.toString());
61 | Assert.assertEquals(1.0, preds.get(0).get("%gaz_W_names.male.txt"));
62 | Assert.assertFalse(preds.get(2).containsKey("%gaz_B_education.university.small.txt"));
63 | Assert.assertEquals(1.0, preds.get(3).get("%gaz_B_education.university.small.txt"));
64 | Assert.assertEquals(1.0, preds.get(4).get("%gaz_I_education.university.small.txt"));
65 | Assert.assertEquals(1.0, preds.get(6).get("%gaz_E_education.university.small.txt"));
66 |
67 | }
68 | }
69 |
--------------------------------------------------------------------------------
/core/src/test/java/org/allenai/scienceparse/HeaderIntegrationTest.java:
--------------------------------------------------------------------------------
1 | package org.allenai.scienceparse;
2 |
3 | import junit.framework.Assert;
4 | import lombok.extern.slf4j.Slf4j;
5 | import lombok.val;
6 | import org.apache.pdfbox.io.IOUtils;
7 | import org.testng.annotations.Test;
8 |
9 | import java.io.*;
10 | import java.net.URL;
11 | import java.nio.file.Files;
12 | import java.util.*;
13 | import java.util.stream.Collectors;
14 |
15 | @Slf4j
16 | public class HeaderIntegrationTest {
17 | private final static PaperSource paperSource = PaperSource.getDefault();
18 |
19 | static final int kSampledPapers = 100;
20 |
21 | public static class Result {
22 | int authorHits;
23 | int authorInvalid;
24 | boolean titleMatch;
25 | String title;
26 | int totalAuthors;
27 | boolean titleMissing;
28 | }
29 |
30 | public static HashSet authorSet(Iterable authors) {
31 | HashSet result = new HashSet();
32 | for (String author : authors) {
33 | result.add(Parser.lastName(author));
34 | }
35 | return result;
36 | }
37 |
38 | public static Result testPaper(
39 | final Parser parser,
40 | final ParserGroundTruth pgt,
41 | final String paperId
42 | ) {
43 | ExtractedMetadata metadata;
44 |
45 | ParserGroundTruth.Paper paper = pgt.forKey(paperId.substring(4));
46 |
47 | try {
48 | metadata = parser.doParse(
49 | paperSource.getPdf(paperId),
50 | Parser.MAXHEADERWORDS);
51 | } catch (Exception e) {
52 | log.info("Failed to parse or extract from {}. Skipping.", paper.url);
53 | return null;
54 | }
55 |
56 | HashSet golden = authorSet(Arrays.asList(paper.authors));
57 | HashSet extracted = authorSet(metadata.authors);
58 |
59 | int hits = 0;
60 | int invalid = 0;
61 | for (String name : golden) {
62 | if (extracted.contains(name)) {
63 | hits += 1;
64 | }
65 | }
66 | for (String name : extracted) {
67 | if (!golden.contains(name)) {
68 | log.info("Bad author {}: {} ", name,
69 | String.join(",", golden.toArray(new String[]{}))
70 | );
71 | invalid += 1;
72 | }
73 | }
74 | Result res = new Result();
75 | res.totalAuthors = golden.size();
76 | res.authorHits = hits;
77 | res.authorInvalid = invalid;
78 | res.title = paper.title;
79 |
80 | if (metadata.title == null) {
81 | res.titleMatch = false;
82 | res.titleMissing = true;
83 | } else {
84 | res.titleMatch = Parser.processTitle(paper.title)
85 | .equals(Parser.processTitle(metadata.title));
86 | }
87 |
88 |
89 | if (res.authorInvalid > 0 || !res.titleMatch) {
90 | metadata.authors.sort((String a, String b) -> a.compareTo(b));
91 | Arrays.sort(paper.authors);
92 | log.info("Failed match for paper {}.", paperId);
93 | log.info("Titles: GOLD:\n{} OURS:\n{}", paper.title, metadata.title);
94 | for (int i = 0; i < Math.max(paper.authors.length, metadata.authors.size()); ++i) {
95 | String goldAuthor = null;
96 | String metaAuthor = null;
97 | if (i < paper.authors.length) { goldAuthor = paper.authors[i]; }
98 | if (i < metadata.authors.size()) { metaAuthor = metadata.authors.get(i); }
99 | log.info("Author: ({}) ({})", goldAuthor, metaAuthor);
100 | }
101 | }
102 |
103 | return res;
104 | }
105 |
106 | public void testAuthorAndTitleExtraction() throws Exception {
107 | ParserGroundTruth pgt = new ParserGroundTruth(
108 | Files.newInputStream(Parser.getDefaultGazetteer()));
109 |
110 | // TODO (build and train a classifier at test time).
111 | // Parser parser = trainParser(pgt);
112 | Parser parser = new Parser();
113 |
114 | ArrayList sampledPapers = new ArrayList<>();
115 |
116 | for (int i = 0; i < pgt.papers.size(); i += pgt.papers.size() / kSampledPapers) {
117 | sampledPapers.add(pgt.papers.get(i));
118 | }
119 |
120 | long startTime = System.currentTimeMillis();
121 | ArrayList results = sampledPapers
122 | .stream()
123 | .parallel()
124 | .map(p -> testPaper(parser, pgt, p.id))
125 | .filter(f -> f != null)
126 | .collect(Collectors.toCollection(ArrayList::new));
127 |
128 | // Gahh I wish I had a dataframe library...
129 | int totalHits = 0, totalInvalid = 0, totalAuthors = 0, titleMatches = 0, titleMissing = 0;
130 | for (Result res : results) {
131 | totalHits += res.authorHits;
132 | totalInvalid += res.authorInvalid;
133 | totalAuthors += res.totalAuthors;
134 | if (res.titleMatch) {
135 | titleMatches += 1;
136 | }
137 | if (res.titleMissing) {
138 | titleMissing += 1;
139 | }
140 | }
141 |
142 | long finishTime = System.currentTimeMillis();
143 | double elapsed = (finishTime - startTime) / 1000.0;
144 | log.info("Testing complete. {} papers processed in {} seconds; {} papers/sec ",
145 | results.size(), elapsed, results.size() / elapsed);
146 |
147 | Assert.assertTrue(results.size() > 5);
148 |
149 | log.info("Authors: {} (Match: {} Invalid: {} Total {})",
150 | totalHits / (double)totalAuthors, totalHits, totalInvalid, totalAuthors);
151 | log.info("Titles: {} (Match: {} Missing: {} Total {})",
152 | titleMatches / (double)results.size(), titleMatches, titleMissing, results.size());
153 | }
154 | }
155 |
--------------------------------------------------------------------------------
/core/src/test/java/org/allenai/scienceparse/PDFPredicateExtractorTest.java:
--------------------------------------------------------------------------------
1 | package org.allenai.scienceparse;
2 |
3 | import com.gs.collections.api.map.primitive.ObjectDoubleMap;
4 | import com.gs.collections.api.tuple.Pair;
5 | import lombok.extern.slf4j.Slf4j;
6 | import org.allenai.scienceparse.pdfapi.PDFDoc;
7 | import org.allenai.scienceparse.pdfapi.PDFExtractor;
8 | import org.testng.Assert;
9 | import org.testng.annotations.Test;
10 |
11 | import java.io.File;
12 | import java.io.FileInputStream;
13 | import java.io.IOException;
14 | import java.io.InputStream;
15 | import java.util.Arrays;
16 | import java.util.Iterator;
17 | import java.util.List;
18 |
19 | @Slf4j
20 | public class PDFPredicateExtractorTest {
21 |
22 | private void titleFontFeatureCheckForStream(InputStream pdfInputStream) throws IOException {
23 | String target = "How to make words with vectors: Phrase generation in distributional semantics";
24 | PDFDoc doc = new PDFExtractor().extractFromInputStream(pdfInputStream);
25 | List pts = PDFToCRFInput.getSequence(doc);
26 | // Iterator it = pts.iterator();
27 | // while(it.hasNext()) {
28 | // PaperToken pt = it.next();
29 | // log.info((pt.getPdfToken()==null)?"null":pt.getPdfToken().token + " f:" + pt.getPdfToken().fontMetrics.ptSize);
30 | // }
31 | Pair pos = PDFToCRFInput.findString(PDFToCRFInput.asStringList(pts), target);
32 | PDFPredicateExtractor ppe = new PDFPredicateExtractor();
33 | List> preds = ppe.nodePredicates(pts);
34 | int[] idxes = new int[]{pos.getOne() - 1, pos.getOne(),
35 | pos.getTwo(), pos.getTwo() + 1, pos.getTwo() + 2};
36 | log.info("fonts for " + Arrays.toString(idxes));
37 | log.info(Arrays.toString(Arrays.stream(idxes).mapToDouble((int a) -> preds.get(a).get("%font")).toArray()));
38 | log.info("tokens for " + Arrays.toString(idxes));
39 | log.info(Arrays.toString(Arrays.stream(idxes).mapToObj((int a) -> pts.get(a).getPdfToken().token).toArray()));
40 |
41 |
42 | Assert.assertEquals(preds.get(pos.getOne()).get("%fcb"), 1.0);
43 | Assert.assertTrue(!preds.get(pos.getTwo() - 1).containsKey("%fcb"));
44 | log.info("Title font change features correct.");
45 | }
46 |
47 | @Test
48 | public void titleFontFeatureCheck() throws IOException {
49 | InputStream is = PDFPredicateExtractorTest.class.getResource("/P14-1059.pdf").openStream();
50 | titleFontFeatureCheckForStream(is);
51 | is.close();
52 | }
53 |
54 | public void titleFontForExplicitFilePath(String f) throws IOException {
55 | InputStream is = new FileInputStream(new File(f));
56 | titleFontFeatureCheckForStream(is);
57 | is.close();
58 | }
59 |
60 | @Test
61 | public void testCaseMasks() {
62 | String cap = "Exploring";
63 | List ls = PDFPredicateExtractor.getCaseMasks(cap);
64 | Assert.assertEquals(ls.size(), 2);
65 | Assert.assertTrue(ls.contains("%Xxx"));
66 | Assert.assertTrue(ls.contains("%letters"));
67 |
68 | String nonSimple = "Dharmaratnå";
69 | ls = PDFPredicateExtractor.getCaseMasks(nonSimple);
70 | Assert.assertTrue(ls.contains("%hasNonAscii"));
71 | Assert.assertTrue(!ls.contains("%hasAt"));
72 |
73 | String email = "bob@joe.com";
74 | ls = PDFPredicateExtractor.getCaseMasks(email);
75 | Assert.assertTrue(ls.contains("%hasAt"));
76 | }
77 |
78 | public static void main(String [] args) throws Exception {
79 | (new PDFPredicateExtractorTest()).titleFontForExplicitFilePath("src\\test\\resources\\P14-1059.pdf");
80 | }
81 |
82 | }
83 |
--------------------------------------------------------------------------------
/core/src/test/java/org/allenai/scienceparse/PDFToCRFInputTest.java:
--------------------------------------------------------------------------------
1 | package org.allenai.scienceparse;
2 |
3 | import com.gs.collections.api.tuple.Pair;
4 | import com.gs.collections.impl.tuple.Tuples;
5 | import lombok.extern.slf4j.Slf4j;
6 | import lombok.val;
7 | import org.allenai.scienceparse.pdfapi.PDFDoc;
8 | import org.allenai.scienceparse.pdfapi.PDFExtractor;
9 | import org.testng.Assert;
10 | import org.testng.annotations.Test;
11 | import scala.Option;
12 | import scala.Some;
13 |
14 | import java.io.IOException;
15 | import java.io.InputStream;
16 | import java.sql.Date;
17 | import java.util.Arrays;
18 | import java.util.List;
19 | import java.util.regex.Pattern;
20 | import java.util.stream.Collectors;
21 |
22 | @Test
23 | @Slf4j
24 | public class PDFToCRFInputTest {
25 | public String filePathOfResource(String path) {
26 | return this.getClass().getResource(path).getFile();
27 | }
28 |
29 | public void testGetPaperTokens() throws IOException {
30 | InputStream pdfInputStream = PDFToCRFInputTest.class.getResourceAsStream("/P14-1059.pdf");
31 | PDFDoc doc = new PDFExtractor().extractFromInputStream(pdfInputStream);
32 | List pts = PDFToCRFInput.getSequence(doc);
33 | log.info("got " + pts.size() + " things.");
34 | assert (pts.size() > 50);
35 | }
36 |
37 | public void testFindString() throws IOException {
38 | String target = "How to make words with vectors: Phrase generation in distributional semantics";
39 | InputStream pdfInputStream = PDFToCRFInputTest.class.getResourceAsStream("/P14-1059.pdf");
40 | PDFDoc doc = new PDFExtractor().extractFromInputStream(pdfInputStream);
41 | List pts = PDFToCRFInput.getSequence(doc);
42 | Pair pos = PDFToCRFInput.findString(PDFToCRFInput.asStringList(pts), target);
43 | Pair posNot = PDFToCRFInput.findString(PDFToCRFInput.asStringList(pts), "this string won't be found");
44 |
45 | Assert.assertTrue(pos != null);
46 | Assert.assertTrue(pos.getOne() > 0 && (pos.getTwo() - pos.getOne() == 11));
47 | log.info("found title at " + pos.getOne() + ", " + pos.getTwo());
48 | log.info("title is " + PDFToCRFInput.stringAt(pts, pos));
49 | Assert.assertTrue(posNot == null);
50 | }
51 |
52 | public void testLabelMetadata() throws IOException {
53 | InputStream pdfInputStream = PDFToCRFInputTest.class.getResourceAsStream("/P14-1059.pdf");
54 | PDFDoc doc = new PDFExtractor().extractFromInputStream(pdfInputStream);
55 | List pts = PDFToCRFInput.getSequence(doc);
56 | final ExtractedMetadata em = new ExtractedMetadata(
57 | "How to make words with vectors: Phrase generation in distributional semantics",
58 | Arrays.asList("Georgiana Dinu", "Marco Baroni"),
59 | new Date(1388556000000L));
60 | val labeledData = LabeledData$.MODULE$.fromExtractedMetadata("dummyid", em);
61 | val result = PDFToCRFInput.labelMetadata("P14-1059", pts, labeledData);
62 | log.info(PDFToCRFInput.getLabelString(result));
63 | log.info(pts.stream().map((PaperToken p) -> p.getPdfToken().token).collect(Collectors.toList()).toString());
64 | Assert.assertEquals(result.get(26).getTwo(), "O");
65 | Assert.assertEquals(result.get(27).getTwo(), "B_T");
66 | Assert.assertEquals(result.get(34).getTwo(), "I_T");
67 | Assert.assertEquals(result.get(37).getTwo(), "E_T");
68 | Assert.assertEquals(result.get(38).getTwo(), "B_A");
69 | Assert.assertEquals(result.get(47).getTwo(), "O");
70 | Assert.assertEquals(result.get(47).getOne(), pts.get(46)); //off by one due to start/stop
71 | Assert.assertEquals(result.get(0).getTwo(), "");
72 | Assert.assertEquals(result.get(result.size() - 1).getTwo(), "");
73 | }
74 |
75 | public void testGetSpans() {
76 | List ls = Arrays.asList("O", "O", "B_A", "I_A", "E_A");
77 | val spans = ExtractedMetadata.getSpans(ls);
78 | Assert.assertEquals(spans.size(), 1);
79 | Assert.assertEquals(spans.get(0).tag, "A");
80 | Assert.assertEquals(spans.get(0).loc, Tuples.pair(2, 5));
81 | }
82 |
83 | public void testAuthorPatterns() {
84 | List> authOpt = PDFToCRFInput.authorToPatternOptPair("Marco C. Baroni");
85 | Assert.assertTrue(authOpt.get(0).getOne().matcher("Marco").matches());
86 | Assert.assertTrue(authOpt.get(1).getOne().matcher("C").matches());
87 | Assert.assertTrue(authOpt.get(2).getOne().matcher("Baroni").matches());
88 | Pair span = PDFToCRFInput.findPatternSequence(Arrays.asList("Marco", "C", "Baroni"), authOpt);
89 | Assert.assertEquals(span, Tuples.pair(0, 3));
90 | span = PDFToCRFInput.findPatternSequence(Arrays.asList("Marco", "Baroni"), authOpt);
91 | Assert.assertEquals(span, Tuples.pair(0, 2));
92 | authOpt = PDFToCRFInput.authorToPatternOptPair("Marco Baroni");
93 | span = PDFToCRFInput.findPatternSequence(Arrays.asList("M.", "G.", "Baroni"), authOpt);
94 | Assert.assertEquals(span, Tuples.pair(0, 3));
95 | span = PDFToCRFInput.findPatternSequence(Arrays.asList("M.", "G.", "B."), authOpt);
96 | Assert.assertEquals(span, null);
97 | }
98 |
99 | public void testAuthor() throws IOException {
100 | InputStream pdfInputStream = PDFToCRFInputTest.class.getResourceAsStream("/P14-1059.pdf");
101 | PDFDoc doc = new PDFExtractor().extractFromInputStream(pdfInputStream);
102 | List pts = PDFToCRFInput.getSequence(doc);
103 | final ExtractedMetadata em = new ExtractedMetadata(
104 | "How to make words with vectors: Phrase generation in distributional semantics",
105 | Arrays.asList("Georgiana Dinu", "Marco C. Baroni"),
106 | new Date(1388556000000L));
107 | val labeledData = LabeledData$.MODULE$.fromExtractedMetadata("dummyid", em);
108 |
109 | val result = PDFToCRFInput.labelMetadata("P14-1059", pts, labeledData);
110 | Assert.assertEquals(result.get(38).getTwo(), "B_A");
111 | Assert.assertEquals(result.get(39).getTwo(), "E_A");
112 | Assert.assertEquals(result.get(41).getTwo(), "B_A");
113 | Assert.assertEquals(result.get(42).getTwo(), "E_A");
114 | }
115 | }
116 |
--------------------------------------------------------------------------------
/core/src/test/java/org/allenai/scienceparse/ParserLMFeaturesTest.java:
--------------------------------------------------------------------------------
1 | package org.allenai.scienceparse;
2 |
3 | import com.gs.collections.impl.set.mutable.UnifiedSet;
4 | import junit.framework.Assert;
5 | import lombok.extern.slf4j.Slf4j;
6 | import org.testng.annotations.Test;
7 |
8 | import java.io.File;
9 |
10 | @Test
11 | @Slf4j
12 | public class ParserLMFeaturesTest {
13 |
14 | public String filePathOfResource(String path) {
15 | return this.getClass().getResource(path).getFile();
16 | }
17 |
18 | public void testParserLMFeatures() throws Exception {
19 | File f = new File(filePathOfResource("/groundTruth.json"));
20 | ParserGroundTruth pgt = new ParserGroundTruth(f.getPath());
21 | log.info("pgt 0: " + pgt.papers.get(0));
22 | ParserLMFeatures plf = new ParserLMFeatures(pgt.papers, new UnifiedSet(), f.getParentFile(), 3);
23 | log.info("of count in background: " + plf.backgroundBow.get("of"));
24 | Assert.assertEquals(1.0, plf.authorBow.get("Seebode"));
25 | Assert.assertEquals(1.0, plf.titleBow.get("Disk-based"));
26 | Assert.assertTrue(plf.backgroundBow.get("of") > 2.0);
27 | }
28 |
29 | }
30 |
--------------------------------------------------------------------------------
/core/src/test/java/org/allenai/scienceparse/ParserTest.java:
--------------------------------------------------------------------------------
1 | package org.allenai.scienceparse;
2 |
3 | import com.fasterxml.jackson.databind.ObjectMapper;
4 | import com.gs.collections.api.tuple.Pair;
5 | import com.gs.collections.impl.tuple.Tuples;
6 | import lombok.extern.slf4j.Slf4j;
7 | import lombok.val;
8 | import org.testng.Assert;
9 | import org.testng.annotations.Test;
10 | import scala.Function0;
11 | import scala.Option;
12 | import scala.collection.JavaConverters;
13 | import scala.runtime.AbstractFunction0;
14 |
15 | import java.io.File;
16 | import java.io.FileInputStream;
17 | import java.io.InputStream;
18 | import java.nio.file.Paths;
19 | import java.util.Arrays;
20 | import java.util.Iterator;
21 | import java.util.List;
22 | import java.util.function.Function;
23 | import java.util.stream.Collectors;
24 |
25 | @Test
26 | @Slf4j
27 | public class ParserTest {
28 |
29 | private final static List pdfKeys = Arrays.asList("/bagnell11", "/seung08", "/ding11", "/mooney05",
30 | "/roark13", "/dyer12", "/bohnet09", "/P14-1059", "/map-reduce", "/fader11", "/proto06",
31 | "/agarwal11", "/smola10", "/senellart10", "/zolotov04", "/pedersen04", "/smith07",
32 | "/aimag10");
33 |
34 | public static String filePathOfResource(String path) {
35 | return ParserTest.class.getResource(path).getFile();
36 | }
37 |
38 | public static String resourceDirectory(String path) {
39 | return (new File(ParserTest.class.getResource(path).getFile())).getParent();
40 | }
41 |
42 | public static InputStream inputStreamOfResource(String path) throws Exception {
43 | return new FileInputStream(new File(filePathOfResource(path)));
44 | }
45 |
46 | private List resolveKeys(List keys) {
47 | return keys.stream().map((String s) -> new File(filePathOfResource(s + ".pdf"))).collect(Collectors.toList());
48 | }
49 |
50 | private Pair testModel(String id, Parser p) throws Exception {
51 | String jsonPath = id + ".extraction.json";
52 | String pdfPath = id + ".pdf";
53 | InputStream jsonInputStream = getClass().getResourceAsStream(jsonPath);
54 | InputStream pdfInputStream = getClass().getResourceAsStream(pdfPath);
55 | List> arr = new ObjectMapper().readValue(jsonInputStream, List.class);
56 | jsonInputStream.close();
57 | ExtractedMetadata em = p.doParse(pdfInputStream, Parser.MAXHEADERWORDS);
58 | pdfInputStream.close();
59 |
60 | double titleTP = 0.0;
61 | double titleFP = 0.0;
62 | double authorTP = 0.0;
63 | double authorFN = 0.0;
64 | for (List> elems : arr) {
65 | String type = (String) elems.get(0);
66 | Object expectedValue = elems.get(1);
67 | if (type.equalsIgnoreCase("title")) {
68 | String guessValue = em.title;
69 | if (guessValue != null && guessValue.equals(expectedValue))
70 | titleTP++;
71 | else
72 | titleFP++;
73 | //Assert.assertEquals(guessValue, expectedValue, String.format("Title error on %s", id));
74 | }
75 | if (type.equalsIgnoreCase("author")) {
76 | if (em.authors.contains(expectedValue))
77 | authorTP++;
78 | else
79 | authorFN++;
80 | //Assert.assertTrue(em.authors.contains(expectedValue),
81 | //"could not find author " + expectedValue + " in extracted authors " + em.authors.toString());
82 | }
83 | // if (type.equalsIgnoreCase("year")) {
84 | // Assert.assertEquals(em.year, expectedValue, String.format("Year error on %s", id));
85 | // }
86 | }
87 | return Tuples.pair((titleTP / (titleTP + titleFP + 0.000001)), authorTP / (authorTP + authorFN + 0.000001));
88 | }
89 |
90 | public void testParserWithGroundTruth() throws Exception {
91 | final File testModelFile = File.createTempFile("science-parse-test-model.", ".dat");
92 | testModelFile.deleteOnExit();
93 |
94 | /*
95 | * We'll use this to override the default paper source which pulls from S2. The problem with
96 | * pulling from S2 is that the set of publicly available PDFs changes over time making this
97 | * test rather flappy.
98 | */
99 | PaperSource previousSource = PaperSource.defaultPaperSource;
100 | PaperSource.defaultPaperSource = new DirectoryPaperSource(
101 | new File(resourceDirectory("/groundTruth.json")));
102 |
103 | try {
104 | Parser.ParseOpts opts = new Parser.ParseOpts();
105 | opts.iterations = 10;
106 | opts.threads = 4;
107 | opts.modelFile = testModelFile.getPath();
108 | opts.headerMax = Parser.MAXHEADERWORDS;
109 | opts.backgroundSamples = 3;
110 | opts.gazetteerFile = null;
111 | opts.trainFraction = 0.9;
112 | opts.backgroundDirectory = resourceDirectory("/groundTruth.json");
113 | opts.minYear = -1;
114 | opts.checkAuthors = false;
115 |
116 | File f = new File(opts.modelFile);
117 | f.deleteOnExit();
118 |
119 | final Iterator labeledTrainingData =
120 | JavaConverters.asJavaIteratorConverter(
121 | LabeledPapersFromDBLP.getFromGroundTruth(
122 | Paths.get(filePathOfResource("/groundTruth.json")))).asJava();
123 |
124 | Parser.trainParser(labeledTrainingData, opts);
125 | final Parser p = new Parser(
126 | testModelFile,
127 | Parser.getDefaultGazetteer().toFile(),
128 | Parser.getDefaultBibModel().toFile());
129 | double avgTitlePrec = 0.0;
130 | double avgAuthorRec = 0.0;
131 | double cases = 0.0;
132 | for (String s : pdfKeys) {
133 | val res = testModel(s, p);
134 | cases++;
135 | avgTitlePrec += res.getOne();
136 | avgAuthorRec += res.getTwo();
137 | }
138 | avgTitlePrec /= cases;
139 | avgAuthorRec /= cases;
140 | log.info("Title precision = recall = " + avgTitlePrec);
141 | log.info("Author recall = " + avgAuthorRec);
142 |
143 | testModelFile.delete();
144 | } finally {
145 | PaperSource.defaultPaperSource = previousSource;
146 | }
147 | }
148 |
149 | public void testParserGroundTruth() throws Exception {
150 | ParserGroundTruth pgt = new ParserGroundTruth(filePathOfResource("/groundTruth.json"));
151 | Assert.assertEquals(pgt.papers.size(), 4);
152 | }
153 |
154 | public void testParserRobustness() throws Exception {
155 | // ParserGroundTruth pgt = new ParserGroundTruth(filePathOfResource("/papers-parseBugs.json"));
156 | // Assert.assertEquals(false, true);
157 | }
158 | }
159 |
--------------------------------------------------------------------------------
/core/src/test/resources/2a774230b5328df3f8125da9b84a82d92b46a240.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/science-parse/f83983aa1b898f0e47c97f4997c57427bf277182/core/src/test/resources/2a774230b5328df3f8125da9b84a82d92b46a240.pdf
--------------------------------------------------------------------------------
/core/src/test/resources/403b61d52192d6cf23c92a3da68ba08f03a954e4.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/science-parse/f83983aa1b898f0e47c97f4997c57427bf277182/core/src/test/resources/403b61d52192d6cf23c92a3da68ba08f03a954e4.pdf
--------------------------------------------------------------------------------
/core/src/test/resources/6c46de8a4399840548a056d13d38e1f54da2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/science-parse/f83983aa1b898f0e47c97f4997c57427bf277182/core/src/test/resources/6c46de8a4399840548a056d13d38e1f54da2.pdf
--------------------------------------------------------------------------------
/core/src/test/resources/P07-1088-labels.txt:
--------------------------------------------------------------------------------
1 | TITLE Sparse Information Extraction: Unsupervised Language Models to the Rescue
2 | AUTHOR Doug Downey
3 | AUTHOR StefanSchoenmackers
4 | AUTHOR Oren Etzioni
5 |
--------------------------------------------------------------------------------
/core/src/test/resources/P07-1088.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/science-parse/f83983aa1b898f0e47c97f4997c57427bf277182/core/src/test/resources/P07-1088.pdf
--------------------------------------------------------------------------------
/core/src/test/resources/P14-1059-labels.txt:
--------------------------------------------------------------------------------
1 | TITLE How to make words with vectors: Phrase generation in distributional semantics
2 | AUTHOR Georgiana Dinu
3 | AUTHOR Marco Baroni
4 | ABSTRACT We introduce the problem of generation in distributional semantics: Given a distributional vector representing some meaning, how can we generate the phrase that best expresses that meaning? We motivate this novel challenge on theoretical and practical grounds and propose a simple data-driven approach to the estimation of generation functions. We test this in a monolingual scenario (paraphrase generation) as well as in a cross-lingual setting(translation by synthesizing adjective-noun phrase vectors in English and generating the equivalent expressions in Italian).
5 |
--------------------------------------------------------------------------------
/core/src/test/resources/P14-1059.extraction.json:
--------------------------------------------------------------------------------
1 | [
2 | ["title", "How to make words with vectors: Phrase generation in distributional semantics"],
3 | ["line", "Abstract"],
4 | ["line", "We introduce the problem of generation"],
5 | ["line", "space functioning as interlingua."],
6 | ["year", 2014]
7 | ]
--------------------------------------------------------------------------------
/core/src/test/resources/P14-1059.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/science-parse/f83983aa1b898f0e47c97f4997c57427bf277182/core/src/test/resources/P14-1059.pdf
--------------------------------------------------------------------------------
/core/src/test/resources/a7c25298c607d5bf32e3301b6b209431e2a7f830.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/science-parse/f83983aa1b898f0e47c97f4997c57427bf277182/core/src/test/resources/a7c25298c607d5bf32e3301b6b209431e2a7f830.pdf
--------------------------------------------------------------------------------
/core/src/test/resources/agarwal11.extraction.json:
--------------------------------------------------------------------------------
1 | [
2 | ["title", "Noisy Matrix Decomposition via Convex Relaxation: Optimal Rates in High Dimensions"],
3 | ["line", "Abstract"],
4 | ["line", "In this paper, we consider a family of regularizers"]
5 | ]
--------------------------------------------------------------------------------
/core/src/test/resources/agarwal11.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/science-parse/f83983aa1b898f0e47c97f4997c57427bf277182/core/src/test/resources/agarwal11.pdf
--------------------------------------------------------------------------------
/core/src/test/resources/aimag10.extraction.json:
--------------------------------------------------------------------------------
1 | [
2 | ["line", "Adapting Open"],
3 | ["year", 2010]
4 | ]
--------------------------------------------------------------------------------
/core/src/test/resources/aimag10.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/science-parse/f83983aa1b898f0e47c97f4997c57427bf277182/core/src/test/resources/aimag10.pdf
--------------------------------------------------------------------------------
/core/src/test/resources/bagnell11.extraction.json:
--------------------------------------------------------------------------------
1 | [
2 | ["title", "Computational Rationalization: The Inverse Equilibrium Problem"],
3 | ["line", "Abstract"],
4 | ["line", "techniques that both explains demonstrated behavior"],
5 | ["line", "3.1. Rationality Assumptions"]
6 | ]
--------------------------------------------------------------------------------
/core/src/test/resources/bagnell11.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/science-parse/f83983aa1b898f0e47c97f4997c57427bf277182/core/src/test/resources/bagnell11.pdf
--------------------------------------------------------------------------------
/core/src/test/resources/bohnet09.extraction.json:
--------------------------------------------------------------------------------
1 | [
2 | ["title", "Efficient Parsing of Syntactic and Semantic Dependency Structures"],
3 | ["line", "Abstract"],
4 | ["line", "proach can compute a projective dependency tree"],
5 | ["year", 2009]
6 | ]
--------------------------------------------------------------------------------
/core/src/test/resources/bohnet09.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/science-parse/f83983aa1b898f0e47c97f4997c57427bf277182/core/src/test/resources/bohnet09.pdf
--------------------------------------------------------------------------------
/core/src/test/resources/bunescu-acl07.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/science-parse/f83983aa1b898f0e47c97f4997c57427bf277182/core/src/test/resources/bunescu-acl07.pdf
--------------------------------------------------------------------------------
/core/src/test/resources/bunescu-acl07.txt:
--------------------------------------------------------------------------------
1 | TITLE = Learning to Extract Relations from the Web using Minimal Supervision
2 | AUTHOR = Razvan C. Bunescu
3 | AUTHOR = Raymond J. Mooney
4 |
--------------------------------------------------------------------------------
/core/src/test/resources/c0690a1d74ab781bd54f9fa7e67267cce656.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/science-parse/f83983aa1b898f0e47c97f4997c57427bf277182/core/src/test/resources/c0690a1d74ab781bd54f9fa7e67267cce656.pdf
--------------------------------------------------------------------------------
/core/src/test/resources/c921a74c209e720534939dfa191d639e647dd242.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/science-parse/f83983aa1b898f0e47c97f4997c57427bf277182/core/src/test/resources/c921a74c209e720534939dfa191d639e647dd242.pdf
--------------------------------------------------------------------------------
/core/src/test/resources/coordinate_calibrator.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/science-parse/f83983aa1b898f0e47c97f4997c57427bf277182/core/src/test/resources/coordinate_calibrator.pdf
--------------------------------------------------------------------------------
/core/src/test/resources/coratest.txt:
--------------------------------------------------------------------------------
1 | A. Cau, R. Kuiper, and W.-P. de Roever. Formalising Dijkstra's development strategy within Stark's formalism. In C. B. Jones, R. C. Shaw, and T. Denvir, editors, Proc. 5th. BCS-FACS Refinement Workshop, 1992.
2 |
--------------------------------------------------------------------------------
/core/src/test/resources/ding11.extraction.json:
--------------------------------------------------------------------------------
1 | [
2 | ["title","Maximum Margin Multi-Instance Learning"],
3 | ["line", "Abstract"],
4 | ["line", "Multi-instance data are different from traditional single-instance data, which bring new opportunities"],
5 | ["year", 2011]
6 | ]
--------------------------------------------------------------------------------
/core/src/test/resources/ding11.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/science-parse/f83983aa1b898f0e47c97f4997c57427bf277182/core/src/test/resources/ding11.pdf
--------------------------------------------------------------------------------
/core/src/test/resources/dyer12.extraction.json:
--------------------------------------------------------------------------------
1 | [
2 | ["title", "Joint Feature Selection in Distributed Stochastic Learning for Large-Scale Discriminative Training in SMT"],
3 | ["line", "Abstract"],
4 | ["line", "Since inference for SMT (unlike many other learn-"],
5 | ["line", "data. Feature groups are 12 dense features (default), rule identifiers (id), rule n-gram (ng), and rule shape (shape)."],
6 | ["year", 2012]
7 | ]
--------------------------------------------------------------------------------
/core/src/test/resources/dyer12.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/science-parse/f83983aa1b898f0e47c97f4997c57427bf277182/core/src/test/resources/dyer12.pdf
--------------------------------------------------------------------------------
/core/src/test/resources/e4faf2c1d76b9bf8f8b4524dfb8c5c6b93be5f35.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/science-parse/f83983aa1b898f0e47c97f4997c57427bf277182/core/src/test/resources/e4faf2c1d76b9bf8f8b4524dfb8c5c6b93be5f35.pdf
--------------------------------------------------------------------------------
/core/src/test/resources/fader11.extraction.json:
--------------------------------------------------------------------------------
1 | [
2 | ["title", "Identifying Relations for Open Information Extraction"],
3 | ["line", "Abstract"],
4 | ["line", "Incoherent extractions are cases where the ex-"],
5 | ["year", 2011]
6 | ]
--------------------------------------------------------------------------------
/core/src/test/resources/fader11.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/science-parse/f83983aa1b898f0e47c97f4997c57427bf277182/core/src/test/resources/fader11.pdf
--------------------------------------------------------------------------------
/core/src/test/resources/gazetteer-test/education.university.small.txt:
--------------------------------------------------------------------------------
1 | universidad pontificia bolivariana
2 | apollo college-phoenix inc
3 | marinello school of beauty
4 |
--------------------------------------------------------------------------------
/core/src/test/resources/gazetteer-test/names.male.txt:
--------------------------------------------------------------------------------
1 | # You may use the lists of names for any purpose, so long as credit is given
2 | # in any published work. You may also redistribute the list if you
3 | # provide the recipients with a copy of this mail message. The lists are
4 | # not in the public domain (I retain the copyright on the lists) but are
5 | # freely redistributable.
6 |
7 | # If you have any additions to the lists of names, I would appreciate
8 | # receiving them.
9 |
10 | # My email address is mkant+@cs.cmu.edu.
11 |
12 | # --mark
13 |
14 | # ****************************************************************************
15 |
16 | # List of common male names.
17 | # Copyright (c) January 1991 by Mark Kantrowitz.
18 | # 2924 names
19 | # Thanks to Bill Ross for about 1000 additional names.
20 |
21 | Aaron
22 | Abbey
23 | Abbie
24 | Abbot
25 | Abbott
--------------------------------------------------------------------------------
/core/src/test/resources/groundTruth.json:
--------------------------------------------------------------------------------
1 | { "id": "c921a74c209e720534939dfa191d639e647dd242", "url": "http://s3-us-west-2.amazonaws.com/ai2-s2-pdfs/c921/a74c209e720534939dfa191d639e647dd242.pdf", "title": "Did you notice?: neuronal processing of multimodal mobile phone feedback", "authors": [ "Antons, Jan-Niklas", "Arndt, Sebastian", "Seebode, Julia", "Schleicher, Robert", "M�ller, Sebastian" ], "year": 2013, "venue": "" }
2 | { "id": "2a774230b5328df3f8125da9b84a82d92b46a240", "url": "http://s3-us-west-2.amazonaws.com/ai2-s2-pdfs/2a77/4230b5328df3f8125da9b84a82d92b46a240.pdf", "title": "Disk-based storage for scalable video", "authors": [ "Chang, Edward Y.", "Zakhor, Avideh" ], "year": 1997, "venue": "" }
3 | { "id": "403b61d52192d6cf23c92a3da68ba08f03a954e4", "url": "http://s3-us-west-2.amazonaws.com/ai2-s2-pdfs/403b/61d52192d6cf23c92a3da68ba08f03a954e4.pdf", "title": "Smoothness-Increasing Accuracy-Conserving (SIAC) Postprocessing for Discontinuous Galerkin Solutions over Structured Triangular Meshes", "authors": [ "Mirzaee, Hanieh", "Ji, Liangyue", "Ryan, Jennifer K.", "Kirby, Robert M." ], "year": 2011, "venue": "" }
4 | { "id": "a7c25298c607d5bf32e3301b6b209431e2a7f830", "url": "http://s3-us-west-2.amazonaws.com/ai2-s2-pdfs/a7c2/5298c607d5bf32e3301b6b209431e2a7f830.pdf", "title": "Mining Generalized Association Rules on Biomedical Literature", "authors": [ "Berardi, Margherita", "Lapi, Michele", "Leo, Pietro", "Loglisci, Corrado" ], "year": 2005, "venue": "" }
5 |
--------------------------------------------------------------------------------
/core/src/test/resources/kermittest.txt:
--------------------------------------------------------------------------------
1 | Tracey, et al. , Nature 330 , 662-664 ( 1987 )
2 | Hinshaw, et al. , Circ. Shock 30 , 279-292 ( 1990 )
3 |
--------------------------------------------------------------------------------
/core/src/test/resources/logback-test.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | ${logback_stdoutLevel:-DEBUG}
6 |
7 |
8 | %-5level %logger{36}: %msg%n
9 |
10 |
11 |
12 |
13 | false
14 | EvalErrors.log
15 |
16 | %msg%n
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
--------------------------------------------------------------------------------
/core/src/test/resources/map-reduce.extraction.json:
--------------------------------------------------------------------------------
1 | [
2 | ["line", "As a reaction to this complexity, we designed a new"],
3 | ["line", "most important words that occur in a document or a set"],
4 | ["line", "ments. The user would write code similar to the follow-"]
5 | ]
--------------------------------------------------------------------------------
/core/src/test/resources/map-reduce.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/science-parse/f83983aa1b898f0e47c97f4997c57427bf277182/core/src/test/resources/map-reduce.pdf
--------------------------------------------------------------------------------
/core/src/test/resources/model-bib-crf-test.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/science-parse/f83983aa1b898f0e47c97f4997c57427bf277182/core/src/test/resources/model-bib-crf-test.dat
--------------------------------------------------------------------------------
/core/src/test/resources/mono04.extraction.json:
--------------------------------------------------------------------------------
1 | [
2 | ["title", "Monolingual Machine Translation for Paraphrase Generation"],
3 | ["line", "Abstract"],
4 | ["line", "AER of 20.88% may appear problematic in a sys-"],
5 | ["line", "tion, we were hesitant to incur the exponential increase"],
6 | ["year", 2004]
7 | ]
--------------------------------------------------------------------------------
/core/src/test/resources/mono04.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/science-parse/f83983aa1b898f0e47c97f4997c57427bf277182/core/src/test/resources/mono04.pdf
--------------------------------------------------------------------------------
/core/src/test/resources/mooney05.extraction.json:
--------------------------------------------------------------------------------
1 | [
2 | ["title", "A Statistical Semantic Parser that Integrates Syntax and Semantics"],
3 | ["line", "Abstract"],
4 | ["year", 2005]
5 | ]
--------------------------------------------------------------------------------
/core/src/test/resources/mooney05.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/science-parse/f83983aa1b898f0e47c97f4997c57427bf277182/core/src/test/resources/mooney05.pdf
--------------------------------------------------------------------------------
/core/src/test/resources/papers-parsebugs.json:
--------------------------------------------------------------------------------
1 | [{
2 | "id": "089f6c46de8a4399840548a056d13d38e1f54da2",
3 | "url": "http://s3-us-west-2.amazonaws.com/ai2-s2-pdfs/089f/6c46de8a4399840548a056d13d38e1f54da2.pdf",
4 | "title": "Scheduling problems in transportation networks of line topology",
5 | "authors": [
6 | "Kowalski, Dariusz R.",
7 | "Nussbaum, Eyal",
8 | "Segal, Michael",
9 | "Milyeykovski, Vitaly"
10 | ],
11 | "year": 2014,
12 | "venue": ""
13 | }]
--------------------------------------------------------------------------------
/core/src/test/resources/pedersen04.extraction.json:
--------------------------------------------------------------------------------
1 | [
2 | ["title", "Combating Web Spam with TrustRank"],
3 | ["line", "Abstract"],
4 | ["year", 2004]
5 | ]
--------------------------------------------------------------------------------
/core/src/test/resources/pedersen04.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/science-parse/f83983aa1b898f0e47c97f4997c57427bf277182/core/src/test/resources/pedersen04.pdf
--------------------------------------------------------------------------------
/core/src/test/resources/proto06.extraction.json:
--------------------------------------------------------------------------------
1 | [
2 | ["title", "Prototype-Driven Learning for Sequence Models"],
3 | ["line", "Abstract"],
4 | ["line", "For our part-of-speech tagging experiments, we used"],
5 | ["year", 2006]
6 | ]
--------------------------------------------------------------------------------
/core/src/test/resources/proto06.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/science-parse/f83983aa1b898f0e47c97f4997c57427bf277182/core/src/test/resources/proto06.pdf
--------------------------------------------------------------------------------
/core/src/test/resources/roark13.extraction.json:
--------------------------------------------------------------------------------
1 | [
2 | ["title", "Discriminative Joint Modeling of Lexical Variation and Acoustic Confusion for Automated Narrative Retelling Assessment"],
3 | ["line", "Abstract"],
4 | ["line", "5K25AG033723-02 and P30 AG024978-05 and"],
5 | ["line", "Table 5: Story element F-score achieved by log-linear models (MaxEnt and CRF) when adding context dependent features (CD)"],
6 | ["year", 2013]
7 | ]
--------------------------------------------------------------------------------
/core/src/test/resources/roark13.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/science-parse/f83983aa1b898f0e47c97f4997c57427bf277182/core/src/test/resources/roark13.pdf
--------------------------------------------------------------------------------
/core/src/test/resources/senellart10.extraction.json:
--------------------------------------------------------------------------------
1 | [
2 | ["title", "Probabilistic XML via Markov Chains"],
3 | ["line", "We want to reinterpret probabilistic models on words to"],
4 | ["line", "bility of going into the original component distributed among"],
5 | ["line", "Figure 3: Translations between probabilistic XML representation systems."],
6 | ["line", "ABSTRACT"],
7 | ["year", 2010]
8 | ]
--------------------------------------------------------------------------------
/core/src/test/resources/senellart10.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/science-parse/f83983aa1b898f0e47c97f4997c57427bf277182/core/src/test/resources/senellart10.pdf
--------------------------------------------------------------------------------
/core/src/test/resources/seung08.extraction.json:
--------------------------------------------------------------------------------
1 | [
2 | ["title", "Natural Image Denoising with Convolutional Networks"],
3 | ["line", "Abstract"],
4 | ["line", "One approach to image denoising is to transform an image from pixel intensities into another rep-"],
5 | ["year", 2008]
6 | ]
--------------------------------------------------------------------------------
/core/src/test/resources/seung08.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/science-parse/f83983aa1b898f0e47c97f4997c57427bf277182/core/src/test/resources/seung08.pdf
--------------------------------------------------------------------------------
/core/src/test/resources/smith07.extraction.json:
--------------------------------------------------------------------------------
1 | [
2 | ["title", "Smooth Sensitivity and Sampling in Private Data Analysis"],
3 | ["line", "ABSTRACT"],
4 | ["year", 2007]
5 | ]
--------------------------------------------------------------------------------
/core/src/test/resources/smith07.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/science-parse/f83983aa1b898f0e47c97f4997c57427bf277182/core/src/test/resources/smith07.pdf
--------------------------------------------------------------------------------
/core/src/test/resources/smola10.extraction.json:
--------------------------------------------------------------------------------
1 | [
2 | ["title", "An Architecture for Parallel Topic Models"],
3 | ["line", "ABSTRACT"],
4 | ["line", "from PubMed abstracts, which is equivalent to a processing"],
5 | ["line", "Instead, we use it here to design a sampler for inference of"],
6 | ["line", "ods. Likewise, the same architecture could be used to per-"],
7 | ["year", 2010]
8 | ]
--------------------------------------------------------------------------------
/core/src/test/resources/smola10.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/science-parse/f83983aa1b898f0e47c97f4997c57427bf277182/core/src/test/resources/smola10.pdf
--------------------------------------------------------------------------------
/core/src/test/resources/superscripttest.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/science-parse/f83983aa1b898f0e47c97f4997c57427bf277182/core/src/test/resources/superscripttest.pdf
--------------------------------------------------------------------------------
/core/src/test/resources/testng.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
--------------------------------------------------------------------------------
/core/src/test/resources/umasstest.txt:
--------------------------------------------------------------------------------
1 | [30] E. W. Montroll , B. J. West , Fluctuation Phenomena , Elsevier Science Publishers B. V. , Amsterdam , 1979 , Ch . On an enriched collection of stochastic processes , pp . 61--205 .
2 |
--------------------------------------------------------------------------------
/core/src/test/resources/zolotov04.extraction.json:
--------------------------------------------------------------------------------
1 | [
2 | ["line", "Indexing XML Data Stored in a Relational Database"],
3 | ["line", "As XML usage grows for both data-centric and"],
4 | ["line", "In the ORDPATH values shown in Figure 2 (such as"],
5 | ["line", "Abstract"],
6 | ["year", 2004]
7 | ]
--------------------------------------------------------------------------------
/core/src/test/resources/zolotov04.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/science-parse/f83983aa1b898f0e47c97f4997c57427bf277182/core/src/test/resources/zolotov04.pdf
--------------------------------------------------------------------------------
/core/src/test/scala/org/allenai/scienceparse/CoraExtractionSpec.scala:
--------------------------------------------------------------------------------
1 | package org.allenai.scienceparse
2 |
3 | import org.allenai.common.Resource
4 | import org.allenai.common.testkit.UnitSpec
5 |
6 | import scala.collection.JavaConverters._
7 | import scala.collection.mutable.ArrayBuffer
8 | import scala.io.Source
9 |
10 | import org.scalatest._
11 | import Matchers._
12 |
13 | class CoraExtractionSpec extends UnitSpec {
14 |
15 | case class Reference(
16 | source: String,
17 | authors: Seq[String],
18 | title: String,
19 | date: String
20 | )
21 |
22 | case class TestResult(
23 | reference: Reference,
24 | extracted: Seq[BibRecord],
25 | precision: Float,
26 | recall: Float,
27 | msg: Seq[String] = Seq()
28 | )
29 |
30 | case class TestResults(
31 | precision: Float,
32 | recall: Float,
33 | results: Seq[TestResult]
34 | )
35 |
36 | val refs = new ArrayBuffer[Reference]()
37 | val extractor = new ExtractReferences(Parser.getDefaultGazetteer.toString)
38 |
39 | Resource.using(
40 | Source.fromInputStream(getClass.getResourceAsStream("/tagged_references.txt"))
41 | ) {
42 | source =>
43 | for (
44 | ref <- source.getLines
45 | ) {
46 | val authorMatch = "(.*)".r.findFirstMatchIn(ref)
47 | val authors = authorMatch
48 | .toSeq
49 | .flatMap(_.group(1).split(",|and|&"))
50 | .map(_.trim)
51 |
52 | val title = "(.*)".r.findFirstMatchIn(ref).map(_.group(1).trim)
53 | val date = "(.*)".r.findFirstMatchIn(ref).map(_.group(1).trim)
54 | val raw = ref.replaceAll("<[^>]+>", "").replaceAll("[^>]+>", "").trim
55 | refs.append(Reference(raw, authors, title.getOrElse(""), date.getOrElse("")))
56 | }
57 | }
58 |
59 | // Successful as long as we got exactly one record.
60 | def segmentationTest(ref: Reference, extracted: Seq[BibRecord]): TestResult = {
61 | TestResult(ref, extracted, 1, 1)
62 | }
63 |
64 | def runTest(name: String, test: (Reference, Seq[BibRecord]) => TestResult): TestResults = {
65 | def testRecord(ref: Reference): TestResult = {
66 | val text = Seq("Bibliography", ref.source).asJava
67 | val records = extractor.findReferences(text).getOne.asScala
68 | if (records.size == 0) {
69 | println(s"Missed extraction: ${ref.source}")
70 | TestResult(ref, records, 0, 0, Seq("Missing"))
71 | } else if (records.size > 1) {
72 | TestResult(ref, records, 0, 0, Seq("Too many extractions"))
73 | } else {
74 | test(ref, records)
75 | }
76 | }
77 |
78 | val results: Seq[TestResult] = refs.map(testRecord _)
79 |
80 | val precision = results.map(_.precision).sum / results.size
81 | val recall = results.map(_.recall).sum / results.size
82 |
83 | println(s"$name precision: $precision recall: $recall")
84 |
85 | TestResults(precision, recall, results)
86 | }
87 |
88 | "cora-ie references" should "be extracted" in {
89 | assert(runTest("segmentation", segmentationTest _).recall >= 0.1)
90 | }
91 | }
92 |
--------------------------------------------------------------------------------
/core/src/test/scala/org/allenai/scienceparse/JavaTestSuite.scala:
--------------------------------------------------------------------------------
1 | package org.allenai.scienceparse
2 |
3 | import org.scalatest.testng.TestNGWrapperSuite
4 |
5 | class JavaTestSuite extends TestNGWrapperSuite(
6 | List("src/test/resources/testng.xml")
7 | )
8 |
9 |
--------------------------------------------------------------------------------
/core/src/test/scala/org/allenai/scienceparse/JsonProtocolSpec.scala:
--------------------------------------------------------------------------------
1 | package org.allenai.scienceparse
2 |
3 | import java.util
4 | import java.util.regex.Pattern
5 |
6 | import org.allenai.common.testkit.UnitSpec
7 |
8 | class JsonProtocolSpec extends UnitSpec {
9 | import spray.json._
10 | import JsonProtocol._
11 |
12 | "JsonProtocol" should "round trip basic content" in {
13 | val em = new ExtractedMetadata(
14 | "The Brand Personality of Rocks: A Critical Evaluation of a Brand Personality Scale",
15 | util.Arrays.asList("Mark Avis", "Sarah Forbes", "Shelagh Ferguson"),
16 | null)
17 |
18 | em.equals(em.toJson.convertTo[ExtractedMetadata])
19 | }
20 |
21 | it should "round trip empty authors" in {
22 | val em = new ExtractedMetadata(
23 | "The Brand Personality of Rocks: A Critical Evaluation of a Brand Personality Scale",
24 | util.Arrays.asList("", "Sarah Forbes", "Shelagh Ferguson"),
25 | null)
26 |
27 | em.equals(em.toJson.convertTo[ExtractedMetadata])
28 | }
29 |
30 | it should "round trip complex content" in {
31 | val em = new ExtractedMetadata(
32 | "The Brand Personality of Rocks: A Critical Evaluation of a Brand Personality Scale",
33 | util.Arrays.asList("Mark Avis", "Sarah Forbes", "Shelagh Ferguson"),
34 | null)
35 | em.year = 2014
36 | em.sections = util.Arrays.asList(
37 | new Section("Introduction", "In this paper, ..."),
38 | new Section(null, "Furthermore, ...")
39 | )
40 | em.abstractText = "Aaker’s (1997) brand personality (BP) scale is widely used in research and is an important foundation for the theory of BP."
41 | em.creator = "MS Paint"
42 | em.source = ExtractedMetadata.Source.META
43 |
44 | em.equals(em.toJson.convertTo[ExtractedMetadata])
45 | }
46 |
47 | it should "round trip empty content" in {
48 | // Empty content
49 | val em = new ExtractedMetadata(
50 | null,
51 | util.Arrays.asList(),
52 | null)
53 | em.sections = util.Arrays.asList(
54 | new Section("", ""),
55 | new Section(null, "")
56 | )
57 | em.abstractText = ""
58 | em.creator = ""
59 |
60 | em.equals(em.toJson.convertTo[ExtractedMetadata])
61 | }
62 |
63 | it should "round trip references" in {
64 | val em = new ExtractedMetadata(
65 | "The Brand Personality of Rocks: A Critical Evaluation of a Brand Personality Scale",
66 | util.Arrays.asList("Mark Avis", "Sarah Forbes", "Shelagh Ferguson"),
67 | null)
68 |
69 | em.references = util.Arrays.asList(
70 | new BibRecord(
71 | "Managing Brand Equity: Capitalizing on the Value of a Brand Name",
72 | util.Arrays.asList("Aaker, D"),
73 | "The Free Press",
74 | null,
75 | null,
76 | 1991
77 | ),
78 | new BibRecord(
79 | "Dimensions of Brand Personality",
80 | util.Arrays.asList("Aaker, D"),
81 | "Journal of Marketing Research",
82 | Pattern.compile("Aaker et al\\."),
83 | Pattern.compile("\\[2\\]"),
84 | 1997
85 | ),
86 | new BibRecord(
87 | null,
88 | util.Arrays.asList(),
89 | null,
90 | null,
91 | null,
92 | 2001
93 | )
94 | )
95 |
96 | em.referenceMentions = util.Arrays.asList(
97 | new CitationRecord(
98 | 1,
99 | "As [1] held these truths to be self-evident, ...",
100 | 3,
101 | 6
102 | )
103 | )
104 |
105 | em.equals(em.toJson.convertTo[ExtractedMetadata])
106 | }
107 |
108 | "LabeledData" should "round-trip through the JSON format" in {
109 | val sha = "a7c25298c607d5bf32e3301b6b209431e2a7f830"
110 | def getInputStream = this.getClass.getResourceAsStream(s"/$sha.pdf")
111 | val em = Parser.getInstance().doParse(getInputStream)
112 | val labeledData = LabeledData.fromExtractedMetadata(sha, em)
113 | val jsonString = labeledData.toJson.prettyPrint
114 |
115 | val labeledDataFromJson = jsonString.parseJson.convertTo[LabeledData]
116 |
117 | assertResult(labeledData.title)(labeledDataFromJson.title)
118 | assertResult(labeledData.authors)(labeledDataFromJson.authors)
119 | assertResult(labeledData.abstractText)(labeledDataFromJson.abstractText)
120 | assertResult(labeledData.year)(labeledDataFromJson.year)
121 | assertResult(labeledData.venue)(labeledDataFromJson.venue)
122 | assertResult(labeledData.sections)(labeledDataFromJson.sections)
123 | assertResult(labeledData.references)(labeledDataFromJson.references)
124 | //assertResult(labeledData.mentions)(labeledDataFromJson.mentions)
125 | }
126 | }
127 |
--------------------------------------------------------------------------------
/core/src/test/scala/org/allenai/scienceparse/MetaEvalSpec.scala:
--------------------------------------------------------------------------------
1 | package org.allenai.scienceparse
2 |
3 | import org.allenai.common.testkit.UnitSpec
4 |
5 | class MetaEvalSpec extends UnitSpec {
6 | "MetaEval" should "produce good P/R numbers" in {
7 | val parser = new Parser()
8 | val evaluationResult = Evaluation.evaluate(parser)
9 | Evaluation.printResults(evaluationResult)
10 |
11 | val minimumPR = Map(
12 | "abstract ".trim -> ((0.856, 0.856)),
13 | "abstractNormalized ".trim -> ((0.856, 0.856)),
14 | "authorFullName ".trim -> ((0.821, 0.805)),
15 | "authorFullNameNormalized ".trim -> ((0.851, 0.831)),
16 | "authorLastName ".trim -> ((0.871, 0.847)),
17 | "authorLastNameNormalized ".trim -> ((0.889, 0.862)),
18 | "bibAll ".trim -> ((0.033, 0.031)),
19 | "bibAllButVenuesNormalized".trim -> ((0.619, 0.560)),
20 | "bibAllNormalized ".trim -> ((0.044, 0.041)),
21 | "bibAuthors ".trim -> ((0.726, 0.637)),
22 | "bibAuthorsNormalized ".trim -> ((0.840, 0.743)),
23 | "bibCounts ".trim -> ((1.000, 0.826)),
24 | "bibMentions ".trim -> ((0.232, 0.218)),
25 | "bibMentionsNormalized ".trim -> ((0.273, 0.245)),
26 | "bibTitles ".trim -> ((0.795, 0.709)),
27 | "bibTitlesNormalized ".trim -> ((0.796, 0.710)),
28 | "bibVenues ".trim -> ((0.062, 0.051)),
29 | "bibVenuesNormalized ".trim -> ((0.063, 0.052)),
30 | "bibYears ".trim -> ((0.933, 0.835)),
31 | "title ".trim -> ((0.427, 0.427)),
32 | "titleNormalized ".trim -> ((0.842, 0.842))
33 | )
34 |
35 | val tolerance = 0.002
36 | evaluationResult.scienceParse.foreach { case (metric, eval) =>
37 | val (minimumP, minimumR) = minimumPR(metric.name)
38 | assert(eval.p > minimumP - tolerance, s"Evaluating precision for ${metric.name}")
39 | assert(eval.r > minimumR - tolerance, s"Evaluating recall for ${metric.name}")
40 | }
41 | }
42 | }
43 |
--------------------------------------------------------------------------------
/core/src/test/scala/org/allenai/scienceparse/StringUtilsSpec.scala:
--------------------------------------------------------------------------------
1 | package org.allenai.scienceparse
2 |
3 | import org.allenai.common.testkit.UnitSpec
4 |
5 | class StringUtilsSpec extends UnitSpec {
6 | "author names" should "get split correctly" in {
7 | val tests = Map(
8 | "Aryabhata" -> Tuple2("", "Aryabhata"),
9 | "Peter Clark" -> Tuple2("Peter", "Clark"),
10 | "Peter Clark" -> Tuple2("Peter", " Clark"),
11 | "Arthur C. Clarke" -> Tuple2("Arthur C.", "Clarke"),
12 | "Ludwig van Beethoven" -> Tuple2("Ludwig", "van Beethoven"),
13 | "Ludwig van Beethoven" -> Tuple2("Ludwig", " van Beethoven"),
14 | " Ludwig van Beethoven" -> Tuple2(" Ludwig", " van Beethoven"),
15 | "Ludwig van Beethoven Jr." -> Tuple2("Ludwig", " van Beethoven Jr."),
16 | "Ludwig van Beethoven Jr. " -> Tuple2("Ludwig", " van Beethoven Jr. "),
17 | "Ayrton Senna da Silva" -> Tuple2("Ayrton Senna", "da Silva"),
18 | "" -> Tuple2("", ""),
19 | " " -> Tuple2("", " ")
20 | )
21 |
22 | tests.foreach { case (original, expected) =>
23 | assertResult(expected)(StringUtils.splitName(original))
24 | }
25 | }
26 | }
27 |
--------------------------------------------------------------------------------
/project/build.properties:
--------------------------------------------------------------------------------
1 | sbt.version=1.2.7
2 |
--------------------------------------------------------------------------------
/project/plugins.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.9")
2 |
3 | addSbtPlugin("com.jsuereth" % "sbt-pgp" % "1.1.2-1")
4 |
5 | addSbtPlugin("org.foundweekends" % "sbt-bintray" % "0.5.4")
6 |
--------------------------------------------------------------------------------
/server/.gitignore:
--------------------------------------------------------------------------------
1 | /target/
2 |
--------------------------------------------------------------------------------
/server/README.md:
--------------------------------------------------------------------------------
1 | # Science Parse Server
2 |
3 | This is a wrapper that makes the [SP library](../core/README.md) available as a web service. We have a version running at http://scienceparse.allenai.org, so you can try it yourself: http://scienceparse.allenai.org/v1/498bb0efad6ec15dd09d941fb309aa18d6df9f5f
4 |
5 | This will show a large amount of JSON. Most of it is body text. You can get a slightly more compact output by skipping the body text: http://scienceparse.allenai.org/v1/498bb0efad6ec15dd09d941fb309aa18d6df9f5f?skipFields=sections
6 |
7 | Both of these examples parse the paper with the S2 paper id `498bb0efad6ec15dd09d941fb309aa18d6df9f5f`. You can see that paper here: https://pdfs.semanticscholar.org/498b/b0efad6ec15dd09d941fb309aa18d6df9f5f.pdf
8 |
9 | ## Parsing your own PDF
10 |
11 | If you want to upload your own PDF, you can do that with a HTTP POST:
12 | ```
13 | curl -v -H "Content-type: application/pdf" --data-binary @paper.pdf "http://scienceparse.allenai.org/v1"
14 | ```
15 |
16 | Note that the content type needs to be `application/pdf`, and the URL needs to not have a trailing slash.
17 |
18 | ## Running the server yourself
19 |
20 | You can compile the server into a super-jar with sbt with `sbt server/assembly`. That will download all dependencies, compile them, and build an executable jar with all dependencies bundled. Then, you can start up the server with `java -Xmx6g -jar jarfile.jar`. On first startup, it will download several gigabytes of model files, and then bind to port 8080 on the machine you run it on.
21 |
22 | The server takes a few command line arguments. Run it with `java -jar jarfile.jar --help` to see what they are.
23 |
24 | Science Parse takes quite a bit of memory, so we recommend running it with `-Xmx6g`. Some documents might require more than that. Science Parse also uses off-heap memory (i.e., memory that's not specified by `-Xmx`), so we recommend that you have at least 2GB free in addition to the heap memory specified with `-Xmx`.
25 |
26 | ## Feedback mechanism
27 |
28 | The server supports something called the "Feedback mechanism". This is a fairly basic way to gather corrections to the extractions SP makes, so we can improve the models. The mechanism is disabled by default, so you shouldn't have to worry about it most of the time.
29 |
30 | We don't support this mechanism publically, but if you want to play with it, it should be easy to point it at a postgres database of your choice, and start gathering feedback.
31 |
--------------------------------------------------------------------------------
/server/build.sbt:
--------------------------------------------------------------------------------
1 | javaOptions in run += s"-Xmx10G"
2 |
3 | fork := true
4 |
5 | mainClass in assembly := Some("org.allenai.scienceparse.SPServer")
6 |
7 | assemblyMergeStrategy in assembly := {
8 | case "logback.xml" => MergeStrategy.first
9 | case "application.conf" => MergeStrategy.concat
10 | case x => (assemblyMergeStrategy in assembly).value.apply(x)
11 | }
12 |
13 | libraryDependencies ++= Seq(
14 | "org.slf4j" % "jcl-over-slf4j" % "1.7.7",
15 | "org.eclipse.jetty" % "jetty-server" % "9.4.1.v20170120",
16 | "com.typesafe" % "config" % "1.3.1",
17 | "org.scalikejdbc" %% "scalikejdbc" % "2.5.0" exclude ("commons-logging", "commons-logging"),
18 | "org.postgresql" % "postgresql" % "42.0.0"
19 | )
20 |
--------------------------------------------------------------------------------
/server/src/main/resources/application.conf:
--------------------------------------------------------------------------------
1 | org.allenai.scienceparse.Server {
2 | db = {
3 | url = null
4 | user = "scienceparse"
5 | password = null
6 |
7 | connectionPool = enabled
8 | keepAliveConnection = true
9 | }
10 |
11 | db-as-root = {
12 | url = ${org.allenai.scienceparse.Server.db.url}
13 | user = "root"
14 | password = null
15 |
16 | connectionPool = disabled
17 | keepAliveConnection = false
18 | numThreads = 1
19 | }
20 | }
21 |
--------------------------------------------------------------------------------
/server/src/main/resources/logback.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | ${logback_stdoutLevel:-DEBUG}
6 |
7 |
8 | %-5level %logger{36}: %msg%n
9 |
10 |
11 |
12 |
13 | false
14 | EvalErrors.log
15 |
16 | %msg%n
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
--------------------------------------------------------------------------------
/server/src/main/scala/org/allenai/scienceparse/FeedbackStore.scala:
--------------------------------------------------------------------------------
1 | package org.allenai.scienceparse
2 |
3 | import com.typesafe.config.{ConfigFactory, Config}
4 | import org.allenai.common.{Resource, Logging}
5 | import org.allenai.common.Config._
6 |
7 | import scalikejdbc._
8 |
9 | import java.time.Instant
10 |
11 | object FeedbackStore extends Logging {
12 | { // Set up the DB
13 | Class.forName("org.postgresql.Driver")
14 |
15 | val config = ConfigFactory.load()
16 | val dbConfig: Config = config[Config]("org.allenai.scienceparse.Server.db")
17 |
18 | scalikejdbc.GlobalSettings.loggingSQLAndTime = scalikejdbc.LoggingSQLAndTimeSettings(
19 | enabled = dbConfig.get[Boolean]("logging").getOrElse(false),
20 | logLevel = 'DEBUG,
21 | warningEnabled = true,
22 | warningThresholdMillis = 1000L,
23 | warningLogLevel = 'WARN
24 | )
25 |
26 | val dbUrl = dbConfig.getString("url")
27 | val dbUser = dbConfig.getString("user")
28 | val dbPassword = dbConfig.get[String]("password").getOrElse(
29 | throw new IllegalArgumentException("Password for DB not set. Please set org.allenai.scienceparse.Server.db.password."))
30 | ConnectionPool.singleton(dbUrl, dbUser, dbPassword)
31 |
32 | // upgrade the schema if necessary
33 | {
34 | val dbConfig: Config = config[Config]("org.allenai.scienceparse.Server.db-as-root")
35 | val dbUrl = dbConfig.getString("url")
36 | logger.info(s"Connecting to $dbUrl")
37 | val dbUser = dbConfig.getString("user")
38 | val dbPassword = dbConfig.get[String]("password").getOrElse(
39 | throw new IllegalArgumentException("Root password for DB not set. Please set org.allenai.scienceparse.Server.db-as-root.password."))
40 |
41 | val rootConnectionPoolName = "rootConnectionPool"
42 | val cpSettings = new ConnectionPoolSettings(initialSize = 1, maxSize = 2)
43 | ConnectionPool.add(rootConnectionPoolName, dbUrl, dbUser, dbPassword, cpSettings)
44 | Resource.using(ConnectionPool(rootConnectionPoolName)) { implicit cp =>
45 | DB.localTx { implicit session =>
46 | sql"""
47 | CREATE TABLE IF NOT EXISTS settings (
48 | key VARCHAR NOT NULL PRIMARY KEY,
49 | value VARCHAR NOT NULL)
50 | """.execute().apply()
51 |
52 | def dbSchemaVersion =
53 | sql"SELECT value::integer FROM settings WHERE key = 'version'".map(_.int("value")).single().apply().getOrElse(0)
54 | val desiredSchemaVersion = 1
55 | val schemaUpdateFunctions = Map(
56 | 0 -> (() => {
57 | sql"""
58 | CREATE TABLE feedback (
59 | paperId CHAR(40) NOT NULL,
60 | timeAdded TIMESTAMP NOT NULL,
61 | value JSONB NOT NULL,
62 | PRIMARY KEY(paperId, timeAdded))
63 | """.execute().apply()
64 |
65 | sql"""
66 | INSERT INTO settings (key, value) VALUES ('version', 1)
67 | """.execute().apply()
68 | })
69 | )
70 |
71 | var currentSchemaVersion = dbSchemaVersion
72 | while(currentSchemaVersion != desiredSchemaVersion) {
73 | val updateFunction = schemaUpdateFunctions.getOrElse(
74 | currentSchemaVersion,
75 | throw new RuntimeException(s"Could not find upgrade function for version $currentSchemaVersion."))
76 | updateFunction()
77 |
78 | val newSchemaVersion = dbSchemaVersion
79 | if(newSchemaVersion == currentSchemaVersion)
80 | throw new RuntimeException(s"Upgrade function for version $currentSchemaVersion did not change the version.")
81 | currentSchemaVersion = newSchemaVersion
82 | }
83 | }
84 | }
85 | }
86 | }
87 |
88 | def addFeedback(paperId: String, data: LabeledData): Unit = {
89 | import spray.json._
90 | import JsonProtocol._
91 |
92 | val jsonString = data.toJson.compactPrint
93 | DB.localTx { implicit t =>
94 | sql"""
95 | INSERT INTO feedback (paperId, timeAdded, value) VALUES
96 | ($paperId, current_timestamp, $jsonString::jsonb)
97 | """.update().apply()
98 | }
99 | }
100 |
101 | private val paperSource = PaperSource.getDefault
102 |
103 | def getFeedback(paperId: String): Option[LabeledData] = {
104 | import spray.json._
105 | import JsonProtocol._
106 |
107 | DB.readOnly { implicit t =>
108 | sql"""
109 | SELECT value FROM feedback WHERE paperId=$paperId ORDER BY timeAdded DESC LIMIT 1
110 | """.map { result =>
111 | val jsonString = result.string("value")
112 | jsonString.parseJson.convertTo[LabeledData]
113 | }.first().apply()
114 | }
115 | }
116 |
117 | /**
118 | * @param onOrAfter If given, constrains returned feedback to those added on or after this timestamp.
119 | * @param before If given, constrains returned feedback to those added before this timestamp.
120 | * @return (paper id, time added, labeled data)
121 | */
122 | def getAllFeedback(
123 | onOrAfter: Option[Instant] = None,
124 | before: Option[Instant] = None
125 | ): Traversable[(String, String, LabeledData)] = {
126 | import spray.json._
127 | import JsonProtocol._
128 |
129 | val onOrAfterClause = onOrAfter.map(ts => sqls" AND a.timeadded >= $ts").getOrElse(sqls"")
130 | val beforeClause = before.map(ts => sqls" AND a.timeadded < $ts").getOrElse(sqls"")
131 |
132 | DB.readOnly { implicit t =>
133 | sql"""
134 | SELECT a.paperId, a.timeAdded, a.value FROM feedback AS a JOIN (
135 | SELECT paperId, MAX(timeAdded) AS timeAdded FROM feedback GROUP BY paperId
136 | ) AS b ON a.paperId = b.paperId AND a.timeAdded = b.timeAdded
137 | $onOrAfterClause $beforeClause
138 | """.map { result =>
139 | val paperId = result.string("paperId")
140 | val timeAdded = result.timestamp("timeAdded").toInstant
141 | val jsonString = result.string("value")
142 | (paperId, timeAdded.toString, jsonString.parseJson.convertTo[LabeledData])
143 | }.traversable.apply()
144 | }
145 | }
146 | }
147 |
--------------------------------------------------------------------------------