├── .travis.yml
├── .gitignore
├── data
    ├── .Rapp.history
    ├── acars.rda
    ├── cakcDict.rda
    ├── skipLens.rda
    ├── acarsGoogle.rda
    ├── acarsLens.rda
    ├── googleNames.rda
    ├── kindCodes.rda
    ├── lensColumns.rda
    ├── lensNames.rda
    ├── scoreColors.rda
    ├── skipGoogle.rda
    ├── excludeWords.rda
    ├── googleColumns.rda
    ├── lensDateOrder.rda
    ├── skipSumobrain.rda
    ├── docLengthTypes.rda
    ├── googleDateFields.rda
    ├── googleDateOrder.rda
    ├── lensDateFields.rda
    ├── sumobrainColumns.rda
    ├── sumobrainNames.rda
    ├── assigneeStopWords.rda
    ├── docLengthTypesDict.rda
    ├── sumobrainDateOrder.rda
    └── sumobrainDateFields.rda
├── tests
    ├── testthat.R
    └── testthat
    │   ├── .DS_Store
    │   ├── testData
    │       ├── .DS_Store
    │       ├── mtcars.xls
    │       ├── lens_autonomous_search.csv
    │       ├── google_autonomous_search.csv
    │       ├── sumobrain_autonomous_search1.xls
    │       ├── sumobrain_autonomous_search1.xlsx
    │       └── sumobrain_autonomous_search2.xlsx
    │   ├── test-imports.R
    │   ├── test-process.R
    │   ├── test-graphics.R
    │   └── test-cleaning.R
├── vignettes
    ├── Rplot.png
    ├── Rplot01.png
    └── summary.Rmd
├── inst
    ├── extdata
    │   ├── kindCodes.xlsx
    │   ├── docLengthTypes.xlsx
    │   ├── lens_autonomous_search.csv
    │   ├── google_autonomous_search.csv
    │   ├── sumobrain_autonomous_search1.xls
    │   ├── sumobrain_autonomous_search2.xls
    │   ├── sumobrain_autonomous_search1.xlsx
    │   └── sumobrain_autonomous_search2.xlsx
    ├── CITATION
    ├── shiny
    │   └── app
    │   │   ├── global.R
    │   │   ├── ui.R
    │   │   └── server.R
    └── examples
    │   └── edaPatentGuide.R
├── .Rbuildignore
├── man
    ├── runExample.Rd
    ├── excludeWords.Rd
    ├── lensDateOrder.Rd
    ├── scoreColors.Rd
    ├── googleDateOrder.Rd
    ├── sumobrainDateOrder.Rd
    ├── lensDateFields.Rd
    ├── googleDateFields.Rd
    ├── sumobrainDateFields.Rd
    ├── chooseFiles.Rd
    ├── assigneeStopWords.Rd
    ├── makeColors.Rd
    ├── capWord.Rd
    ├── extractKindCode.Rd
    ├── cakcDict.Rd
    ├── lensColumns.Rd
    ├── googleColumns.Rd
    ├── skipLens.Rd
    ├── skipSumobrain.Rd
    ├── skipGoogle.Rd
    ├── docLengthTypes.Rd
    ├── sumobrainColumns.Rd
    ├── extractPubNumber.Rd
    ├── googleNames.Rd
    ├── lensNames.Rd
    ├── sumobrainNames.Rd
    ├── docLengthTypesDict.Rd
    ├── extractCleanDate.Rd
    ├── getClaimsText.Rd
    ├── extractCountryCode.Rd
    ├── cleanGoogleURL.Rd
    ├── addPdfImage.Rd
    ├── importPatentData.Rd
    ├── showDups.Rd
    ├── wordCloudIt.Rd
    ├── extractDocLength.Rd
    ├── summarizeColumns.Rd
    ├── patentr.Rd
    ├── createGoogleURL.Rd
    ├── factorForGraph.Rd
    ├── cleanNames.Rd
    ├── getClaimFromURL.Rd
    ├── acars.Rd
    ├── acarsGoogle.Rd
    ├── summaryText.Rd
    ├── flippedHistogram.Rd
    ├── cleanHeaderNames.Rd
    ├── generateDocType.Rd
    ├── facetPlot.Rd
    ├── removeDups.Rd
    ├── addFullImagePptx.Rd
    ├── tilePlot.Rd
    ├── kindCodes.Rd
    ├── acarsLens.Rd
    ├── cleanPatentData.Rd
    └── addChartRightTextLeftPptx.Rd
├── patentr.Rproj
├── ..Rcheck
    └── 00check.log
├── R
    ├── shiny.R
    ├── patentr.R
    ├── explorePatentData.R
    ├── importPatentData.R
    ├── acars.R
    ├── processPatentData.R
    └── reportPatentData.R
├── DESCRIPTION
├── NAMESPACE
├── README.md
└── README.Rmd


/.travis.yml:
--------------------------------------------------------------------------------
1 | language: r
2 | cache: packages
3 | sudo: false
4 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .Rproj.user
2 | .Rhistory
3 | .RData
4 | inst/doc
5 | 


--------------------------------------------------------------------------------
/data/.Rapp.history:
--------------------------------------------------------------------------------
1 | load("/Users/Yao/coding/patentr/data/acars.rda")
2 | 


--------------------------------------------------------------------------------
/data/acars.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kamilien1/patentr/HEAD/data/acars.rda


--------------------------------------------------------------------------------
/data/cakcDict.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kamilien1/patentr/HEAD/data/cakcDict.rda


--------------------------------------------------------------------------------
/data/skipLens.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kamilien1/patentr/HEAD/data/skipLens.rda


--------------------------------------------------------------------------------
/data/acarsGoogle.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kamilien1/patentr/HEAD/data/acarsGoogle.rda


--------------------------------------------------------------------------------
/data/acarsLens.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kamilien1/patentr/HEAD/data/acarsLens.rda


--------------------------------------------------------------------------------
/data/googleNames.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kamilien1/patentr/HEAD/data/googleNames.rda


--------------------------------------------------------------------------------
/data/kindCodes.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kamilien1/patentr/HEAD/data/kindCodes.rda


--------------------------------------------------------------------------------
/data/lensColumns.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kamilien1/patentr/HEAD/data/lensColumns.rda


--------------------------------------------------------------------------------
/data/lensNames.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kamilien1/patentr/HEAD/data/lensNames.rda


--------------------------------------------------------------------------------
/data/scoreColors.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kamilien1/patentr/HEAD/data/scoreColors.rda


--------------------------------------------------------------------------------
/data/skipGoogle.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kamilien1/patentr/HEAD/data/skipGoogle.rda


--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
1 | library(testthat)
2 | library(patentr)
3 | 
4 | test_check("patentr")
5 | 


--------------------------------------------------------------------------------
/vignettes/Rplot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kamilien1/patentr/HEAD/vignettes/Rplot.png


--------------------------------------------------------------------------------
/data/excludeWords.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kamilien1/patentr/HEAD/data/excludeWords.rda


--------------------------------------------------------------------------------
/data/googleColumns.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kamilien1/patentr/HEAD/data/googleColumns.rda


--------------------------------------------------------------------------------
/data/lensDateOrder.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kamilien1/patentr/HEAD/data/lensDateOrder.rda


--------------------------------------------------------------------------------
/data/skipSumobrain.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kamilien1/patentr/HEAD/data/skipSumobrain.rda


--------------------------------------------------------------------------------
/vignettes/Rplot01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kamilien1/patentr/HEAD/vignettes/Rplot01.png


--------------------------------------------------------------------------------
/data/docLengthTypes.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kamilien1/patentr/HEAD/data/docLengthTypes.rda


--------------------------------------------------------------------------------
/data/googleDateFields.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kamilien1/patentr/HEAD/data/googleDateFields.rda


--------------------------------------------------------------------------------
/data/googleDateOrder.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kamilien1/patentr/HEAD/data/googleDateOrder.rda


--------------------------------------------------------------------------------
/data/lensDateFields.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kamilien1/patentr/HEAD/data/lensDateFields.rda


--------------------------------------------------------------------------------
/data/sumobrainColumns.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kamilien1/patentr/HEAD/data/sumobrainColumns.rda


--------------------------------------------------------------------------------
/data/sumobrainNames.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kamilien1/patentr/HEAD/data/sumobrainNames.rda


--------------------------------------------------------------------------------
/tests/testthat/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kamilien1/patentr/HEAD/tests/testthat/.DS_Store


--------------------------------------------------------------------------------
/data/assigneeStopWords.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kamilien1/patentr/HEAD/data/assigneeStopWords.rda


--------------------------------------------------------------------------------
/data/docLengthTypesDict.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kamilien1/patentr/HEAD/data/docLengthTypesDict.rda


--------------------------------------------------------------------------------
/data/sumobrainDateOrder.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kamilien1/patentr/HEAD/data/sumobrainDateOrder.rda


--------------------------------------------------------------------------------
/inst/extdata/kindCodes.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kamilien1/patentr/HEAD/inst/extdata/kindCodes.xlsx


--------------------------------------------------------------------------------
/data/sumobrainDateFields.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kamilien1/patentr/HEAD/data/sumobrainDateFields.rda


--------------------------------------------------------------------------------
/inst/extdata/docLengthTypes.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kamilien1/patentr/HEAD/inst/extdata/docLengthTypes.xlsx


--------------------------------------------------------------------------------
/tests/testthat/testData/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kamilien1/patentr/HEAD/tests/testthat/testData/.DS_Store


--------------------------------------------------------------------------------
/tests/testthat/testData/mtcars.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kamilien1/patentr/HEAD/tests/testthat/testData/mtcars.xls


--------------------------------------------------------------------------------
/inst/extdata/lens_autonomous_search.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kamilien1/patentr/HEAD/inst/extdata/lens_autonomous_search.csv


--------------------------------------------------------------------------------
/inst/extdata/google_autonomous_search.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kamilien1/patentr/HEAD/inst/extdata/google_autonomous_search.csv


--------------------------------------------------------------------------------
/inst/extdata/sumobrain_autonomous_search1.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kamilien1/patentr/HEAD/inst/extdata/sumobrain_autonomous_search1.xls


--------------------------------------------------------------------------------
/inst/extdata/sumobrain_autonomous_search2.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kamilien1/patentr/HEAD/inst/extdata/sumobrain_autonomous_search2.xls


--------------------------------------------------------------------------------
/.Rbuildignore:
--------------------------------------------------------------------------------
1 | ^.*\.Rproj$
2 | ^\.Rproj\.user$
3 | ^README\.Rmd$
4 | ^cran-comments\.md$
5 | ^NEWS\.md$
6 | ^\.travis\.yml$
7 | ^README-.*\.png$
8 | 


--------------------------------------------------------------------------------
/inst/extdata/sumobrain_autonomous_search1.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kamilien1/patentr/HEAD/inst/extdata/sumobrain_autonomous_search1.xlsx


--------------------------------------------------------------------------------
/inst/extdata/sumobrain_autonomous_search2.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kamilien1/patentr/HEAD/inst/extdata/sumobrain_autonomous_search2.xlsx


--------------------------------------------------------------------------------
/tests/testthat/testData/lens_autonomous_search.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kamilien1/patentr/HEAD/tests/testthat/testData/lens_autonomous_search.csv


--------------------------------------------------------------------------------
/tests/testthat/testData/google_autonomous_search.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kamilien1/patentr/HEAD/tests/testthat/testData/google_autonomous_search.csv


--------------------------------------------------------------------------------
/tests/testthat/testData/sumobrain_autonomous_search1.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kamilien1/patentr/HEAD/tests/testthat/testData/sumobrain_autonomous_search1.xls


--------------------------------------------------------------------------------
/tests/testthat/testData/sumobrain_autonomous_search1.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kamilien1/patentr/HEAD/tests/testthat/testData/sumobrain_autonomous_search1.xlsx


--------------------------------------------------------------------------------
/tests/testthat/testData/sumobrain_autonomous_search2.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kamilien1/patentr/HEAD/tests/testthat/testData/sumobrain_autonomous_search2.xlsx


--------------------------------------------------------------------------------
/inst/CITATION:
--------------------------------------------------------------------------------
1 | citHeader("To cite patentr in publications, use:")citEntry(  textVersion  =  paste("Kamil Bojanczyk, Yao Yang (2017).",        "patentr: A patent analysis toolkit in R.",        "URL https://github.com/kamilien1/patentr"))


--------------------------------------------------------------------------------
/man/runExample.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/shiny.R
 3 | \name{runExample}
 4 | \alias{runExample}
 5 | \title{Shiny app}
 6 | \usage{
 7 | runExample()
 8 | }
 9 | \description{
10 | this is a shiny app that loads patent data, views it,
11 | and does a simple visualization. 
12 | 
13 | NOTE: This only works with xlsx files.
14 | }
15 | 


--------------------------------------------------------------------------------
/patentr.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: No
 4 | SaveWorkspace: No
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | Encoding: UTF-8
 9 | 
10 | AutoAppendNewline: Yes
11 | StripTrailingWhitespace: Yes
12 | 
13 | BuildType: Package
14 | PackageUseDevtools: Yes
15 | PackageInstallArgs: --no-multiarch --with-keep.source
16 | PackageRoxygenize: rd,collate,namespace
17 | 


--------------------------------------------------------------------------------
/..Rcheck/00check.log:
--------------------------------------------------------------------------------
 1 | * using log directory ‘/Users/Kamil/Documents/src/Data Science/stats290 project/patentr/..Rcheck’
 2 | * using R version 3.3.2 (2016-10-31)
 3 | * using platform: x86_64-apple-darwin13.4.0 (64-bit)
 4 | * using session charset: UTF-8
 5 | * checking for file ‘./DESCRIPTION’ ... ERROR
 6 | Required fields missing or empty:
 7 |   ‘Author’ ‘Maintainer’
 8 | * DONE
 9 | Status: 1 ERROR
10 | 


--------------------------------------------------------------------------------
/man/excludeWords.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{excludeWords}
 5 | \alias{excludeWords}
 6 | \title{A standard list of words to exclude in a patent word cloud.}
 7 | \format{A character vector. 
 8 | 
 9 | \describe{
10 | \item{excludeWords}{A character vector of words to exclude}
11 | }}
12 | \usage{
13 | excludeWords
14 | }
15 | \description{
16 | A standard list of words to exclude from a patent data word cloud.
17 | }
18 | \keyword{data}
19 | 


--------------------------------------------------------------------------------
/man/lensDateOrder.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{lensDateOrder}
 5 | \alias{lensDateOrder}
 6 | \title{Date order for lens.org data.}
 7 | \format{A character value.
 8 | 
 9 | \describe{
10 | \item{lensDateOrder}{A character variable of date order.}
11 | }}
12 | \usage{
13 | lensDateOrder
14 | }
15 | \description{
16 | A date order to be used in lens.org date data.
17 | }
18 | \seealso{
19 | \code{\link{extractCleanDate}}
20 | }
21 | \keyword{data}
22 | 


--------------------------------------------------------------------------------
/man/scoreColors.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{scoreColors}
 5 | \alias{scoreColors}
 6 | \title{Score colors used in graphing.}
 7 | \format{A character vector. 
 8 | 
 9 | \describe{
10 | \item{scoreColors}{A character variable of four score colors for 0 to 3.}
11 | }}
12 | \usage{
13 | scoreColors
14 | }
15 | \description{
16 | A character vector of Hexadecimal score colors.
17 | }
18 | \seealso{
19 | \code{\link{flippedHistogram}}
20 | }
21 | \keyword{data}
22 | 


--------------------------------------------------------------------------------
/R/shiny.R:
--------------------------------------------------------------------------------
 1 | #' Shiny app
 2 | #' 
 3 | #' @description this is a shiny app that loads patent data, views it,
 4 | #' and does a simple visualization. 
 5 | #' 
 6 | #' NOTE: This only works with xlsx files. 
 7 | #' 
 8 | #' @export
 9 | 
10 | ## yang yao start
11 | runExample <- function() {
12 |   appDir <- system.file("shiny", "app" ,package = "patentr")
13 |   if (appDir == "") {
14 |     stop("Could not find example directory. Try re-installing `patentr`.", call. = FALSE)
15 |   }
16 |   shiny::runApp(appDir, display.mode = "normal")
17 | }
18 | ## yang yao end


--------------------------------------------------------------------------------
/man/googleDateOrder.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{googleDateOrder}
 5 | \alias{googleDateOrder}
 6 | \title{Date order for Google Patents data.}
 7 | \format{A character value.
 8 | 
 9 | \describe{
10 | \item{googleDateOrder}{A character variable of date order.}
11 | }}
12 | \usage{
13 | googleDateOrder
14 | }
15 | \description{
16 | A date order to be used in Google patent date data.
17 | }
18 | \seealso{
19 | \code{\link{extractCleanDate}}
20 | }
21 | \keyword{data}
22 | 


--------------------------------------------------------------------------------
/man/sumobrainDateOrder.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{sumobrainDateOrder}
 5 | \alias{sumobrainDateOrder}
 6 | \title{Date order for sumobrain data.}
 7 | \format{A character value.
 8 | 
 9 | \describe{
10 | \item{sumobrainDateOrder}{A character variable of date order.}
11 | }}
12 | \usage{
13 | sumobrainDateOrder
14 | }
15 | \description{
16 | A date order to be used in sumobrain date data.
17 | }
18 | \seealso{
19 | \code{\link{extractCleanDate}}
20 | }
21 | \keyword{data}
22 | 


--------------------------------------------------------------------------------
/man/lensDateFields.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{lensDateFields}
 5 | \alias{lensDateFields}
 6 | \title{A simple list of date column names in lens.org data.}
 7 | \format{A character vector.
 8 | 
 9 | \describe{
10 | \item{lensDateFields}{A character vector of date fields.}
11 | }}
12 | \usage{
13 | lensDateFields
14 | }
15 | \description{
16 | A character vector of date fields in lens.org data.
17 | }
18 | \seealso{
19 | \code{\link{cleanHeaderNames}}
20 | }
21 | \keyword{data}
22 | 


--------------------------------------------------------------------------------
/man/googleDateFields.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{googleDateFields}
 5 | \alias{googleDateFields}
 6 | \title{A simple list of date column names in Google patent data.}
 7 | \format{A character vector.
 8 | 
 9 | \describe{
10 | \item{googleDateFields}{A character vector of date fields.}
11 | }}
12 | \usage{
13 | googleDateFields
14 | }
15 | \description{
16 | A character vector of date fields in Google patent data.
17 | }
18 | \seealso{
19 | \code{\link{cleanHeaderNames}}
20 | }
21 | \keyword{data}
22 | 


--------------------------------------------------------------------------------
/man/sumobrainDateFields.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{sumobrainDateFields}
 5 | \alias{sumobrainDateFields}
 6 | \title{A simple list of date column names in sumobrain data.}
 7 | \format{A character vector
 8 | 
 9 | \describe{
10 | \item{sumobrainDateFields}{A character vector of date fields.}
11 | }}
12 | \usage{
13 | sumobrainDateFields
14 | }
15 | \description{
16 | A character vector of date fields in sumobrain data.
17 | }
18 | \seealso{
19 | \code{\link{cleanHeaderNames}}
20 | }
21 | \keyword{data}
22 | 


--------------------------------------------------------------------------------
/man/chooseFiles.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/importPatentData.R
 3 | \name{chooseFiles}
 4 | \alias{chooseFiles}
 5 | \title{Allow the user to navigate to files manually.}
 6 | \usage{
 7 | chooseFiles()
 8 | }
 9 | \value{
10 | A list of character vectors with absolute pathnames to files.
11 | }
12 | \description{
13 | Uses a popup window (Tk file dialog) to allow the user to choose a list of zero or more files interactively.
14 | }
15 | \examples{
16 | \dontrun{
17 | filePaths <- chooseFiles()
18 | allData <- importPatentData(filePaths)
19 | }
20 | }
21 | 


--------------------------------------------------------------------------------
/man/assigneeStopWords.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{assigneeStopWords}
 5 | \alias{assigneeStopWords}
 6 | \title{A simple stop word list for assignee names.}
 7 | \format{A character vector
 8 | 
 9 | \describe{
10 | \item{assigneeStopWords}{A character vector of stop words.}
11 | }}
12 | \usage{
13 | assigneeStopWords
14 | }
15 | \description{
16 | A character vector of common stop words to remove from assignee names for 
17 | name standardization, such as "inc".
18 | }
19 | \seealso{
20 | \code{\link{cleanNames}}
21 | }
22 | \keyword{data}
23 | 


--------------------------------------------------------------------------------
/man/makeColors.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/visualizePatentData.R
 3 | \name{makeColors}
 4 | \alias{makeColors}
 5 | \title{Make color hues}
 6 | \usage{
 7 | makeColors(numColors)
 8 | }
 9 | \arguments{
10 | \item{numColors}{Number of colors, a numeric input.}
11 | }
12 | \value{
13 | A character vector of colors.
14 | }
15 | \description{
16 | Generate an evenly-spaced number of color hues.
17 |  
18 | Credit for this function goes to \href{http://stackoverflow.com/questions/8197559/emulate-ggplot2-default-color-palette}{John Colby's}
19 | Stack Overflow post.
20 | }
21 | \examples{
22 | makeColors(5)
23 | 
24 | }
25 | \seealso{
26 | \code{\link{flippedHistogram}}
27 | }
28 | 


--------------------------------------------------------------------------------
/man/capWord.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/visualizePatentData.R
 3 | \name{capWord}
 4 | \alias{capWord}
 5 | \title{Capitalize the first letter of a character}
 6 | \usage{
 7 | capWord(s)
 8 | }
 9 | \arguments{
10 | \item{s}{Character string to input. Default set to \code{"word"}.}
11 | }
12 | \value{
13 | A character string with the first letter capitalized.
14 | }
15 | \description{
16 | A quick shortcut function to capitalize the first letter 
17 | of a character. Useful for making data frame column names quickly look like 
18 | plain english.
19 | }
20 | \examples{
21 | 
22 | capWord("hello")
23 | capWord("")
24 | capWord("Hi")
25 | 
26 | }
27 | \seealso{
28 | \code{\link{flippedHistogram}}
29 | }
30 | 


--------------------------------------------------------------------------------
/man/extractKindCode.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/cleanPatentData.R
 3 | \name{extractKindCode}
 4 | \alias{extractKindCode}
 5 | \title{Extract the kind code, if available, from the publication number.}
 6 | \usage{
 7 | extractKindCode(docNum)
 8 | }
 9 | \arguments{
10 | \item{docNum}{The character vector of document numbers.}
11 | }
12 | \value{
13 | A character vector of kind codes. If none found, a blank character is returned.
14 | }
15 | \description{
16 | Extracts the kind code, a one-to-two character code with a letter and 
17 | typically a number, if found in the document (published) number.
18 | }
19 | \examples{
20 | acars$kindCode <- extractKindCode(acars$docNum)
21 | head(acars[,c("docNum","kindCode")]) 
22 |   
23 | }
24 | 


--------------------------------------------------------------------------------
/man/cakcDict.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{cakcDict}
 5 | \alias{cakcDict}
 6 | \title{A country and kind code dictionary.}
 7 | \format{A named character vector
 8 | 
 9 | \describe{
10 | \item{cakcDict}{A named character vector representing key/value pairs
11 | of country codes, kind codes, and type of patent document.}
12 | 
13 | }
14 | 
15 | Built with the following code: 
16 | 
17 | \code{cakcDict <- kindCodes$docType}
18 | 
19 | \code{names(cakcDict) <- kindCodes$countryAndKindCode}}
20 | \usage{
21 | cakcDict
22 | }
23 | \description{
24 | A named vector of key/value pairs for country codes and kind codes used to 
25 | determine the type of document.
26 | }
27 | \seealso{
28 | \code{\link{generateDocType}}, \code{\link{kindCodes}}
29 | }
30 | \keyword{data}
31 | 


--------------------------------------------------------------------------------
/man/lensColumns.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{lensColumns}
 5 | \alias{lensColumns}
 6 | \title{The number of columns in a lens.org csv data export.}
 7 | \format{A numeric value.
 8 | 
 9 | \describe{
10 | \item{lensColumns}{A numeric value of the number of columns in a lens.org 
11 | patent data export. }
12 | 
13 | }}
14 | \usage{
15 | lensColumns
16 | }
17 | \description{
18 | The number of columns in a lens.org csv data export.
19 | }
20 | \details{
21 | Used with \code{\link{acarsLens}} data.
22 | }
23 | \seealso{
24 | \code{\link{skipGoogle}}, \code{\link{skipLens}}, \code{\link{skipSumobrain}},
25 | \code{\link{googleColumns}}, \code{\link{sumobrainColumns}}, 
26 | \code{\link{sumobrainNames}}, \code{\link{lensNames}}, \code{\link{googleNames}}
27 | }
28 | \keyword{data}
29 | 


--------------------------------------------------------------------------------
/man/googleColumns.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{googleColumns}
 5 | \alias{googleColumns}
 6 | \title{Number of columns in Google Patents export data.}
 7 | \format{A numeric value.
 8 | 
 9 | \describe{
10 | \item{googleColumns}{A numeric value of number of columns in a csv export from 
11 | Google Patents.}
12 | 
13 | }}
14 | \usage{
15 | googleColumns
16 | }
17 | \description{
18 | The number of columns in a Google Patents CSV export.
19 | }
20 | \details{
21 | Used with \code{\link{acarsGoogle}} data.
22 | }
23 | \seealso{
24 | \code{\link{skipGoogle}}, \code{\link{skipLens}}, \code{\link{skipSumobrain}},
25 | \code{\link{lensColumns}}, \code{\link{sumobrainColumns}}, 
26 | \code{\link{sumobrainNames}}, \code{\link{lensNames}}, \code{\link{googleNames}}
27 | }
28 | \keyword{data}
29 | 


--------------------------------------------------------------------------------
/man/skipLens.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{skipLens}
 5 | \alias{skipLens}
 6 | \title{How many lines to skip in a lens.org patent data export.}
 7 | \format{A numeric value.
 8 | 
 9 | \describe{
10 | \item{skipLens}{A numeric value representing the number of rows to skip in a 
11 | lens.org csv data export.}
12 | 
13 | }}
14 | \usage{
15 | skipLens
16 | }
17 | \description{
18 | How many lines to skip in a lens.org patent data export.
19 | }
20 | \details{
21 | Used with \code{\link{acarsLens}} data.
22 | }
23 | \seealso{
24 | \code{\link{skipGoogle}}, \code{\link{skipSumobrain}},
25 | \code{\link{googleColumns}},\code{\link{lensColumns}}, \code{\link{sumobrainColumns}}, 
26 | \code{\link{sumobrainNames}}, \code{\link{lensNames}}, \code{\link{googleNames}}
27 | }
28 | \keyword{data}
29 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: patentr
 2 | Title: A Toolbox for Analyzing Patent Data
 3 | Version: 0.0.1
 4 | Authors@R: c(person("Bojanczyk", "Kamil", email = "kamil.bojanczyk@gmail.com", role = c("aut", "cre")),
 5 |     person("Yang", "Yao", email = "yangyaonju@gmail.com", role = "aut"))
 6 | Description: A toolkit for patent data analysis. 
 7 | Depends: R (>= 3.3.2)
 8 | License: GPL (> 2)
 9 | Encoding: UTF-8
10 | LazyData: true
11 | Imports: readxl (>= 0.1.1),
12 | 	plyr (>= 1.8.4),
13 | 	stringr (>= 1.2.0),
14 | 	lubridate (>= 1.6.0),
15 | 	XML (>= 3.98-1.5),
16 | 	httr (>= 1.2.1),
17 | 	dplyr (>= 0.5.0),
18 | 	magrittr (>= 1.5),
19 | 	ggplot2 (>= 2.2.1),
20 | 	RColorBrewer (>= 1.1-2),
21 | 	tm (>= 0.7-1),
22 | 	wordcloud (>= 2.5),
23 | 	ReporteRs (>= 0.8.8),
24 | 	shiny (>= 1.0.0)
25 | Suggests: testthat,
26 | 	rprojroot
27 | RoxygenNote: 6.0.1
28 | BugReports: http://github.com/kamilien1/patentr/issues
29 | 


--------------------------------------------------------------------------------
/man/skipSumobrain.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{skipSumobrain}
 5 | \alias{skipSumobrain}
 6 | \title{The number of lines to skip in a data read for a sumobrain.com export file.}
 7 | \format{A numeric value.
 8 | 
 9 | \describe{
10 | \item{skipSumobrain}{A hard-coded numeric value for how many lines to skip 
11 | in a sumobrain.com data export.}
12 | 
13 | }}
14 | \usage{
15 | skipSumobrain
16 | }
17 | \description{
18 | The number of lines to skip in a data read for a sumobrain.com export file.
19 | Used with \code{\link{acars}} data.
20 | }
21 | \seealso{
22 | \code{\link{skipGoogle}}, \code{\link{skipLens}}, 
23 | \code{\link{googleColumns}},\code{\link{lensColumns}}, \code{\link{sumobrainColumns}}, 
24 | \code{\link{sumobrainNames}}, \code{\link{lensNames}}, \code{\link{googleNames}}
25 | }
26 | \keyword{data}
27 | 


--------------------------------------------------------------------------------
/man/skipGoogle.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{skipGoogle}
 5 | \alias{skipGoogle}
 6 | \title{How many lines to skip in a Google Patents CSV export file.}
 7 | \format{A numeric value.
 8 | 
 9 | \describe{
10 | \item{skipGoogle}{A numeric value for number of lines to skip in a Google 
11 | Patents csv export.}
12 | 
13 | }}
14 | \usage{
15 | skipGoogle
16 | }
17 | \description{
18 | A hard-coded value for the number of lines to skip in a Google Patents csv 
19 | export.
20 | }
21 | \details{
22 | Used with \code{\link{acarsGoogle}} data.
23 | }
24 | \seealso{
25 | \code{\link{skipLens}}, \code{\link{skipSumobrain}},
26 | \code{\link{googleColumns}},\code{\link{lensColumns}}, \code{\link{sumobrainColumns}}, 
27 | \code{\link{sumobrainNames}}, \code{\link{lensNames}}, \code{\link{googleNames}}
28 | }
29 | \keyword{data}
30 | 


--------------------------------------------------------------------------------
/man/docLengthTypes.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{docLengthTypes}
 5 | \alias{docLengthTypes}
 6 | \title{A document mapper for country codes and document digit length to the type of 
 7 | document.}
 8 | \format{A data frame with a key and value pair. 
 9 | 
10 | \describe{
11 | \item{key}{A concatenated country code and length of the numeric portion of a 
12 | document number. For example: US7 is a US document with 7 digits.}
13 | \item{value}{The type of patent document based on the country code and document 
14 | length value.}
15 | 
16 | }}
17 | \usage{
18 | docLengthTypes
19 | }
20 | \description{
21 | A simple table that helps map the country code and length of the numeric portion 
22 | of the data to the type of document.
23 | }
24 | \details{
25 | May need to add the USAPP for sumobrain. For now it is not needed.
26 | }
27 | \keyword{data}
28 | 


--------------------------------------------------------------------------------
/man/sumobrainColumns.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{sumobrainColumns}
 5 | \alias{sumobrainColumns}
 6 | \title{The number of columns in a sumobrain.com data export.}
 7 | \format{A numeric value.
 8 | 
 9 | \describe{
10 | \item{sumobrainColumns}{A hard-coded numeric value for the number of columns in a 
11 | sumobrain.com data export.}
12 | 
13 | }}
14 | \usage{
15 | sumobrainColumns
16 | }
17 | \description{
18 | A convenient hard-coded value that can be used when reading in sumobrain.com 
19 | exported patent data files.
20 | }
21 | \details{
22 | Used with \code{\link{acars}} data.
23 | }
24 | \seealso{
25 | \code{\link{skipGoogle}}, \code{\link{skipLens}}, \code{\link{skipSumobrain}},
26 | \code{\link{googleColumns}},\code{\link{lensColumns}},
27 | \code{\link{sumobrainNames}}, \code{\link{lensNames}}, \code{\link{googleNames}}
28 | }
29 | \keyword{data}
30 | 


--------------------------------------------------------------------------------
/man/extractPubNumber.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/cleanPatentData.R
 3 | \name{extractPubNumber}
 4 | \alias{extractPubNumber}
 5 | \title{Extract the numeric portion of the document (published) number.}
 6 | \usage{
 7 | extractPubNumber(docNum)
 8 | }
 9 | \arguments{
10 | \item{docNum}{The character vector of document numbers.}
11 | }
12 | \value{
13 | A character vector of same length inputted, with varying length. 
14 | Typical lengths are 7 to 11 characters. Only numbers are returned. All other
15 | characters are stripped.
16 | }
17 | \description{
18 | Extract the numeric portion of the document number. 
19 | This is useful for a number of processing applications, and thus is beneficial
20 | to isolate from the entire publication number.
21 | }
22 | \examples{
23 | acars$pubNum <- extractPubNumber(acars$docNum)
24 | head(acars[,c("docNum","pubNum")]) 
25 |   
26 | }
27 | \seealso{
28 | \code{\link{createGoogleURL}}
29 | }
30 | 


--------------------------------------------------------------------------------
/man/googleNames.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{googleNames}
 5 | \alias{googleNames}
 6 | \title{Header names for a data upload sourced from Google Patents data exports.}
 7 | \format{A character vector.
 8 | 
 9 | \describe{
10 | \item{googleNames}{A 9-element character vector of clean Google patent names.}
11 | 
12 | }}
13 | \usage{
14 | googleNames
15 | }
16 | \description{
17 | A character vector to standardize the headers of the imported csv from a 
18 | Google Patents patent data export. Used with \code{\link{acarsGoogle}} data.
19 | }
20 | \seealso{
21 | Go to \href{patents.google.com}{Google Patents} to download the data.
22 | 
23 | \code{\link{skipGoogle}}, \code{\link{skipLens}}, \code{\link{skipSumobrain}},
24 | \code{\link{googleColumns}},\code{\link{lensColumns}}, \code{\link{sumobrainColumns}}, 
25 | \code{\link{sumobrainNames}}, \code{\link{lensNames}}
26 | }
27 | \keyword{data}
28 | 


--------------------------------------------------------------------------------
/man/lensNames.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{lensNames}
 5 | \alias{lensNames}
 6 | \title{Header names for a data upload sourced from lens.org.}
 7 | \format{A character vector.
 8 | 
 9 | \describe{
10 | \item{sumobrainNames}{A 26-element character vector of clean lens.org header names.}
11 | 
12 | }}
13 | \usage{
14 | lensNames
15 | }
16 | \description{
17 | A character vector to standardize the headers of the imported csv from a 
18 | lens.org patent data export.
19 | }
20 | \details{
21 | Used with \code{\link{acarsLens}} data.
22 | }
23 | \seealso{
24 | Go to \href{lens.org}{Lens}, optionally create a free account, and 
25 | download the data. 
26 | 
27 | \code{\link{skipGoogle}}, \code{\link{skipLens}}, \code{\link{skipSumobrain}},
28 | \code{\link{googleColumns}},\code{\link{lensColumns}}, \code{\link{sumobrainColumns}}, 
29 | \code{\link{sumobrainNames}},  \code{\link{googleNames}}
30 | }
31 | \keyword{data}
32 | 


--------------------------------------------------------------------------------
/man/sumobrainNames.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{sumobrainNames}
 5 | \alias{sumobrainNames}
 6 | \title{Header names for a data upload sourced from sumobrain.com}
 7 | \format{A character vector
 8 | 
 9 | \describe{
10 | \item{sumobrainNames}{An 11-element character vector of clean sumobrain names.}
11 | 
12 | }}
13 | \usage{
14 | sumobrainNames
15 | }
16 | \description{
17 | A character vector to standardize the headers of the imported excel from a 
18 | sumobrain.com patent data export.
19 | }
20 | \details{
21 | Used with \code{\link{acars}} data.
22 | }
23 | \seealso{
24 | Go to \href{www.sumobrain.com}{Sumobrain}, create a free account, and 
25 | download the data.
26 | 
27 | \code{\link{skipGoogle}}, \code{\link{skipLens}}, \code{\link{skipSumobrain}},
28 | \code{\link{googleColumns}},\code{\link{lensColumns}}, \code{\link{sumobrainColumns}}, 
29 | \code{\link{lensNames}}, \code{\link{googleNames}}
30 | }
31 | \keyword{data}
32 | 


--------------------------------------------------------------------------------
/man/docLengthTypesDict.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{docLengthTypesDict}
 5 | \alias{docLengthTypesDict}
 6 | \title{A named vector of key/value pairs for country codes and publication number 
 7 | document lengths used to determine the type of document.}
 8 | \format{A named character vector
 9 | 
10 | \describe{
11 | \item{docLengthTypesDict}{A named character vector representing key/value pairs
12 | of doc lengths, country codes, and type of patent document.}
13 | 
14 | }
15 | 
16 | Built with the following code: 
17 | 
18 | \code{docLengthTypesDict <- docLengthTypes$value}
19 | 
20 | \code{names(docLengthTypesDict) <- docLengthTypes$key}}
21 | \usage{
22 | docLengthTypesDict
23 | }
24 | \description{
25 | A named vector of key/value pairs for country codes and publication number 
26 | document lengths used to determine the type of document.
27 | }
28 | \seealso{
29 | \code{\link{generateDocType}}, \code{\link{docLengthTypes}}
30 | }
31 | \keyword{data}
32 | 


--------------------------------------------------------------------------------
/inst/shiny/app/global.R:
--------------------------------------------------------------------------------
 1 | ##This should detect and install missing packages before loading them  
 2 | ## yang yao and kamil bojanczyk
 3 | ## motivation: R Shiny gallery and look at urls in ui.R
 4 | list.of.packages <- c("shiny","ggplot2", "dplyr")
 5 | new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
 6 | if(length(new.packages)) install.packages(new.packages)
 7 | lapply(list.of.packages,function(x){library(x,character.only=TRUE)}) 
 8 | 
 9 | 
10 | # TODO 
11 | #' 1) successfully read in csv from
12 | #' 1a) lens.org data
13 | #' 1b) Google patents data
14 | #' 2) successfull read in excel file from sumobrain data
15 | #' 3) successfully visualize patent data frame data by
16 | #' 3a) columns (choose which ones to display)
17 | #' 3b) values within rows: example, choose assignees to display
18 | #' 4) successfully display simple patent summaries
19 | #' 4a) total number of documents by docType
20 | #' 4b) number of documents by assignee
21 | #' 5) be able to export data with the following types
22 | #' 5a) csv export
23 | #' 5b) excel export (xlsx)
24 | #' 


--------------------------------------------------------------------------------
/man/extractCleanDate.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/cleanPatentData.R
 3 | \name{extractCleanDate}
 4 | \alias{extractCleanDate}
 5 | \title{Format patent dates.}
 6 | \usage{
 7 | extractCleanDate(dateVector, orders = "ymd")
 8 | }
 9 | \arguments{
10 | \item{dateVector}{A vector of character dates.}
11 | 
12 | \item{orders}{The orders the dates appear in. 
13 | Sumobrain is "ymd" and Lens.org and Google data are "mdy". Hardcoded values include 
14 | \code{\link{googleDateOrder}},\code{\link{lensDateOrder}}, 
15 | and \code{\link{sumobrainDateOrder}}.}
16 | }
17 | \value{
18 | A date vector of year, month, day dates.
19 | }
20 | \description{
21 | Create a clean year, month, day date.
22 | 
23 | Reading data in and aout of R may cause date mistakes, using a simple set
24 | function will ensure data types are the right format and class type. This 
25 | data format is cleaned up to be in the format yyyy-mm-dd with no hours,
26 | minutes, seconds, or time zone attached.
27 | }
28 | \examples{
29 | acars$pubDate <- extractCleanDate(dateVector = acars$pubDate, orders = "ymd")
30 | 
31 | 
32 | }
33 | 


--------------------------------------------------------------------------------
/man/getClaimsText.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/processPatentData.R
 3 | \name{getClaimsText}
 4 | \alias{getClaimsText}
 5 | \title{Get claims data for all rows in a data frame}
 6 | \usage{
 7 | getClaimsText(googleURLs, langCode = "en", whichClaim = 1)
 8 | }
 9 | \arguments{
10 | \item{googleURLs}{A character vector of Google URLs}
11 | 
12 | \item{langCode}{A language code, default set to "en"}
13 | 
14 | \item{whichClaim}{Which claim (if available) to return. Default set to 1st.}
15 | }
16 | \description{
17 | Generate claims data for all rows in a data frame. 
18 | 
19 | This is a wrapper function for the \code{\link{getClaimFromURL}} function.
20 | }
21 | \examples{
22 | 
23 | \dontrun{
24 | cc <- extractCountryCode(acars$docNum)
25 | pn <- extractPubNumber(acars$docNum)
26 | kc <- extractKindCode(acars$docNum)
27 | urls <- createGoogleURL(countryCode = cc, pubNum = pn ,kindCode = kc)
28 | urls <- urls[1:4]
29 | clms <- getClaimsText(urls)
30 | clms[1]
31 | }
32 | }
33 | \seealso{
34 | \code{\link{createGoogleURL}}, \code{\link{cleanGoogleURL}},
35 | \code{\link{getClaimFromURL}}
36 | }
37 | 


--------------------------------------------------------------------------------
/man/extractCountryCode.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/cleanPatentData.R
 3 | \name{extractCountryCode}
 4 | \alias{extractCountryCode}
 5 | \title{Extract the country code from a vector string of document numbers.}
 6 | \usage{
 7 | extractCountryCode(docNum)
 8 | }
 9 | \arguments{
10 | \item{docNum}{The character vector of document numbers.}
11 | }
12 | \value{
13 | A character vector of the same length inputted, with 2-4 characters
14 | representing the country code of the ptaent document.
15 | }
16 | \description{
17 | Extract the country code from a patent document number, which is the 
18 | first two to four letters in a patent document number.
19 | 
20 | For example, if a patent number is US8880270, the country code is US. In rare
21 | cases, we have more than two letters. Typical country codes are US (United States),
22 | EP (Europe), JP (Japan), and WO (World, meaning a PCT application).
23 | }
24 | \examples{
25 | # create a new column called countryCode in the acars data set
26 | acars$countryCode <- extractCountryCode(acars$docNum)
27 | head(acars[,c("docNum","countryCode")])
28 | 
29 | }
30 | 


--------------------------------------------------------------------------------
/man/cleanGoogleURL.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/processPatentData.R
 3 | \name{cleanGoogleURL}
 4 | \alias{cleanGoogleURL}
 5 | \title{Sanitize a Google URL before attempting to extract data}
 6 | \usage{
 7 | cleanGoogleURL(googleURL, langCode = "en")
 8 | }
 9 | \arguments{
10 | \item{googleURL}{A character value of a google URL.}
11 | 
12 | \item{langCode}{A language code, default set to "en" English.}
13 | }
14 | \value{
15 | A clean character vector of a Google Patents URL.
16 | }
17 | \description{
18 | Clean up the google URL to make sure it will be read properly.
19 | 
20 | If you use the \code{\link{createGoogleURL}} function, you won't have to use this function. 
21 | However, if you use your own generator or want to change the language, use this 
22 | function to do so.
23 | }
24 | \examples{
25 | 
26 | cleanGoogleURL("https://patents.google.com/patent/US8818682B1/mx")
27 | cleanGoogleURL("https://patents.google.com/patent/US8818682B1/")
28 | cleanGoogleURL("https://patents.google.com/patent/US8818682B1")
29 | cleanGoogleURL("https://patents.google.com/patent/US8818682B1/en")
30 | 
31 | }
32 | \seealso{
33 | \code{\link{createGoogleURL}}
34 | }
35 | 


--------------------------------------------------------------------------------
/man/addPdfImage.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/reportPatentData.R
 3 | \name{addPdfImage}
 4 | \alias{addPdfImage}
 5 | \title{Make a PDF output of a plot}
 6 | \usage{
 7 | addPdfImage(graph, name = "image", w = 12, h = 12)
 8 | }
 9 | \arguments{
10 | \item{graph}{The graph object to input}
11 | 
12 | \item{name}{A character name to name your file. It can have a filepath as well.}
13 | 
14 | \item{w}{The width, in inches, of your image, default set to 12.}
15 | 
16 | \item{h}{The height, in inches, of your image, default set to 12.}
17 | }
18 | \value{
19 | No ret
20 | }
21 | \description{
22 | Make a PDF output of a plot.
23 | }
24 | \examples{
25 | 
26 | sumo <- cleanPatentData(patentData = patentr::acars, columnsExpected = sumobrainColumns,
27 | cleanNames = sumobrainNames,
28 | dateFields = sumobrainDateFields,
29 | dateOrders = sumobrainDateOrder,
30 | deduplicate = TRUE,
31 | cakcDict = patentr::cakcDict,
32 | docLengthTypesDict = patentr::docLengthTypesDict,
33 | keepType = "grant",
34 | firstAssigneeOnly = TRUE,
35 | assigneeSep = ";",
36 | stopWords = patentr::assigneeStopWords)
37 | 
38 | # df <- dplyr::select(sumo, title, abstract)
39 | df <- sumo[,c("title","abstract")]
40 | addPdfImage(wordCloudIt(df, excludeWords, minfreq = 20,
41 |                         random.order = FALSE, rot.per = 0.25),"wordCloud")
42 | 
43 | }
44 | 


--------------------------------------------------------------------------------
/man/importPatentData.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/importPatentData.R
 3 | \name{importPatentData}
 4 | \alias{importPatentData}
 5 | \title{Read in a data file or list of files from excel spreadsheets.}
 6 | \usage{
 7 | importPatentData(rawDataFilePath = NA, skipLines = 1)
 8 | }
 9 | \arguments{
10 | \item{rawDataFilePath}{A filepath, or list of filespaths, for xls files.}
11 | 
12 | \item{skipLines}{Number of lines to skip before reading in your data file.}
13 | }
14 | \value{
15 | A single data frame of all data. NULL if no data.
16 | }
17 | \description{
18 | Import, read, and connect patent data files. Currently: xls files
19 | from a filepath. 
20 | Future use: can read from a URL, an xlsx file, google doc, and a csv.
21 | }
22 | \examples{
23 | \dontrun{
24 | 
25 | # access the files here and put them in a data/ folder of your working directory.
26 | file1 <- system.file("extdata/", "sumobrain_autonomous_search1.xlsx", package="patentr")
27 | file2 <- system.file("extdata/", "sumobrain_autonomous_search2.xlsx", package="patentr")
28 | files <- list(file1, file2)
29 | ipData <- importPatentData(rawDataFilePath = files, skipLines = 1)
30 | 
31 | # example 2
32 | # assume csv files are in the data folder
33 | ipData <- importPatentData(rawDataFilePath = list.files('data/', full.names=T), skipLines = 1)
34 | }
35 | 
36 | 
37 | }
38 | 


--------------------------------------------------------------------------------
/man/showDups.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/cleanPatentData.R
 3 | \name{showDups}
 4 | \alias{showDups}
 5 | \title{View all your duplicate entries to decide which to remove}
 6 | \usage{
 7 | showDups(input)
 8 | }
 9 | \arguments{
10 | \item{input}{A vector or a data frame which you wish to view duplicated values.}
11 | }
12 | \value{
13 | A logical vector of TRUE / FALSE with all entries that contain two 
14 | or more duplicate values.
15 | }
16 | \description{
17 | Return a logical vector of all duplicate entries. 
18 | 
19 | Often times, you want to review your duplicate results to determine which 
20 | rows you want to keep and which you want to erase. 
21 | 
22 | For example, if you have 
23 | an application number that is an application, and another that is a search report, 
24 | then you will want to keep the application data and remove the search report 
25 | entry. 
26 | 
27 | Or, if you have an application number that has both a grant and an 
28 | application entry, you may want to remove the application from your analysis 
29 | and focus on the grant data, as the claim scope is most important for the 
30 | granted patent.
31 | }
32 | \examples{
33 | 
34 | acarsDups <- acars[showDups(acars$appNum),]
35 | head(acarsDups[order(acarsDups$appNum),c("docNum","docTypeSumobrain","appNum")])
36 | 
37 | }
38 | \seealso{
39 | \code{\link[base]{duplicated}}, \code{\link{removeDups}}
40 | }
41 | 


--------------------------------------------------------------------------------
/man/wordCloudIt.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/visualizePatentData.R
 3 | \name{wordCloudIt}
 4 | \alias{wordCloudIt}
 5 | \title{Generate a word cloud with a given subset of patent data fields.}
 6 | \usage{
 7 | wordCloudIt(file, rmwords, minfreq = 20, maxwords = 150, ...)
 8 | }
 9 | \arguments{
10 | \item{file}{The data frame you want word cloud, typically the abstract, title, 
11 | and claims subset.}
12 | 
13 | \item{rmwords}{A character vector of words you exclude from your analysis. Default 
14 | is \code{\link{excludeWords}}.}
15 | 
16 | \item{minfreq}{From \code{\link[wordcloud]{wordcloud}}, the min frequency 
17 | to include a word. Default is 10.}
18 | 
19 | \item{maxwords}{From \code{\link[wordcloud]{wordcloud}}, the max number of 
20 | words to show. Default is 150.}
21 | 
22 | \item{...}{\code{\link[wordcloud]{wordcloud}} options}
23 | }
24 | \value{
25 | NULL, prints out a wordcloud
26 | }
27 | \description{
28 | Create a word cloud from a patent data set.
29 | }
30 | \examples{
31 | 
32 | sumo <- cleanPatentData(patentData = patentr::acars, columnsExpected = sumobrainColumns,
33 | cleanNames = sumobrainNames,
34 | dateFields = sumobrainDateFields,
35 | dateOrders = sumobrainDateOrder,
36 | deduplicate = TRUE,
37 | cakcDict = patentr::cakcDict,
38 | docLengthTypesDict = patentr::docLengthTypesDict,
39 | keepType = "grant",
40 | firstAssigneeOnly = TRUE,
41 | assigneeSep = ";",
42 | stopWords = patentr::assigneeStopWords)
43 | 
44 | # df <- dplyr::select(sumo, title, abstract)
45 | df <- sumo[,c("title","abstract")]
46 | wordCloudIt(df, excludeWords, minfreq = 20, 
47 | random.order = FALSE, rot.per = 0.25)
48 | 
49 | }
50 | 


--------------------------------------------------------------------------------
/man/extractDocLength.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/cleanPatentData.R
 3 | \name{extractDocLength}
 4 | \alias{extractDocLength}
 5 | \title{Get a code for length of doc and country code}
 6 | \usage{
 7 | extractDocLength(countryCode, pubNum)
 8 | }
 9 | \arguments{
10 | \item{countryCode}{A string vector of country codes}
11 | 
12 | \item{pubNum}{A string vector of the numeric portion of a publication number.}
13 | }
14 | \value{
15 | A string vector of concatenated country code and publication number 
16 | length, such as US11 or EP9.
17 | }
18 | \description{
19 | Generate a custom concatenation of country code and length of 
20 | the publication number, for document type identification purposes. 
21 | 
22 | Given limited metadata available on free sites, often times the downloaded
23 | data set does not include the type of patent document. There are two easy ways to 
24 | discover the type of a patent document. A dictionary stored with the 
25 | package can compare the output to match up the type of patent document. 
26 | 
27 | \enumerate{
28 | \item The kind code, if present, is typically the same for each country.
29 | \code{B} is usually a patent and \code{A} is usually an application.
30 | \item The length of the publication number, along with the country code, is 
31 | another great indicator. Applications in USA have 11 numbers, and, for now,
32 | 9 numbers for granted patents.
33 | }
34 | }
35 | \examples{
36 | acars$pubNum <- extractPubNumber(acars$docNum)
37 | acars$countryCode <- extractCountryCode(acars$docNum)
38 | acars$officeDocLength <- extractDocLength(countryCode = acars$countryCode,
39 | pubNum = acars$pubNum)
40 | head(acars[,c("officeDocLength","docNum")])
41 | 
42 | }
43 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | export(addChartRightTextLeftPptx)
 4 | export(addFullImagePptx)
 5 | export(addPdfImage)
 6 | export(capWord)
 7 | export(chooseFiles)
 8 | export(cleanGoogleURL)
 9 | export(cleanHeaderNames)
10 | export(cleanNames)
11 | export(cleanPatentData)
12 | export(createGoogleURL)
13 | export(extractCleanDate)
14 | export(extractCountryCode)
15 | export(extractDocLength)
16 | export(extractKindCode)
17 | export(extractPubNumber)
18 | export(facetPlot)
19 | export(factorForGraph)
20 | export(flippedHistogram)
21 | export(generateDocType)
22 | export(getClaimFromURL)
23 | export(getClaimsText)
24 | export(importPatentData)
25 | export(makeColors)
26 | export(removeDups)
27 | export(runExample)
28 | export(showDups)
29 | export(summarizeColumns)
30 | export(summaryText)
31 | export(tilePlot)
32 | export(wordCloudIt)
33 | import(ReporteRs)
34 | import(ggplot2)
35 | importFrom(RColorBrewer,brewer.pal)
36 | importFrom(ReporteRs,addPlot)
37 | importFrom(ReporteRs,addSlide)
38 | importFrom(XML,getNodeSet)
39 | importFrom(XML,htmlParse)
40 | importFrom(XML,xmlValue)
41 | importFrom(XML,xpathSApply)
42 | importFrom(dplyr,arrange)
43 | importFrom(dplyr,group_by_)
44 | importFrom(dplyr,n)
45 | importFrom(dplyr,select_)
46 | importFrom(dplyr,summarize)
47 | importFrom(httr,GET)
48 | importFrom(lubridate,parse_date_time)
49 | importFrom(magrittr,"%>%")
50 | importFrom(plyr,ldply)
51 | importFrom(readxl,read_excel)
52 | importFrom(stringr,str_extract)
53 | importFrom(tm,Corpus)
54 | importFrom(tm,VectorSource)
55 | importFrom(tm,content_transformer)
56 | importFrom(tm,removePunctuation)
57 | importFrom(tm,removeWords)
58 | importFrom(tm,stopwords)
59 | importFrom(tm,tm_map)
60 | importFrom(wordcloud,wordcloud)
61 | 


--------------------------------------------------------------------------------
/inst/shiny/app/ui.R:
--------------------------------------------------------------------------------
 1 | ## yang yao 
 2 | ## motivation: R Shiny gallery and look at urls below
 3 | library(shiny)
 4 | 
 5 | fluidPage(
 6 |   titlePanel("Uploading Files"),
 7 |   sidebarLayout(
 8 |     sidebarPanel(
 9 |       # http://shiny.rstudio.com/gallery/file-upload.html
10 |       # https://shiny.rstudio.com/reference/shiny/latest/fileInput.html
11 |       # http://stackoverflow.com/questions/29201155/how-to-validate-the-file-type-of-a-file-uploaded-by-the-user-in-a-shiny-app
12 |       # http://stackoverflow.com/questions/30624201/read-excel-in-a-shiny-app
13 |       fileInput('file1', 'Choose a File',
14 |                 accept=c('text/csv', 
15 |                          'text/comma-separated-values,text/plain', 
16 |                          '.csv',
17 |                          '.xls',
18 |                          '.xlsx')),
19 |       tags$hr(),
20 |       checkboxInput('header', 'Header', TRUE),
21 |       radioButtons('sep', 'Separator',
22 |                    c(Comma=',',
23 |                      Semicolon=';',
24 |                      Tab='\t'),
25 |                    ','),
26 |       radioButtons('quote', 'Quote',
27 |                    c(None='',
28 |                      'Double Quote'='"',
29 |                      'Single Quote'="'"),
30 |                    '"'),
31 |       actionButton('cleanDataButton',"Clean Data"),
32 |       p("Click this button to clean the raw data"),
33 |       downloadButton('downloadData',"Download Clean Data")
34 |     ),
35 |     mainPanel(
36 |       tabsetPanel(type = "tabs",
37 |                   tabPanel("Data Table",tableOutput("contents"), tableOutput("cleanContents")),
38 |                   tabPanel("Score Count Plot",plotOutput("outplot1"))
39 |       )
40 |     )
41 |   )
42 | )
43 | 
44 | ## yang yao 
45 | ## motivation: R Shiny gallery and look at urls below


--------------------------------------------------------------------------------
/man/summarizeColumns.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/explorePatentData.R
 3 | \name{summarizeColumns}
 4 | \alias{summarizeColumns}
 5 | \title{Summarize columns of a data frame}
 6 | \usage{
 7 | summarizeColumns(df, names, naOmit = FALSE)
 8 | }
 9 | \arguments{
10 | \item{df}{A data frame of patent data.}
11 | 
12 | \item{names}{a character vector of header names that you want to summarize.}
13 | 
14 | \item{naOmit}{Logical. Optionally, remove NA values at the end of the summary.
15 | Useful when comparing fields that have NA values, such as features.}
16 | }
17 | \value{
18 | A dataframe of summarize values.
19 | }
20 | \description{
21 | Summarize columns of a data frame.
22 | 
23 | Summarize a data frame \code{df} by a \code{names} character vector of 
24 | header names.
25 | }
26 | \examples{
27 | sumo <- cleanPatentData(patentData = patentr::acars, columnsExpected = sumobrainColumns,
28 | cleanNames = sumobrainNames,
29 | dateFields = sumobrainDateFields,
30 | dateOrders = sumobrainDateOrder,
31 | deduplicate = TRUE,
32 | cakcDict = patentr::cakcDict,
33 | docLengthTypesDict = patentr::docLengthTypesDict,
34 | keepType = "grant",
35 | firstAssigneeOnly = TRUE, 
36 | assigneeSep = ";",
37 | stopWords = patentr::assigneeStopWords)
38 | 
39 | # note that in reality, you need a patent analyst to carefully score
40 | # these patents, the score here is for demonstrational purposes
41 | score <- round(rnorm(dim(sumo)[1],mean=1.4,sd=0.9))
42 | score[score>3] <- 3
43 | score[score<0] <- 0
44 | sumo$score <- score
45 | scoreSum <- summarizeColumns(sumo, "score")
46 | scoreSum
47 | # load library(ggplot2) for the below part to run
48 | # ggplot(scoreSum, aes(x=score, y = total, fill=factor(score) )) + geom_bar(stat="identity")
49 | nameAndScore <- summarizeColumns(sumo, c("assigneeClean","score"))
50 | # tail(nameAndScore)
51 | 
52 | }
53 | 


--------------------------------------------------------------------------------
/tests/testthat/test-imports.R:
--------------------------------------------------------------------------------
 1 | # test that import works
 2 | 
 3 | # files
 4 | file1 <- system.file("extdata/", "sumobrain_autonomous_search1.xlsx", package="patentr")
 5 | file2 <- system.file("extdata/", "sumobrain_autonomous_search2.xlsx", package="patentr")
 6 | files <- list(file1, file2)
 7 | # read it in 
 8 | patData <- importPatentData(files)
 9 | 
10 | # should be a data frame
11 | expect_true(is.data.frame(patData))
12 | 
13 | 
14 | # test_that("importing a data file works",{
15 | #   # files
16 | #   file1 <- system.file("inst/extdata/", "sumobrain_autonomous_search1.xlsx", package="patentr")
17 | #   file2 <- system.file("inst/extdata/", "sumobrain_autonomous_search2.xlsx", package="patentr")
18 | #   files <- list(file1, file2)
19 | #   # read it in 
20 | #   patData <- importPatentData(files)
21 | #   
22 | #   # should be a data frame
23 | #   expect_true(is.data.frame(patData))
24 | # })
25 | 
26 | # motivation
27 | # test taken from readxl package in tidyverse and modified
28 | # credit goes to tidyverse team
29 | # https://github.com/tidyverse/readxl/blob/83af028bcc577d23b01c4a1f47d2dfc314497253/tests/testthat/helper.R
30 | # NOTE: may need to cancel this test as readxl 0.1.1 still has error, only github version does not have error
31 | # this only works on 0.1.1.9000 (current at time of writing) and above
32 | test_that("can tolerate xls(x) that underreports number of columns",{
33 |   # tidyverse test
34 |   #df <- readxl::read_excel(rprojroot::find_testthat_root_file("testData","mtcars.xls"))
35 |   #expect_identical(ncol(df),ncol(mtcars))
36 |   # test modified
37 |   # note, test cases modified to remove dependency on 0.1.1.9000 readxl, future versions (0.1.2+)
38 |   # expected to fix this issue
39 |   df2 <- readxl::read_excel(rprojroot::find_testthat_root_file("testData","sumobrain_autonomous_search1.xlsx"), skip=1)
40 |   expect_identical(ncol(df2),ncol(acars))
41 |   
42 | })
43 | 
44 | 


--------------------------------------------------------------------------------
/R/patentr.R:
--------------------------------------------------------------------------------
 1 | ## yang yao start
 2 | #' patentr: A package for analyzing patent data. 
 3 | #' 
 4 | #' The package is a data processing and reporting tool of patent data sets for patent analysts.
 5 | #' 
 6 | #' 
 7 | #' The package is aimed at patent agents, lawyers, managers, analysts, and 
 8 | #' academics who are working on patents. This may be used in a patent landscape 
 9 | #' analysis, company IP portfolio analysis, or a freedom to operate search. 
10 | #' 
11 | #' 
12 | #' The patentr package provides four categories of important functions:
13 | #' 
14 | #' \enumerate{
15 | #' \item Data input and cleaning
16 | #' \item Data (re)processing
17 | #' \item Data exploration & visualization
18 | #' \item Visualization & reporting
19 | #' }
20 | #' 
21 | #' 
22 | #' @section load functions:
23 | #' 
24 | #' \code{\link{importPatentData}}: Import xls patent data from filepaths.
25 | #' \code{\link{chooseFiles}}: Uses a popup window (Tk file dialog) to 
26 | #' allow the user to choose a list of zero or more files interactively.
27 | #' 
28 | #' @section clean functions:
29 | #' 
30 | #' \code{\link{cleanHeaderNames}}, \code{\link{extractCountryCode}},
31 | #' \code{\link{extractPubNumber}}, \code{\link{extractKindCode}},
32 | #' \code{\link{extractDocLength}}, \code{\link{extractCleanDate}}, 
33 | #' \code{\link{showDups}}, \code{\link{removeDups}}, \code{\link{generateDocType}}, 
34 | #' \code{\link{cleanNames}}, \code{\link{cleanPatentData}}
35 | #' 
36 | #' 
37 | #' @section patentr data:
38 | #' \code{\link{acars}} To pay respect to the \code{\link[datasets]{mtcars}} data, 
39 | #' this is a data set of autonomous driving car patents from major companies. 
40 | #' \code{\link{kindCodes}} A data frame of kind codes by country with associated 
41 | #' descriptions. 
42 | #' \code{\link{docLengthTypes}} A data frame mapping doc length to the type of 
43 | #' patent document.  
44 | #' 
45 | #' @docType package
46 | #' @name patentr
47 | NULL
48 | 
49 | ## yang yao end


--------------------------------------------------------------------------------
/man/patentr.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/patentr.R
 3 | \docType{package}
 4 | \name{patentr}
 5 | \alias{patentr}
 6 | \alias{patentr-package}
 7 | \title{patentr: A package for analyzing patent data.}
 8 | \description{
 9 | The package is a data processing and reporting tool of patent data sets for patent analysts.
10 | }
11 | \details{
12 | The package is aimed at patent agents, lawyers, managers, analysts, and 
13 | academics who are working on patents. This may be used in a patent landscape 
14 | analysis, company IP portfolio analysis, or a freedom to operate search.
15 | 
16 | The patentr package provides four categories of important functions:
17 | 
18 | \enumerate{
19 | \item Data input and cleaning
20 | \item Data (re)processing
21 | \item Data exploration & visualization
22 | \item Visualization & reporting
23 | }
24 | }
25 | \section{load functions}{
26 | 
27 | 
28 | \code{\link{importPatentData}}: Import xls patent data from filepaths.
29 | \code{\link{chooseFiles}}: Uses a popup window (Tk file dialog) to 
30 | allow the user to choose a list of zero or more files interactively.
31 | }
32 | 
33 | \section{clean functions}{
34 | 
35 | 
36 | \code{\link{cleanHeaderNames}}, \code{\link{extractCountryCode}},
37 | \code{\link{extractPubNumber}}, \code{\link{extractKindCode}},
38 | \code{\link{extractDocLength}}, \code{\link{extractCleanDate}}, 
39 | \code{\link{showDups}}, \code{\link{removeDups}}, \code{\link{generateDocType}}, 
40 | \code{\link{cleanNames}}, \code{\link{cleanPatentData}}
41 | }
42 | 
43 | \section{patentr data}{
44 | 
45 | \code{\link{acars}} To pay respect to the \code{\link[datasets]{mtcars}} data, 
46 | this is a data set of autonomous driving car patents from major companies. 
47 | \code{\link{kindCodes}} A data frame of kind codes by country with associated 
48 | descriptions. 
49 | \code{\link{docLengthTypes}} A data frame mapping doc length to the type of 
50 | patent document.
51 | }
52 | 
53 | 


--------------------------------------------------------------------------------
/man/createGoogleURL.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/processPatentData.R
 3 | \name{createGoogleURL}
 4 | \alias{createGoogleURL}
 5 | \title{Create a URL link to Google patents}
 6 | \usage{
 7 | createGoogleURL(countryCode, pubNum, kindCode,
 8 |   googleURL = "https://patents.google.com/patent/", lang = "en")
 9 | }
10 | \arguments{
11 | \item{countryCode}{A character vector of the country code of the document. 
12 | Typically a two-letter character.}
13 | 
14 | \item{pubNum}{A character vector of the numeric portion of a publication number.}
15 | 
16 | \item{kindCode}{character vector of the kind code of a document. If not available,
17 | enter a blank string "".}
18 | 
19 | \item{googleURL}{A character string of the URL to Google Patents, with working
20 | default value.}
21 | 
22 | \item{lang}{The language you want to read the patent, default set to "en" english.}
23 | }
24 | \value{
25 | A character vector of properly formatted URL strings.
26 | }
27 | \description{
28 | Create a URL string to link you to Google Patents. 
29 | 
30 | By concatenating the country code, publication number, and kind code, you can
31 | generate a URL to link you to google patents for further exploration. This 
32 | feature is especially useful when browsing the data in a spreadsheet or in 
33 | a Shiny app. It is also useful for extracting data from the HTML content. 
34 | 
35 | As each website (Google, lens.org, sumobrain.com, etc..) has a different 
36 | method for generating patent URLs, these functions are website-specific. 
37 | 
38 | The original Google patents version still works as of March 2017 and the 
39 | \code{googleURL} value is  \code{https://www.google.com/patents/}. This older 
40 | version may be easier to extract data.
41 | }
42 | \examples{
43 | acars$kindCode <- extractKindCode(acars$docNum)
44 | acars$pubName <- extractPubNumber(acars$docNum)
45 | acars$googleURL <- createGoogleURL(countryCode = acars$countryCode, 
46 | pubNum = acars$pubNum, kindCode =acars$kindCode)
47 | head(acars$googleURL)
48 | 
49 | }
50 | 


--------------------------------------------------------------------------------
/man/factorForGraph.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/visualizePatentData.R
 3 | \name{factorForGraph}
 4 | \alias{factorForGraph}
 5 | \title{Factor one column by another column's popularity}
 6 | \usage{
 7 | factorForGraph(df, xVal, fillVal, decFill = TRUE)
 8 | }
 9 | \arguments{
10 | \item{df}{A data frame containing the x and fill value columns.}
11 | 
12 | \item{xVal}{A character value from a header name in \code{df} 
13 | that will be used as 
14 | the x value in a ggplot2 plot.}
15 | 
16 | \item{fillVal}{A character value from a header name in \code{df} 
17 | that will be used as the 
18 | fill value in a ggplot2 plot.}
19 | 
20 | \item{decFill}{Sort fill value in decreasing order.}
21 | }
22 | \value{
23 | A data frame with two of the columns factored.
24 | }
25 | \description{
26 | Factor (or refactor) a data frame of values to be used 
27 | for graphing in the correct order.
28 | 
29 | Many graphs require a reordering when plotting with a fill value. This 
30 | helper function factors the x-value of a plot that will be stacked by 
31 | \code{fillVal}.
32 | }
33 | \examples{
34 | 
35 | 
36 | sumo <- cleanPatentData(patentData = patentr::acars, columnsExpected = sumobrainColumns,
37 | cleanNames = sumobrainNames,
38 | dateFields = sumobrainDateFields,
39 | dateOrders = sumobrainDateOrder,
40 | deduplicate = TRUE,
41 | cakcDict = patentr::cakcDict,
42 | docLengthTypesDict = patentr::docLengthTypesDict,
43 | keepType = "grant",
44 | firstAssigneeOnly = TRUE, 
45 | assigneeSep = ";",
46 | stopWords = patentr::assigneeStopWords)
47 | 
48 | # note that in reality, you need a patent analyst to carefully score
49 | # these patents, the score here is for demonstrational purposes
50 | score <- round(rnorm(dim(sumo)[1],mean=1.4,sd=0.9))
51 | score[score>3] <- 3
52 | score[score<0] <- 0
53 | sumo$score <- score
54 | dim(sumo)
55 | sumoFactor <- factorForGraph(sumo, "assigneeClean", "score")
56 | # if you want to view, uncomment and load ggplot2
57 | # ggplot(sumoFactor, aes(x=assigneeClean, y=score, fill=factor(score))) + 
58 | # geom_bar(stat="identity")
59 | 
60 | 
61 | }
62 | 


--------------------------------------------------------------------------------
/man/cleanNames.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/cleanPatentData.R
 3 | \name{cleanNames}
 4 | \alias{cleanNames}
 5 | \title{Clean up string names.}
 6 | \usage{
 7 | cleanNames(rawNames, firstAssigneeOnly = TRUE, sep = ";",
 8 |   removeStopWords = TRUE, stopWords = patentr::assigneeStopWords)
 9 | }
10 | \arguments{
11 | \item{rawNames}{The character vector you want to clean up}
12 | 
13 | \item{firstAssigneeOnly}{A logical value, default set to TRUE, keeping only the first 
14 | assignee if multiple exist.}
15 | 
16 | \item{sep}{The separating character for multiple assignees, default set to semi-colon.}
17 | 
18 | \item{removeStopWords}{Logical default TRUE, if want to remove common company stopwords 
19 | found in the \code{stopWords} parameter.}
20 | 
21 | \item{stopWords}{An optional character vector of words you want to remove. Default to 
22 | \code{\link{assigneeStopWords}}.}
23 | }
24 | \value{
25 | A character vector of cleaned up character names.
26 | }
27 | \description{
28 | Quick cleanup of characters in a string, 
29 | typically assignee (company names) and the inventors. 
30 | 
31 | If you have issues with this, you may need to convert to UTF-8 or ASCII.
32 | Use the \code{iconv(thisVector, to="UTF-8")} or \code{to="ASCII"} and it should 
33 | fix the problem. See the examples for the code.
34 | 
35 | 
36 | This function: 
37 | \enumerate{
38 | \item{Removes values between spaces, such as (US)}
39 | \item{Changes all names to lower case}
40 | }
41 | }
42 | \examples{
43 | 
44 | assigneeNames <- cleanNames(acars$assignee)
45 | # get a feel for the less-messy data
46 | head(sort(table(assigneeNames), decreasing = TRUE))
47 | 
48 | # for a messier example, note you need to convert to ASCII/UTF-8 to get rid of errors
49 | # associated with tolower
50 | rawGoogleData <- system.file("extdata", "google_autonomous_search.csv", package = "patentr")
51 | rawGoogleData <- read.csv(rawGoogleData, stringsAsFactors = FALSE, skip = patentr::skipGoogle)
52 | rawGoogleData <- data.frame(lapply(rawGoogleData, 
53 | function(x){iconv(x, to = "ASCII")}), stringsAsFactors = FALSE)
54 | assigneeClean <- cleanNames(rawGoogleData$assignee)
55 | head(sort(table(assigneeClean), decreasing = TRUE))
56 | 
57 | }
58 | 


--------------------------------------------------------------------------------
/man/getClaimFromURL.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/processPatentData.R
 3 | \name{getClaimFromURL}
 4 | \alias{getClaimFromURL}
 5 | \title{Get a claim from a Google Patents URL}
 6 | \usage{
 7 | getClaimFromURL(googleURL, langCode = "en", whichClaim = 1)
 8 | }
 9 | \arguments{
10 | \item{googleURL}{The well-formatted google URL built from \code{\link{createGoogleURL}}.
11 | It is a character value.}
12 | 
13 | \item{langCode}{The language code, used check for non-english text.}
14 | 
15 | \item{whichClaim}{Default set to 1, a numeric determining which claim to get. Note
16 | if claim is out of bounds, the return claim will be a blank character.}
17 | }
18 | \value{
19 | A character vector of the claim from each Google URL. If no claim exists, 
20 | or if the country code is not on the inclusion list, an empty character value is returned 
21 | for that index.
22 | }
23 | \description{
24 | Input a valid Google Patents URL of the form given below and 
25 | then get back a claim from the index of your choosing. If no claim exists or 
26 | if your index is out of bounds, an  empty character string returns. 
27 | 
28 | The function works on strings that begin with the following sequence: 
29 | \code{https://patents.google.com/patent/}. If the string sequence afterwards 
30 | is invalid, a 404 status returns from the GET command and eventually an empty 
31 | string returns.
32 | }
33 | \examples{
34 | 
35 | \dontrun{
36 | # works for USA
37 | aclaim <- getClaimFromURL("https://patents.google.com/patent/US8818682B1/en")
38 | print(aclaim)
39 | # test WO, EP
40 | woclaim <- getClaimFromURL("https://patents.google.com/patent/WO2015134152A1/en")
41 | print(woclaim)
42 | epclaim <- getClaimFromURL("https://patents.google.com/patent/EP2991875A1/en")
43 | print(epclaim)
44 | # test KR, JP, CN
45 | krclaim <- getClaimFromURL("https://patents.google.com/patent/KR20150127745A/en")
46 | cnclaim <- getClaimFromURL("https://patents.google.com/patent/CN104786953A/en")
47 | jpclaim <- getClaimFromURL("https://patents.google.com/patent/JP2016173842A/en")
48 | declaim <- getClaimFromURL("https://patents.google.com/patent/DE102014219223A1/en")
49 | }
50 | 
51 | }
52 | \seealso{
53 | \code{\link{createGoogleURL}}, \code{\link{extractCountryCode}},
54 | \code{\link{cleanGoogleURL}}
55 | }
56 | 


--------------------------------------------------------------------------------
/man/acars.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/acars.R
 3 | \docType{data}
 4 | \name{acars}
 5 | \alias{acars}
 6 | \title{Autonomous Vehicle Patent Data from Sumobrain.com}
 7 | \format{A data frame with 499 observations on 10 variables.
 8 | \describe{
 9 | \item{docNum}{A published document number including the kind code, publication number,
10 | and kind code for the patent document.}
11 | \item{docTypeSumobrain}{Very similar to the country code, with minor additions, USAPP being the 
12 | most noticable difference. }
13 | \item{pubDate}{Publication Date}
14 | \item{title}{Title}
15 | \item{abstract}{Abstract}
16 | \item{inventors}{Inventor Name}
17 | \item{assignee}{Assignee}
18 | \item{appNum}{Application Number}
19 | \item{dateFiled}{Filing Date}
20 | \item{classPrimary}{Primary Class}
21 | \item{classOthers}{Other Classes}
22 | }}
23 | \usage{
24 | acars
25 | }
26 | \description{
27 | An example data set of autonomous vehicle IP from major assignees.
28 | }
29 | \details{
30 | The data search was performd on Monday, March 13, 2017 from sumobrain.com, and the exact
31 | search term was:
32 | 
33 | \code{ABST/"autonomous" AN/(Apple* OR Google* OR Waymo* OR Tesla*}
34 | 
35 | \code{OR Ford* OR General*) PD/12/13/1790->3/13/2017}
36 | 
37 | View the search \href{http://www.sumobrain.com/result.html?p=1&stemming=on&sort=chron&uspat=on&usapp=on&eupat=on&jp=on&pct=on&collections=&srch=xprtsrch&date_range=all&hits=502&from_ss=&srch_id=&srch_name=&search_name=&selected_doc_flag=&selected_newdoc_flag=&selected_portfolio=&portfolio_name=&query_txt=ABST\%2F\%22autonomous\%22+AN\%2F\%28Apple*+OR+Google*+OR+Waymo*+OR+Tesla*+OR+Ford*+OR+General*\%29+PD\%2F12\%2F13\%2F1790-\%3E3\%2F13\%2F2017&search.x=0&search.y=0&search=search_ezy}{here}.
38 | 
39 | For all collections (US patents, applications, EP documents, abstracts of Japan, and WIPO).
40 | 
41 | Can get raw data with the following commands:
42 | 
43 | \code{system.file("extdata", "sumobrain_autonomous_search1.xls", package = "patentr")}
44 | 
45 | \code{system.file("extdata", "sumobrain_autonomous_search2.xls", package = "patentr")}
46 | }
47 | \seealso{
48 | \url{http://www.sumobrain.com} You will need to create a free account to export data.
49 | 
50 | \code{\link{acarsGoogle}} provides a similar search from Google. 
51 | \code{\link{acarsLens}} provides a simialr search from Lens.org.
52 | }
53 | \keyword{data}
54 | 


--------------------------------------------------------------------------------
/man/acarsGoogle.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/acars.R
 3 | \docType{data}
 4 | \name{acarsGoogle}
 5 | \alias{acarsGoogle}
 6 | \title{Autonomous Vehicle Patent Data from Google Patents}
 7 | \format{A data frame with 316 observations on 9 variables.
 8 | \describe{
 9 | \item{\code{docNum}}{A published document number including the kind code, publication number,
10 | and kind code for the patent document.}
11 | \item{\code{title}}{The title of the invention.}
12 | \item{\code{assignee}}{The owner of the document.}
13 | \item{\code{inventors}}{The name(s) of the inventor(s), separated by commas.}
14 | \item{\code{priorityDate}}{The earliest priority date on the application.}
15 | \item{\code{dateFiled}}{Date the document was filed. They calll it filing/creation date.}
16 | \item{\code{pubDate}}{Date document became publicly available.}
17 | \item{\code{grantDate}}{Date the application became a grant. NA if there is no associated grant.}
18 | \item{\code{googleURL}}{The link to the Google Patents page for the document.}
19 | }}
20 | \usage{
21 | acarsGoogle
22 | }
23 | \description{
24 | An example data set of autonomous vehicle IP from major assignees.
25 | }
26 | \details{
27 | The first row in the raw CSV export contains the search URL and is skipped.
28 | 
29 | The data search was performd on Saturday, March 18, 2017 from patents.google.com, and the exact
30 | search: \href{https://patents.google.com/?q=AB\%3dautonomous&assignee=Apple*,Google*,Waymo*,Tesla*,Ford*,General*&before=filing:20170318}{Google Patents Search}
31 | For all countries available on Google.
32 | 
33 | You process the raw data with the following commands:
34 | 
35 | \code{temp <- system.file("extdata", "google_autonomous_search.csv", package = "patentr")}
36 | 
37 | \code{# from the source package you can navigate to }
38 | 
39 | \code{temp <- read.csv("inst/extdata/google_autonomous_search.csv", skip = 1, stringsAsFactors = FALSE)}
40 | 
41 | \code{names(temp) <- googleNames}
42 | 
43 | \code{temp <- data.frame(lapply(temp, function(x){iconv(x,to="ASCII")}),stringsAsFactors = FALSE)}
44 | 
45 | \code{dateFields <- c("priorityDate","dateFiled","pubDate","grantDate")}
46 | 
47 | \code{temp[dateFields] <- as.data.frame(lapply(temp[dateFields], as.Date, format="\%m/\%d/\%y"))}
48 | }
49 | \seealso{
50 | \url{https://patents.google.com/} 
51 | 
52 | \code{\link{acars}} provides a similar search from Sumobrain. 
53 | \code{\link{acarsLens}} provides a simialr search from Lens.org.
54 | }
55 | \keyword{data}
56 | 


--------------------------------------------------------------------------------
/man/summaryText.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/reportPatentData.R
 3 | \name{summaryText}
 4 | \alias{summaryText}
 5 | \title{Add summary text to be used in a pptx slide}
 6 | \usage{
 7 | summaryText(df, singular, plural, sumVar)
 8 | }
 9 | \arguments{
10 | \item{df}{A summarized patent data frame, summarized by one variable. 
11 | See \code{\link{summarizeColumns}}.}
12 | 
13 | \item{singular}{The name of the variable, singular version. A character string.
14 | For example: assignee.}
15 | 
16 | \item{plural}{The name of the variable, plural version. A character string.
17 | For example: assignees, with an 's'.}
18 | 
19 | \item{sumVar}{The vector of the variable to summarize, taken from the original
20 | patent data set. For example \code{sumo$score} to summarize the score range.}
21 | }
22 | \value{
23 | A length four character vector.
24 | }
25 | \description{
26 | Add a standard summarized text that will be used in 
27 | association with a plot.
28 | }
29 | \examples{
30 | sumo <- cleanPatentData(patentData = patentr::acars, columnsExpected = sumobrainColumns,
31 | cleanNames = sumobrainNames,
32 | dateFields = sumobrainDateFields,
33 | dateOrders = sumobrainDateOrder,
34 | deduplicate = TRUE,
35 | cakcDict = patentr::cakcDict,
36 | docLengthTypesDict = patentr::docLengthTypesDict,
37 | keepType = "grant",
38 | firstAssigneeOnly = TRUE,
39 | assigneeSep = ";",
40 | stopWords = patentr::assigneeStopWords)
41 | 
42 | # note that in reality, you need a patent analyst to carefully score
43 | # these patents, the score here is for demonstrational purposes
44 | score <- round(rnorm(dim(sumo)[1],mean=1.4,sd=0.9))
45 | score[score>3] <- 3; score[score<0] <- 0
46 | sumo$score <- score
47 | sumo$assigneeSmall <- strtrim(sumo$assigneeClean,12)
48 | category <- c("system","control algorithm","product","control system", "communication")
49 | c <- round(rnorm(dim(sumo)[1],mean=2.5,sd=1.5))
50 | c[c>5] <- 5; c[c<1] <- 1
51 | sumo$category <- category[c]
52 | feature1 <- c("adaptive", "park", "lane", NA,NA,NA,NA,NA, 
53 | "brake", "steer","accelerate","deactivate")
54 | f <- round(rnorm(dim(sumo)[1],mean=5,sd=1))
55 | l <- length(feature1)
56 | f[f>l] <- l; f[f<1] <- 1
57 | sumo$feature1 <- c(feature1,feature1[f])[1:dim(sumo)[1]]
58 | 
59 | # Summarize the assignees
60 | as <- summarizeColumns(sumo, 'assigneeSmall')
61 | summaryText(as, 'assignee','assignees',sumo$score)
62 | # summarize the number of features
63 | f <- summarizeColumns(sumo, 'feature1', naOmit = TRUE)
64 | summaryText(f, 'feature','features',sumo$feature1)
65 | 
66 | }
67 | 


--------------------------------------------------------------------------------
/man/flippedHistogram.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/visualizePatentData.R
 3 | \name{flippedHistogram}
 4 | \alias{flippedHistogram}
 5 | \title{Plot a flipped histogram with a fill value}
 6 | \usage{
 7 | flippedHistogram(df, xVal, fillVal, colors = patentr::scoreColors,
 8 |   recolor = FALSE)
 9 | }
10 | \arguments{
11 | \item{df}{The original data frame of patent data}
12 | 
13 | \item{xVal}{A character value of a name in \code{df}}
14 | 
15 | \item{fillVal}{A character value of a name in \code{df} to color the chart.}
16 | 
17 | \item{colors}{A character vector of colors, the same length as the number of 
18 | unique values in the column of \code{xVal[,fillVal]}. Default set to 
19 | \code{scoreColors}.}
20 | 
21 | \item{recolor}{A logical allowing you to choose to recolor the plot if the 
22 | colors vector is not applicable to you. Default set to \code{FALSE}. Uses 
23 | the helper function \code{\link{makeColors}} to generate colors. Note that your 
24 | plot may fail if \code{colors} is not the same length as the number of unique 
25 | values in fillVal and \code{recolor} is set to \code{FALSE}.}
26 | }
27 | \value{
28 | A plot
29 | }
30 | \description{
31 | Plot a flipped histogram with fill values.
32 | 
33 | Often times, you want to plot a histogram showing patent documents 
34 | faceted by one value and filled by another.
35 | }
36 | \examples{
37 | 
38 | 
39 | sumo <- cleanPatentData(patentData = patentr::acars, columnsExpected = sumobrainColumns,
40 | cleanNames = sumobrainNames,
41 | dateFields = sumobrainDateFields,
42 | dateOrders = sumobrainDateOrder,
43 | deduplicate = TRUE,
44 | cakcDict = patentr::cakcDict,
45 | docLengthTypesDict = patentr::docLengthTypesDict,
46 | keepType = "grant",
47 | firstAssigneeOnly = TRUE,
48 | assigneeSep = ";",
49 | stopWords = patentr::assigneeStopWords)
50 | 
51 | # note that in reality, you need a patent analyst to carefully score
52 | # these patents, the score here is for demonstrational purposes
53 | score <- round(rnorm(dim(sumo)[1],mean=1.4,sd=0.9))
54 | score[score>3] <- 3
55 | score[score<0] <- 0
56 | sumo$score <- score
57 | sumo$assigneeSmall <- strtrim(sumo$assigneeClean,12)
58 | flippedHistogram(sumo, "assigneeSmall","score",colors=scoreColors)
59 | flippedHistogram(subset(sumo, score > 0), "assigneeSmall","score",colors=scoreColors)
60 | flippedHistogram(subset(sumo, score > 2) ,"score","assigneeSmall",colors=scoreColors,
61 | recolor = TRUE)
62 | flippedHistogram(subset(sumo, score > 2) ,"assigneeSmall","docType",colors=scoreColors,
63 | recolor = TRUE)
64 | 
65 | }
66 | \seealso{
67 | \code{\link{makeColors}}, \code{\link{capWord}}
68 | }
69 | 


--------------------------------------------------------------------------------
/R/explorePatentData.R:
--------------------------------------------------------------------------------
 1 | # explore patent data, goes hand-in-hand with visualization
 2 | ## yang yao start
 3 | 
 4 | #' Summarize columns of a data frame 
 5 | #' 
 6 | #' @description Summarize columns of a data frame.
 7 | #' 
 8 | #' Summarize a data frame \code{df} by a \code{names} character vector of 
 9 | #' header names.
10 | #' 
11 | #' @param df A data frame of patent data.
12 | #' @param names a character vector of header names that you want to summarize. 
13 | #' @param naOmit Logical. Optionally, remove NA values at the end of the summary.
14 | #' Useful when comparing fields that have NA values, such as features. 
15 | #' 
16 | #' @return A dataframe of summarize values.
17 | #' 
18 | #' @examples 
19 | #' sumo <- cleanPatentData(patentData = patentr::acars, columnsExpected = sumobrainColumns,
20 | #' cleanNames = sumobrainNames,
21 | #' dateFields = sumobrainDateFields,
22 | #' dateOrders = sumobrainDateOrder,
23 | #' deduplicate = TRUE,
24 | #' cakcDict = patentr::cakcDict,
25 | #' docLengthTypesDict = patentr::docLengthTypesDict,
26 | #' keepType = "grant",
27 | #' firstAssigneeOnly = TRUE, 
28 | #' assigneeSep = ";",
29 | #' stopWords = patentr::assigneeStopWords)
30 | #' 
31 | #' # note that in reality, you need a patent analyst to carefully score
32 | #' # these patents, the score here is for demonstrational purposes
33 | #' score <- round(rnorm(dim(sumo)[1],mean=1.4,sd=0.9))
34 | #' score[score>3] <- 3
35 | #' score[score<0] <- 0
36 | #' sumo$score <- score
37 | #' scoreSum <- summarizeColumns(sumo, "score")
38 | #' scoreSum
39 | #' # load library(ggplot2) for the below part to run
40 | #' # ggplot(scoreSum, aes(x=score, y = total, fill=factor(score) )) + geom_bar(stat="identity")
41 | #' nameAndScore <- summarizeColumns(sumo, c("assigneeClean","score"))
42 | #' # tail(nameAndScore)
43 | #' 
44 | #' @export
45 | #' 
46 | #' @importFrom dplyr group_by_
47 | #' @importFrom dplyr summarize
48 | #' @importFrom dplyr arrange
49 | #' @importFrom dplyr n
50 | #' @importFrom magrittr %>%
51 | #' 
52 | summarizeColumns <- function(df, names, naOmit = FALSE){
53 |   
54 |   # dplyr functions
55 |   # as.symbol or as.name both work, unsure why
56 |   names <- lapply(names, as.name)
57 |   
58 |   # for an error fix 
59 |   # http://stackoverflow.com/questions/9439256/how-can-i-
60 |   # handle-r-cmd-check-no-visible-binding-for-global-variable-notes-when
61 |   total <- NULL
62 |   # group by names, sum them, and arrange them top to bottom
63 |   df <- df %>% 
64 |     dplyr::group_by_(.dots = names) %>% 
65 |     dplyr::summarize(total=n())  %>% 
66 |     dplyr::arrange(total)
67 |   
68 |   if(naOmit){
69 |     df <- df %>% 
70 |       stats::na.omit()
71 |   }
72 |   return(df)
73 | }
74 | 
75 | 
76 | 
77 | ## yang yao end


--------------------------------------------------------------------------------
/man/cleanHeaderNames.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/cleanPatentData.R
 3 | \name{cleanHeaderNames}
 4 | \alias{cleanHeaderNames}
 5 | \title{Generate a standard set of header names for import data}
 6 | \usage{
 7 | cleanHeaderNames(patentData = NA,
 8 |   columnsExpected = patentr::sumobrainColumns,
 9 |   cleanNames = patentr::sumobrainNames)
10 | }
11 | \arguments{
12 | \item{patentData}{A data frame. Default is NA.}
13 | 
14 | \item{columnsExpected}{An expected number of columns. 
15 | Default is Sumobrain \code{\link{sumobrainColumns}} data.}
16 | 
17 | \item{cleanNames}{A standard list of clean names. Default is Sumobrain 
18 | \code{\link{sumobrainNames}} data.}
19 | }
20 | \value{
21 | A data frame 11 columns wide, with standard column names used in other
22 | functions.
23 | }
24 | \description{
25 | Create a standard nameset from Sumobrain import data. 
26 | See \code{\link{acars}} for the name set.
27 | 
28 | There are three main sources of free and exportable patent data from the internet: 
29 | \enumerate{
30 | \item{\href{www.sumobrain.com}{Sumobrain}}
31 | \item{\href{www.lens.org}{The Lens}}
32 | \item{\href{www.patents.google.com}{Google}}
33 | }
34 | 
35 | These three popular sites have varying levels of exportable data available. 
36 | Sumobrain tends to be the most comprehensive, followed by Lens, and finally 
37 | by Google. Thus, all three have hardcoded data available in the \code{patentr} 
38 | package. 
39 | 
40 | To download Sumobrain data, go to \url{http://www.sumobrain.com} and create a free
41 | account. Then run your search, export the data (250 max at a time), and use the 
42 | \code{\link{chooseFiles}} and \code{\link{importPatentData}} functions to load
43 | the data into R. 
44 | 
45 | To download Lens data, go to \url{www.lens.org}. You do not need to create an 
46 | account. Run your search, and in the header section, look for the cloud icon 
47 | with a downward arrow. Choose the CSV option. 
48 | 
49 | To download Google patent data, visit \url{www.patents.google.com}, run 
50 | your search, and click "Download (CSV)" in the upper left-hand corner.
51 | }
52 | \examples{
53 | cleanData <- cleanHeaderNames(patentData = acars)
54 | cleanDataLens <- cleanHeaderNames(patentData = acarsLens, 
55 | columnsExpected = lensColumns, cleanNames = lensNames)
56 | 
57 | }
58 | \seealso{
59 | \enumerate{
60 | \item{\code{\link{sumobrainColumns}}}
61 | \item{\code{\link{sumobrainNames}}}
62 | \item{\code{\link{skipSumobrain}}}
63 | \item{\code{\link{googleColumns}}}
64 | \item{\code{\link{googleNames}}}
65 | \item{\code{\link{skipGoogle}}}
66 | \item{\code{\link{lensColumns}}}
67 | \item{\code{\link{lensNames}}}
68 | \item{\code{\link{skipLens}}}
69 | }
70 | }
71 | 


--------------------------------------------------------------------------------
/man/generateDocType.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/cleanPatentData.R
 3 | \name{generateDocType}
 4 | \alias{generateDocType}
 5 | \title{Determine the patent document type}
 6 | \usage{
 7 | generateDocType(officeDocLength, countryAndKindCode,
 8 |   cakcDict = patentr::cakcDict,
 9 |   docLengthTypesDict = patentr::docLengthTypesDict)
10 | }
11 | \arguments{
12 | \item{officeDocLength}{The concat value of country code and number of numerical digits. 
13 | Extracted using the \code{\link{extractDocLength}} function.}
14 | 
15 | \item{countryAndKindCode}{The concat value of the country code and kind code. 
16 | Extracted using the \code{\link{extractCountryCode}} and \code{\link{extractKindCode}} 
17 | functions.}
18 | 
19 | \item{cakcDict}{A county and kind code dictionary. Default is \code{\link{cakcDict}}.}
20 | 
21 | \item{docLengthTypesDict}{A document length and type dictionary. Default is \code{\link{docLengthTypesDict}}.}
22 | }
23 | \value{
24 | A vector of characters labeling the document type, with NA for when 
25 | no match was found.
26 | }
27 | \description{
28 | Determine the type of document from the patent publication data. 
29 | 
30 | Often times, data exports from publicly available sources do not provide the 
31 | type of patent document, or, if provided, still requires standardization. By 
32 | using the kind code, country code, and pre-developed dictionaries for doc length 
33 | and country code, you can get a great approximation of the types of documents. 
34 | 
35 | Note that you can use View(lens[lens$docType=="NA",]) to view the not-found 
36 | document types. Often times, these are small countries. You can add to the 
37 | \code{\link{cakcDict}} to fix these. They are also useful to ignore if you 
38 | only want to focus on the larger countries, which are all covered.
39 | }
40 | \examples{
41 | 
42 | acars <- acars
43 | acars$pubNum <- extractPubNumber(acars$docNum) # pubnum, ex ####
44 | acars$countryCode <- extractCountryCode(acars$docNum) # country code, ex USAPP, USD
45 | acars$officeDocLength <- extractDocLength(countryCode = acars$countryCode, 
46 |                                          pubNum = acars$pubNum) # cc + pub num length concat
47 | acars$kindCode <- extractKindCode(acars$docNum)
48 | acars$countryAndKindCode <- with(acars, paste0(countryCode, kindCode))
49 |                                          
50 | acars$docType <- generateDocType(officeDocLength = acars$officeDocLength,
51 |                             countryAndKindCode = acars$countryAndKindCode,
52 |                             cakcDict = cakcDict,
53 |                             docLengthTypesDict = docLengthTypesDict)
54 | table(acars$docType)
55 | 
56 | 
57 | 
58 | }
59 | \seealso{
60 | \code{\link{cakcDict}}, \code{\link{docLengthTypesDict}}
61 | }
62 | 


--------------------------------------------------------------------------------
/inst/shiny/app/server.R:
--------------------------------------------------------------------------------
 1 | ## yang yao 
 2 | ## motivation: R Shiny gallery and look at urls in ui.R
 3 | library(shiny)
 4 | 
 5 | 
 6 | ## kamil bojanczyk start
 7 | function(input, output) {
 8 |   #read in file as rawdata
 9 |   rawData <- reactive({
10 |     inFile <- input$file1
11 |     if (is.null(inFile))
12 |       return(NULL)
13 |     if("csv" %in%  unlist(strsplit(inFile$type,"[/]"))){
14 |       # print("reading file")
15 |       read.csv(inFile$datapath, header=input$header, sep=input$sep, 
16 |                quote=input$quote)  
17 |     } else{
18 |       ext <- tools::file_ext(inFile$name)
19 |       # print(paste("extention is",ext))
20 |       file.rename(inFile$datapath,
21 |                   paste(inFile$datapath, ext, sep="."))
22 |       readxl::read_excel(paste(inFile$datapath, ext, sep="."), 1)
23 |     }
24 |   })
25 |   ## kamil bojanczyk end 
26 |   #show raw data in a table
27 |   output$contents <- renderTable({rawData()})
28 |   
29 |  # clean the raw data
30 |   cleanData <- eventReactive(input$cleanDataButton, {
31 |     df <- rawData()
32 |     if(is.null(df)) return(NULL)
33 |     cleanPatentData(df, columnsExpected = sumobrainColumns,
34 |                             cleanNames = sumobrainNames,
35 |                             dateFields = sumobrainDateFields,
36 |                             dateOrders = sumobrainDateOrder,
37 |                             deduplicate = TRUE,
38 |                             cakcDict = patentr::cakcDict,
39 |                             docLengthTypesDict = patentr::docLengthTypesDict,
40 |                             keepType = "grant",
41 |                             firstAssigneeOnly = TRUE,
42 |                             assigneeSep = ";",
43 |                             stopWords = patentr::assigneeStopWords)
44 |   })
45 |   
46 |   #show the clean data in the tab 
47 |   output$cleanContents <- renderTable({
48 |      cleanData()
49 |   })
50 |   
51 |   #show the first plot
52 |   output$outplot1 <- renderPlot({
53 |     df2<-cleanData()
54 |     df2$assigneeSmall <- strtrim(df2$assigneeClean,12)
55 |     score <- round(rnorm(dim(df2)[1],mean=1.4,sd=0.9))
56 |     score[score>3] <- 3
57 |     score[score<0] <- 0
58 |     df2$score <- score
59 |     scoreSum <- summarizeColumns(df2, "score")
60 |     ggplot(scoreSum, aes(x=score, y = total, fill=factor(score) )) + geom_bar(stat="identity")
61 |     # score[score>3] <- 3
62 |     # score[score<0] <- 0
63 |     # df2$score <- score
64 |     # df2 <- df2[score >2,]
65 |     # flippedHistogram(df2, "assigneeSmall","score",colors=scoreColors)
66 |     
67 |    })
68 |   
69 |   #download the clean data
70 |   output$downloadData <- downloadHandler(
71 |     filename = ('clean.csv'),
72 |     content = function(file) {
73 |       write.csv(cleanData(), file)
74 |     }
75 |   )
76 | }
77 | 
78 | 
79 | 
80 | ## yang yao 
81 | ## motivation: R Shiny gallery and look at urls in ui.R


--------------------------------------------------------------------------------
/man/facetPlot.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/visualizePatentData.R
 3 | \name{facetPlot}
 4 | \alias{facetPlot}
 5 | \title{Make a tiled plot}
 6 | \usage{
 7 | facetPlot(df, xVal, fillVal, facetVal, colors = patentr::scoreColors,
 8 |   recolor = FALSE)
 9 | }
10 | \arguments{
11 | \item{df}{A data frame of the cleaned data you want to plot.}
12 | 
13 | \item{xVal}{A character string of the x value you want for your plot, must be a 
14 | name of the header in \code{df}.}
15 | 
16 | \item{fillVal}{A character string of the fill value you want for your plot, must be a 
17 | name of the header in \code{df}.}
18 | 
19 | \item{facetVal}{A character string of the facet you want for your plot, must be a 
20 | name of the header in \code{df}.}
21 | 
22 | \item{colors}{A character vector of colors, the same length as the number of 
23 | unique values in the column of \code{xVal[,fillVal]}. Default set to 
24 | \code{scoreColors}.}
25 | 
26 | \item{recolor}{A logical allowing you to choose to recolor the plot if the 
27 | colors vector is not applicable to you. Default set to \code{FALSE}. Uses 
28 | the helper function \code{\link{makeColors}} to generate colors. Note that your 
29 | plot may fail if \code{colors} is not the same length as the number of unique 
30 | values in fillVal and \code{recolor} is set to \code{FALSE}.}
31 | }
32 | \value{
33 | A ggplot2 plot object.
34 | }
35 | \description{
36 | Tile plot an x and y variable by facet z. 
37 | 
38 | Tile plots are a a great way to show a dense amount of information in one 
39 | plot sequence. Plotting document count by category, and plotting by assignee, 
40 | is one example.
41 | }
42 | \examples{
43 | 
44 | \dontrun{
45 | sumo <- cleanPatentData(patentData = patentr::acars, columnsExpected = sumobrainColumns,
46 | cleanNames = sumobrainNames,
47 | dateFields = sumobrainDateFields,
48 | dateOrders = sumobrainDateOrder,
49 | deduplicate = TRUE,
50 | cakcDict = patentr::cakcDict,
51 | docLengthTypesDict = patentr::docLengthTypesDict,
52 | keepType = "grant",
53 | firstAssigneeOnly = TRUE,
54 | assigneeSep = ";",
55 | stopWords = patentr::assigneeStopWords)
56 | 
57 | # note that in reality, you need a patent analyst to carefully score
58 | # these patents, the score here is for demonstrational purposes
59 | score <- round(rnorm(dim(sumo)[1],mean=1.4,sd=0.9))
60 | score[score>3] <- 3; score[score<0] <- 0
61 | sumo$score <- score
62 | sumo$assigneeSmall <- strtrim(sumo$assigneeClean,12)
63 | category <- c("system","control algorithm","product","control system", "communication")
64 | c <- round(rnorm(dim(sumo)[1],mean=2.5,sd=1.5))
65 | c[c>5] <- 5; c[c<1] <- 1
66 | sumo$category <- category[c]
67 | 
68 | xVal = "category"
69 | fillVal = "score"
70 | facetVal = "assigneeSmall"
71 | 
72 | facetPlot(subset(sumo, score > 0), xVal, fillVal, facetVal, colors = patentr::scoreColors,
73 |           recolor = FALSE)
74 | }
75 | 
76 | }
77 | 


--------------------------------------------------------------------------------
/man/removeDups.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/cleanPatentData.R
 3 | \name{removeDups}
 4 | \alias{removeDups}
 5 | \title{Remove duplicate entries in a patent data set}
 6 | \usage{
 7 | removeDups(input, hasDup = NA, docType = NA, keepType = "grant")
 8 | }
 9 | \arguments{
10 | \item{input}{A vector or a data frame which you wish to remove duplicate values. 
11 | When choosing a data frame, you are more selective. For example, you may want to 
12 | remove a patent document only if it has the same docNum and country code.}
13 | 
14 | \item{hasDup}{A logical vector noting if a duplicate exists. If NA, ignore. The 
15 | \code{\link{showDups}} funciton helps with this input.}
16 | 
17 | \item{docType}{A character vector of the type of patent document (app, grant, etc.). 
18 | If NA, ignore.}
19 | 
20 | \item{keepType}{A character variable denoting which document type to keep. Default is "grant". 
21 | If NA, ignore.}
22 | }
23 | \value{
24 | A logical vector used to remove duplicate documents not fitting the one 
25 | chosen. TRUE is for the document to keep.
26 | }
27 | \description{
28 | Remove duplicate values in the patent data. Typically you will 
29 | want to check if you have repeat document numbers. A document number should be 
30 | a unique number in your dataset, thus, having a duplicate document number in your 
31 | data set should be avoided. You can optionally specify which document type to keep.
32 | 
33 | Often times, your data sets contain duplicate patent entries. This function is 
34 | a wrapper function of the \code{\link[base]{duplicated}} function, 
35 | applied to a dataframe or vector. 
36 | 
37 | For example, if you have the vector [US123, US123, US456], you will get the value 
38 | TRUE FALSE TRUE and the duplicate value is removed. 
39 | 
40 | You can go deeper with the optional variables. For many analyses, we want to exclude the 
41 | second document, typically the application. This function allows you to choose 
42 | which document type to keep and the rest get thrown out.
43 | }
44 | \examples{
45 | 
46 | # simple removal: see how many rows were removed
47 | dim(acars) - dim(acars[removeDups(acars$appNum),])
48 | 
49 | # specific removal: keep the grant docs
50 | hasDup <- showDups(acars$appNum)
51 | pubNum <- extractPubNumber(acars$docNum)
52 | countryCode <- extractCountryCode(acars$docNum)
53 | officeDocLength <- extractDocLength(countryCode = countryCode, pubNum = pubNum)
54 | kindCode <- extractKindCode(acars$docNum)
55 | countryAndKindCode <- paste0(countryCode, kindCode)
56 | docType <- generateDocType(officeDocLength = officeDocLength, 
57 | countryAndKindCode = countryAndKindCode, 
58 | cakcDict = patentr::cakcDict, 
59 | docLengthTypesDict = patentr::docLengthTypesDict)
60 | keepType <- "grant"
61 | toKeep <- removeDups(acars$appNum, hasDup = hasDup, docType = docType, keepType = keepType)
62 | table(toKeep)
63 | acarsDedup <- acars[toKeep, ]
64 | 
65 | 
66 | }
67 | \seealso{
68 | \code{\link[base]{duplicated}}, \code{\link{showDups}}
69 | }
70 | 


--------------------------------------------------------------------------------
/man/addFullImagePptx.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/reportPatentData.R
 3 | \name{addFullImagePptx}
 4 | \alias{addFullImagePptx}
 5 | \title{Add a full-sized plot image to a pptx}
 6 | \usage{
 7 | addFullImagePptx(ppt, plot, slide_layout = "Title and Content", w = 13.3,
 8 |   h = 7.5)
 9 | }
10 | \arguments{
11 | \item{ppt}{A ppt object to add a slide to.}
12 | 
13 | \item{plot}{A plot output object from ggplto2.}
14 | 
15 | \item{slide_layout}{A character value, slide layout, default value is
16 | \code{"Title and Content"}.}
17 | 
18 | \item{w}{Width in inches, default set to max width 13.3}
19 | 
20 | \item{h}{Height in inches, default set to max height 7.5}
21 | }
22 | \value{
23 | a pptx object.
24 | }
25 | \description{
26 | Take a plot image from ggplot2 and size it to fit an entire
27 | slide.
28 | }
29 | \examples{
30 | sumo <- cleanPatentData(patentData = patentr::acars, columnsExpected = sumobrainColumns,
31 | cleanNames = sumobrainNames,
32 | dateFields = sumobrainDateFields,
33 | dateOrders = sumobrainDateOrder,
34 | deduplicate = TRUE,
35 | cakcDict = patentr::cakcDict,
36 | docLengthTypesDict = patentr::docLengthTypesDict,
37 | keepType = "grant",
38 | firstAssigneeOnly = TRUE,
39 | assigneeSep = ";",
40 | stopWords = patentr::assigneeStopWords)
41 | 
42 | # note that in reality, you need a patent analyst to carefully score
43 | # these patents, the score here is for demonstrational purposes
44 | score <- round(rnorm(dim(sumo)[1],mean=1.4,sd=0.9))
45 | score[score>3] <- 3; score[score<0] <- 0
46 | sumo$score <- score
47 | sumo$assigneeSmall <- strtrim(sumo$assigneeClean,12)
48 | category <- c("system","control algorithm","product","control system", "communication")
49 | c <- round(rnorm(dim(sumo)[1],mean=2.5,sd=1.5))
50 | c[c>5] <- 5; c[c<1] <- 1
51 | sumo$category <- category[c]
52 | 
53 | xVal = "category"
54 | fillVal = "score"
55 | facetVal = "assigneeSmall"
56 | 
57 | fp <- facetPlot(subset(sumo, score > 0), xVal, fillVal, facetVal, colors = patentr::scoreColors,
58 |                 recolor = FALSE)
59 | 
60 | 
61 | 
62 | # create a ppt
63 | ppt <- ReporteRs::pptx(title="IP Update")
64 | # view the types of layouts available by default
65 | # slide.layouts(ppt)
66 | layoutTitleContent = "Title and Content"
67 | 
68 | fp <- facetPlot(subset(sumo, score > 0), xVal, fillVal, facetVal, colors = patentr::scoreColors,
69 |                 recolor = FALSE)
70 | ppt <- addFullImagePptx(ppt, plot = fp, slide_layout = layoutTitleContent)
71 | fp <- facetPlot(subset(sumo, score > 1), xVal, fillVal, facetVal, colors = patentr::scoreColors,
72 |                 recolor = FALSE)
73 | ppt <- addFullImagePptx(ppt, plot = fp, slide_layout = layoutTitleContent)
74 | fp <- facetPlot(subset(sumo, score > 2), xVal, fillVal, facetVal, colors = patentr::scoreColors,
75 |                 recolor = FALSE)
76 | ppt <- addFullImagePptx(ppt, plot = fp, slide_layout = layoutTitleContent)
77 | 
78 | 
79 | # find a data folder and write it out to your folder
80 | # out <- paste("data/",Sys.Date(),"_exampleChartRightTextLeft.pptx",sep='')
81 | # ReporteRs::writeDoc(ppt, out)
82 | 
83 | 
84 | }
85 | \seealso{
86 | \code{\link{addChartRightTextLeftPptx}}
87 | }
88 | 


--------------------------------------------------------------------------------
/R/importPatentData.R:
--------------------------------------------------------------------------------
 1 | ## kamil bojanczyk start
 2 | #' Read in a data file or list of files from excel spreadsheets.
 3 | #'
 4 | #' @description Import, read, and connect patent data files. Currently: xls files
 5 | #' from a filepath. 
 6 | #' Future use: can read from a URL, an xlsx file, google doc, and a csv.
 7 | #'
 8 | #' @param rawDataFilePath A filepath, or list of filespaths, for xls files.
 9 | #' @param skipLines Number of lines to skip before reading in your data file.
10 | #' @return A single data frame of all data. NULL if no data. 
11 | #' @examples \dontrun{
12 | #' 
13 | #' # access the files here and put them in a data/ folder of your working directory.
14 | #' file1 <- system.file("extdata/", "sumobrain_autonomous_search1.xlsx", package="patentr")
15 | #' file2 <- system.file("extdata/", "sumobrain_autonomous_search2.xlsx", package="patentr")
16 | #' files <- list(file1, file2)
17 | #' ipData <- importPatentData(rawDataFilePath = files, skipLines = 1)
18 | #' 
19 | #' # example 2
20 | #' # assume csv files are in the data folder
21 | #' ipData <- importPatentData(rawDataFilePath = list.files('data/', full.names=T), skipLines = 1)
22 | #' }
23 | #' 
24 | #' 
25 | #' @export
26 | #' 
27 | #' @importFrom readxl read_excel
28 | #' @importFrom plyr ldply
29 | #' 
30 | importPatentData <- function(rawDataFilePath = NA, skipLines = 1){
31 |   
32 |   # grep all files that end in "xls". This is a lazy-mans error-check. 
33 |   filePaths <- rawDataFilePath[grep(".*.xls",rawDataFilePath,ignore.case=T)]
34 |   if (length(filePaths) == 0){
35 |     warning("Inputted filepath list: ",rawDataFilePath,"\ndoes not contain any xls files.")
36 |     # exit
37 |     return(NULL)
38 |   }
39 |   else {
40 |     # use read_excel from the readxl package
41 |     # note: on xls files the last column might get dropped
42 |     # a fix was supposed to have worked in Feb 2017
43 |     # https://github.com/tidyverse/readxl/issues/152
44 |     rawData <- lapply(rawDataFilePath, readxl::read_excel, skip = skipLines)
45 |     
46 |     # clean the data with ldply, unlists data and creates single data frame
47 |     cleanData <- plyr::ldply(rawData)
48 |     print(paste("Successfull loaded in a file with",dim(cleanData)[1], "rows and",dim(cleanData)[2],"columns."))
49 |     return(cleanData)
50 |   }
51 |   
52 | }
53 | 
54 | 
55 | 
56 | #' Allow the user to navigate to files manually. 
57 | #' 
58 | #' @description Uses a popup window (Tk file dialog) to allow the user to choose a list of zero or more files interactively.
59 | #' 
60 | #' @return A list of character vectors with absolute pathnames to files.
61 | #' 
62 | #' @examples \dontrun{
63 | #' filePaths <- chooseFiles()
64 | #' allData <- importPatentData(filePaths)
65 | #' }
66 | #' @export
67 | #' 
68 | chooseFiles <- function() {
69 |   ## Note: adding in @importFrom tcltk tk_choose.files breaks the build
70 |   # on linux machines, it may be for the reason below (tcltk may not be installed
71 |   # on some R builds)
72 |   # http://r.789695.n4.nabble.com/Where-is-the-tcltk-package-td3434915.html
73 |   # apparently do not need to necessarily include tcltk in Depends
74 |   
75 |   files <- tcltk::tk_choose.files(caption = "Select the file(s) you wish to read")
76 |   files
77 | }
78 | 
79 | ## kamil bojanczyk end


--------------------------------------------------------------------------------
/inst/examples/edaPatentGuide.R:
--------------------------------------------------------------------------------
 1 | ### simple exploratory data anaylsis guide 
 2 | 
 3 | # 1 read in data
 4 | # access the files here and put them in a data/ folder of your working directory.
 5 | file1 <- system.file("extdata/", "sumobrain_autonomous_search1.xls", package="patentr")
 6 | file2 <- system.file("extdata/", "sumobrain_autonomous_search2.xls", package="patentr")
 7 | files <- list(file1, file2)
 8 | ipData <- importPatentData(rawDataFilePath = files, skipLines = 1)
 9 | 
10 | 
11 | # 2 clean data that was read in 
12 | sumo <- cleanPatentData(patentData = patentr::acars, columnsExpected = sumobrainColumns,
13 | cleanNames = sumobrainNames,
14 | dateFields = sumobrainDateFields,
15 | dateOrders = sumobrainDateOrder,
16 | deduplicate = TRUE,
17 | cakcDict = patentr::cakcDict,
18 | docLengthTypesDict = patentr::docLengthTypesDict,
19 | keepType = "grant",
20 | firstAssigneeOnly = TRUE,
21 | assigneeSep = ";",
22 | stopWords = patentr::assigneeStopWords)
23 | View(sumo)
24 | 
25 | 
26 | # 3 explore data
27 | # note that in reality, you need a patent analyst to carefully score
28 | # these patents, the score here is for demonstrational purposes
29 | score <- round(rnorm(dim(sumo)[1],mean=1.4,sd=0.9))
30 | score[score>3] <- 3
31 | score[score<0] <- 0
32 | sumo$score <- score
33 | scoreSum <- summarizeColumns(sumo, "score")
34 | scoreSum
35 | # load library(ggplot2) for the below part to run
36 | # ggplot(scoreSum, aes(x=score, y = total, fill=factor(score) )) + geom_bar(stat="identity")
37 | nameAndScore <- summarizeColumns(sumo, c("assigneeClean","score"))
38 | tail(nameAndScore)
39 | names(sumo)
40 | tail(summarizeColumns(sumo, c("docType","score","countryCode")))
41 | 
42 | # 4 visualize
43 | ## 4-1 histogram
44 | sumo$assigneeSmall <- strtrim(sumo$assigneeClean,12)
45 | flippedHistogram(sumo, "assigneeSmall","score",colors=scoreColors)
46 | 
47 | flippedHistogram(subset(sumo, score > 0), "assigneeSmall","score",colors=scoreColors)
48 | 
49 | flippedHistogram(subset(sumo, score > 2) ,"assigneeSmall","docType",colors=scoreColors,
50 | recolor = TRUE)
51 | 
52 | ## 4-2 facet plot
53 | category <- c("system","control algorithm","product","control system", "communication")
54 | c <- round(rnorm(dim(sumo)[1],mean=2.5,sd=1.5))
55 | c[c>5] <- 5; c[c<1] <- 1
56 | sumo$category <- category[c]
57 | 
58 | xVal = "category"
59 | fillVal = "score"
60 | facetVal = "assigneeSmall"
61 | 
62 | # warning, if xVal has more than 10 unique vals, it is hard to see
63 | facetPlot(subset(sumo, score > 1), xVal, fillVal, facetVal, colors = patentr::scoreColors,
64 |           recolor = FALSE)
65 | 
66 | 
67 | ## 4-3 tile plots
68 | feature1 <- c("adaptive", "park", "lane", NA,NA,NA,NA,NA,
69 | "brake", "steer","accelerate","deactivate")
70 | f <- round(rnorm(dim(sumo)[1],mean=5,sd=1))
71 | l <- length(feature1)
72 | f[f>l] <- l; f[f<1] <- 1
73 | sumo$feature1 <- c(feature1,feature1[f])[1:dim(sumo)[1]]
74 | 
75 | tilePlot(sumo, "category", "feature1")
76 | 
77 | tilePlot(sumo, xVal = "assigneeSmall", tileVal = "feature1", fillVal = "category",
78 | xangle=90, xhjust=0, showLegend = TRUE)
79 | 
80 | tilePlot(sumo, xVal = "assigneeSmall", tileVal = "feature1", fillVal = "category",
81 | xangle=90, xhjust=0, showLegend = TRUE, facetVal = "docType", fscale = "fixed")
82 | 
83 | tilePlot(sumo, xVal = "assigneeSmall", tileVal = "feature1", fillVal = "category",
84 | xangle=90, xhjust=0, showLegend = TRUE, facetVal = "docType", fscale = "free")
85 | 
86 | tilePlot(sumo, xVal = "assigneeSmall", tileVal = "feature1", fillVal = "category",
87 | xangle=90, xhjust=0, showLegend = TRUE, facetVal = "score", fscale = "free")
88 | 


--------------------------------------------------------------------------------
/man/tilePlot.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/visualizePatentData.R
 3 | \name{tilePlot}
 4 | \alias{tilePlot}
 5 | \title{Make a facet tile plot to view two features.}
 6 | \usage{
 7 | tilePlot(df, xVal, tileVal, fillVal = NA, xangle = 0, xhjust = 0.5,
 8 |   showLegend = FALSE, facetVal = NA, fscale = c("free", "fixed"))
 9 | }
10 | \arguments{
11 | \item{df}{The patent data frame you want to graph.}
12 | 
13 | \item{xVal}{The x value you will be plotting, a character value that is a 
14 | name of \code{df}.}
15 | 
16 | \item{tileVal}{The tile value you will be plotting, a character value that is a 
17 | name of \code{df}.}
18 | 
19 | \item{fillVal}{An optional value for filling the color of the tiles on a third 
20 | variable. Default set to \code{NA} and evaluates to \code{xVal}.}
21 | 
22 | \item{xangle}{A numeric 0 to 360 value for the angle of the x axis text}
23 | 
24 | \item{xhjust}{Double value between 0 and 1. 0 Means left justified, 1 means right justified,
25 | default set to 0.5 (middle), for the x axis text.}
26 | 
27 | \item{showLegend}{A logical to allow you to show or hide the legend, which is 
28 | mapped to the fillVal}
29 | 
30 | \item{facetVal}{Optional faceting. 
31 | A character string of the facet you want for your plot, must be a 
32 | name of the header in \code{df}. Default set to \code{NA}.}
33 | 
34 | \item{fscale}{Facet scale, a character value chosen from \code{c("free","fixed")}.
35 | Default set to \code{fixed}. It changes the y axis to adjust to each facet 
36 | and drop unused y (tile) values or keeps them all constant.}
37 | }
38 | \value{
39 | A ggplot2 facet plot object.
40 | }
41 | \description{
42 | Scan for patent market gaps. 
43 | Visualize the features of a set of patents by a category. Can view up to 
44 | four dimensions of data with this plot (x, y, and optionals fill and facet).
45 | 
46 | Quickly scan this chart to look for gaps in the feature sets.
47 | }
48 | \examples{
49 | 
50 | sumo <- cleanPatentData(patentData = patentr::acars, columnsExpected = sumobrainColumns,
51 | cleanNames = sumobrainNames,
52 | dateFields = sumobrainDateFields,
53 | dateOrders = sumobrainDateOrder,
54 | deduplicate = TRUE,
55 | cakcDict = patentr::cakcDict,
56 | docLengthTypesDict = patentr::docLengthTypesDict,
57 | keepType = "grant",
58 | firstAssigneeOnly = TRUE,
59 | assigneeSep = ";",
60 | stopWords = patentr::assigneeStopWords)
61 | 
62 | # note that in reality, you need a patent analyst to carefully score
63 | # these patents, the score here is for demonstrational purposes
64 | score <- round(rnorm(dim(sumo)[1],mean=1.4,sd=0.9))
65 | score[score>3] <- 3; score[score<0] <- 0
66 | sumo$score <- score
67 | sumo$assigneeSmall <- strtrim(sumo$assigneeClean,12)
68 | category <- c("system","control algorithm","product","control system", "communication")
69 | c <- round(rnorm(dim(sumo)[1],mean=2.5,sd=1.5))
70 | c[c>5] <- 5; c[c<1] <- 1
71 | sumo$category <- category[c]
72 | feature1 <- c("adaptive", "park", "lane", NA,NA,NA,NA,NA, 
73 | "brake", "steer","accelerate","deactivate")
74 | f <- round(rnorm(dim(sumo)[1],mean=5,sd=1))
75 | l <- length(feature1)
76 | f[f>l] <- l; f[f<1] <- 1
77 | sumo$feature1 <- c(feature1,feature1[f])[1:dim(sumo)[1]]
78 | 
79 | tilePlot(sumo, "category", "feature1")
80 | 
81 | tilePlot(sumo, xVal = "assigneeSmall", tileVal = "feature1", fillVal = "category",
82 | xangle=90, xhjust=0, showLegend = TRUE)
83 | 
84 | tilePlot(sumo, xVal = "assigneeSmall", tileVal = "feature1", fillVal = "category",
85 | xangle=90, xhjust=0, showLegend = TRUE, facetVal = "docType", fscale = "fixed")
86 | 
87 | tilePlot(sumo, xVal = "assigneeSmall", tileVal = "feature1", fillVal = "category",
88 | xangle=90, xhjust=0, showLegend = TRUE, facetVal = "docType", fscale = "free")
89 | 
90 | tilePlot(sumo, xVal = "assigneeSmall", tileVal = "feature1", fillVal = "category",
91 | xangle=90, xhjust=0, showLegend = TRUE, facetVal = "score", fscale = "free")
92 | 
93 | }
94 | 


--------------------------------------------------------------------------------
/man/kindCodes.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{kindCodes}
 5 | \alias{kindCodes}
 6 | \title{A kind codes database to show the type of document for each patent document.}
 7 | \format{A data frame.
 8 | 
 9 | \describe{
10 | \item{\code{countryCode}}{The country code for the originating office where the application 
11 | was filed.}
12 | \item{kindCode}{The letter/number code to signify the type of document. Codes may 
13 | change after a certain date, so pay attention to \code{dateStarted} and \code{dateDeprecated}}
14 | \item{isDeprecated}{Logical TRUE/FALSE if the kind code for the country is no longer in use.}
15 | \item{dateDeprecated}{The date the kind code stopped being in use.}
16 | \item{isNew}{If the kind code is a replacement for a former kind code, TRUE, else FALSE.}
17 | \item{dateStarted}{If isNew == TRUE, the date the new kind code began being used.}
18 | \item{comment}{Additional information explaining the details of the kind code.}
19 | \item{docTypeLong}{The long version of the document type.}
20 | \item{docType}{A shorter, standardized version of \code{docTypeLong}.}
21 | \item{expectDuplicate}{A logical TRUE/FALSE to help the analyst understand if the 
22 | published document is expected to have a duplicate publication. For example, USB2 is 
23 | a granted patent that has an application that was also published, whereas USB1 has no 
24 | previous documents published. This helps speed up the deduplication process. }
25 | \item{countryAndKindCode}{A concatenation of country code and kind code. Used in 
26 | the deduplication process and to determine the type of document.}
27 | }}
28 | \usage{
29 | kindCodes
30 | }
31 | \description{
32 | Patent documents have associated kind codes, which are letter/number code 
33 | combinations that signify the type of document, such as application, granted 
34 | patent, utility patent, etc. These kind codes vary by country and are a useful 
35 | approach to classifying patent document types. Most, however, not all, downloaded 
36 | data from free services such as sumobrain.com or lens.org includes the kind code 
37 | at the end of the patent document number.
38 | }
39 | \details{
40 | For example, from the sumobrain.com download from the \code{\link{acars}} data set,
41 | here are three documents:
42 | \enumerate{
43 | \item{US6523912}
44 | \item{US20030060197}
45 | \item{EP1310400A1}
46 | }
47 | 
48 | The first two items are missing kind codes. The third item has kind code A1 
49 | and the country code is EP.
50 | 
51 | To clean the data yourself:
52 | 
53 | \code{temp <- readxl::read_excel(system.file("extdata", "kindCodes.xlsx", package = "patentr"))}
54 | 
55 | \code{temp <- replace(temp, is.na(temp), "NA")}
56 | 
57 | \code{temp$dateDeprecated <- as.numeric(temp$dateDeprecated)}
58 | 
59 | \code{temp$dateDeprecated <- as.Date(temp$dateDeprecated, origin = "1899-12-30")}
60 | 
61 | \code{temp$dateStarted <- as.numeric(temp$dateStarted)}
62 | 
63 | \code{temp$dateStarted <- as.Date(temp$dateStarted, origin = "1899-12-30")}
64 | 
65 | \code{temp$countryAndKindCode <- with(temp,paste0(countryCode, kindCode))}
66 | 
67 | See \url{https://www.r-bloggers.com/date-formats-in-r/} for excel mac/windows 
68 | and confirm this origin works for you by reviewing the source file.
69 | 
70 | View the data sources:
71 | \enumerate{
72 | \item{\href{https://www.uspto.gov/learning-and-resources/support-centers/electronic-business-center/kind-codes-included-uspto-patent}{USPTO kind codes}}
73 | \item{\href{https://www.cas.org/content/references/patkind}{CAS list of kind codes}}
74 | \item{\href{http://ipbookcompanion.org/links/pk_codes.pdf}{IP Book kind codes}}
75 | \item{\href{http://www.thomsonfilehistories.com/docs/RESOURCES_Kind\%20Codes\%20by\%20Country.pdf}{Thomson File Histories}}
76 | }
77 | For esp@cenet at the EPO, try the following link:
78 | \code{https://worldwide.espacenet.com/help?locale=en_EP&method=}
79 | \code{handleHelpTopic&topic=kindcodes\%5C}
80 | }
81 | \keyword{data}
82 | 


--------------------------------------------------------------------------------
/man/acarsLens.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/acars.R
 3 | \docType{data}
 4 | \name{acarsLens}
 5 | \alias{acarsLens}
 6 | \title{Autonomous Vehicle Patent Data from Lens Patent Search}
 7 | \format{A data frame with 863 observations on 26 variables.
 8 | \describe{
 9 | \item{resultNum}{The search result number.}
10 | \item{countryCode}{The jurisdiction of the patent document.}
11 | \item{\code{kindCode}}{The kind code.}
12 | \item{docNum}{The published document number with country code and kind code included.}
13 | \item{lensID}{The unique identification number of the document on lens.org}
14 | \item{pubDate}{Date the document was published.}
15 | \item{pubYear}{Year the document published.}
16 | \item{appNum}{The filing number of the application (country code, number, and abridged kind code, typically 'A')}
17 | \item{dateFiled}{Date the application for the patent document was filed.}
18 | \item{priorityApps}{Applications this patent document claims priority. 
19 | Format: Country code, application number, A = application or P = provisional, YYYYMMDD of priority. 
20 | Multiple application separated by a double semi-colon.}
21 | \item{title}{The title of the document.}
22 | \item{assignee}{The name of the applicant(s) at the time of filing.}
23 | \item{inventors}{The inventor(s).}
24 | \item{lensURL}{The lens.org URL for the document.}
25 | \item{docTypeLens}{A lens.org mapping of the doc type. 
26 | Granted, application, ambiguous, unknown, search report, and possibly more values.}
27 | \item{hasFullText}{A logical value to show if there is a full text available from lens.org}
28 | \item{citeCount}{The number of times this document is cited, also known as forward citations.}
29 | \item{familySimpleCount}{The number of unique documents in the immediate patent family.}
30 | \item{familyExtendedCount}{The number of unique documents sharing a priority applicaiton in the extended family.}
31 | \item{seqCount}{Used in biological applications -- the number of sequences in the application.}
32 | \item{cpcClasses}{The CPC classification codes, separated by a double semi-colon.}
33 | \item{ipcrClasses}{The IPCR classification codes, separated by a double semi-colon.}
34 | \item{usClasses}{The US classification codes, separated by a double semi-colon.}
35 | \item{pubmedID}{A pubmed ID to any related research.}
36 | \item{DOI}{A digital object identifier. 
37 | Go to doi.org and paste the value to get the associated research paper.}
38 | \item{npl}{Non-patent literature, or citations of non-patent sources.
39 | Separated with double semi-colons.}
40 | 
41 | 
42 | }}
43 | \usage{
44 | acarsLens
45 | }
46 | \description{
47 | An example data set of autonomous vehicle IP from major assignees.
48 | }
49 | \details{
50 | The data search was performd on Saturday, March 18, 2017 from lens.org, and the exact
51 | search:
52 | 
53 | \href{https://www.lens.org/lens/search?q=abstract\%3Aautonomous+\%26\%26+applicant\%3A\%28Apple*+OR+Google*+OR+Waymo*+OR+Tesla*+OR+Ford*+OR+General*\%29&predicate=\%26\%26&l=en}{Lens Patents Search}
54 | 
55 | For all countries available on Lens.
56 | 
57 | Can get raw data with the following commands:
58 | 
59 | \code{temp <- system.file("extdata", "lens_autonomous_search.csv", package = "patentr")}
60 | 
61 | \code{temp <- read.csv(temp, stringsAsFactors = FALSE)}
62 | 
63 | \code{temp <- data.frame(lapply(temp, function(x){iconv(x,to="ASCII")}),stringsAsFactors = FALSE)}
64 |  
65 | \code{names(temp) <- lensNames}
66 | 
67 | \code{temp$dateFiled <- as.Date(temp$dateFiled, format = '\%m/\%d/\%y')}
68 | 
69 | \code{temp$pubDate <- as.Date(temp$pubDate, format='\%m/\%d/\%y')} # note that % y is system-specific and may not work everywhere.
70 | 
71 | \code{colsNum <- c("resultNum","citeCount","familySimpleCount","familyExtendedCount", "seqCount")}
72 | 
73 | \code{temp[colsNum] <- sapply(temp[colsNum], as.numeric)}
74 | 
75 | \code{temp$hasFullText <- sapply(temp$hasFullText, function(x) ifelse(x=="yes",TRUE,FALSE))}
76 | }
77 | \seealso{
78 | \url{www.lens.org} You can export without an account, or can create 
79 | an account to save your searches. 
80 | 
81 | \code{\link{acarsGoogle}} provides a similar search from Google. 
82 | \code{\link{acars}} provides a similar search from sumobrain.
83 | }
84 | \keyword{data}
85 | 


--------------------------------------------------------------------------------
/man/cleanPatentData.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/cleanPatentData.R
  3 | \name{cleanPatentData}
  4 | \alias{cleanPatentData}
  5 | \title{Generate a clean data set from the imported raw data.}
  6 | \usage{
  7 | cleanPatentData(patentData = NULL, columnsExpected, cleanNames,
  8 |   dateFields = NA, dateOrders, deduplicate = TRUE,
  9 |   cakcDict = patentr::cakcDict,
 10 |   docLengthTypesDict = patentr::docLengthTypesDict, keepType = "grant",
 11 |   firstAssigneeOnly = TRUE, assigneeSep = ";",
 12 |   stopWords = patentr::assigneeStopWords)
 13 | }
 14 | \arguments{
 15 | \item{patentData}{The data frame of initial raw patent data.}
 16 | 
 17 | \item{columnsExpected}{The expected width of the data frame, numeric.}
 18 | 
 19 | \item{cleanNames}{A character vector of length columnsExpected to rename the 
 20 | data frame with.}
 21 | 
 22 | \item{dateFields}{A character vector of the date column names which will be 
 23 | converted to `Date` format.}
 24 | 
 25 | \item{dateOrders}{A character string of the format required to convert string 
 26 | data into `Date` data. Sumobrain is "ymd" and lens and Google data are "mdy".
 27 | Hardcoded values include \code{\link{googleDateOrder}},\code{\link{lensDateOrder}}, 
 28 | and \code{\link{sumobrainDateOrder}}.}
 29 | 
 30 | \item{deduplicate}{A logical, default set to TRUE, if you want to deduplicated 
 31 | any patent documents that have both an app and a grant.}
 32 | 
 33 | \item{cakcDict}{A county and kind code dictionary. Default is \code{\link{cakcDict}}.}
 34 | 
 35 | \item{docLengthTypesDict}{A document length and type dictionary. Default is \code{\link{docLengthTypesDict}}.}
 36 | 
 37 | \item{keepType}{A character variable denoting which document type to keep. Default is "grant". 
 38 | If NA, ignore.}
 39 | 
 40 | \item{firstAssigneeOnly}{For cleaning names, use the first assignee only, default TRUE.}
 41 | 
 42 | \item{assigneeSep}{The separation character if there is more than one assignee. Default 
 43 | is ";" semicolon.}
 44 | 
 45 | \item{stopWords}{The stopword list to remove from assignee names. Default is 
 46 | \code{\link{assigneeStopWords}}.}
 47 | }
 48 | \value{
 49 | A data frame of tidy patent data.
 50 | }
 51 | \description{
 52 | Generate a clean data set from the imported raw data set. The 
 53 | data available dictates the number of columns of attributes that can be 
 54 | generated.
 55 | 
 56 | Sumobrain, Lens.org, and Google Patents have varying levels of data available. 
 57 | 
 58 | If you import your own data, be sure to adhere to the template format, or 
 59 | read carefully to create your own.
 60 | }
 61 | \examples{
 62 | 
 63 | 
 64 | sumo <- cleanPatentData(patentData = patentr::acars, columnsExpected = sumobrainColumns,
 65 | cleanNames = sumobrainNames,
 66 | dateFields = sumobrainDateFields,
 67 | dateOrders = sumobrainDateOrder,
 68 | deduplicate = TRUE,
 69 | cakcDict = patentr::cakcDict,
 70 | docLengthTypesDict = patentr::docLengthTypesDict,
 71 | keepType = "grant",
 72 | firstAssigneeOnly = TRUE, 
 73 | assigneeSep = ";",
 74 | stopWords = patentr::assigneeStopWords)
 75 | 
 76 | # use a fresh Google export csv
 77 | # in a new csv download, however, it would not be the case
 78 | 
 79 | 
 80 | rawGoogleData <- system.file("extdata", "google_autonomous_search.csv", 
 81 | package = "patentr")
 82 | rawGoogleData <- read.csv(rawGoogleData, 
 83 | skip = skipGoogle, stringsAsFactors = FALSE)
 84 | rawGoogleData <- data.frame(lapply(rawGoogleData, 
 85 | function(x){iconv(x, to = "ASCII")}), stringsAsFactors = FALSE)
 86 | google <- cleanPatentData(patentData = rawGoogleData, columnsExpected = googleColumns,
 87 | cleanNames = googleNames,
 88 | dateFields = googleDateFields,
 89 | dateOrders = googleDateOrder,
 90 | deduplicate = TRUE,
 91 | cakcDict = patentr::cakcDict,
 92 | docLengthTypesDict = patentr::docLengthTypesDict,
 93 | keepType = "grant",
 94 | firstAssigneeOnly = TRUE, 
 95 | assigneeSep = ",",
 96 | stopWords = patentr::assigneeStopWords)
 97 | 
 98 | 
 99 | lensRawData <- system.file("extdata", "lens_autonomous_search.csv", 
100 | package = "patentr")
101 | lensRawData <- read.csv(lensRawData, stringsAsFactors = FALSE, skip = skipLens)
102 | lensRawData <- data.frame(lapply(lensRawData, 
103 | function(x){iconv(x, to = "ASCII")}), stringsAsFactors = FALSE)
104 | lens <- cleanPatentData(patentData = lensRawData, columnsExpected = lensColumns,
105 | cleanNames = lensNames,
106 | dateFields = lensDateFields,
107 | dateOrders = lensDateOrder,
108 | deduplicate = TRUE,
109 | cakcDict = patentr::cakcDict,
110 | docLengthTypesDict = patentr::docLengthTypesDict,
111 | keepType = "grant",
112 | firstAssigneeOnly = TRUE, 
113 | assigneeSep = ";;",
114 | stopWords = patentr::assigneeStopWords)
115 | 
116 | }
117 | \seealso{
118 | For data formats: \code{\link{acars}} for Sumobrain, 
119 | \code{\link{acarsGoogle}} for Google Patents data, and \code{\link{acarsLens}} 
120 | for Lens.org data.
121 | }
122 | 


--------------------------------------------------------------------------------
/tests/testthat/test-process.R:
--------------------------------------------------------------------------------
  1 | # used with processPatentData.R
  2 | 
  3 | 
  4 | 
  5 | 
  6 | # sumobrain data
  7 | test_that("Sumobrain data has clean google URLS.",{
  8 |   df <- importPatentData(rprojroot::find_testthat_root_file("testData","sumobrain_autonomous_search1.xlsx"), skipLines = skipSumobrain)
  9 |   df <- cleanPatentData(patentData = df, columnsExpected = sumobrainColumns,
 10 |                         cleanNames = sumobrainNames, dateFields = sumobrainDateFields,
 11 |                         dateOrders = sumobrainDateOrder, deduplicate = TRUE,
 12 |                         cakcDict = cakcDict, docLengthTypesDict = docLengthTypesDict,
 13 |                         keepType = "grant",firstAssigneeOnly = TRUE, assigneeSep = ";",
 14 |                         stopWords = assigneeStopWords)
 15 |   kc <- extractKindCode(df$docNum)
 16 |   pn <- extractPubNumber(df$docNum)
 17 |   cc <- extractCountryCode(df$docNum)
 18 |   gurl <- createGoogleURL(countryCode = cc, pubNum = pn, kindCode = kc)
 19 |   expect_equal(length(gurl) ,dim(df)[1])
 20 | })
 21 | 
 22 | 
 23 | 
 24 | test_that("Google data has clean google URLS.",{
 25 |   df <- read.csv(rprojroot::find_testthat_root_file("testData","google_autonomous_search.csv"), 
 26 |                  skip = skipGoogle, stringsAsFactors = FALSE)
 27 |   df <- data.frame(lapply(df,function(x){iconv(x, to = "ASCII")}), stringsAsFactors = FALSE)
 28 |   
 29 |   df <- cleanPatentData(patentData = df, columnsExpected = googleColumns,
 30 |                         cleanNames = googleNames, dateFields = googleDateFields,
 31 |                         dateOrders = googleDateOrder, deduplicate = TRUE,
 32 |                         cakcDict = cakcDict, docLengthTypesDict = docLengthTypesDict,
 33 |                         keepType = "grant",firstAssigneeOnly = TRUE, assigneeSep = ",",
 34 |                         stopWords = assigneeStopWords)
 35 |   kc <- extractKindCode(df$docNum)
 36 |   pn <- extractPubNumber(df$docNum)
 37 |   cc <- extractCountryCode(df$docNum)
 38 |   gurl <- createGoogleURL(countryCode = cc, pubNum = pn, kindCode = kc)
 39 |   expect_equal(length(gurl) ,dim(df)[1])
 40 | })
 41 | 
 42 | 
 43 | test_that("Lens.org patent data has clean google URLS.",{
 44 |   df <- read.csv(rprojroot::find_testthat_root_file("testData","lens_autonomous_search.csv"), 
 45 |                  skip = skipLens, stringsAsFactors = FALSE)
 46 |   df <- data.frame(lapply(df,function(x){iconv(x, to = "ASCII")}), stringsAsFactors = FALSE)
 47 |   
 48 |   df <- cleanPatentData(patentData = df, columnsExpected = lensColumns,
 49 |                         cleanNames = lensNames, dateFields = lensDateFields,
 50 |                         dateOrders = lensDateOrder, deduplicate = TRUE,
 51 |                         cakcDict = cakcDict, docLengthTypesDict = docLengthTypesDict,
 52 |                         keepType = "grant",firstAssigneeOnly = TRUE, assigneeSep = ";;",
 53 |                         stopWords = assigneeStopWords)
 54 |   kc <- extractKindCode(df$docNum)
 55 |   pn <- extractPubNumber(df$docNum)
 56 |   cc <- extractCountryCode(df$docNum)
 57 |   gurl <- createGoogleURL(countryCode = cc, pubNum = pn, kindCode = kc)
 58 |   expect_equal(length(gurl) ,dim(df)[1])
 59 | })
 60 | 
 61 | 
 62 | test_that("getClaimFromURL returns character value.",{
 63 |   aclaim <- getClaimFromURL("https://patents.google.com/patent/US8818682B1/en")
 64 |   expect_is(aclaim ,"character")
 65 | })
 66 | 
 67 | test_that("getClaimFromURL for an old patent should return blank.",{
 68 |   anOldClaim <- getClaimFromURL("https://patents.google.com/patent/US881/en")
 69 |   expect_is(anOldClaim ,"character")
 70 | })
 71 | 
 72 | test_that("getClaimFromURL from a bad (well-formatted, 404 error) URL should return blank.",{
 73 |   aBadURLClaim <- getClaimFromURL("https://patents.google.com/patent/USsss881/en")
 74 |   expect_is(aBadURLClaim ,"character")
 75 | })
 76 | 
 77 | 
 78 | test_that("cleanGoogleURL from /mx returns character.",{
 79 |   expect_is(cleanGoogleURL("https://patents.google.com/patent/US8818682B1/mx") ,"character")
 80 | })
 81 | 
 82 | test_that("cleanGoogleURL from / returns character.",{
 83 |   expect_is(cleanGoogleURL("https://patents.google.com/patent/US8818682B1/") ,"character")
 84 | })
 85 | 
 86 | 
 87 | test_that("cleanGoogleURL from no backslash returns character.",{
 88 |   expect_is(cleanGoogleURL("https://patents.google.com/patent/US8818682B1") ,"character")
 89 | })
 90 | 
 91 | test_that("cleanGoogleURL from /en returns character.",{
 92 |   expect_is(cleanGoogleURL("https://patents.google.com/patent/US8818682B1/en") ,"character")
 93 | })
 94 | 
 95 | 
 96 | 
 97 | test_that("getClaimFromURL should return a character of length 1.",{
 98 |   krclaim <- getClaimFromURL("https://patents.google.com/patent/KR20150127745A/en")
 99 |   expect_equal(length(krclaim), 1)
100 |   expect_is(krclaim, "character")
101 | })
102 | 
103 | 
104 | test_that("getClaimsText reads in 3 urls and returns a character vector of length 3.",{
105 |   df <- importPatentData(rprojroot::find_testthat_root_file("testData","sumobrain_autonomous_search1.xlsx"), skipLines = skipSumobrain)
106 |   df <- cleanPatentData(patentData = df, columnsExpected = sumobrainColumns,
107 |                         cleanNames = sumobrainNames, dateFields = sumobrainDateFields,
108 |                         dateOrders = sumobrainDateOrder, deduplicate = TRUE,
109 |                         cakcDict = cakcDict, docLengthTypesDict = docLengthTypesDict,
110 |                         keepType = "grant",firstAssigneeOnly = TRUE, assigneeSep = ";",
111 |                         stopWords = assigneeStopWords)
112 |   urls <- df$googleURL[1:3]
113 |   clms <- getClaimsText(urls)
114 |   expect_equal(length(clms), 3)
115 |   expect_is(urls, "character")
116 | })
117 | 
118 | 


--------------------------------------------------------------------------------
/tests/testthat/test-graphics.R:
--------------------------------------------------------------------------------
  1 | # test graphics
  2 | ## kamil bojanczyk start
  3 | # svg("tests/testthat/testData/sb0.svg")
  4 | # flippedHistogram(subset(sumo, score > 0), "assigneeSmall","score",colors=scoreColors)
  5 | # dev.off()
  6 | # graphics are the same
  7 | test_that("Sumobrain flipped histogram outputs a plot.",{
  8 | 
  9 |   file1 <- rprojroot::find_testthat_root_file("testData","sumobrain_autonomous_search1.xlsx")
 10 |   file2 <- rprojroot::find_testthat_root_file("testData","sumobrain_autonomous_search2.xlsx")
 11 |   files <- list(file1, file2)
 12 |   df <- importPatentData(rawDataFilePath = files, skipLines = skipSumobrain)
 13 |   df <- cleanPatentData(patentData = df, columnsExpected = sumobrainColumns,
 14 |                         cleanNames = sumobrainNames, dateFields = sumobrainDateFields,
 15 |                         dateOrders = sumobrainDateOrder, deduplicate = TRUE,
 16 |                         cakcDict = cakcDict, docLengthTypesDict = docLengthTypesDict,
 17 |                         keepType = "grant",firstAssigneeOnly = TRUE, assigneeSep = ";",
 18 |                         stopWords = assigneeStopWords)
 19 |   
 20 |   
 21 |   # https://github.com/hadley/evaluate/blob/master/tests/testthat/ggplot-loop.r
 22 |   df$assigneeSmall <- strtrim(df$assigneeClean,12)
 23 |   score <- round(rnorm(dim(df)[1],mean=1.4,sd=0.9))
 24 |   score[score>3] <- 3
 25 |   score[score<0] <- 0
 26 |   df$score <- score
 27 |   aPlot <- flippedHistogram(df, "assigneeSmall","score")
 28 | 
 29 |   # taken from hadley's ggplot2 tests
 30 |   # https://github.com/tidyverse/ggplot2/blob/master/tests/testthat/test-geom-hex.R
 31 |   out <- layer_data(aPlot)
 32 | 
 33 |   temp <- summarizeColumns(df, c("assigneeSmall","score"))
 34 |   expect_equal(nrow(out), nrow(temp))
 35 |   expect_equal(sort(out$count), temp$total)
 36 |   expect_is(aPlot, c("gg","ggplot"))
 37 |   
 38 |   # Note: I read a SO post saying it is not wise to test svg outputs against 
 39 |   # current plots, thus, the exact plot is not compared to an svg file
 40 | })
 41 | 
 42 | 
 43 | test_that("facetPlot plots makes a plot object",{
 44 |   
 45 |   file1 <- rprojroot::find_testthat_root_file("testData","sumobrain_autonomous_search1.xlsx")
 46 |   file2 <- rprojroot::find_testthat_root_file("testData","sumobrain_autonomous_search2.xlsx")
 47 |   files <- list(file1, file2)
 48 |   df <- importPatentData(rawDataFilePath = files, skipLines = skipSumobrain)
 49 |   df <- cleanPatentData(patentData = df, columnsExpected = sumobrainColumns,
 50 |                         cleanNames = sumobrainNames, dateFields = sumobrainDateFields,
 51 |                         dateOrders = sumobrainDateOrder, deduplicate = TRUE,
 52 |                         cakcDict = cakcDict, docLengthTypesDict = docLengthTypesDict,
 53 |                         keepType = "grant",firstAssigneeOnly = TRUE, assigneeSep = ";",
 54 |                         stopWords = assigneeStopWords)
 55 |   
 56 |   
 57 |   # https://github.com/hadley/evaluate/blob/master/tests/testthat/ggplot-loop.r
 58 |   df$assigneeSmall <- strtrim(df$assigneeClean,12)
 59 |   score <- round(rnorm(dim(df)[1],mean=1.4,sd=0.9))
 60 |   score[score>3] <- 3
 61 |   score[score<0] <- 0
 62 |   df$score <- score
 63 |   category <- c("system","control algorithm","product","control system", "communication")
 64 |   c <- round(rnorm(dim(df)[1],mean=2.5,sd=1.5))
 65 |   c[c>5] <- 5; c[c<1] <- 1
 66 |   df$category <- category[c]
 67 |   
 68 |   xVal = "category"
 69 |   fillVal = "score"
 70 |   facetVal = "assigneeSmall"
 71 |   
 72 |   aPlot <- facetPlot(subset(df, score > 0), xVal, fillVal, facetVal, colors = patentr::scoreColors,
 73 |                      recolor = FALSE)
 74 |   
 75 |   
 76 |   # taken from hadley's ggplot2 tests
 77 |   # https://github.com/tidyverse/ggplot2/blob/master/tests/testthat/test-geom-hex.R
 78 |   out <- layer_data(aPlot)
 79 |   expect_is(aPlot, c("gg","ggplot"))
 80 | 
 81 | })
 82 | 
 83 | 
 84 | 
 85 | 
 86 | test_that("tilePlot plots makes a plot object",{
 87 |   
 88 |   file1 <- rprojroot::find_testthat_root_file("testData","sumobrain_autonomous_search1.xlsx")
 89 |   file2 <- rprojroot::find_testthat_root_file("testData","sumobrain_autonomous_search2.xlsx")
 90 |   files <- list(file1, file2)
 91 |   df <- importPatentData(rawDataFilePath = files, skipLines = skipSumobrain)
 92 |   df <- cleanPatentData(patentData = df, columnsExpected = sumobrainColumns,
 93 |                         cleanNames = sumobrainNames, dateFields = sumobrainDateFields,
 94 |                         dateOrders = sumobrainDateOrder, deduplicate = TRUE,
 95 |                         cakcDict = cakcDict, docLengthTypesDict = docLengthTypesDict,
 96 |                         keepType = "grant",firstAssigneeOnly = TRUE, assigneeSep = ";",
 97 |                         stopWords = assigneeStopWords)
 98 |   
 99 |   
100 |   # https://github.com/hadley/evaluate/blob/master/tests/testthat/ggplot-loop.r
101 |   df$assigneeSmall <- strtrim(df$assigneeClean,12)
102 |   score <- round(rnorm(dim(df)[1],mean=1.4,sd=0.9))
103 |   score[score>3] <- 3
104 |   score[score<0] <- 0
105 |   df$score <- score
106 |   category <- c("system","control algorithm","product","control system", "communication")
107 |   c <- round(rnorm(dim(df)[1],mean=2.5,sd=1.5))
108 |   c[c>5] <- 5; c[c<1] <- 1
109 |   df$category <- category[c]
110 | 
111 |   category <- c("system","control algorithm","product","control system", "communication")
112 |   c <- round(rnorm(dim(df)[1],mean=2.5,sd=1.5))
113 |   c[c>5] <- 5; c[c<1] <- 1
114 |   df$category <- category[c]
115 |   feature1 <- c("adaptive", "park", "lane", NA,NA,NA,NA,NA, "brake", "steer","accelerate","deactivate")
116 |   f <- round(rnorm(dim(df)[1],mean=5,sd=1))
117 |   l <- length(feature1)
118 |   f[f>l] <- l; f[f<1] <- 1
119 |   df$feature1 <- c(feature1,feature1[f])[1:dim(df)[1]]
120 |   
121 |   aPlot <- tilePlot(df, "category", "feature1")
122 |   
123 |   # taken from hadley's ggplot2 tests
124 |   # https://github.com/tidyverse/ggplot2/blob/master/tests/testthat/test-geom-hex.R
125 |   out <- layer_data(aPlot)
126 |   expect_is(aPlot, c("gg","ggplot"))
127 |   
128 | })
129 | 
130 | 
131 | 
132 | ## kamil bojanczyk end


--------------------------------------------------------------------------------
/man/addChartRightTextLeftPptx.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/reportPatentData.R
  3 | \name{addChartRightTextLeftPptx}
  4 | \alias{addChartRightTextLeftPptx}
  5 | \title{Add a PPTX slide with chart on the right and text on the left}
  6 | \usage{
  7 | addChartRightTextLeftPptx(ppt, plot, text, title,
  8 |   slide_layout = "Title and Content", Poffx = 5.3, Poffy = 0,
  9 |   Pwidth = 8, Pheight = 7.5, Toffx = 1, Toffy = 2, Twidth = 5,
 10 |   Theight = 5.5)
 11 | }
 12 | \arguments{
 13 | \item{ppt}{A ppt object.}
 14 | 
 15 | \item{plot}{A plot object from ggplot2.}
 16 | 
 17 | \item{text}{A character vector of text, typically less than one paragraph 
 18 | in size.}
 19 | 
 20 | \item{title}{A character title for a page. Default is NULL}
 21 | 
 22 | \item{slide_layout}{The name of a slide layout, the same name as the names in a .potx 
 23 | powerpoint template file. Default is a Title and Content blank layout.}
 24 | 
 25 | \item{Poffx}{Plot image x position from left top, inches. 
 26 | See \code{\link[ReporteRs]{addPlot}}. Default is 5.3.}
 27 | 
 28 | \item{Poffy}{Plot image y position from left top, inches.
 29 | See \code{\link[ReporteRs]{addPlot}}. Default is 0.}
 30 | 
 31 | \item{Pwidth}{Plot image width, inches. 
 32 | See \code{\link[ReporteRs]{addPlot}}. Default is 8.}
 33 | 
 34 | \item{Pheight}{Plot image height, inches. 
 35 | See \code{\link[ReporteRs]{addPlot}}. Default is 7.5}
 36 | 
 37 | \item{Toffx}{Text image x position from left top, inches. 
 38 | See \code{\link[ReporteRs]{addPlot}}. Default is 1.}
 39 | 
 40 | \item{Toffy}{Text image y position from left top, inches. 
 41 | See \code{\link[ReporteRs]{addPlot}}. Default is 2.}
 42 | 
 43 | \item{Twidth}{Text image width, inches. 
 44 | See \code{\link[ReporteRs]{addPlot}}. Default is 5.}
 45 | 
 46 | \item{Theight}{Text image height, inches. 
 47 | See \code{\link[ReporteRs]{addPlot}}. Default is 5.5.}
 48 | }
 49 | \description{
 50 | Generate a commonly-used PPTX slide format where the patent 
 51 | chart is on the right and some text is on the left. 
 52 | 
 53 | This function automates a number of steps used in formatting a pptx slide. 
 54 | It returns the ppt object with the new slide included.
 55 | }
 56 | \examples{
 57 | 
 58 | sumo <- cleanPatentData(patentData = patentr::acars, columnsExpected = sumobrainColumns,
 59 |                         cleanNames = sumobrainNames,
 60 |                         dateFields = sumobrainDateFields,
 61 |                         dateOrders = sumobrainDateOrder,
 62 |                         deduplicate = TRUE,
 63 |                         cakcDict = patentr::cakcDict,
 64 |                         docLengthTypesDict = patentr::docLengthTypesDict,
 65 |                         keepType = "grant",
 66 |                         firstAssigneeOnly = TRUE,
 67 |                         assigneeSep = ";",
 68 |                         stopWords = patentr::assigneeStopWords)
 69 | 
 70 | # note that in reality, you need a patent analyst to carefully score
 71 | # these patents, the score here is for demonstrational purposes
 72 | score <- round(rnorm(dim(sumo)[1],mean=1.4,sd=0.9))
 73 | score[score>3] <- 3
 74 | score[score<0] <- 0
 75 | sumo$score <- score
 76 | sumo$assigneeSmall <- strtrim(sumo$assigneeClean,12)
 77 | category <- c("system","control algorithm","product","control system", "communication")
 78 | c <- round(rnorm(dim(sumo)[1],mean=2.5,sd=1.5))
 79 | c[c>5] <- 5; c[c<1] <- 1
 80 | sumo$category <- category[c]
 81 | feature1 <- c("adaptive", "park", "lane", NA,NA,NA,NA,NA, 
 82 | "brake", "steer","accelerate","deactivate")
 83 | f <- round(rnorm(dim(sumo)[1],mean=5,sd=1))
 84 | l <- length(feature1)
 85 | f[f>l] <- l; f[f<1] <- 1
 86 | sumo$feature1 <- c(feature1,feature1[f])[1:dim(sumo)[1]]
 87 | 
 88 | flippedHistogram(sumo, "assigneeSmall","score",colors=scoreColors)
 89 | flippedHistogram(subset(sumo, score > 0), "assigneeSmall","score",colors=scoreColors)
 90 | 
 91 | flippedHistogram(subset(sumo, score > 2) ,"assigneeSmall","docType",colors=scoreColors,
 92 |                  recolor = TRUE)
 93 | 
 94 | 
 95 | 
 96 | 
 97 | # create a ppt
 98 | ppt <- ReporteRs::pptx(title="IP Update")
 99 | # view the types of layouts available by default
100 | # slide.layouts(ppt)
101 | layoutTitleContent = "Title and Content"
102 | 
103 | # first plot of top score (3) 
104 | asdt <- summarizeColumns(subset(sumo,score > 2),'docType')
105 | ppt <- 
106 |   addChartRightTextLeftPptx(ppt = ppt,
107 |                             plot = flippedHistogram(subset(sumo, score > 2) ,
108 |                                                     "assigneeSmall","docType",
109 |                                                     colors=scoreColors, 
110 |                                                     recolor = TRUE), 
111 |                             text = summaryText(asdt, "doc type", "doc types", 
112 |                                                subset(sumo,score>2)$docType), 
113 |                             title = "Doc Types for Top Score Docs", 
114 |                             slide_layout = layoutTitleContent)
115 | 
116 | # top scores by assignee
117 | ascore <- summarizeColumns(subset(sumo,score > 2),'assigneeSmall')
118 | ppt <- 
119 |   addChartRightTextLeftPptx(ppt = ppt,
120 |                             plot = flippedHistogram(subset(sumo, score > 2) ,
121 |                                                     "assigneeSmall","score",
122 |                                                     colors=scoreColors, 
123 |                                                     recolor = FALSE), 
124 |                             text = summaryText(ascore, "assignee", "assignees", 
125 |                                                subset(sumo,score>2)$assigneeSmall), 
126 |                             title = "Assignees with Top Scores", 
127 |                             slide_layout = layoutTitleContent)
128 | 
129 | 
130 | # last plot is category
131 | sc <- summarizeColumns(sumo,'category')
132 | ppt <- 
133 |   addChartRightTextLeftPptx(ppt = ppt,
134 |                             plot = flippedHistogram(sumo ,"category",
135 |                                                     "score", colors = scoreColors,
136 |                                                     recolor = TRUE),
137 |                             text = summaryText(sc, "category", "categories", sumo$category),
138 |                             title = "Categories and Scores",
139 |                             slide_layout = layoutTitleContent)
140 | 
141 | # find a data folder and write it out to your folder
142 | # out <- paste("data/",Sys.Date(),"_exampleChartRightTextLeft.pptx",sep='')
143 | # ReporteRs::writeDoc(ppt, out)
144 | 
145 | 
146 | }
147 | \seealso{
148 | \code{\link[ReporteRs]{pptx}}, \code{\link{addFullImagePptx}}
149 | }
150 | 


--------------------------------------------------------------------------------
/tests/testthat/test-cleaning.R:
--------------------------------------------------------------------------------
  1 | # test cleaning data
  2 | 
  3 | 
  4 | # clean names 
  5 | test_that("Imported Sumobrain csv to data frame has names standardized",{
  6 |   df <- importPatentData(rprojroot::find_testthat_root_file("testData","sumobrain_autonomous_search1.xlsx"), skipLines = 1)
  7 |   df <- cleanHeaderNames(patentData = df)
  8 |   expect_identical(names(df),names(acars))
  9 |   
 10 | })
 11 | 
 12 | 
 13 | # same length when extracting country code
 14 | test_that("Country code extracted from document number, and all country codes are chars of length 2-4",{
 15 |   df <- importPatentData(rprojroot::find_testthat_root_file("testData","sumobrain_autonomous_search1.xlsx"), skipLines = 1)
 16 |   df <- cleanHeaderNames(patentData = df)
 17 |   expect_length(extractCountryCode(df$docNum),dim(df)[1])
 18 | })
 19 | 
 20 | # same length when extracting publication number
 21 | test_that("Publication number, numeric portion extracted from document number properly",{
 22 |   df <- importPatentData(rprojroot::find_testthat_root_file("testData","sumobrain_autonomous_search1.xlsx"), skipLines = 1)
 23 |   df <- cleanHeaderNames(patentData = df)
 24 |   # should return the same length
 25 |   expect_length(extractPubNumber(df$docNum),dim(df)[1])
 26 | })
 27 | 
 28 | 
 29 | # same length when extracting kind code
 30 | test_that("Kind code extracted returns same length as number of rows of data frame",{
 31 |   df <- importPatentData(rprojroot::find_testthat_root_file("testData","sumobrain_autonomous_search1.xlsx"), skipLines = 1)
 32 |   df <- cleanHeaderNames(patentData = df)
 33 |   # should return the same length
 34 |   expect_length(extractKindCode(df$docNum),dim(df)[1])
 35 | })
 36 | 
 37 | 
 38 | # same length when extracting kind code
 39 | test_that("Office doc length extracted returns same length as number of rows of data frame",{
 40 |   df <- importPatentData(rprojroot::find_testthat_root_file("testData","sumobrain_autonomous_search1.xlsx"), skipLines = 1)
 41 |   df <- cleanHeaderNames(patentData = df)
 42 |   df$pubName <- extractPubNumber(df$docNum)
 43 |   df$countryCode <- extractCountryCode(df$docNum)
 44 |   df$officeDocLength <- extractDocLength(countryCode = df$countryCode, pubNum = df$pubNum)
 45 |   # should return the same length
 46 |   expect_length(df$officeDocLength ,dim(df)[1])
 47 | })
 48 | 
 49 | 
 50 | # Dates converted properly
 51 | test_that("Dates converted properly from characters",{
 52 |   df <- importPatentData(rprojroot::find_testthat_root_file("testData","sumobrain_autonomous_search1.xlsx"), skipLines = 1)
 53 |   df <- cleanHeaderNames(patentData = df)
 54 |   df$pubDate <- extractCleanDate(df$pubDate)
 55 |   # should return the same length
 56 |   expect_equal(inherits(df$pubDate, "Date") ,TRUE)
 57 | })
 58 | 
 59 | # same length when extracting kind code
 60 | test_that("Google URL vector returns same length as number of rows of data frame",{
 61 |   df <- importPatentData(rprojroot::find_testthat_root_file("testData","sumobrain_autonomous_search1.xlsx"), skipLines = 1)
 62 |   df <- cleanHeaderNames(patentData = df)
 63 |   df$pubNum <- extractPubNumber(df$docNum)
 64 |   df$countryCode <- extractCountryCode(df$docNum)
 65 |   df$kindCode <- extractKindCode(df$docNum)
 66 |   # should return the same length
 67 |   expect_length(createGoogleURL(countryCode = df$countryCode, 
 68 |                                 pubNum = df$pubNum, 
 69 |                                 kindCode =df$kindCode) ,dim(df)[1])
 70 | })
 71 | 
 72 | 
 73 | # duplicates are removed if exist
 74 | test_that("Removing dups is a logical vector",{
 75 |   df <- importPatentData(rprojroot::find_testthat_root_file("testData","sumobrain_autonomous_search1.xlsx"), skipLines = 1)
 76 |   df <- cleanHeaderNames(patentData = df)
 77 |   # should be of type logical
 78 |   expect_type(removeDups(df$docNum) ,"logical")
 79 | })
 80 | 
 81 | 
 82 | # duplicates are shown
 83 | test_that("Showing all duplicates and showDups is a logical vector",{
 84 |   df <- importPatentData(rprojroot::find_testthat_root_file("testData","sumobrain_autonomous_search1.xlsx"), skipLines = 1)
 85 |   df <- cleanHeaderNames(patentData = df)
 86 |   # should be of type logical
 87 |   expect_type(showDups(df$appNum) ,"logical")
 88 | })
 89 | 
 90 | 
 91 | # same length when generating the type of document
 92 | test_that("generateDocType returns same length as number of rows of data frame",{
 93 |   df <- importPatentData(rprojroot::find_testthat_root_file("testData","sumobrain_autonomous_search1.xlsx"), skipLines = 1)
 94 |   df <- cleanHeaderNames(patentData = df)
 95 |   df$pubNum <- extractPubNumber(df$docNum)
 96 |   df$countryCode <- extractCountryCode(df$docNum)
 97 |   df$kindCode <- extractKindCode(df$docNum)
 98 |   df$officeDocLength <- extractDocLength(countryCode = df$countryCode, pubNum = df$pubNum)
 99 |   df$countryAndKindCode <- with(df, paste0(countryCode, kindCode))
100 |   # should return the same length
101 |   temp <- generateDocType(officeDocLength = df$officeDocLength,
102 |                           countryAndKindCode = df$countryAndKindCode,
103 |                           cakcDict = patentr::cakcDict,
104 |                           docLengthTypesDict = patentr::docLengthTypesDict)
105 |   expect_length(temp ,dim(df)[1])
106 | })
107 | 
108 | 
109 | 
110 | # names returns the same length
111 | test_that("Google URL vector returns same length as number of rows of data frame",{
112 |   df <- importPatentData(rprojroot::find_testthat_root_file("testData","sumobrain_autonomous_search1.xlsx"), skipLines = 1)
113 |   df <- cleanHeaderNames(patentData = df)
114 |   expect_length(cleanNames(df$assignee), dim(df)[1])
115 | })
116 | 
117 | 
118 | # sumobrain full clean returns data frame
119 | test_that("Sumobrain data cleanPatentData returns a data frame.",{
120 |   df <- importPatentData(rprojroot::find_testthat_root_file("testData","sumobrain_autonomous_search1.xlsx"), skipLines = skipSumobrain)
121 |   df <- cleanPatentData(patentData = df, columnsExpected = sumobrainColumns,
122 |                         cleanNames = sumobrainNames, dateFields = sumobrainDateFields,
123 |                         dateOrders = sumobrainDateOrder, deduplicate = TRUE,
124 |                         cakcDict = cakcDict, docLengthTypesDict = docLengthTypesDict,
125 |                         keepType = "grant",firstAssigneeOnly = TRUE, assigneeSep = ";",
126 |                         stopWords = assigneeStopWords)
127 |   # should be of type logical
128 |   expect_is(df ,"data.frame")
129 | })
130 | 
131 | # google patent data full clean returns data frame
132 | test_that("Google patent data cleanPatentData returns a data frame.",{
133 |   df <- read.csv(rprojroot::find_testthat_root_file("testData","google_autonomous_search.csv"), 
134 |                  skip = skipGoogle, stringsAsFactors = FALSE)
135 |   df <- data.frame(lapply(df,function(x){iconv(x, to = "ASCII")}), stringsAsFactors = FALSE)
136 |   
137 |   df <- cleanPatentData(patentData = df, columnsExpected = googleColumns,
138 |                         cleanNames = googleNames, dateFields = googleDateFields,
139 |                         dateOrders = googleDateOrder, deduplicate = TRUE,
140 |                         cakcDict = cakcDict, docLengthTypesDict = docLengthTypesDict,
141 |                         keepType = "grant",firstAssigneeOnly = TRUE, assigneeSep = ",",
142 |                         stopWords = assigneeStopWords)
143 |   # should be of type logical
144 |   expect_is(df ,"data.frame")
145 | })
146 | 
147 | 
148 | # lens.org data file 
149 | test_that("Lens.org patent data cleanPatentData returns a data frame.",{
150 |   df <- read.csv(rprojroot::find_testthat_root_file("testData","lens_autonomous_search.csv"), 
151 |                  skip = skipLens, stringsAsFactors = FALSE)
152 |   df <- data.frame(lapply(df,function(x){iconv(x, to = "ASCII")}), stringsAsFactors = FALSE)
153 |   
154 |   df <- cleanPatentData(patentData = df, columnsExpected = lensColumns,
155 |                         cleanNames = lensNames, dateFields = lensDateFields,
156 |                         dateOrders = lensDateOrder, deduplicate = TRUE,
157 |                         cakcDict = cakcDict, docLengthTypesDict = docLengthTypesDict,
158 |                         keepType = "grant",firstAssigneeOnly = TRUE, assigneeSep = ";;",
159 |                         stopWords = assigneeStopWords)
160 |   # should be of type logical
161 |   expect_is(df ,"data.frame")
162 | })
163 | 
164 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | patentr
  2 | ================
  3 | Kamil Bojanczyk
  4 | 2017-03-22
  5 | 
  6 | <!-- README.md is generated from README.Rmd. Please edit that file -->
  7 | Introducing `patentr`, the toolkit for patent data analysis in R. The summary md file currently holds all documentation.
  8 | 
  9 | The package is aimed at patent agents, lawyers, managers, analysts, and academics who are working on patents. This may be used in a patent landscape analysis, company IP portfolio analysis, or a freedom to operate search.
 10 | 
 11 | This is a data processing and reporting tool on patent data sets for patent analysts. The motivation comes from a lack of useful, exportable patent data. `patentr` builds upon the free data available from Sumobrain.com, Lens.org, and Google Patents, leveraging their data to summarize and analyze patents.
 12 | 
 13 | `patentr` performs four key functions:
 14 | 
 15 | 1.  *Data input:* Easily **import** patent excel and csv files from the top patent websites
 16 | 
 17 | -   `CSV` from Google Patents and lens.org
 18 | -   `xlsx` from sumobrain.com
 19 | 
 20 | 1.  *Data cleaning:* **Sanitize** patent data and extract useful metadata for custom analyses
 21 | 
 22 | -   Clean up important fields such as names, dates, country codes, and kind codes
 23 | -   Infer the document type so that you don't analyze the same patent twice
 24 | -   Deduplicate data sets and prioritize grants over applications
 25 | -   Use the generated Google URL to jump to the patent document or to download claim data using the included `httr` and `XML` functions
 26 | 
 27 | 1.  *Exploratory data analysis:* **Explore** patent data and quickly **visualize** important attributes
 28 | 
 29 | -   Quickly summarize patent data by relevant columns to get document count
 30 | -   View standard histogram, tile, and facet plots of important information
 31 | -   Extract claim information for **wordcloud** analysis
 32 | -   Interact with your data on the **Shiny** user interface
 33 | 
 34 | 1.  *Reporting:* Export your data as **powerpoints** and **PDFs**
 35 | 
 36 | -   Browse through the **many example plots**
 37 | -   Download your charts locally as a **PDF** or make your own **PPTX**
 38 | 
 39 | **Note:** The Shiny app works only with `xlsx` data. Simply upload the data file, click "clean", and then you can view a straightforward graph and the raw data.
 40 | 
 41 | There are three core date sets available, all based on autonomous car patent sets: `acars` (from Sumobrain.com), `acarsLens`, and `acarsGoogle`. All data sets are reproducible and their sources can be found in their documentation.
 42 | 
 43 | Data Input and Data Sources
 44 | ---------------------------
 45 | 
 46 | Choose your data from Sumobrain.com for excel files, or Lens.org and Google Patents for `csv` files.
 47 | 
 48 | You can read in patent data files from publicly available sources and clean the data into a more useful, usable format for further analysis. `patentr` has an **interactive** browser that allows you to choose a **list** of files of xlsx format. Alternatively, you can read in your own `csv` files.
 49 | 
 50 | ``` r
 51 | # read in xlsx files
 52 | file1 <- system.file("extdata/", "sumobrain_autonomous_search1.xlsx", package="patentr")
 53 | file2 <- system.file("extdata/", "sumobrain_autonomous_search2.xlsx", package="patentr")
 54 | files <- list(file1, file2)
 55 | ipData <- importPatentData(rawDataFilePath = files, skipLines = 1)
 56 | # example 2 -- a popup window appears for you to choose xlsx files
 57 | filePaths <- chooseFiles()
 58 | allData <- importPatentData(filePaths)
 59 | # example 3 -- read in csv files
 60 | google <- read.csv(system.file("testData/","google_autonomous_search.csv", package ="patentr")
 61 |                  skip = skipGoogle, stringsAsFactors = FALSE)
 62 | google <- data.frame(lapply(lens,function(x){iconv(x, to = "ASCII")}), stringsAsFactors = FALSE)
 63 |   
 64 | ```
 65 | 
 66 | Clean Data
 67 | ----------
 68 | 
 69 | There are ten different cleaning functions available, all wrapped up nicely into the `cleanPatentData` function. This single function can save you hours of work cleaning and processing your data. Read the documentation carefully, as there are a number of time-saving preloaded variables to name the columns, process the dates, clean up the assignee names, and much more.
 70 | 
 71 | For excel files, use the `cleanPatentData` function directly. For csv files, use the pre-processing lines below.
 72 | 
 73 | Clean data uses `extract` functions that take in character vectors and return extracted metadata useful in patent data analysis. A master cleaner function bundles all these functions together. The user also has the ability to use the functions one-by-one for custom analysis.
 74 | 
 75 | ``` r
 76 | lensRawData <- system.file("extdata", "lens_autonomous_search.csv", package = "patentr")
 77 | lensRawData <- read.csv(lensRawData, stringsAsFactors = FALSE, skip = skipLens)
 78 | lensRawData <- data.frame(lapply(lensRawData,
 79 | function(x){iconv(x, to = "ASCII")}), stringsAsFactors = FALSE)
 80 | lens <- cleanPatentData(patentData = lensRawData, columnsExpected = lensColumns,
 81 | cleanNames = lensNames, dateFields = lensDateFields, dateOrders = lensDateOrder,
 82 | deduplicate = TRUE, cakcDict = patentr::cakcDict, docLengthTypesDict = patentr::docLengthTypesDict,
 83 | keepType = "grant", firstAssigneeOnly = TRUE, assigneeSep = ";;", stopWords = patentr::assigneeStopWords)
 84 | ```
 85 | 
 86 | Exploratory Analysis
 87 | --------------------
 88 | 
 89 | The exploratory analysis includes simple summaries and numerous graphings. Ideally, a patent analyst needs to add the following columns to the cleaned data to make full use of the package: \* score \* category \* feature 1 (main feature) \* feature 2 (secondary feature)
 90 | 
 91 | For the purpose of this first package, all examples come with a pre-built 0 to 3 score, 3 being the highest. Categories are also predefined, as is feature 1. These are important variables that require days to weeks of a patent analysts time, thus, in future realeases an expert-tagged data set will be available.
 92 | 
 93 | A simple example is the word cloud. We load a file, deduplicate it, and then quickly view the top phrases. Another example is a simple facet that shows the category of a patent technology, along with the major feature of that patent. For example, every autonomous car technology category has a lane feature, as staying in the lane for a car is important.
 94 | 
 95 | ![Wordcloud](vignettes/Rplot01.png) ![Tile plot](vignettes/Rplot.png)
 96 | 
 97 | Reporting
 98 | ---------
 99 | 
100 | The package allows the user to output a set of pre-defined plots and summary information. There are pdf and pptx options.
101 | 
102 | ``` r
103 | sumo <- cleanPatentData(patentData = patentr::acars, columnsExpected = sumobrainColumns,
104 | cleanNames = sumobrainNames, dateFields = sumobrainDateFields, dateOrders = sumobrainDateOrder,
105 | deduplicate = TRUE, cakcDict = patentr::cakcDict, docLengthTypesDict = patentr::docLengthTypesDict, 
106 | keepType = "grant", firstAssigneeOnly = TRUE, assigneeSep = ";", stopWords = patentr::assigneeStopWords)
107 | 
108 | score <- round(rnorm(dim(sumo)[1],mean=1.4,sd=0.9))
109 | score[score>3] <- 3; score[score<0] <- 0; sumo$score <- score
110 | sumo$assigneeSmall <- strtrim(sumo$assigneeClean,12)
111 | category <- c("system","control algorithm","product","control system", "communication")
112 | c <- round(rnorm(dim(sumo)[1],mean=2.5,sd=1.5)); c[c>5] <- 5; c[c<1] <- 1; sumo$category <- category[c]
113 | xVal = "category"; fillVal = "score"; facetVal = "assigneeSmall"
114 | # create a ppt
115 | ppt <- ReporteRs::pptx(title="IP Update")
116 | layoutTitleContent = "Title and Content"
117 | fp <- facetPlot(subset(sumo, score > 0), xVal, fillVal, facetVal, colors = patentr::scoreColors, recolor = FALSE)
118 | ppt <- addFullImagePptx(ppt, plot = fp, slide_layout = layoutTitleContent)
119 | # find a data folder and write it out to your folder
120 | out <- paste("data/",Sys.Date(),"_exampleChartRightTextLeft.pptx",sep='')
121 | ReporteRs::writeDoc(ppt, out)
122 | ```
123 | 
124 | Upcoming Features in the Second Release
125 | ---------------------------------------
126 | 
127 | The next round of patent features will include:
128 | 
129 | -   Directly edit patent data in Shiny, or upload and redownload the data to Google Spreadsheets or excel
130 | -   Utilize a custom template set to auto-generate a powerpoint presentation
131 | -   Use supervised learning to semi-automate the classification of:
132 | -   score
133 | -   category
134 | -   feature 1 (main feature)
135 | -   feature 2 (secondary feature)
136 | 


--------------------------------------------------------------------------------
/R/acars.R:
--------------------------------------------------------------------------------
  1 | ## kamil bojanczyk start
  2 | #' Autonomous Vehicle Patent Data from Sumobrain.com
  3 | #'
  4 | #' An example data set of autonomous vehicle IP from major assignees. 
  5 | #' 
  6 | #' The data search was performd on Monday, March 13, 2017 from sumobrain.com, and the exact
  7 | #' search term was: 
  8 | #' 
  9 | #' \code{ABST/"autonomous" AN/(Apple* OR Google* OR Waymo* OR Tesla*} 
 10 | #' 
 11 | #' \code{OR Ford* OR General*) PD/12/13/1790->3/13/2017}
 12 | #' 
 13 | #' View the search \href{http://www.sumobrain.com/result.html?p=1&stemming=on&sort=chron&uspat=on&usapp=on&eupat=on&jp=on&pct=on&collections=&srch=xprtsrch&date_range=all&hits=502&from_ss=&srch_id=&srch_name=&search_name=&selected_doc_flag=&selected_newdoc_flag=&selected_portfolio=&portfolio_name=&query_txt=ABST\%2F\%22autonomous\%22+AN\%2F\%28Apple*+OR+Google*+OR+Waymo*+OR+Tesla*+OR+Ford*+OR+General*\%29+PD\%2F12\%2F13\%2F1790-\%3E3\%2F13\%2F2017&search.x=0&search.y=0&search=search_ezy}{here}.
 14 | #' 
 15 | #' 
 16 | #' For all collections (US patents, applications, EP documents, abstracts of Japan, and WIPO).
 17 | #' 
 18 | #' Can get raw data with the following commands:
 19 | #' 
 20 | #' \code{system.file("extdata", "sumobrain_autonomous_search1.xls", package = "patentr")}
 21 | #' 
 22 | #' \code{system.file("extdata", "sumobrain_autonomous_search2.xls", package = "patentr")}
 23 | #' 
 24 | #' 
 25 | #' @name acars
 26 | #' @docType data
 27 | #' @keywords data
 28 | #' 
 29 | #' 
 30 | #' @format 
 31 | #' A data frame with 499 observations on 10 variables.
 32 | #' \describe{
 33 | #' \item{docNum}{A published document number including the kind code, publication number,
 34 | #' and kind code for the patent document.}
 35 | #' \item{docTypeSumobrain}{Very similar to the country code, with minor additions, USAPP being the 
 36 | #' most noticable difference. }
 37 | #' \item{pubDate}{Publication Date}
 38 | #' \item{title}{Title}
 39 | #' \item{abstract}{Abstract}
 40 | #' \item{inventors}{Inventor Name}
 41 | #' \item{assignee}{Assignee}
 42 | #' \item{appNum}{Application Number}
 43 | #' \item{dateFiled}{Filing Date}
 44 | #' \item{classPrimary}{Primary Class}
 45 | #' \item{classOthers}{Other Classes}
 46 | #' }
 47 | #' 
 48 | #' @seealso \url{http://www.sumobrain.com} You will need to create a free account to export data.
 49 | #' 
 50 | #' \code{\link{acarsGoogle}} provides a similar search from Google. 
 51 | #' \code{\link{acarsLens}} provides a simialr search from Lens.org. 
 52 | #' 
 53 | "acars"
 54 | 
 55 | 
 56 | #' Autonomous Vehicle Patent Data from Google Patents
 57 | #'
 58 | #' An example data set of autonomous vehicle IP from major assignees. 
 59 | #' 
 60 | #' The first row in the raw CSV export contains the search URL and is skipped.
 61 | #' 
 62 | #' The data search was performd on Saturday, March 18, 2017 from patents.google.com, and the exact
 63 | #' search: \href{https://patents.google.com/?q=AB\%3dautonomous&assignee=Apple*,Google*,Waymo*,Tesla*,Ford*,General*&before=filing:20170318}{Google Patents Search}
 64 | #' For all countries available on Google.
 65 | #' 
 66 | #' You process the raw data with the following commands:
 67 | #' 
 68 | #' \code{temp <- system.file("extdata", "google_autonomous_search.csv", package = "patentr")}
 69 | #' 
 70 | #' \code{# from the source package you can navigate to }
 71 | #' 
 72 | #' \code{temp <- read.csv("inst/extdata/google_autonomous_search.csv", skip = 1, stringsAsFactors = FALSE)}
 73 | #' 
 74 | #' \code{names(temp) <- googleNames}
 75 | #' 
 76 | #' \code{temp <- data.frame(lapply(temp, function(x){iconv(x,to="ASCII")}),stringsAsFactors = FALSE)} 
 77 | #' 
 78 | #' \code{dateFields <- c("priorityDate","dateFiled","pubDate","grantDate")}
 79 | #' 
 80 | #' \code{temp[dateFields] <- as.data.frame(lapply(temp[dateFields], as.Date, format="\%m/\%d/\%y"))}
 81 | #' 
 82 | #' 
 83 | #' @name acarsGoogle
 84 | #' @docType data
 85 | #' @keywords data
 86 | #' 
 87 | #' 
 88 | #' @format 
 89 | #' A data frame with 316 observations on 9 variables.
 90 | #' \describe{
 91 | #' \item{\code{docNum}}{A published document number including the kind code, publication number,
 92 | #' and kind code for the patent document.}
 93 | #' \item{\code{title}}{The title of the invention.}
 94 | #' \item{\code{assignee}}{The owner of the document.}
 95 | #' \item{\code{inventors}}{The name(s) of the inventor(s), separated by commas.}
 96 | #' \item{\code{priorityDate}}{The earliest priority date on the application.}
 97 | #' \item{\code{dateFiled}}{Date the document was filed. They calll it filing/creation date.}
 98 | #' \item{\code{pubDate}}{Date document became publicly available.}
 99 | #' \item{\code{grantDate}}{Date the application became a grant. NA if there is no associated grant.}
100 | #' \item{\code{googleURL}}{The link to the Google Patents page for the document.}
101 | #' }
102 | #' 
103 | #' @seealso \url{https://patents.google.com/} 
104 | #' 
105 | #' \code{\link{acars}} provides a similar search from Sumobrain. 
106 | #' \code{\link{acarsLens}} provides a simialr search from Lens.org. 
107 | #' 
108 | "acarsGoogle"
109 | 
110 | ## kamil bojanczyk end
111 | ## yang yao start
112 | #' Autonomous Vehicle Patent Data from Lens Patent Search
113 | #'
114 | #' An example data set of autonomous vehicle IP from major assignees. 
115 | #' 
116 | #' The data search was performd on Saturday, March 18, 2017 from lens.org, and the exact
117 | #' search: 
118 | #' 
119 | #' \href{https://www.lens.org/lens/search?q=abstract\%3Aautonomous+\%26\%26+applicant\%3A\%28Apple*+OR+Google*+OR+Waymo*+OR+Tesla*+OR+Ford*+OR+General*\%29&predicate=\%26\%26&l=en}{Lens Patents Search}
120 | #' 
121 | #' For all countries available on Lens.
122 | #' 
123 | #' Can get raw data with the following commands:
124 | #' 
125 | #' \code{temp <- system.file("extdata", "lens_autonomous_search.csv", package = "patentr")}
126 | #' 
127 | #' \code{temp <- read.csv(temp, stringsAsFactors = FALSE)}
128 | #'
129 | #' \code{temp <- data.frame(lapply(temp, function(x){iconv(x,to="ASCII")}),stringsAsFactors = FALSE)}
130 | #'  
131 | #' \code{names(temp) <- lensNames}
132 | #' 
133 | #' \code{temp$dateFiled <- as.Date(temp$dateFiled, format = '\%m/\%d/\%y')}
134 | #' 
135 | #' \code{temp$pubDate <- as.Date(temp$pubDate, format='\%m/\%d/\%y')} # note that % y is system-specific and may not work everywhere.
136 | #' 
137 | #' \code{colsNum <- c("resultNum","citeCount","familySimpleCount","familyExtendedCount", "seqCount")}
138 | #' 
139 | #' \code{temp[colsNum] <- sapply(temp[colsNum], as.numeric)}
140 | #' 
141 | #' \code{temp$hasFullText <- sapply(temp$hasFullText, function(x) ifelse(x=="yes",TRUE,FALSE))}
142 | #' 
143 | #' @name acarsLens
144 | #' @docType data
145 | #' @keywords data
146 | #' 
147 | #' 
148 | #' @format 
149 | #' A data frame with 863 observations on 26 variables.
150 | #' \describe{
151 | #' \item{resultNum}{The search result number.}
152 | #' \item{countryCode}{The jurisdiction of the patent document.}
153 | #' \item{\code{kindCode}}{The kind code.}
154 | #' \item{docNum}{The published document number with country code and kind code included.}
155 | #' \item{lensID}{The unique identification number of the document on lens.org}
156 | #' \item{pubDate}{Date the document was published.}
157 | #' \item{pubYear}{Year the document published.}
158 | #' \item{appNum}{The filing number of the application (country code, number, and abridged kind code, typically 'A')}
159 | #' \item{dateFiled}{Date the application for the patent document was filed.}
160 | #' \item{priorityApps}{Applications this patent document claims priority. 
161 | #' Format: Country code, application number, A = application or P = provisional, YYYYMMDD of priority. 
162 | #' Multiple application separated by a double semi-colon.}
163 | #' \item{title}{The title of the document.}
164 | #' \item{assignee}{The name of the applicant(s) at the time of filing.}
165 | #' \item{inventors}{The inventor(s).}
166 | #' \item{lensURL}{The lens.org URL for the document.}
167 | #' \item{docTypeLens}{A lens.org mapping of the doc type. 
168 | #' Granted, application, ambiguous, unknown, search report, and possibly more values.}
169 | #' \item{hasFullText}{A logical value to show if there is a full text available from lens.org}
170 | #' \item{citeCount}{The number of times this document is cited, also known as forward citations.}
171 | #' \item{familySimpleCount}{The number of unique documents in the immediate patent family.}
172 | #' \item{familyExtendedCount}{The number of unique documents sharing a priority applicaiton in the extended family.}
173 | #' \item{seqCount}{Used in biological applications -- the number of sequences in the application.}
174 | #' \item{cpcClasses}{The CPC classification codes, separated by a double semi-colon.}
175 | #' \item{ipcrClasses}{The IPCR classification codes, separated by a double semi-colon.}
176 | #' \item{usClasses}{The US classification codes, separated by a double semi-colon.}
177 | #' \item{pubmedID}{A pubmed ID to any related research.}
178 | #' \item{DOI}{A digital object identifier. 
179 | #' Go to doi.org and paste the value to get the associated research paper.}
180 | #' \item{npl}{Non-patent literature, or citations of non-patent sources.
181 | #' Separated with double semi-colons.}
182 | #' 
183 | #' 
184 | #' }
185 | #' 
186 | #' @seealso \url{www.lens.org} You can export without an account, or can create 
187 | #' an account to save your searches. 
188 | #' 
189 | #' \code{\link{acarsGoogle}} provides a similar search from Google. 
190 | #' \code{\link{acars}} provides a similar search from sumobrain. 
191 | #' 
192 | "acarsLens"
193 | ## yang yao end


--------------------------------------------------------------------------------
/vignettes/summary.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "patentr"
  3 | author: "Kamil Bojanczyk; Yao Yang"
  4 | date: "`r Sys.Date()`"
  5 | output: rmarkdown::html_vignette
  6 | vignette: >
  7 |   %\VignetteIndexEntry{patentr}
  8 |   %\VignetteEngine{knitr::rmarkdown}
  9 |   %\VignetteEncoding{UTF-8}
 10 | ---
 11 | 
 12 | The package is aimed at patent agents, lawyers, managers, analysts, and 
 13 | academics who are working on patents. This may be used in a patent landscape 
 14 | analysis, company IP portfolio analysis, or a freedom to operate search. 
 15 | 
 16 | This is a data processing and reporting tool on patent data sets for 
 17 | patent analysts. The motivation comes from a lack of useful, exportable 
 18 | patent data. `patentr` builds upon the free data available from Sumobrain.com, 
 19 | Lens.org, and Google Patents, leveraging their data to summarize and analyze 
 20 | patents. 
 21 | 
 22 | `patentr` performs four key functions:
 23 | 
 24 |  1. *Data input:* Easily **import** patent excel and csv files from the top patent websites
 25 |   + `CSV` from Google Patents and lens.org
 26 |   + `xlsx` from sumobrain.com 
 27 |  2. *Data cleaning:* **Sanitize** patent data and extract useful metadata for custom analyses
 28 |   + Clean up important fields such as names, dates, country codes, and kind codes
 29 |   + Infer the document type so that you don't analyze the same patent twice
 30 |   + Deduplicate data sets and prioritize grants over applications 
 31 |   + Use the generated Google URL to jump to the patent document or to download 
 32 |   claim data using the included `httr` and `XML` functions
 33 |  3. *Exploratory data analysis:* **Explore** patent data and quickly **visualize** important attributes 
 34 |   + Quickly summarize patent data by relevant columns to get document count 
 35 |   + View standard histogram, tile, and facet plots of important information 
 36 |   + Extract claim information for **wordcloud** analysis 
 37 |   + Interact with your data on the **Shiny** user interface
 38 |  4. *Reporting:* Export your data as **powerpoints** and **PDFs** 
 39 |   + Browse through the **many example plots** 
 40 |   + Download your charts locally as a **PDF** or make your own **PPTX** 
 41 | 
 42 | **Note:** The Shiny app works only with `xlsx` data. Simply upload the data file,
 43 | click "clean", and then you can view a straightforward graph and the raw data.
 44 | 
 45 | There are three core date sets available, all based on autonomous car patent sets:
 46 | `acars` (from Sumobrain.com), `acarsLens`, and `acarsGoogle`. All data sets are 
 47 | reproducible and their sources can be found in their documentation. 
 48 | 
 49 | ## Data Input and Data Sources
 50 | 
 51 | Choose your data from Sumobrain.com for excel files, or Lens.org and Google Patents 
 52 | for `csv` files. 
 53 | 
 54 | You can read in patent data files from publicly available sources and clean the 
 55 | data into a more useful, usable format for further analysis. `patentr` has an 
 56 | **interactive** browser that allows you to choose a **list** of files of xlsx 
 57 | format. Alternatively, you can read in your own `csv` files. 
 58 | ```{r, eval=FALSE}
 59 | # read in xlsx files
 60 | file1 <- system.file("extdata/", "sumobrain_autonomous_search1.xlsx", package="patentr")
 61 | file2 <- system.file("extdata/", "sumobrain_autonomous_search2.xlsx", package="patentr")
 62 | files <- list(file1, file2)
 63 | ipData <- importPatentData(rawDataFilePath = files, skipLines = 1)
 64 | # example 2 -- a popup window appears for you to choose xlsx files
 65 | filePaths <- chooseFiles()
 66 | allData <- importPatentData(filePaths)
 67 | # example 3 -- read in csv files
 68 | google <- read.csv(system.file("testData/","google_autonomous_search.csv", package ="patentr")
 69 |                  skip = skipGoogle, stringsAsFactors = FALSE)
 70 | google <- data.frame(lapply(lens,function(x){iconv(x, to = "ASCII")}), stringsAsFactors = FALSE)
 71 |   
 72 | 
 73 | ```
 74 | 
 75 | 
 76 | ## Clean Data
 77 | 
 78 | There are ten different cleaning functions available, all wrapped up nicely into 
 79 | the `cleanPatentData` function. This single function can save you hours of work 
 80 | cleaning and processing your data. Read the documentation carefully, as there are 
 81 | a number of time-saving preloaded variables to name the columns, process the 
 82 | dates, clean up the assignee names, and much more.
 83 | 
 84 | For excel files, use the `cleanPatentData` function directly. For csv files, 
 85 | use the pre-processing lines below. 
 86 | 
 87 | Clean data uses `extract` functions that take in character vectors and return 
 88 | extracted metadata useful in patent data analysis. A master cleaner function
 89 | bundles all these functions together. The user also has the ability to use the 
 90 | functions one-by-one for custom analysis. 
 91 | 
 92 | ```{r, eval=FALSE}
 93 | lensRawData <- system.file("extdata", "lens_autonomous_search.csv", package = "patentr")
 94 | lensRawData <- read.csv(lensRawData, stringsAsFactors = FALSE, skip = skipLens)
 95 | lensRawData <- data.frame(lapply(lensRawData,
 96 | function(x){iconv(x, to = "ASCII")}), stringsAsFactors = FALSE)
 97 | lens <- cleanPatentData(patentData = lensRawData, columnsExpected = lensColumns,
 98 | cleanNames = lensNames, dateFields = lensDateFields, dateOrders = lensDateOrder,
 99 | deduplicate = TRUE, cakcDict = patentr::cakcDict, docLengthTypesDict = patentr::docLengthTypesDict,
100 | keepType = "grant", firstAssigneeOnly = TRUE, assigneeSep = ";;", stopWords = patentr::assigneeStopWords)
101 | ```
102 | 
103 | 
104 | ## Exploratory Analysis 
105 | 
106 | The exploratory analysis includes simple summaries and numerous graphings. Ideally,
107 | a patent analyst needs to add the following columns to the cleaned data to make 
108 | full use of the package:
109 |  * score 
110 |  * category 
111 |  * feature 1 (main feature)
112 |  * feature 2 (secondary feature)
113 | 
114 | For the purpose of this first package, all examples come with a pre-built 0 to 3
115 | score, 3 being the highest. Categories are also predefined, as is feature 1. These 
116 | are important variables that require days to weeks of a patent analysts time, thus,
117 | in future realeases an expert-tagged data set will be available. 
118 | 
119 | A simple example is the word cloud. We load a file, deduplicate it, and then 
120 | quickly view the top phrases. Another example is a simple facet that shows 
121 | the category of a patent technology, along with the major feature of that patent. 
122 | For example, every autonomous car technology category has a lane feature, as 
123 | staying in the lane for a car is important.
124 | 
125 | ![Wordcloud](Rplot01.png) ![Tile plot](Rplot.png)
126 | ```{r warning=FALSE, echo = FALSE, message=FALSE, eval=FALSE}
127 | devtools::load_all()
128 | sumo <- cleanPatentData(patentData = patentr::acars, columnsExpected = sumobrainColumns,
129 | cleanNames = sumobrainNames,
130 | dateFields = sumobrainDateFields,
131 | dateOrders = sumobrainDateOrder,
132 | deduplicate = TRUE,
133 | cakcDict = patentr::cakcDict,
134 | docLengthTypesDict = patentr::docLengthTypesDict,
135 | keepType = "grant",
136 | firstAssigneeOnly = TRUE,
137 | assigneeSep = ";",
138 | stopWords = patentr::assigneeStopWords)
139 | 
140 | # df <- dplyr::select(sumo, title, abstract)
141 | df <- sumo[,c("title","abstract")]
142 | wordCloudIt(df, excludeWords, minfreq = 20,
143 | random.order = FALSE, rot.per = 0.25)
144 | 
145 | 
146 | # note that in reality, you need a patent analyst to carefully score
147 | # these patents, the score here is for demonstrational purposes
148 | score <- round(rnorm(dim(sumo)[1],mean=1.4,sd=0.9))
149 | score[score>3] <- 3; score[score<0] <- 0
150 | sumo$score <- score
151 | sumo$assigneeSmall <- strtrim(sumo$assigneeClean,12)
152 | category <- c("system","control algorithm","product","control system", "communication")
153 | c <- round(rnorm(dim(sumo)[1],mean=2.5,sd=1.5))
154 | c[c>5] <- 5; c[c<1] <- 1
155 | sumo$category <- category[c]
156 | feature1 <- c("adaptive", "park", "lane", NA,NA,NA,NA,NA,
157 | "brake", "steer","accelerate","deactivate")
158 | f <- round(rnorm(dim(sumo)[1],mean=5,sd=1))
159 | l <- length(feature1)
160 | f[f>l] <- l; f[f<1] <- 1
161 | sumo$feature1 <- c(feature1,feature1[f])[1:dim(sumo)[1]]
162 | 
163 | tilePlot(sumo, "category", "feature1", xangle = 90, xhjust=0)
164 | 
165 | 
166 | ```
167 | 
168 | 
169 | 
170 | ## Reporting
171 | The package allows the user to output a set of pre-defined plots and 
172 | summary information. There are pdf and pptx options. 
173 | 
174 | ```{r, eval = FALSE}
175 | sumo <- cleanPatentData(patentData = patentr::acars, columnsExpected = sumobrainColumns,
176 | cleanNames = sumobrainNames, dateFields = sumobrainDateFields, dateOrders = sumobrainDateOrder,
177 | deduplicate = TRUE, cakcDict = patentr::cakcDict, docLengthTypesDict = patentr::docLengthTypesDict, 
178 | keepType = "grant", firstAssigneeOnly = TRUE, assigneeSep = ";", stopWords = patentr::assigneeStopWords)
179 | 
180 | score <- round(rnorm(dim(sumo)[1],mean=1.4,sd=0.9))
181 | score[score>3] <- 3; score[score<0] <- 0; sumo$score <- score
182 | sumo$assigneeSmall <- strtrim(sumo$assigneeClean,12)
183 | category <- c("system","control algorithm","product","control system", "communication")
184 | c <- round(rnorm(dim(sumo)[1],mean=2.5,sd=1.5)); c[c>5] <- 5; c[c<1] <- 1; sumo$category <- category[c]
185 | xVal = "category"; fillVal = "score"; facetVal = "assigneeSmall"
186 | # create a ppt
187 | ppt <- ReporteRs::pptx(title="IP Update")
188 | layoutTitleContent = "Title and Content"
189 | fp <- facetPlot(subset(sumo, score > 0), xVal, fillVal, facetVal, colors = patentr::scoreColors, recolor = FALSE)
190 | ppt <- addFullImagePptx(ppt, plot = fp, slide_layout = layoutTitleContent)
191 | # find a data folder and write it out to your folder
192 | out <- paste("data/",Sys.Date(),"_exampleChartRightTextLeft.pptx",sep='')
193 | ReporteRs::writeDoc(ppt, out)
194 | ```
195 | 
196 | 
197 | ## Upcoming Features in the Second Release
198 | 
199 | The next round of patent features will include: 
200 | 
201 |  * Directly edit patent data in Shiny, or upload and redownload the data to 
202 |  Google Spreadsheets or excel 
203 |  * Utilize a custom template set to auto-generate a powerpoint presentation 
204 |  * Use supervised learning to semi-automate the classification of:
205 |   + score 
206 |   + category 
207 |   + feature 1 (main feature)
208 |   + feature 2 (secondary feature)


--------------------------------------------------------------------------------
/R/processPatentData.R:
--------------------------------------------------------------------------------
  1 | ## kamil bojanczk start
  2 | #' Create a URL link to Google patents
  3 | #' 
  4 | #' @description Create a URL string to link you to Google Patents. 
  5 | #' 
  6 | #' By concatenating the country code, publication number, and kind code, you can
  7 | #' generate a URL to link you to google patents for further exploration. This 
  8 | #' feature is especially useful when browsing the data in a spreadsheet or in 
  9 | #' a Shiny app. It is also useful for extracting data from the HTML content. 
 10 | #' 
 11 | #' As each website (Google, lens.org, sumobrain.com, etc..) has a different 
 12 | #' method for generating patent URLs, these functions are website-specific. 
 13 | #' 
 14 | #' The original Google patents version still works as of March 2017 and the 
 15 | #' \code{googleURL} value is  \code{https://www.google.com/patents/}. This older 
 16 | #' version may be easier to extract data. 
 17 | #' 
 18 | #' @param countryCode A character vector of the country code of the document. 
 19 | #' Typically a two-letter character. 
 20 | #' @param pubNum A character vector of the numeric portion of a publication number.
 21 | #' @param kindCode character vector of the kind code of a document. If not available,
 22 | #' enter a blank string "".
 23 | #' @param googleURL A character string of the URL to Google Patents, with working
 24 | #' default value. 
 25 | #' @param lang The language you want to read the patent, default set to "en" english.
 26 | #' 
 27 | #' @return A character vector of properly formatted URL strings. 
 28 | #' 
 29 | #' @examples 
 30 | #' acars$kindCode <- extractKindCode(acars$docNum)
 31 | #' acars$pubName <- extractPubNumber(acars$docNum)
 32 | #' acars$googleURL <- createGoogleURL(countryCode = acars$countryCode, 
 33 | #' pubNum = acars$pubNum, kindCode =acars$kindCode)
 34 | #' head(acars$googleURL)
 35 | #' 
 36 | #' @export
 37 | createGoogleURL <- function(countryCode, pubNum, kindCode, 
 38 |                             googleURL = "https://patents.google.com/patent/",
 39 |                             lang ="en"){
 40 |   # create the URL 
 41 |   paste(googleURL, countryCode, pubNum, kindCode, "/",lang, sep='')  
 42 |   # TODO: validate the URL
 43 |   # http://stackoverflow.com/questions/28527100/check-if-https-hypertext-transfer-protocol-secure-url-is-valid
 44 | }
 45 | 
 46 | 
 47 | 
 48 | #' Get a claim from a Google Patents URL
 49 | #' 
 50 | #' @description Input a valid Google Patents URL of the form given below and 
 51 | #' then get back a claim from the index of your choosing. If no claim exists or 
 52 | #' if your index is out of bounds, an  empty character string returns. 
 53 | #' 
 54 | #' The function works on strings that begin with the following sequence: 
 55 | #' \code{https://patents.google.com/patent/}. If the string sequence afterwards 
 56 | #' is invalid, a 404 status returns from the GET command and eventually an empty 
 57 | #' string returns. 
 58 | #' 
 59 | #' 
 60 | #' 
 61 | #' 
 62 | #' @return A character vector of the claim from each Google URL. If no claim exists, 
 63 | #' or if the country code is not on the inclusion list, an empty character value is returned 
 64 | #' for that index.
 65 | #' 
 66 | #' @param googleURL The well-formatted google URL built from \code{\link{createGoogleURL}}.
 67 | #' It is a character value.
 68 | #' @param langCode The language code, used check for non-english text.
 69 | #' @param whichClaim Default set to 1, a numeric determining which claim to get. Note
 70 | #' if claim is out of bounds, the return claim will be a blank character. 
 71 | #' 
 72 | #' 
 73 | #' @seealso \code{\link{createGoogleURL}}, \code{\link{extractCountryCode}},
 74 | #' \code{\link{cleanGoogleURL}}
 75 | #' 
 76 | #' @examples 
 77 | #' 
 78 | #' \dontrun{
 79 | #' # works for USA
 80 | #' aclaim <- getClaimFromURL("https://patents.google.com/patent/US8818682B1/en")
 81 | #' print(aclaim)
 82 | #' # test WO, EP
 83 | #' woclaim <- getClaimFromURL("https://patents.google.com/patent/WO2015134152A1/en")
 84 | #' print(woclaim)
 85 | #' epclaim <- getClaimFromURL("https://patents.google.com/patent/EP2991875A1/en")
 86 | #' print(epclaim)
 87 | #' # test KR, JP, CN
 88 | #' krclaim <- getClaimFromURL("https://patents.google.com/patent/KR20150127745A/en")
 89 | #' cnclaim <- getClaimFromURL("https://patents.google.com/patent/CN104786953A/en")
 90 | #' jpclaim <- getClaimFromURL("https://patents.google.com/patent/JP2016173842A/en")
 91 | #' declaim <- getClaimFromURL("https://patents.google.com/patent/DE102014219223A1/en")
 92 | #' }
 93 | #' 
 94 | #' @export
 95 | #' 
 96 | #' @importFrom XML xpathSApply 
 97 | #' @importFrom XML xmlValue 
 98 | #' @importFrom XML getNodeSet 
 99 | #' @importFrom XML htmlParse
100 | #' @importFrom httr GET
101 | #' 
102 | getClaimFromURL <- function(googleURL, langCode="en", whichClaim = 1){
103 |   
104 |   # make sure language code is set in URL
105 |   googleURL <- cleanGoogleURL(googleURL = googleURL, langCode = langCode)
106 | 
107 |   # pd = patent data
108 |   pd1 <- httr::GET(url = googleURL)
109 |   pd2 <- XML::htmlParse(pd1)
110 |   # future mode will have an input vector of options to choose from
111 |   pd3 <- XML::getNodeSet(pd2, "//div[@class='claim']")
112 | 
113 |   # pc = patent claim
114 |   # if exists
115 |   if(length(pd3)>=whichClaim){
116 |     
117 |     # if US
118 |     # works for USA
119 |     pc <- XML::xmlValue(pd3[[whichClaim]])
120 | 
121 |   } 
122 |     else{
123 |       # works for WO, EP
124 |       pd3 <- XML::getNodeSet(pd2, "//claim")
125 |       if(length(pd3)>=whichClaim){
126 |         pc <- XML::xmlValue(pd3[[whichClaim]]) 
127 | 
128 |         
129 |       } else{
130 |         # catch all
131 |         pc <- ""
132 |       }
133 | 
134 |   }
135 | 
136 |   # if english, get rid of non-english words and try replacing 
137 |   # any alphanumerics that are non-english vocabulary
138 |   if(langCode == "en" && (length(pd3) >= whichClaim) ){
139 |     
140 |     pd3 <- XML::getNodeSet(pd2, "//div[@class='claim']")
141 |     pd3 <- XML::getNodeSet(pd2, "//div[contains(@class,'claim')]")
142 |     pd4 <- XML::getNodeSet(pd3[[whichClaim]],
143 |                            paste0("//div[@num=",whichClaim,"]//span[@class='notranslate']/text()"))
144 |     pd5 <- paste((sapply(pd4, XML::xmlValue)), collapse = "")
145 |     # if returns a value and less than the original printout
146 |     # replace pc with the new "cleaner" version
147 |     # note this may have issues and "too much" may be returned
148 |     # requires further testing, (nchar(pd5) < nchar(pc)) may need to
149 |     if( nchar(pd5) > 1 ){
150 |       pc <- pd5
151 |     }
152 |     pc <- gsub("[^[:alnum:] ]","",pc)
153 |   }
154 |   
155 | 
156 |   # trim  to remove new lines and numbering with a period
157 |   pc <- trimws(gsub("\\n|[0-9].", "", pc))
158 |   # remove excessive spacing
159 |   pc <- gsub("\\s+"," ", pc)
160 |   # return a trimmed version of the claim
161 |   return(pc)
162 |   
163 | }
164 | # later want to get the # of claims 
165 | # http://stackoverflow.com/questions/8702039/how-to-find-the-max-attribute-from-an-xml-document-using-xpath-1-0
166 | # do something like this /library/book[@id = max(/library/book/@id)]
167 | # may need to sleep to not call too many
168 | # https://stat.ethz.ch/R-manual/R-devel/library/base/html/Sys.sleep.html
169 | 
170 | 
171 | 
172 | 
173 | #' Sanitize a Google URL before attempting to extract data
174 | #' 
175 | #' @description Clean up the google URL to make sure it will be read properly.
176 | #' 
177 | #' If you use the \code{\link{createGoogleURL}} function, you won't have to use this function. 
178 | #' However, if you use your own generator or want to change the language, use this 
179 | #' function to do so.
180 | #' 
181 | #' @param googleURL A character value of a google URL.
182 | #' @param langCode A language code, default set to "en" English.
183 | #' 
184 | #' @return A clean character vector of a Google Patents URL.
185 | #' 
186 | #' @export
187 | #' 
188 | #' @examples 
189 | #' 
190 | #' cleanGoogleURL("https://patents.google.com/patent/US8818682B1/mx")
191 | #' cleanGoogleURL("https://patents.google.com/patent/US8818682B1/")
192 | #' cleanGoogleURL("https://patents.google.com/patent/US8818682B1")
193 | #' cleanGoogleURL("https://patents.google.com/patent/US8818682B1/en")
194 | #' 
195 | #' @seealso \code{\link{createGoogleURL}}
196 | #' 
197 | cleanGoogleURL <- function(googleURL, langCode="en"){
198 |   
199 |   expr <- paste0("\\/",langCode)
200 |   # if the last two digist are not the language code, attempt to fix it
201 |   if(regexpr(expr,googleURL)==-1L){
202 |     
203 |     # 3 types of errors
204 |     # 1 /en <--> /mx replace lang code
205 |     # 2 /en <--> / add lang code
206 |     # 3 /en <--> '' doesn't exist, add lang code and backslash
207 |     
208 |     if(regexpr("\\/[A-Za-z]{2}$",googleURL)>-1L){
209 |       googleURL <- gsub("\\/[A-Za-z]{2}$",paste0("/",langCode),googleURL)
210 |     } else if(regexpr("\\/$",googleURL)>-1L){
211 |       googleURL <- gsub("\\/$",paste0("/",langCode),googleURL)
212 |     } else{
213 |       # warning, attempting to generate URL, this may fail
214 |       googleURL <- gsub("$",paste0("/",langCode),googleURL)
215 |     }
216 |     
217 |   }
218 |   return(googleURL)
219 | }
220 | 
221 | #' Get claims data for all rows in a data frame
222 | #' 
223 | #' @description Generate claims data for all rows in a data frame. 
224 | #' 
225 | #' This is a wrapper function for the \code{\link{getClaimFromURL}} function.
226 | #' 
227 | #' @param googleURLs A character vector of Google URLs
228 | #' @param langCode A language code, default set to "en"
229 | #' @param whichClaim Which claim (if available) to return. Default set to 1st.
230 | #' 
231 | #' @export
232 | #' 
233 | #' @examples 
234 | #' 
235 | #' \dontrun{
236 | #' cc <- extractCountryCode(acars$docNum)
237 | #' pn <- extractPubNumber(acars$docNum)
238 | #' kc <- extractKindCode(acars$docNum)
239 | #' urls <- createGoogleURL(countryCode = cc, pubNum = pn ,kindCode = kc)
240 | #' urls <- urls[1:4]
241 | #' clms <- getClaimsText(urls)
242 | #' clms[1]
243 | #' }
244 | #' @seealso \code{\link{createGoogleURL}}, \code{\link{cleanGoogleURL}},
245 | #' \code{\link{getClaimFromURL}}
246 | #' 
247 | getClaimsText <- function(googleURLs, langCode="en",whichClaim=1){
248 |   sapply(googleURLs, function(x){
249 |     getClaimFromURL(googleURL = x, langCode = langCode, whichClaim = whichClaim)
250 |   })
251 | }
252 | 
253 | 
254 | 
255 | ## kamil bojanczk end


--------------------------------------------------------------------------------
/README.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | output: github_document
  3 | title: "patentr"
  4 | date: "`r Sys.Date()`"
  5 | author: "Kamil Bojanczyk, Yao Yang"
  6 | ---
  7 | 
  8 | <!-- README.md is generated from README.Rmd. Please edit that file -->
  9 | 
 10 | ```{r, echo = FALSE}
 11 | knitr::opts_chunk$set(
 12 |   collapse = TRUE,
 13 |   comment = "#>",
 14 |   fig.path = "README-"
 15 | )
 16 | ```
 17 | 
 18 | Introducing `patentr`, the toolkit for patent data analysis in R. The summary md
 19 | file currently holds all documentation. 
 20 | 
 21 | 
 22 | The package is aimed at patent agents, lawyers, managers, analysts, and 
 23 | academics who are working on patents. This may be used in a patent landscape 
 24 | analysis, company IP portfolio analysis, or a freedom to operate search. 
 25 | 
 26 | This is a data processing and reporting tool on patent data sets for 
 27 | patent analysts. The motivation comes from a lack of useful, exportable 
 28 | patent data. `patentr` builds upon the free data available from Sumobrain.com, 
 29 | Lens.org, and Google Patents, leveraging their data to summarize and analyze 
 30 | patents. 
 31 | 
 32 | `patentr` performs four key functions:
 33 | 
 34 |  1. *Data input:* Easily **import** patent excel and csv files from the top patent websites
 35 |   + `CSV` from Google Patents and lens.org
 36 |   + `xlsx` from sumobrain.com 
 37 |  2. *Data cleaning:* **Sanitize** patent data and extract useful metadata for custom analyses
 38 |   + Clean up important fields such as names, dates, country codes, and kind codes
 39 |   + Infer the document type so that you don't analyze the same patent twice
 40 |   + Deduplicate data sets and prioritize grants over applications 
 41 |   + Use the generated Google URL to jump to the patent document or to download 
 42 |   claim data using the included `httr` and `XML` functions
 43 |  3. *Exploratory data analysis:* **Explore** patent data and quickly **visualize** important attributes 
 44 |   + Quickly summarize patent data by relevant columns to get document count 
 45 |   + View standard histogram, tile, and facet plots of important information 
 46 |   + Extract claim information for **wordcloud** analysis 
 47 |   + Interact with your data on the **Shiny** user interface
 48 |  4. *Reporting:* Export your data as **powerpoints** and **PDFs** 
 49 |   + Browse through the **many example plots** 
 50 |   + Download your charts locally as a **PDF** or make your own **PPTX** 
 51 | 
 52 | **Note:** The Shiny app works only with `xlsx` data. Simply upload the data file,
 53 | click "clean", and then you can view a straightforward graph and the raw data.
 54 | 
 55 | There are three core date sets available, all based on autonomous car patent sets:
 56 | `acars` (from Sumobrain.com), `acarsLens`, and `acarsGoogle`. All data sets are 
 57 | reproducible and their sources can be found in their documentation. 
 58 | 
 59 | ## Data Input and Data Sources
 60 | 
 61 | Choose your data from Sumobrain.com for excel files, or Lens.org and Google Patents 
 62 | for `csv` files. 
 63 | 
 64 | You can read in patent data files from publicly available sources and clean the 
 65 | data into a more useful, usable format for further analysis. `patentr` has an 
 66 | **interactive** browser that allows you to choose a **list** of files of xlsx 
 67 | format. Alternatively, you can read in your own `csv` files. 
 68 | ```{r, eval=FALSE}
 69 | # read in xlsx files
 70 | file1 <- system.file("extdata/", "sumobrain_autonomous_search1.xlsx", package="patentr")
 71 | file2 <- system.file("extdata/", "sumobrain_autonomous_search2.xlsx", package="patentr")
 72 | files <- list(file1, file2)
 73 | ipData <- importPatentData(rawDataFilePath = files, skipLines = 1)
 74 | # example 2 -- a popup window appears for you to choose xlsx files
 75 | filePaths <- chooseFiles()
 76 | allData <- importPatentData(filePaths)
 77 | # example 3 -- read in csv files
 78 | google <- read.csv(system.file("testData/","google_autonomous_search.csv", package ="patentr")
 79 |                  skip = skipGoogle, stringsAsFactors = FALSE)
 80 | google <- data.frame(lapply(lens,function(x){iconv(x, to = "ASCII")}), stringsAsFactors = FALSE)
 81 |   
 82 | 
 83 | ```
 84 | 
 85 | 
 86 | ## Clean Data
 87 | 
 88 | There are ten different cleaning functions available, all wrapped up nicely into 
 89 | the `cleanPatentData` function. This single function can save you hours of work 
 90 | cleaning and processing your data. Read the documentation carefully, as there are 
 91 | a number of time-saving preloaded variables to name the columns, process the 
 92 | dates, clean up the assignee names, and much more.
 93 | 
 94 | For excel files, use the `cleanPatentData` function directly. For csv files, 
 95 | use the pre-processing lines below. 
 96 | 
 97 | Clean data uses `extract` functions that take in character vectors and return 
 98 | extracted metadata useful in patent data analysis. A master cleaner function
 99 | bundles all these functions together. The user also has the ability to use the 
100 | functions one-by-one for custom analysis. 
101 | 
102 | ```{r, eval=FALSE}
103 | lensRawData <- system.file("extdata", "lens_autonomous_search.csv", package = "patentr")
104 | lensRawData <- read.csv(lensRawData, stringsAsFactors = FALSE, skip = skipLens)
105 | lensRawData <- data.frame(lapply(lensRawData,
106 | function(x){iconv(x, to = "ASCII")}), stringsAsFactors = FALSE)
107 | lens <- cleanPatentData(patentData = lensRawData, columnsExpected = lensColumns,
108 | cleanNames = lensNames, dateFields = lensDateFields, dateOrders = lensDateOrder,
109 | deduplicate = TRUE, cakcDict = patentr::cakcDict, docLengthTypesDict = patentr::docLengthTypesDict,
110 | keepType = "grant", firstAssigneeOnly = TRUE, assigneeSep = ";;", stopWords = patentr::assigneeStopWords)
111 | ```
112 | 
113 | 
114 | ## Exploratory Analysis 
115 | 
116 | The exploratory analysis includes simple summaries and numerous graphings. Ideally,
117 | a patent analyst needs to add the following columns to the cleaned data to make 
118 | full use of the package:
119 |  * score 
120 |  * category 
121 |  * feature 1 (main feature)
122 |  * feature 2 (secondary feature)
123 | 
124 | For the purpose of this first package, all examples come with a pre-built 0 to 3
125 | score, 3 being the highest. Categories are also predefined, as is feature 1. These 
126 | are important variables that require days to weeks of a patent analysts time, thus,
127 | in future realeases an expert-tagged data set will be available. 
128 | 
129 | A simple example is the word cloud. We load a file, deduplicate it, and then 
130 | quickly view the top phrases. Another example is a simple facet that shows 
131 | the category of a patent technology, along with the major feature of that patent. 
132 | For example, every autonomous car technology category has a lane feature, as 
133 | staying in the lane for a car is important.
134 | 
135 | ![Wordcloud](vignettes/Rplot01.png) ![Tile plot](vignettes/Rplot.png)
136 | ```{r warning=FALSE, echo = FALSE, message=FALSE, eval=FALSE}
137 | devtools::load_all()
138 | sumo <- cleanPatentData(patentData = patentr::acars, columnsExpected = sumobrainColumns,
139 | cleanNames = sumobrainNames,
140 | dateFields = sumobrainDateFields,
141 | dateOrders = sumobrainDateOrder,
142 | deduplicate = TRUE,
143 | cakcDict = patentr::cakcDict,
144 | docLengthTypesDict = patentr::docLengthTypesDict,
145 | keepType = "grant",
146 | firstAssigneeOnly = TRUE,
147 | assigneeSep = ";",
148 | stopWords = patentr::assigneeStopWords)
149 | 
150 | # df <- dplyr::select(sumo, title, abstract)
151 | df <- sumo[,c("title","abstract")]
152 | wordCloudIt(df, excludeWords, minfreq = 20,
153 | random.order = FALSE, rot.per = 0.25)
154 | 
155 | 
156 | # note that in reality, you need a patent analyst to carefully score
157 | # these patents, the score here is for demonstrational purposes
158 | score <- round(rnorm(dim(sumo)[1],mean=1.4,sd=0.9))
159 | score[score>3] <- 3; score[score<0] <- 0
160 | sumo$score <- score
161 | sumo$assigneeSmall <- strtrim(sumo$assigneeClean,12)
162 | category <- c("system","control algorithm","product","control system", "communication")
163 | c <- round(rnorm(dim(sumo)[1],mean=2.5,sd=1.5))
164 | c[c>5] <- 5; c[c<1] <- 1
165 | sumo$category <- category[c]
166 | feature1 <- c("adaptive", "park", "lane", NA,NA,NA,NA,NA,
167 | "brake", "steer","accelerate","deactivate")
168 | f <- round(rnorm(dim(sumo)[1],mean=5,sd=1))
169 | l <- length(feature1)
170 | f[f>l] <- l; f[f<1] <- 1
171 | sumo$feature1 <- c(feature1,feature1[f])[1:dim(sumo)[1]]
172 | 
173 | tilePlot(sumo, "category", "feature1", xangle = 90, xhjust=0)
174 | 
175 | 
176 | ```
177 | 
178 | 
179 | 
180 | ## Reporting
181 | The package allows the user to output a set of pre-defined plots and 
182 | summary information. There are pdf and pptx options. 
183 | 
184 | ```{r, eval = FALSE}
185 | sumo <- cleanPatentData(patentData = patentr::acars, columnsExpected = sumobrainColumns,
186 | cleanNames = sumobrainNames, dateFields = sumobrainDateFields, dateOrders = sumobrainDateOrder,
187 | deduplicate = TRUE, cakcDict = patentr::cakcDict, docLengthTypesDict = patentr::docLengthTypesDict, 
188 | keepType = "grant", firstAssigneeOnly = TRUE, assigneeSep = ";", stopWords = patentr::assigneeStopWords)
189 | 
190 | score <- round(rnorm(dim(sumo)[1],mean=1.4,sd=0.9))
191 | score[score>3] <- 3; score[score<0] <- 0; sumo$score <- score
192 | sumo$assigneeSmall <- strtrim(sumo$assigneeClean,12)
193 | category <- c("system","control algorithm","product","control system", "communication")
194 | c <- round(rnorm(dim(sumo)[1],mean=2.5,sd=1.5)); c[c>5] <- 5; c[c<1] <- 1; sumo$category <- category[c]
195 | xVal = "category"; fillVal = "score"; facetVal = "assigneeSmall"
196 | # create a ppt
197 | ppt <- ReporteRs::pptx(title="IP Update")
198 | layoutTitleContent = "Title and Content"
199 | fp <- facetPlot(subset(sumo, score > 0), xVal, fillVal, facetVal, colors = patentr::scoreColors, recolor = FALSE)
200 | ppt <- addFullImagePptx(ppt, plot = fp, slide_layout = layoutTitleContent)
201 | # find a data folder and write it out to your folder
202 | out <- paste("data/",Sys.Date(),"_exampleChartRightTextLeft.pptx",sep='')
203 | ReporteRs::writeDoc(ppt, out)
204 | ```
205 | 
206 | 
207 | ## Upcoming Features in the Second Release
208 | 
209 | The next round of patent features will include: 
210 | 
211 |  * Directly edit patent data in Shiny, or upload and redownload the data to 
212 |  Google Spreadsheets or excel 
213 |  * Utilize a custom template set to auto-generate a powerpoint presentation 
214 |  * Use supervised learning to semi-automate the classification of:
215 |   + score 
216 |   + category 
217 |   + feature 1 (main feature)
218 |   + feature 2 (secondary feature)


--------------------------------------------------------------------------------
/R/reportPatentData.R:
--------------------------------------------------------------------------------
  1 | # reporting-related functions to generate ppt slides 
  2 | ## yang yao start
  3 | 
  4 | 
  5 | 
  6 | #' Add summary text to be used in a pptx slide
  7 | #' 
  8 | #' @description Add a standard summarized text that will be used in 
  9 | #' association with a plot. 
 10 | #' 
 11 | #' @param df A summarized patent data frame, summarized by one variable. 
 12 | #' See \code{\link{summarizeColumns}}.
 13 | #' @param singular The name of the variable, singular version. A character string.
 14 | #' For example: assignee.
 15 | #' @param plural The name of the variable, plural version. A character string.
 16 | #' For example: assignees, with an 's'.
 17 | #' @param sumVar The vector of the variable to summarize, taken from the original
 18 | #' patent data set. For example \code{sumo$score} to summarize the score range. 
 19 | #' 
 20 | #' @return A length four character vector.
 21 | #' 
 22 | #' @examples 
 23 | #' sumo <- cleanPatentData(patentData = patentr::acars, columnsExpected = sumobrainColumns,
 24 | #' cleanNames = sumobrainNames,
 25 | #' dateFields = sumobrainDateFields,
 26 | #' dateOrders = sumobrainDateOrder,
 27 | #' deduplicate = TRUE,
 28 | #' cakcDict = patentr::cakcDict,
 29 | #' docLengthTypesDict = patentr::docLengthTypesDict,
 30 | #' keepType = "grant",
 31 | #' firstAssigneeOnly = TRUE,
 32 | #' assigneeSep = ";",
 33 | #' stopWords = patentr::assigneeStopWords)
 34 | #' 
 35 | #' # note that in reality, you need a patent analyst to carefully score
 36 | #' # these patents, the score here is for demonstrational purposes
 37 | #' score <- round(rnorm(dim(sumo)[1],mean=1.4,sd=0.9))
 38 | #' score[score>3] <- 3; score[score<0] <- 0
 39 | #' sumo$score <- score
 40 | #' sumo$assigneeSmall <- strtrim(sumo$assigneeClean,12)
 41 | #' category <- c("system","control algorithm","product","control system", "communication")
 42 | #' c <- round(rnorm(dim(sumo)[1],mean=2.5,sd=1.5))
 43 | #' c[c>5] <- 5; c[c<1] <- 1
 44 | #' sumo$category <- category[c]
 45 | #' feature1 <- c("adaptive", "park", "lane", NA,NA,NA,NA,NA, 
 46 | #' "brake", "steer","accelerate","deactivate")
 47 | #' f <- round(rnorm(dim(sumo)[1],mean=5,sd=1))
 48 | #' l <- length(feature1)
 49 | #' f[f>l] <- l; f[f<1] <- 1
 50 | #' sumo$feature1 <- c(feature1,feature1[f])[1:dim(sumo)[1]]
 51 | #' 
 52 | #' # Summarize the assignees
 53 | #' as <- summarizeColumns(sumo, 'assigneeSmall')
 54 | #' summaryText(as, 'assignee','assignees',sumo$score)
 55 | #' # summarize the number of features
 56 | #' f <- summarizeColumns(sumo, 'feature1', naOmit = TRUE)
 57 | #' summaryText(f, 'feature','features',sumo$feature1)
 58 | #' 
 59 | #' @export
 60 | #' 
 61 | summaryText <- function(df, singular, plural, sumVar){
 62 |   
 63 |   m1range <- paste("For entry range ",capWord(min(sumVar, na.rm = TRUE)), " to ",
 64 |                        capWord(max(sumVar, na.rm = TRUE)),"...", sep='')
 65 |   
 66 |   m2size <- paste("There are ", dim(df)[1]," ", plural,".", sep='')
 67 |   
 68 |   m3top <- paste("Top ",singular," is ", capWord(as.character(utils::tail(unlist(df[,1]),1))),", with ",
 69 |                       as.character(utils::tail(unlist(df[,2]),1))," documents.",sep='')
 70 |   
 71 |   m4total <- paste("Total IP count is ", sum(as.numeric(unlist(df[,2])))," documents.",sep='')
 72 |   
 73 |   c(m1range, m2size, m3top, m4total)
 74 | }
 75 | 
 76 | 
 77 | 
 78 | 
 79 | #' Add a PPTX slide with chart on the right and text on the left
 80 | #' 
 81 | #' @description Generate a commonly-used PPTX slide format where the patent 
 82 | #' chart is on the right and some text is on the left. 
 83 | #' 
 84 | #' This function automates a number of steps used in formatting a pptx slide. 
 85 | #' It returns the ppt object with the new slide included. 
 86 | #' 
 87 | #' @param ppt A ppt object.
 88 | #' @param plot A plot object from ggplot2. 
 89 | #' @param text A character vector of text, typically less than one paragraph 
 90 | #' in size.
 91 | #' @param title A character title for a page. Default is NULL
 92 | #' @param slide_layout The name of a slide layout, the same name as the names in a .potx 
 93 | #' powerpoint template file. Default is a Title and Content blank layout.
 94 | #' @param Poffx Plot image x position from left top, inches. 
 95 | #' See \code{\link[ReporteRs]{addPlot}}. Default is 5.3. 
 96 | #' @param Poffy Plot image y position from left top, inches.
 97 | #' See \code{\link[ReporteRs]{addPlot}}. Default is 0.
 98 | #' @param Pwidth Plot image width, inches. 
 99 | #' See \code{\link[ReporteRs]{addPlot}}. Default is 8.
100 | #' @param Pheight Plot image height, inches. 
101 | #' See \code{\link[ReporteRs]{addPlot}}. Default is 7.5
102 | #' @param Toffx Text image x position from left top, inches. 
103 | #' See \code{\link[ReporteRs]{addPlot}}. Default is 1.
104 | #' @param Toffy Text image y position from left top, inches. 
105 | #' See \code{\link[ReporteRs]{addPlot}}. Default is 2.
106 | #' @param Twidth Text image width, inches. 
107 | #' See \code{\link[ReporteRs]{addPlot}}. Default is 5.
108 | #' @param Theight Text image height, inches. 
109 | #' See \code{\link[ReporteRs]{addPlot}}. Default is 5.5.
110 | #' 
111 | #' 
112 | #' @examples 
113 | #' 
114 | #' sumo <- cleanPatentData(patentData = patentr::acars, columnsExpected = sumobrainColumns,
115 | #'                         cleanNames = sumobrainNames,
116 | #'                         dateFields = sumobrainDateFields,
117 | #'                         dateOrders = sumobrainDateOrder,
118 | #'                         deduplicate = TRUE,
119 | #'                         cakcDict = patentr::cakcDict,
120 | #'                         docLengthTypesDict = patentr::docLengthTypesDict,
121 | #'                         keepType = "grant",
122 | #'                         firstAssigneeOnly = TRUE,
123 | #'                         assigneeSep = ";",
124 | #'                         stopWords = patentr::assigneeStopWords)
125 | #' 
126 | #' # note that in reality, you need a patent analyst to carefully score
127 | #' # these patents, the score here is for demonstrational purposes
128 | #' score <- round(rnorm(dim(sumo)[1],mean=1.4,sd=0.9))
129 | #' score[score>3] <- 3
130 | #' score[score<0] <- 0
131 | #' sumo$score <- score
132 | #' sumo$assigneeSmall <- strtrim(sumo$assigneeClean,12)
133 | #' category <- c("system","control algorithm","product","control system", "communication")
134 | #' c <- round(rnorm(dim(sumo)[1],mean=2.5,sd=1.5))
135 | #' c[c>5] <- 5; c[c<1] <- 1
136 | #' sumo$category <- category[c]
137 | #' feature1 <- c("adaptive", "park", "lane", NA,NA,NA,NA,NA, 
138 | #' "brake", "steer","accelerate","deactivate")
139 | #' f <- round(rnorm(dim(sumo)[1],mean=5,sd=1))
140 | #' l <- length(feature1)
141 | #' f[f>l] <- l; f[f<1] <- 1
142 | #' sumo$feature1 <- c(feature1,feature1[f])[1:dim(sumo)[1]]
143 | #' 
144 | #' flippedHistogram(sumo, "assigneeSmall","score",colors=scoreColors)
145 | #' flippedHistogram(subset(sumo, score > 0), "assigneeSmall","score",colors=scoreColors)
146 | #' 
147 | #' flippedHistogram(subset(sumo, score > 2) ,"assigneeSmall","docType",colors=scoreColors,
148 | #'                  recolor = TRUE)
149 | #' 
150 | #' 
151 | #' 
152 | #' 
153 | #' # create a ppt
154 | #' ppt <- ReporteRs::pptx(title="IP Update")
155 | #' # view the types of layouts available by default
156 | #' # slide.layouts(ppt)
157 | #' layoutTitleContent = "Title and Content"
158 | #' 
159 | #' # first plot of top score (3) 
160 | #' asdt <- summarizeColumns(subset(sumo,score > 2),'docType')
161 | #' ppt <- 
162 | #'   addChartRightTextLeftPptx(ppt = ppt,
163 | #'                             plot = flippedHistogram(subset(sumo, score > 2) ,
164 | #'                                                     "assigneeSmall","docType",
165 | #'                                                     colors=scoreColors, 
166 | #'                                                     recolor = TRUE), 
167 | #'                             text = summaryText(asdt, "doc type", "doc types", 
168 | #'                                                subset(sumo,score>2)$docType), 
169 | #'                             title = "Doc Types for Top Score Docs", 
170 | #'                             slide_layout = layoutTitleContent)
171 | #' 
172 | #' # top scores by assignee
173 | #' ascore <- summarizeColumns(subset(sumo,score > 2),'assigneeSmall')
174 | #' ppt <- 
175 | #'   addChartRightTextLeftPptx(ppt = ppt,
176 | #'                             plot = flippedHistogram(subset(sumo, score > 2) ,
177 | #'                                                     "assigneeSmall","score",
178 | #'                                                     colors=scoreColors, 
179 | #'                                                     recolor = FALSE), 
180 | #'                             text = summaryText(ascore, "assignee", "assignees", 
181 | #'                                                subset(sumo,score>2)$assigneeSmall), 
182 | #'                             title = "Assignees with Top Scores", 
183 | #'                             slide_layout = layoutTitleContent)
184 | #' 
185 | #' 
186 | #' # last plot is category
187 | #' sc <- summarizeColumns(sumo,'category')
188 | #' ppt <- 
189 | #'   addChartRightTextLeftPptx(ppt = ppt,
190 | #'                             plot = flippedHistogram(sumo ,"category",
191 | #'                                                     "score", colors = scoreColors,
192 | #'                                                     recolor = TRUE),
193 | #'                             text = summaryText(sc, "category", "categories", sumo$category),
194 | #'                             title = "Categories and Scores",
195 | #'                             slide_layout = layoutTitleContent)
196 | #' 
197 | #' # find a data folder and write it out to your folder
198 | #' # out <- paste("data/",Sys.Date(),"_exampleChartRightTextLeft.pptx",sep='')
199 | #' # ReporteRs::writeDoc(ppt, out)
200 | #' 
201 | #' 
202 | #' @seealso \code{\link[ReporteRs]{pptx}}, \code{\link{addFullImagePptx}}
203 | #' 
204 | #' 
205 | #' @export
206 | #' 
207 | #' @import ReporteRs
208 | #' 
209 | addChartRightTextLeftPptx <- function(ppt, plot, text, title, slide_layout = "Title and Content",
210 |                                       Poffx = 5.3,Poffy = 0,Pwidth = 8, Pheight = 7.5,
211 |                                       Toffx = 1, Toffy = 2, Twidth = 5, Theight = 5.5){
212 |   # add a new slide
213 |   ppt <- ReporteRs::addSlide(ppt, slide.layout = slide_layout)
214 |   # add the plot, it takes up slightly more than half (13.3in by 7.5in per slide)
215 |   ppt <- ReporteRs::addPlot(ppt, print, x = plot, 
216 |                             offx = Poffx, offy = Poffy, 
217 |                             width = Pwidth, height = Pheight)
218 |   # add in bullet point text
219 |   ppt <- ReporteRs::addParagraph(ppt, text, 
220 |                                  par.properties = ReporteRs::parProperties(list.style='unordered', level=1),
221 |                                  offx = Toffx, offy = Toffy, 
222 |                                  width = Twidth, height=Theight)
223 |   # add in title overlaid last
224 |   ppt <- ReporteRs::addTitle(ppt, title)
225 |   ppt
226 | }
227 | 
228 | 
229 | 
230 | 
231 | #' Add a full-sized plot image to a pptx
232 | #'
233 | #' @description Take a plot image from ggplot2 and size it to fit an entire
234 | #' slide.
235 | #'
236 | #' @param ppt A ppt object to add a slide to.
237 | #' @param plot A plot output object from ggplto2.
238 | #' @param slide_layout A character value, slide layout, default value is
239 | #' \code{"Title and Content"}.
240 | #' @param w Width in inches, default set to max width 13.3
241 | #' @param h Height in inches, default set to max height 7.5
242 | #'
243 | #'
244 | #' @return a pptx object. 
245 | #'
246 | #' @importFrom ReporteRs addSlide
247 | #' @importFrom ReporteRs addPlot
248 | #'
249 | #' @examples 
250 | #' sumo <- cleanPatentData(patentData = patentr::acars, columnsExpected = sumobrainColumns,
251 | #' cleanNames = sumobrainNames,
252 | #' dateFields = sumobrainDateFields,
253 | #' dateOrders = sumobrainDateOrder,
254 | #' deduplicate = TRUE,
255 | #' cakcDict = patentr::cakcDict,
256 | #' docLengthTypesDict = patentr::docLengthTypesDict,
257 | #' keepType = "grant",
258 | #' firstAssigneeOnly = TRUE,
259 | #' assigneeSep = ";",
260 | #' stopWords = patentr::assigneeStopWords)
261 | #' 
262 | #' # note that in reality, you need a patent analyst to carefully score
263 | #' # these patents, the score here is for demonstrational purposes
264 | #' score <- round(rnorm(dim(sumo)[1],mean=1.4,sd=0.9))
265 | #' score[score>3] <- 3; score[score<0] <- 0
266 | #' sumo$score <- score
267 | #' sumo$assigneeSmall <- strtrim(sumo$assigneeClean,12)
268 | #' category <- c("system","control algorithm","product","control system", "communication")
269 | #' c <- round(rnorm(dim(sumo)[1],mean=2.5,sd=1.5))
270 | #' c[c>5] <- 5; c[c<1] <- 1
271 | #' sumo$category <- category[c]
272 | #' 
273 | #' xVal = "category"
274 | #' fillVal = "score"
275 | #' facetVal = "assigneeSmall"
276 | #' 
277 | #' fp <- facetPlot(subset(sumo, score > 0), xVal, fillVal, facetVal, colors = patentr::scoreColors,
278 | #'                 recolor = FALSE)
279 | #' 
280 | #' 
281 | #' 
282 | #' # create a ppt
283 | #' ppt <- ReporteRs::pptx(title="IP Update")
284 | #' # view the types of layouts available by default
285 | #' # slide.layouts(ppt)
286 | #' layoutTitleContent = "Title and Content"
287 | #' 
288 | #' fp <- facetPlot(subset(sumo, score > 0), xVal, fillVal, facetVal, colors = patentr::scoreColors,
289 | #'                 recolor = FALSE)
290 | #' ppt <- addFullImagePptx(ppt, plot = fp, slide_layout = layoutTitleContent)
291 | #' fp <- facetPlot(subset(sumo, score > 1), xVal, fillVal, facetVal, colors = patentr::scoreColors,
292 | #'                 recolor = FALSE)
293 | #' ppt <- addFullImagePptx(ppt, plot = fp, slide_layout = layoutTitleContent)
294 | #' fp <- facetPlot(subset(sumo, score > 2), xVal, fillVal, facetVal, colors = patentr::scoreColors,
295 | #'                 recolor = FALSE)
296 | #' ppt <- addFullImagePptx(ppt, plot = fp, slide_layout = layoutTitleContent)
297 | #' 
298 | #' 
299 | #' # find a data folder and write it out to your folder
300 | #' # out <- paste("data/",Sys.Date(),"_exampleChartRightTextLeft.pptx",sep='')
301 | #' # ReporteRs::writeDoc(ppt, out)
302 | #'
303 | #'
304 | #' @export
305 | #'
306 | #' @seealso \code{\link{addChartRightTextLeftPptx}}
307 | #'
308 | addFullImagePptx <- function(ppt, plot, slide_layout = "Title and Content",
309 |                              w = 13.3, h = 7.5){
310 |   ppt <- ReporteRs::addSlide(ppt, slide.layout = slide_layout)
311 |   ppt <- ReporteRs::addPlot(ppt, print, x = plot, offx = 0, offy = 0,
312 |                             width = w, height = h)
313 |   
314 |   return(ppt)
315 | 
316 | }
317 | 
318 | 
319 | #' Make a PDF output of a plot
320 | #' 
321 | #' @description Make a PDF output of a plot.
322 | #' 
323 | #' @param graph The graph object to input
324 | #' @param name A character name to name your file. It can have a filepath as well.
325 | #' @param w The width, in inches, of your image, default set to 12.
326 | #' @param h The height, in inches, of your image, default set to 12.
327 | #' 
328 | #' 
329 | #' @return No ret
330 | #' 
331 | #' @examples
332 | #' 
333 | #' sumo <- cleanPatentData(patentData = patentr::acars, columnsExpected = sumobrainColumns,
334 | #' cleanNames = sumobrainNames,
335 | #' dateFields = sumobrainDateFields,
336 | #' dateOrders = sumobrainDateOrder,
337 | #' deduplicate = TRUE,
338 | #' cakcDict = patentr::cakcDict,
339 | #' docLengthTypesDict = patentr::docLengthTypesDict,
340 | #' keepType = "grant",
341 | #' firstAssigneeOnly = TRUE,
342 | #' assigneeSep = ";",
343 | #' stopWords = patentr::assigneeStopWords)
344 | #' 
345 | #' # df <- dplyr::select(sumo, title, abstract)
346 | #' df <- sumo[,c("title","abstract")]
347 | #' addPdfImage(wordCloudIt(df, excludeWords, minfreq = 20,
348 | #'                         random.order = FALSE, rot.per = 0.25),"wordCloud")
349 | #' 
350 | #' @export
351 | #' 
352 | #' 
353 | addPdfImage <- function(graph,name = "image",w=12,h=12){
354 |   name <- paste(name,".pdf",sep='')
355 |   grDevices::pdf(name,width=w,height=h)
356 |   print(graph)
357 |   grDevices::dev.off()
358 | }
359 | 
360 | 
361 | ## yang yao end


--------------------------------------------------------------------------------