├── .travis.yml ├── .gitignore ├── data ├── .Rapp.history ├── acars.rda ├── cakcDict.rda ├── skipLens.rda ├── acarsGoogle.rda ├── acarsLens.rda ├── googleNames.rda ├── kindCodes.rda ├── lensColumns.rda ├── lensNames.rda ├── scoreColors.rda ├── skipGoogle.rda ├── excludeWords.rda ├── googleColumns.rda ├── lensDateOrder.rda ├── skipSumobrain.rda ├── docLengthTypes.rda ├── googleDateFields.rda ├── googleDateOrder.rda ├── lensDateFields.rda ├── sumobrainColumns.rda ├── sumobrainNames.rda ├── assigneeStopWords.rda ├── docLengthTypesDict.rda ├── sumobrainDateOrder.rda └── sumobrainDateFields.rda ├── tests ├── testthat.R └── testthat │ ├── .DS_Store │ ├── testData │ ├── .DS_Store │ ├── mtcars.xls │ ├── lens_autonomous_search.csv │ ├── google_autonomous_search.csv │ ├── sumobrain_autonomous_search1.xls │ ├── sumobrain_autonomous_search1.xlsx │ └── sumobrain_autonomous_search2.xlsx │ ├── test-imports.R │ ├── test-process.R │ ├── test-graphics.R │ └── test-cleaning.R ├── vignettes ├── Rplot.png ├── Rplot01.png └── summary.Rmd ├── inst ├── extdata │ ├── kindCodes.xlsx │ ├── docLengthTypes.xlsx │ ├── lens_autonomous_search.csv │ ├── google_autonomous_search.csv │ ├── sumobrain_autonomous_search1.xls │ ├── sumobrain_autonomous_search2.xls │ ├── sumobrain_autonomous_search1.xlsx │ └── sumobrain_autonomous_search2.xlsx ├── CITATION ├── shiny │ └── app │ │ ├── global.R │ │ ├── ui.R │ │ └── server.R └── examples │ └── edaPatentGuide.R ├── .Rbuildignore ├── man ├── runExample.Rd ├── excludeWords.Rd ├── lensDateOrder.Rd ├── scoreColors.Rd ├── googleDateOrder.Rd ├── sumobrainDateOrder.Rd ├── lensDateFields.Rd ├── googleDateFields.Rd ├── sumobrainDateFields.Rd ├── chooseFiles.Rd ├── assigneeStopWords.Rd ├── makeColors.Rd ├── capWord.Rd ├── extractKindCode.Rd ├── cakcDict.Rd ├── lensColumns.Rd ├── googleColumns.Rd ├── skipLens.Rd ├── skipSumobrain.Rd ├── skipGoogle.Rd ├── docLengthTypes.Rd ├── sumobrainColumns.Rd ├── extractPubNumber.Rd ├── googleNames.Rd ├── lensNames.Rd ├── sumobrainNames.Rd ├── docLengthTypesDict.Rd ├── extractCleanDate.Rd ├── getClaimsText.Rd ├── extractCountryCode.Rd ├── cleanGoogleURL.Rd ├── addPdfImage.Rd ├── importPatentData.Rd ├── showDups.Rd ├── wordCloudIt.Rd ├── extractDocLength.Rd ├── summarizeColumns.Rd ├── patentr.Rd ├── createGoogleURL.Rd ├── factorForGraph.Rd ├── cleanNames.Rd ├── getClaimFromURL.Rd ├── acars.Rd ├── acarsGoogle.Rd ├── summaryText.Rd ├── flippedHistogram.Rd ├── cleanHeaderNames.Rd ├── generateDocType.Rd ├── facetPlot.Rd ├── removeDups.Rd ├── addFullImagePptx.Rd ├── tilePlot.Rd ├── kindCodes.Rd ├── acarsLens.Rd ├── cleanPatentData.Rd └── addChartRightTextLeftPptx.Rd ├── patentr.Rproj ├── ..Rcheck └── 00check.log ├── R ├── shiny.R ├── patentr.R ├── explorePatentData.R ├── importPatentData.R ├── acars.R ├── processPatentData.R └── reportPatentData.R ├── DESCRIPTION ├── NAMESPACE ├── README.md └── README.Rmd /.travis.yml: -------------------------------------------------------------------------------- 1 | language: r 2 | cache: packages 3 | sudo: false 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .RData 4 | inst/doc 5 | -------------------------------------------------------------------------------- /data/.Rapp.history: -------------------------------------------------------------------------------- 1 | load("/Users/Yao/coding/patentr/data/acars.rda") 2 | -------------------------------------------------------------------------------- /data/acars.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kamilien1/patentr/HEAD/data/acars.rda -------------------------------------------------------------------------------- /data/cakcDict.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kamilien1/patentr/HEAD/data/cakcDict.rda -------------------------------------------------------------------------------- /data/skipLens.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kamilien1/patentr/HEAD/data/skipLens.rda -------------------------------------------------------------------------------- /data/acarsGoogle.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kamilien1/patentr/HEAD/data/acarsGoogle.rda -------------------------------------------------------------------------------- /data/acarsLens.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kamilien1/patentr/HEAD/data/acarsLens.rda -------------------------------------------------------------------------------- /data/googleNames.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kamilien1/patentr/HEAD/data/googleNames.rda -------------------------------------------------------------------------------- /data/kindCodes.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kamilien1/patentr/HEAD/data/kindCodes.rda -------------------------------------------------------------------------------- /data/lensColumns.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kamilien1/patentr/HEAD/data/lensColumns.rda -------------------------------------------------------------------------------- /data/lensNames.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kamilien1/patentr/HEAD/data/lensNames.rda -------------------------------------------------------------------------------- /data/scoreColors.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kamilien1/patentr/HEAD/data/scoreColors.rda -------------------------------------------------------------------------------- /data/skipGoogle.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kamilien1/patentr/HEAD/data/skipGoogle.rda -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(patentr) 3 | 4 | test_check("patentr") 5 | -------------------------------------------------------------------------------- /vignettes/Rplot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kamilien1/patentr/HEAD/vignettes/Rplot.png -------------------------------------------------------------------------------- /data/excludeWords.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kamilien1/patentr/HEAD/data/excludeWords.rda -------------------------------------------------------------------------------- /data/googleColumns.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kamilien1/patentr/HEAD/data/googleColumns.rda -------------------------------------------------------------------------------- /data/lensDateOrder.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kamilien1/patentr/HEAD/data/lensDateOrder.rda -------------------------------------------------------------------------------- /data/skipSumobrain.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kamilien1/patentr/HEAD/data/skipSumobrain.rda -------------------------------------------------------------------------------- /vignettes/Rplot01.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kamilien1/patentr/HEAD/vignettes/Rplot01.png -------------------------------------------------------------------------------- /data/docLengthTypes.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kamilien1/patentr/HEAD/data/docLengthTypes.rda -------------------------------------------------------------------------------- /data/googleDateFields.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kamilien1/patentr/HEAD/data/googleDateFields.rda -------------------------------------------------------------------------------- /data/googleDateOrder.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kamilien1/patentr/HEAD/data/googleDateOrder.rda -------------------------------------------------------------------------------- /data/lensDateFields.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kamilien1/patentr/HEAD/data/lensDateFields.rda -------------------------------------------------------------------------------- /data/sumobrainColumns.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kamilien1/patentr/HEAD/data/sumobrainColumns.rda -------------------------------------------------------------------------------- /data/sumobrainNames.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kamilien1/patentr/HEAD/data/sumobrainNames.rda -------------------------------------------------------------------------------- /tests/testthat/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kamilien1/patentr/HEAD/tests/testthat/.DS_Store -------------------------------------------------------------------------------- /data/assigneeStopWords.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kamilien1/patentr/HEAD/data/assigneeStopWords.rda -------------------------------------------------------------------------------- /data/docLengthTypesDict.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kamilien1/patentr/HEAD/data/docLengthTypesDict.rda -------------------------------------------------------------------------------- /data/sumobrainDateOrder.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kamilien1/patentr/HEAD/data/sumobrainDateOrder.rda -------------------------------------------------------------------------------- /inst/extdata/kindCodes.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kamilien1/patentr/HEAD/inst/extdata/kindCodes.xlsx -------------------------------------------------------------------------------- /data/sumobrainDateFields.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kamilien1/patentr/HEAD/data/sumobrainDateFields.rda -------------------------------------------------------------------------------- /inst/extdata/docLengthTypes.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kamilien1/patentr/HEAD/inst/extdata/docLengthTypes.xlsx -------------------------------------------------------------------------------- /tests/testthat/testData/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kamilien1/patentr/HEAD/tests/testthat/testData/.DS_Store -------------------------------------------------------------------------------- /tests/testthat/testData/mtcars.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kamilien1/patentr/HEAD/tests/testthat/testData/mtcars.xls -------------------------------------------------------------------------------- /inst/extdata/lens_autonomous_search.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kamilien1/patentr/HEAD/inst/extdata/lens_autonomous_search.csv -------------------------------------------------------------------------------- /inst/extdata/google_autonomous_search.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kamilien1/patentr/HEAD/inst/extdata/google_autonomous_search.csv -------------------------------------------------------------------------------- /inst/extdata/sumobrain_autonomous_search1.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kamilien1/patentr/HEAD/inst/extdata/sumobrain_autonomous_search1.xls -------------------------------------------------------------------------------- /inst/extdata/sumobrain_autonomous_search2.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kamilien1/patentr/HEAD/inst/extdata/sumobrain_autonomous_search2.xls -------------------------------------------------------------------------------- /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^.*\.Rproj$ 2 | ^\.Rproj\.user$ 3 | ^README\.Rmd$ 4 | ^cran-comments\.md$ 5 | ^NEWS\.md$ 6 | ^\.travis\.yml$ 7 | ^README-.*\.png$ 8 | -------------------------------------------------------------------------------- /inst/extdata/sumobrain_autonomous_search1.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kamilien1/patentr/HEAD/inst/extdata/sumobrain_autonomous_search1.xlsx -------------------------------------------------------------------------------- /inst/extdata/sumobrain_autonomous_search2.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kamilien1/patentr/HEAD/inst/extdata/sumobrain_autonomous_search2.xlsx -------------------------------------------------------------------------------- /tests/testthat/testData/lens_autonomous_search.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kamilien1/patentr/HEAD/tests/testthat/testData/lens_autonomous_search.csv -------------------------------------------------------------------------------- /tests/testthat/testData/google_autonomous_search.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kamilien1/patentr/HEAD/tests/testthat/testData/google_autonomous_search.csv -------------------------------------------------------------------------------- /tests/testthat/testData/sumobrain_autonomous_search1.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kamilien1/patentr/HEAD/tests/testthat/testData/sumobrain_autonomous_search1.xls -------------------------------------------------------------------------------- /tests/testthat/testData/sumobrain_autonomous_search1.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kamilien1/patentr/HEAD/tests/testthat/testData/sumobrain_autonomous_search1.xlsx -------------------------------------------------------------------------------- /tests/testthat/testData/sumobrain_autonomous_search2.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kamilien1/patentr/HEAD/tests/testthat/testData/sumobrain_autonomous_search2.xlsx -------------------------------------------------------------------------------- /inst/CITATION: -------------------------------------------------------------------------------- 1 | citHeader("To cite patentr in publications, use:") citEntry( textVersion = paste("Kamil Bojanczyk, Yao Yang (2017).", "patentr: A patent analysis toolkit in R.", "URL https://github.com/kamilien1/patentr") ) -------------------------------------------------------------------------------- /man/runExample.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/shiny.R 3 | \name{runExample} 4 | \alias{runExample} 5 | \title{Shiny app} 6 | \usage{ 7 | runExample() 8 | } 9 | \description{ 10 | this is a shiny app that loads patent data, views it, 11 | and does a simple visualization. 12 | 13 | NOTE: This only works with xlsx files. 14 | } 15 | -------------------------------------------------------------------------------- /patentr.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: No 4 | SaveWorkspace: No 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | Encoding: UTF-8 9 | 10 | AutoAppendNewline: Yes 11 | StripTrailingWhitespace: Yes 12 | 13 | BuildType: Package 14 | PackageUseDevtools: Yes 15 | PackageInstallArgs: --no-multiarch --with-keep.source 16 | PackageRoxygenize: rd,collate,namespace 17 | -------------------------------------------------------------------------------- /..Rcheck/00check.log: -------------------------------------------------------------------------------- 1 | * using log directory ‘/Users/Kamil/Documents/src/Data Science/stats290 project/patentr/..Rcheck’ 2 | * using R version 3.3.2 (2016-10-31) 3 | * using platform: x86_64-apple-darwin13.4.0 (64-bit) 4 | * using session charset: UTF-8 5 | * checking for file ‘./DESCRIPTION’ ... ERROR 6 | Required fields missing or empty: 7 | ‘Author’ ‘Maintainer’ 8 | * DONE 9 | Status: 1 ERROR 10 | -------------------------------------------------------------------------------- /man/excludeWords.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{excludeWords} 5 | \alias{excludeWords} 6 | \title{A standard list of words to exclude in a patent word cloud.} 7 | \format{A character vector. 8 | 9 | \describe{ 10 | \item{excludeWords}{A character vector of words to exclude} 11 | }} 12 | \usage{ 13 | excludeWords 14 | } 15 | \description{ 16 | A standard list of words to exclude from a patent data word cloud. 17 | } 18 | \keyword{data} 19 | -------------------------------------------------------------------------------- /man/lensDateOrder.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{lensDateOrder} 5 | \alias{lensDateOrder} 6 | \title{Date order for lens.org data.} 7 | \format{A character value. 8 | 9 | \describe{ 10 | \item{lensDateOrder}{A character variable of date order.} 11 | }} 12 | \usage{ 13 | lensDateOrder 14 | } 15 | \description{ 16 | A date order to be used in lens.org date data. 17 | } 18 | \seealso{ 19 | \code{\link{extractCleanDate}} 20 | } 21 | \keyword{data} 22 | -------------------------------------------------------------------------------- /man/scoreColors.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{scoreColors} 5 | \alias{scoreColors} 6 | \title{Score colors used in graphing.} 7 | \format{A character vector. 8 | 9 | \describe{ 10 | \item{scoreColors}{A character variable of four score colors for 0 to 3.} 11 | }} 12 | \usage{ 13 | scoreColors 14 | } 15 | \description{ 16 | A character vector of Hexadecimal score colors. 17 | } 18 | \seealso{ 19 | \code{\link{flippedHistogram}} 20 | } 21 | \keyword{data} 22 | -------------------------------------------------------------------------------- /R/shiny.R: -------------------------------------------------------------------------------- 1 | #' Shiny app 2 | #' 3 | #' @description this is a shiny app that loads patent data, views it, 4 | #' and does a simple visualization. 5 | #' 6 | #' NOTE: This only works with xlsx files. 7 | #' 8 | #' @export 9 | 10 | ## yang yao start 11 | runExample <- function() { 12 | appDir <- system.file("shiny", "app" ,package = "patentr") 13 | if (appDir == "") { 14 | stop("Could not find example directory. Try re-installing `patentr`.", call. = FALSE) 15 | } 16 | shiny::runApp(appDir, display.mode = "normal") 17 | } 18 | ## yang yao end -------------------------------------------------------------------------------- /man/googleDateOrder.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{googleDateOrder} 5 | \alias{googleDateOrder} 6 | \title{Date order for Google Patents data.} 7 | \format{A character value. 8 | 9 | \describe{ 10 | \item{googleDateOrder}{A character variable of date order.} 11 | }} 12 | \usage{ 13 | googleDateOrder 14 | } 15 | \description{ 16 | A date order to be used in Google patent date data. 17 | } 18 | \seealso{ 19 | \code{\link{extractCleanDate}} 20 | } 21 | \keyword{data} 22 | -------------------------------------------------------------------------------- /man/sumobrainDateOrder.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{sumobrainDateOrder} 5 | \alias{sumobrainDateOrder} 6 | \title{Date order for sumobrain data.} 7 | \format{A character value. 8 | 9 | \describe{ 10 | \item{sumobrainDateOrder}{A character variable of date order.} 11 | }} 12 | \usage{ 13 | sumobrainDateOrder 14 | } 15 | \description{ 16 | A date order to be used in sumobrain date data. 17 | } 18 | \seealso{ 19 | \code{\link{extractCleanDate}} 20 | } 21 | \keyword{data} 22 | -------------------------------------------------------------------------------- /man/lensDateFields.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{lensDateFields} 5 | \alias{lensDateFields} 6 | \title{A simple list of date column names in lens.org data.} 7 | \format{A character vector. 8 | 9 | \describe{ 10 | \item{lensDateFields}{A character vector of date fields.} 11 | }} 12 | \usage{ 13 | lensDateFields 14 | } 15 | \description{ 16 | A character vector of date fields in lens.org data. 17 | } 18 | \seealso{ 19 | \code{\link{cleanHeaderNames}} 20 | } 21 | \keyword{data} 22 | -------------------------------------------------------------------------------- /man/googleDateFields.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{googleDateFields} 5 | \alias{googleDateFields} 6 | \title{A simple list of date column names in Google patent data.} 7 | \format{A character vector. 8 | 9 | \describe{ 10 | \item{googleDateFields}{A character vector of date fields.} 11 | }} 12 | \usage{ 13 | googleDateFields 14 | } 15 | \description{ 16 | A character vector of date fields in Google patent data. 17 | } 18 | \seealso{ 19 | \code{\link{cleanHeaderNames}} 20 | } 21 | \keyword{data} 22 | -------------------------------------------------------------------------------- /man/sumobrainDateFields.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{sumobrainDateFields} 5 | \alias{sumobrainDateFields} 6 | \title{A simple list of date column names in sumobrain data.} 7 | \format{A character vector 8 | 9 | \describe{ 10 | \item{sumobrainDateFields}{A character vector of date fields.} 11 | }} 12 | \usage{ 13 | sumobrainDateFields 14 | } 15 | \description{ 16 | A character vector of date fields in sumobrain data. 17 | } 18 | \seealso{ 19 | \code{\link{cleanHeaderNames}} 20 | } 21 | \keyword{data} 22 | -------------------------------------------------------------------------------- /man/chooseFiles.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/importPatentData.R 3 | \name{chooseFiles} 4 | \alias{chooseFiles} 5 | \title{Allow the user to navigate to files manually.} 6 | \usage{ 7 | chooseFiles() 8 | } 9 | \value{ 10 | A list of character vectors with absolute pathnames to files. 11 | } 12 | \description{ 13 | Uses a popup window (Tk file dialog) to allow the user to choose a list of zero or more files interactively. 14 | } 15 | \examples{ 16 | \dontrun{ 17 | filePaths <- chooseFiles() 18 | allData <- importPatentData(filePaths) 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /man/assigneeStopWords.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{assigneeStopWords} 5 | \alias{assigneeStopWords} 6 | \title{A simple stop word list for assignee names.} 7 | \format{A character vector 8 | 9 | \describe{ 10 | \item{assigneeStopWords}{A character vector of stop words.} 11 | }} 12 | \usage{ 13 | assigneeStopWords 14 | } 15 | \description{ 16 | A character vector of common stop words to remove from assignee names for 17 | name standardization, such as "inc". 18 | } 19 | \seealso{ 20 | \code{\link{cleanNames}} 21 | } 22 | \keyword{data} 23 | -------------------------------------------------------------------------------- /man/makeColors.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/visualizePatentData.R 3 | \name{makeColors} 4 | \alias{makeColors} 5 | \title{Make color hues} 6 | \usage{ 7 | makeColors(numColors) 8 | } 9 | \arguments{ 10 | \item{numColors}{Number of colors, a numeric input.} 11 | } 12 | \value{ 13 | A character vector of colors. 14 | } 15 | \description{ 16 | Generate an evenly-spaced number of color hues. 17 | 18 | Credit for this function goes to \href{http://stackoverflow.com/questions/8197559/emulate-ggplot2-default-color-palette}{John Colby's} 19 | Stack Overflow post. 20 | } 21 | \examples{ 22 | makeColors(5) 23 | 24 | } 25 | \seealso{ 26 | \code{\link{flippedHistogram}} 27 | } 28 | -------------------------------------------------------------------------------- /man/capWord.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/visualizePatentData.R 3 | \name{capWord} 4 | \alias{capWord} 5 | \title{Capitalize the first letter of a character} 6 | \usage{ 7 | capWord(s) 8 | } 9 | \arguments{ 10 | \item{s}{Character string to input. Default set to \code{"word"}.} 11 | } 12 | \value{ 13 | A character string with the first letter capitalized. 14 | } 15 | \description{ 16 | A quick shortcut function to capitalize the first letter 17 | of a character. Useful for making data frame column names quickly look like 18 | plain english. 19 | } 20 | \examples{ 21 | 22 | capWord("hello") 23 | capWord("") 24 | capWord("Hi") 25 | 26 | } 27 | \seealso{ 28 | \code{\link{flippedHistogram}} 29 | } 30 | -------------------------------------------------------------------------------- /man/extractKindCode.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/cleanPatentData.R 3 | \name{extractKindCode} 4 | \alias{extractKindCode} 5 | \title{Extract the kind code, if available, from the publication number.} 6 | \usage{ 7 | extractKindCode(docNum) 8 | } 9 | \arguments{ 10 | \item{docNum}{The character vector of document numbers.} 11 | } 12 | \value{ 13 | A character vector of kind codes. If none found, a blank character is returned. 14 | } 15 | \description{ 16 | Extracts the kind code, a one-to-two character code with a letter and 17 | typically a number, if found in the document (published) number. 18 | } 19 | \examples{ 20 | acars$kindCode <- extractKindCode(acars$docNum) 21 | head(acars[,c("docNum","kindCode")]) 22 | 23 | } 24 | -------------------------------------------------------------------------------- /man/cakcDict.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{cakcDict} 5 | \alias{cakcDict} 6 | \title{A country and kind code dictionary.} 7 | \format{A named character vector 8 | 9 | \describe{ 10 | \item{cakcDict}{A named character vector representing key/value pairs 11 | of country codes, kind codes, and type of patent document.} 12 | 13 | } 14 | 15 | Built with the following code: 16 | 17 | \code{cakcDict <- kindCodes$docType} 18 | 19 | \code{names(cakcDict) <- kindCodes$countryAndKindCode}} 20 | \usage{ 21 | cakcDict 22 | } 23 | \description{ 24 | A named vector of key/value pairs for country codes and kind codes used to 25 | determine the type of document. 26 | } 27 | \seealso{ 28 | \code{\link{generateDocType}}, \code{\link{kindCodes}} 29 | } 30 | \keyword{data} 31 | -------------------------------------------------------------------------------- /man/lensColumns.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{lensColumns} 5 | \alias{lensColumns} 6 | \title{The number of columns in a lens.org csv data export.} 7 | \format{A numeric value. 8 | 9 | \describe{ 10 | \item{lensColumns}{A numeric value of the number of columns in a lens.org 11 | patent data export. } 12 | 13 | }} 14 | \usage{ 15 | lensColumns 16 | } 17 | \description{ 18 | The number of columns in a lens.org csv data export. 19 | } 20 | \details{ 21 | Used with \code{\link{acarsLens}} data. 22 | } 23 | \seealso{ 24 | \code{\link{skipGoogle}}, \code{\link{skipLens}}, \code{\link{skipSumobrain}}, 25 | \code{\link{googleColumns}}, \code{\link{sumobrainColumns}}, 26 | \code{\link{sumobrainNames}}, \code{\link{lensNames}}, \code{\link{googleNames}} 27 | } 28 | \keyword{data} 29 | -------------------------------------------------------------------------------- /man/googleColumns.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{googleColumns} 5 | \alias{googleColumns} 6 | \title{Number of columns in Google Patents export data.} 7 | \format{A numeric value. 8 | 9 | \describe{ 10 | \item{googleColumns}{A numeric value of number of columns in a csv export from 11 | Google Patents.} 12 | 13 | }} 14 | \usage{ 15 | googleColumns 16 | } 17 | \description{ 18 | The number of columns in a Google Patents CSV export. 19 | } 20 | \details{ 21 | Used with \code{\link{acarsGoogle}} data. 22 | } 23 | \seealso{ 24 | \code{\link{skipGoogle}}, \code{\link{skipLens}}, \code{\link{skipSumobrain}}, 25 | \code{\link{lensColumns}}, \code{\link{sumobrainColumns}}, 26 | \code{\link{sumobrainNames}}, \code{\link{lensNames}}, \code{\link{googleNames}} 27 | } 28 | \keyword{data} 29 | -------------------------------------------------------------------------------- /man/skipLens.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{skipLens} 5 | \alias{skipLens} 6 | \title{How many lines to skip in a lens.org patent data export.} 7 | \format{A numeric value. 8 | 9 | \describe{ 10 | \item{skipLens}{A numeric value representing the number of rows to skip in a 11 | lens.org csv data export.} 12 | 13 | }} 14 | \usage{ 15 | skipLens 16 | } 17 | \description{ 18 | How many lines to skip in a lens.org patent data export. 19 | } 20 | \details{ 21 | Used with \code{\link{acarsLens}} data. 22 | } 23 | \seealso{ 24 | \code{\link{skipGoogle}}, \code{\link{skipSumobrain}}, 25 | \code{\link{googleColumns}},\code{\link{lensColumns}}, \code{\link{sumobrainColumns}}, 26 | \code{\link{sumobrainNames}}, \code{\link{lensNames}}, \code{\link{googleNames}} 27 | } 28 | \keyword{data} 29 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: patentr 2 | Title: A Toolbox for Analyzing Patent Data 3 | Version: 0.0.1 4 | Authors@R: c(person("Bojanczyk", "Kamil", email = "kamil.bojanczyk@gmail.com", role = c("aut", "cre")), 5 | person("Yang", "Yao", email = "yangyaonju@gmail.com", role = "aut")) 6 | Description: A toolkit for patent data analysis. 7 | Depends: R (>= 3.3.2) 8 | License: GPL (> 2) 9 | Encoding: UTF-8 10 | LazyData: true 11 | Imports: readxl (>= 0.1.1), 12 | plyr (>= 1.8.4), 13 | stringr (>= 1.2.0), 14 | lubridate (>= 1.6.0), 15 | XML (>= 3.98-1.5), 16 | httr (>= 1.2.1), 17 | dplyr (>= 0.5.0), 18 | magrittr (>= 1.5), 19 | ggplot2 (>= 2.2.1), 20 | RColorBrewer (>= 1.1-2), 21 | tm (>= 0.7-1), 22 | wordcloud (>= 2.5), 23 | ReporteRs (>= 0.8.8), 24 | shiny (>= 1.0.0) 25 | Suggests: testthat, 26 | rprojroot 27 | RoxygenNote: 6.0.1 28 | BugReports: http://github.com/kamilien1/patentr/issues 29 | -------------------------------------------------------------------------------- /man/skipSumobrain.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{skipSumobrain} 5 | \alias{skipSumobrain} 6 | \title{The number of lines to skip in a data read for a sumobrain.com export file.} 7 | \format{A numeric value. 8 | 9 | \describe{ 10 | \item{skipSumobrain}{A hard-coded numeric value for how many lines to skip 11 | in a sumobrain.com data export.} 12 | 13 | }} 14 | \usage{ 15 | skipSumobrain 16 | } 17 | \description{ 18 | The number of lines to skip in a data read for a sumobrain.com export file. 19 | Used with \code{\link{acars}} data. 20 | } 21 | \seealso{ 22 | \code{\link{skipGoogle}}, \code{\link{skipLens}}, 23 | \code{\link{googleColumns}},\code{\link{lensColumns}}, \code{\link{sumobrainColumns}}, 24 | \code{\link{sumobrainNames}}, \code{\link{lensNames}}, \code{\link{googleNames}} 25 | } 26 | \keyword{data} 27 | -------------------------------------------------------------------------------- /man/skipGoogle.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{skipGoogle} 5 | \alias{skipGoogle} 6 | \title{How many lines to skip in a Google Patents CSV export file.} 7 | \format{A numeric value. 8 | 9 | \describe{ 10 | \item{skipGoogle}{A numeric value for number of lines to skip in a Google 11 | Patents csv export.} 12 | 13 | }} 14 | \usage{ 15 | skipGoogle 16 | } 17 | \description{ 18 | A hard-coded value for the number of lines to skip in a Google Patents csv 19 | export. 20 | } 21 | \details{ 22 | Used with \code{\link{acarsGoogle}} data. 23 | } 24 | \seealso{ 25 | \code{\link{skipLens}}, \code{\link{skipSumobrain}}, 26 | \code{\link{googleColumns}},\code{\link{lensColumns}}, \code{\link{sumobrainColumns}}, 27 | \code{\link{sumobrainNames}}, \code{\link{lensNames}}, \code{\link{googleNames}} 28 | } 29 | \keyword{data} 30 | -------------------------------------------------------------------------------- /man/docLengthTypes.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{docLengthTypes} 5 | \alias{docLengthTypes} 6 | \title{A document mapper for country codes and document digit length to the type of 7 | document.} 8 | \format{A data frame with a key and value pair. 9 | 10 | \describe{ 11 | \item{key}{A concatenated country code and length of the numeric portion of a 12 | document number. For example: US7 is a US document with 7 digits.} 13 | \item{value}{The type of patent document based on the country code and document 14 | length value.} 15 | 16 | }} 17 | \usage{ 18 | docLengthTypes 19 | } 20 | \description{ 21 | A simple table that helps map the country code and length of the numeric portion 22 | of the data to the type of document. 23 | } 24 | \details{ 25 | May need to add the USAPP for sumobrain. For now it is not needed. 26 | } 27 | \keyword{data} 28 | -------------------------------------------------------------------------------- /man/sumobrainColumns.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{sumobrainColumns} 5 | \alias{sumobrainColumns} 6 | \title{The number of columns in a sumobrain.com data export.} 7 | \format{A numeric value. 8 | 9 | \describe{ 10 | \item{sumobrainColumns}{A hard-coded numeric value for the number of columns in a 11 | sumobrain.com data export.} 12 | 13 | }} 14 | \usage{ 15 | sumobrainColumns 16 | } 17 | \description{ 18 | A convenient hard-coded value that can be used when reading in sumobrain.com 19 | exported patent data files. 20 | } 21 | \details{ 22 | Used with \code{\link{acars}} data. 23 | } 24 | \seealso{ 25 | \code{\link{skipGoogle}}, \code{\link{skipLens}}, \code{\link{skipSumobrain}}, 26 | \code{\link{googleColumns}},\code{\link{lensColumns}}, 27 | \code{\link{sumobrainNames}}, \code{\link{lensNames}}, \code{\link{googleNames}} 28 | } 29 | \keyword{data} 30 | -------------------------------------------------------------------------------- /man/extractPubNumber.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/cleanPatentData.R 3 | \name{extractPubNumber} 4 | \alias{extractPubNumber} 5 | \title{Extract the numeric portion of the document (published) number.} 6 | \usage{ 7 | extractPubNumber(docNum) 8 | } 9 | \arguments{ 10 | \item{docNum}{The character vector of document numbers.} 11 | } 12 | \value{ 13 | A character vector of same length inputted, with varying length. 14 | Typical lengths are 7 to 11 characters. Only numbers are returned. All other 15 | characters are stripped. 16 | } 17 | \description{ 18 | Extract the numeric portion of the document number. 19 | This is useful for a number of processing applications, and thus is beneficial 20 | to isolate from the entire publication number. 21 | } 22 | \examples{ 23 | acars$pubNum <- extractPubNumber(acars$docNum) 24 | head(acars[,c("docNum","pubNum")]) 25 | 26 | } 27 | \seealso{ 28 | \code{\link{createGoogleURL}} 29 | } 30 | -------------------------------------------------------------------------------- /man/googleNames.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{googleNames} 5 | \alias{googleNames} 6 | \title{Header names for a data upload sourced from Google Patents data exports.} 7 | \format{A character vector. 8 | 9 | \describe{ 10 | \item{googleNames}{A 9-element character vector of clean Google patent names.} 11 | 12 | }} 13 | \usage{ 14 | googleNames 15 | } 16 | \description{ 17 | A character vector to standardize the headers of the imported csv from a 18 | Google Patents patent data export. Used with \code{\link{acarsGoogle}} data. 19 | } 20 | \seealso{ 21 | Go to \href{patents.google.com}{Google Patents} to download the data. 22 | 23 | \code{\link{skipGoogle}}, \code{\link{skipLens}}, \code{\link{skipSumobrain}}, 24 | \code{\link{googleColumns}},\code{\link{lensColumns}}, \code{\link{sumobrainColumns}}, 25 | \code{\link{sumobrainNames}}, \code{\link{lensNames}} 26 | } 27 | \keyword{data} 28 | -------------------------------------------------------------------------------- /man/lensNames.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{lensNames} 5 | \alias{lensNames} 6 | \title{Header names for a data upload sourced from lens.org.} 7 | \format{A character vector. 8 | 9 | \describe{ 10 | \item{sumobrainNames}{A 26-element character vector of clean lens.org header names.} 11 | 12 | }} 13 | \usage{ 14 | lensNames 15 | } 16 | \description{ 17 | A character vector to standardize the headers of the imported csv from a 18 | lens.org patent data export. 19 | } 20 | \details{ 21 | Used with \code{\link{acarsLens}} data. 22 | } 23 | \seealso{ 24 | Go to \href{lens.org}{Lens}, optionally create a free account, and 25 | download the data. 26 | 27 | \code{\link{skipGoogle}}, \code{\link{skipLens}}, \code{\link{skipSumobrain}}, 28 | \code{\link{googleColumns}},\code{\link{lensColumns}}, \code{\link{sumobrainColumns}}, 29 | \code{\link{sumobrainNames}}, \code{\link{googleNames}} 30 | } 31 | \keyword{data} 32 | -------------------------------------------------------------------------------- /man/sumobrainNames.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{sumobrainNames} 5 | \alias{sumobrainNames} 6 | \title{Header names for a data upload sourced from sumobrain.com} 7 | \format{A character vector 8 | 9 | \describe{ 10 | \item{sumobrainNames}{An 11-element character vector of clean sumobrain names.} 11 | 12 | }} 13 | \usage{ 14 | sumobrainNames 15 | } 16 | \description{ 17 | A character vector to standardize the headers of the imported excel from a 18 | sumobrain.com patent data export. 19 | } 20 | \details{ 21 | Used with \code{\link{acars}} data. 22 | } 23 | \seealso{ 24 | Go to \href{www.sumobrain.com}{Sumobrain}, create a free account, and 25 | download the data. 26 | 27 | \code{\link{skipGoogle}}, \code{\link{skipLens}}, \code{\link{skipSumobrain}}, 28 | \code{\link{googleColumns}},\code{\link{lensColumns}}, \code{\link{sumobrainColumns}}, 29 | \code{\link{lensNames}}, \code{\link{googleNames}} 30 | } 31 | \keyword{data} 32 | -------------------------------------------------------------------------------- /man/docLengthTypesDict.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{docLengthTypesDict} 5 | \alias{docLengthTypesDict} 6 | \title{A named vector of key/value pairs for country codes and publication number 7 | document lengths used to determine the type of document.} 8 | \format{A named character vector 9 | 10 | \describe{ 11 | \item{docLengthTypesDict}{A named character vector representing key/value pairs 12 | of doc lengths, country codes, and type of patent document.} 13 | 14 | } 15 | 16 | Built with the following code: 17 | 18 | \code{docLengthTypesDict <- docLengthTypes$value} 19 | 20 | \code{names(docLengthTypesDict) <- docLengthTypes$key}} 21 | \usage{ 22 | docLengthTypesDict 23 | } 24 | \description{ 25 | A named vector of key/value pairs for country codes and publication number 26 | document lengths used to determine the type of document. 27 | } 28 | \seealso{ 29 | \code{\link{generateDocType}}, \code{\link{docLengthTypes}} 30 | } 31 | \keyword{data} 32 | -------------------------------------------------------------------------------- /inst/shiny/app/global.R: -------------------------------------------------------------------------------- 1 | ##This should detect and install missing packages before loading them 2 | ## yang yao and kamil bojanczyk 3 | ## motivation: R Shiny gallery and look at urls in ui.R 4 | list.of.packages <- c("shiny","ggplot2", "dplyr") 5 | new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])] 6 | if(length(new.packages)) install.packages(new.packages) 7 | lapply(list.of.packages,function(x){library(x,character.only=TRUE)}) 8 | 9 | 10 | # TODO 11 | #' 1) successfully read in csv from 12 | #' 1a) lens.org data 13 | #' 1b) Google patents data 14 | #' 2) successfull read in excel file from sumobrain data 15 | #' 3) successfully visualize patent data frame data by 16 | #' 3a) columns (choose which ones to display) 17 | #' 3b) values within rows: example, choose assignees to display 18 | #' 4) successfully display simple patent summaries 19 | #' 4a) total number of documents by docType 20 | #' 4b) number of documents by assignee 21 | #' 5) be able to export data with the following types 22 | #' 5a) csv export 23 | #' 5b) excel export (xlsx) 24 | #' -------------------------------------------------------------------------------- /man/extractCleanDate.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/cleanPatentData.R 3 | \name{extractCleanDate} 4 | \alias{extractCleanDate} 5 | \title{Format patent dates.} 6 | \usage{ 7 | extractCleanDate(dateVector, orders = "ymd") 8 | } 9 | \arguments{ 10 | \item{dateVector}{A vector of character dates.} 11 | 12 | \item{orders}{The orders the dates appear in. 13 | Sumobrain is "ymd" and Lens.org and Google data are "mdy". Hardcoded values include 14 | \code{\link{googleDateOrder}},\code{\link{lensDateOrder}}, 15 | and \code{\link{sumobrainDateOrder}}.} 16 | } 17 | \value{ 18 | A date vector of year, month, day dates. 19 | } 20 | \description{ 21 | Create a clean year, month, day date. 22 | 23 | Reading data in and aout of R may cause date mistakes, using a simple set 24 | function will ensure data types are the right format and class type. This 25 | data format is cleaned up to be in the format yyyy-mm-dd with no hours, 26 | minutes, seconds, or time zone attached. 27 | } 28 | \examples{ 29 | acars$pubDate <- extractCleanDate(dateVector = acars$pubDate, orders = "ymd") 30 | 31 | 32 | } 33 | -------------------------------------------------------------------------------- /man/getClaimsText.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/processPatentData.R 3 | \name{getClaimsText} 4 | \alias{getClaimsText} 5 | \title{Get claims data for all rows in a data frame} 6 | \usage{ 7 | getClaimsText(googleURLs, langCode = "en", whichClaim = 1) 8 | } 9 | \arguments{ 10 | \item{googleURLs}{A character vector of Google URLs} 11 | 12 | \item{langCode}{A language code, default set to "en"} 13 | 14 | \item{whichClaim}{Which claim (if available) to return. Default set to 1st.} 15 | } 16 | \description{ 17 | Generate claims data for all rows in a data frame. 18 | 19 | This is a wrapper function for the \code{\link{getClaimFromURL}} function. 20 | } 21 | \examples{ 22 | 23 | \dontrun{ 24 | cc <- extractCountryCode(acars$docNum) 25 | pn <- extractPubNumber(acars$docNum) 26 | kc <- extractKindCode(acars$docNum) 27 | urls <- createGoogleURL(countryCode = cc, pubNum = pn ,kindCode = kc) 28 | urls <- urls[1:4] 29 | clms <- getClaimsText(urls) 30 | clms[1] 31 | } 32 | } 33 | \seealso{ 34 | \code{\link{createGoogleURL}}, \code{\link{cleanGoogleURL}}, 35 | \code{\link{getClaimFromURL}} 36 | } 37 | -------------------------------------------------------------------------------- /man/extractCountryCode.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/cleanPatentData.R 3 | \name{extractCountryCode} 4 | \alias{extractCountryCode} 5 | \title{Extract the country code from a vector string of document numbers.} 6 | \usage{ 7 | extractCountryCode(docNum) 8 | } 9 | \arguments{ 10 | \item{docNum}{The character vector of document numbers.} 11 | } 12 | \value{ 13 | A character vector of the same length inputted, with 2-4 characters 14 | representing the country code of the ptaent document. 15 | } 16 | \description{ 17 | Extract the country code from a patent document number, which is the 18 | first two to four letters in a patent document number. 19 | 20 | For example, if a patent number is US8880270, the country code is US. In rare 21 | cases, we have more than two letters. Typical country codes are US (United States), 22 | EP (Europe), JP (Japan), and WO (World, meaning a PCT application). 23 | } 24 | \examples{ 25 | # create a new column called countryCode in the acars data set 26 | acars$countryCode <- extractCountryCode(acars$docNum) 27 | head(acars[,c("docNum","countryCode")]) 28 | 29 | } 30 | -------------------------------------------------------------------------------- /man/cleanGoogleURL.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/processPatentData.R 3 | \name{cleanGoogleURL} 4 | \alias{cleanGoogleURL} 5 | \title{Sanitize a Google URL before attempting to extract data} 6 | \usage{ 7 | cleanGoogleURL(googleURL, langCode = "en") 8 | } 9 | \arguments{ 10 | \item{googleURL}{A character value of a google URL.} 11 | 12 | \item{langCode}{A language code, default set to "en" English.} 13 | } 14 | \value{ 15 | A clean character vector of a Google Patents URL. 16 | } 17 | \description{ 18 | Clean up the google URL to make sure it will be read properly. 19 | 20 | If you use the \code{\link{createGoogleURL}} function, you won't have to use this function. 21 | However, if you use your own generator or want to change the language, use this 22 | function to do so. 23 | } 24 | \examples{ 25 | 26 | cleanGoogleURL("https://patents.google.com/patent/US8818682B1/mx") 27 | cleanGoogleURL("https://patents.google.com/patent/US8818682B1/") 28 | cleanGoogleURL("https://patents.google.com/patent/US8818682B1") 29 | cleanGoogleURL("https://patents.google.com/patent/US8818682B1/en") 30 | 31 | } 32 | \seealso{ 33 | \code{\link{createGoogleURL}} 34 | } 35 | -------------------------------------------------------------------------------- /man/addPdfImage.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/reportPatentData.R 3 | \name{addPdfImage} 4 | \alias{addPdfImage} 5 | \title{Make a PDF output of a plot} 6 | \usage{ 7 | addPdfImage(graph, name = "image", w = 12, h = 12) 8 | } 9 | \arguments{ 10 | \item{graph}{The graph object to input} 11 | 12 | \item{name}{A character name to name your file. It can have a filepath as well.} 13 | 14 | \item{w}{The width, in inches, of your image, default set to 12.} 15 | 16 | \item{h}{The height, in inches, of your image, default set to 12.} 17 | } 18 | \value{ 19 | No ret 20 | } 21 | \description{ 22 | Make a PDF output of a plot. 23 | } 24 | \examples{ 25 | 26 | sumo <- cleanPatentData(patentData = patentr::acars, columnsExpected = sumobrainColumns, 27 | cleanNames = sumobrainNames, 28 | dateFields = sumobrainDateFields, 29 | dateOrders = sumobrainDateOrder, 30 | deduplicate = TRUE, 31 | cakcDict = patentr::cakcDict, 32 | docLengthTypesDict = patentr::docLengthTypesDict, 33 | keepType = "grant", 34 | firstAssigneeOnly = TRUE, 35 | assigneeSep = ";", 36 | stopWords = patentr::assigneeStopWords) 37 | 38 | # df <- dplyr::select(sumo, title, abstract) 39 | df <- sumo[,c("title","abstract")] 40 | addPdfImage(wordCloudIt(df, excludeWords, minfreq = 20, 41 | random.order = FALSE, rot.per = 0.25),"wordCloud") 42 | 43 | } 44 | -------------------------------------------------------------------------------- /man/importPatentData.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/importPatentData.R 3 | \name{importPatentData} 4 | \alias{importPatentData} 5 | \title{Read in a data file or list of files from excel spreadsheets.} 6 | \usage{ 7 | importPatentData(rawDataFilePath = NA, skipLines = 1) 8 | } 9 | \arguments{ 10 | \item{rawDataFilePath}{A filepath, or list of filespaths, for xls files.} 11 | 12 | \item{skipLines}{Number of lines to skip before reading in your data file.} 13 | } 14 | \value{ 15 | A single data frame of all data. NULL if no data. 16 | } 17 | \description{ 18 | Import, read, and connect patent data files. Currently: xls files 19 | from a filepath. 20 | Future use: can read from a URL, an xlsx file, google doc, and a csv. 21 | } 22 | \examples{ 23 | \dontrun{ 24 | 25 | # access the files here and put them in a data/ folder of your working directory. 26 | file1 <- system.file("extdata/", "sumobrain_autonomous_search1.xlsx", package="patentr") 27 | file2 <- system.file("extdata/", "sumobrain_autonomous_search2.xlsx", package="patentr") 28 | files <- list(file1, file2) 29 | ipData <- importPatentData(rawDataFilePath = files, skipLines = 1) 30 | 31 | # example 2 32 | # assume csv files are in the data folder 33 | ipData <- importPatentData(rawDataFilePath = list.files('data/', full.names=T), skipLines = 1) 34 | } 35 | 36 | 37 | } 38 | -------------------------------------------------------------------------------- /man/showDups.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/cleanPatentData.R 3 | \name{showDups} 4 | \alias{showDups} 5 | \title{View all your duplicate entries to decide which to remove} 6 | \usage{ 7 | showDups(input) 8 | } 9 | \arguments{ 10 | \item{input}{A vector or a data frame which you wish to view duplicated values.} 11 | } 12 | \value{ 13 | A logical vector of TRUE / FALSE with all entries that contain two 14 | or more duplicate values. 15 | } 16 | \description{ 17 | Return a logical vector of all duplicate entries. 18 | 19 | Often times, you want to review your duplicate results to determine which 20 | rows you want to keep and which you want to erase. 21 | 22 | For example, if you have 23 | an application number that is an application, and another that is a search report, 24 | then you will want to keep the application data and remove the search report 25 | entry. 26 | 27 | Or, if you have an application number that has both a grant and an 28 | application entry, you may want to remove the application from your analysis 29 | and focus on the grant data, as the claim scope is most important for the 30 | granted patent. 31 | } 32 | \examples{ 33 | 34 | acarsDups <- acars[showDups(acars$appNum),] 35 | head(acarsDups[order(acarsDups$appNum),c("docNum","docTypeSumobrain","appNum")]) 36 | 37 | } 38 | \seealso{ 39 | \code{\link[base]{duplicated}}, \code{\link{removeDups}} 40 | } 41 | -------------------------------------------------------------------------------- /man/wordCloudIt.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/visualizePatentData.R 3 | \name{wordCloudIt} 4 | \alias{wordCloudIt} 5 | \title{Generate a word cloud with a given subset of patent data fields.} 6 | \usage{ 7 | wordCloudIt(file, rmwords, minfreq = 20, maxwords = 150, ...) 8 | } 9 | \arguments{ 10 | \item{file}{The data frame you want word cloud, typically the abstract, title, 11 | and claims subset.} 12 | 13 | \item{rmwords}{A character vector of words you exclude from your analysis. Default 14 | is \code{\link{excludeWords}}.} 15 | 16 | \item{minfreq}{From \code{\link[wordcloud]{wordcloud}}, the min frequency 17 | to include a word. Default is 10.} 18 | 19 | \item{maxwords}{From \code{\link[wordcloud]{wordcloud}}, the max number of 20 | words to show. Default is 150.} 21 | 22 | \item{...}{\code{\link[wordcloud]{wordcloud}} options} 23 | } 24 | \value{ 25 | NULL, prints out a wordcloud 26 | } 27 | \description{ 28 | Create a word cloud from a patent data set. 29 | } 30 | \examples{ 31 | 32 | sumo <- cleanPatentData(patentData = patentr::acars, columnsExpected = sumobrainColumns, 33 | cleanNames = sumobrainNames, 34 | dateFields = sumobrainDateFields, 35 | dateOrders = sumobrainDateOrder, 36 | deduplicate = TRUE, 37 | cakcDict = patentr::cakcDict, 38 | docLengthTypesDict = patentr::docLengthTypesDict, 39 | keepType = "grant", 40 | firstAssigneeOnly = TRUE, 41 | assigneeSep = ";", 42 | stopWords = patentr::assigneeStopWords) 43 | 44 | # df <- dplyr::select(sumo, title, abstract) 45 | df <- sumo[,c("title","abstract")] 46 | wordCloudIt(df, excludeWords, minfreq = 20, 47 | random.order = FALSE, rot.per = 0.25) 48 | 49 | } 50 | -------------------------------------------------------------------------------- /man/extractDocLength.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/cleanPatentData.R 3 | \name{extractDocLength} 4 | \alias{extractDocLength} 5 | \title{Get a code for length of doc and country code} 6 | \usage{ 7 | extractDocLength(countryCode, pubNum) 8 | } 9 | \arguments{ 10 | \item{countryCode}{A string vector of country codes} 11 | 12 | \item{pubNum}{A string vector of the numeric portion of a publication number.} 13 | } 14 | \value{ 15 | A string vector of concatenated country code and publication number 16 | length, such as US11 or EP9. 17 | } 18 | \description{ 19 | Generate a custom concatenation of country code and length of 20 | the publication number, for document type identification purposes. 21 | 22 | Given limited metadata available on free sites, often times the downloaded 23 | data set does not include the type of patent document. There are two easy ways to 24 | discover the type of a patent document. A dictionary stored with the 25 | package can compare the output to match up the type of patent document. 26 | 27 | \enumerate{ 28 | \item The kind code, if present, is typically the same for each country. 29 | \code{B} is usually a patent and \code{A} is usually an application. 30 | \item The length of the publication number, along with the country code, is 31 | another great indicator. Applications in USA have 11 numbers, and, for now, 32 | 9 numbers for granted patents. 33 | } 34 | } 35 | \examples{ 36 | acars$pubNum <- extractPubNumber(acars$docNum) 37 | acars$countryCode <- extractCountryCode(acars$docNum) 38 | acars$officeDocLength <- extractDocLength(countryCode = acars$countryCode, 39 | pubNum = acars$pubNum) 40 | head(acars[,c("officeDocLength","docNum")]) 41 | 42 | } 43 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | export(addChartRightTextLeftPptx) 4 | export(addFullImagePptx) 5 | export(addPdfImage) 6 | export(capWord) 7 | export(chooseFiles) 8 | export(cleanGoogleURL) 9 | export(cleanHeaderNames) 10 | export(cleanNames) 11 | export(cleanPatentData) 12 | export(createGoogleURL) 13 | export(extractCleanDate) 14 | export(extractCountryCode) 15 | export(extractDocLength) 16 | export(extractKindCode) 17 | export(extractPubNumber) 18 | export(facetPlot) 19 | export(factorForGraph) 20 | export(flippedHistogram) 21 | export(generateDocType) 22 | export(getClaimFromURL) 23 | export(getClaimsText) 24 | export(importPatentData) 25 | export(makeColors) 26 | export(removeDups) 27 | export(runExample) 28 | export(showDups) 29 | export(summarizeColumns) 30 | export(summaryText) 31 | export(tilePlot) 32 | export(wordCloudIt) 33 | import(ReporteRs) 34 | import(ggplot2) 35 | importFrom(RColorBrewer,brewer.pal) 36 | importFrom(ReporteRs,addPlot) 37 | importFrom(ReporteRs,addSlide) 38 | importFrom(XML,getNodeSet) 39 | importFrom(XML,htmlParse) 40 | importFrom(XML,xmlValue) 41 | importFrom(XML,xpathSApply) 42 | importFrom(dplyr,arrange) 43 | importFrom(dplyr,group_by_) 44 | importFrom(dplyr,n) 45 | importFrom(dplyr,select_) 46 | importFrom(dplyr,summarize) 47 | importFrom(httr,GET) 48 | importFrom(lubridate,parse_date_time) 49 | importFrom(magrittr,"%>%") 50 | importFrom(plyr,ldply) 51 | importFrom(readxl,read_excel) 52 | importFrom(stringr,str_extract) 53 | importFrom(tm,Corpus) 54 | importFrom(tm,VectorSource) 55 | importFrom(tm,content_transformer) 56 | importFrom(tm,removePunctuation) 57 | importFrom(tm,removeWords) 58 | importFrom(tm,stopwords) 59 | importFrom(tm,tm_map) 60 | importFrom(wordcloud,wordcloud) 61 | -------------------------------------------------------------------------------- /inst/shiny/app/ui.R: -------------------------------------------------------------------------------- 1 | ## yang yao 2 | ## motivation: R Shiny gallery and look at urls below 3 | library(shiny) 4 | 5 | fluidPage( 6 | titlePanel("Uploading Files"), 7 | sidebarLayout( 8 | sidebarPanel( 9 | # http://shiny.rstudio.com/gallery/file-upload.html 10 | # https://shiny.rstudio.com/reference/shiny/latest/fileInput.html 11 | # http://stackoverflow.com/questions/29201155/how-to-validate-the-file-type-of-a-file-uploaded-by-the-user-in-a-shiny-app 12 | # http://stackoverflow.com/questions/30624201/read-excel-in-a-shiny-app 13 | fileInput('file1', 'Choose a File', 14 | accept=c('text/csv', 15 | 'text/comma-separated-values,text/plain', 16 | '.csv', 17 | '.xls', 18 | '.xlsx')), 19 | tags$hr(), 20 | checkboxInput('header', 'Header', TRUE), 21 | radioButtons('sep', 'Separator', 22 | c(Comma=',', 23 | Semicolon=';', 24 | Tab='\t'), 25 | ','), 26 | radioButtons('quote', 'Quote', 27 | c(None='', 28 | 'Double Quote'='"', 29 | 'Single Quote'="'"), 30 | '"'), 31 | actionButton('cleanDataButton',"Clean Data"), 32 | p("Click this button to clean the raw data"), 33 | downloadButton('downloadData',"Download Clean Data") 34 | ), 35 | mainPanel( 36 | tabsetPanel(type = "tabs", 37 | tabPanel("Data Table",tableOutput("contents"), tableOutput("cleanContents")), 38 | tabPanel("Score Count Plot",plotOutput("outplot1")) 39 | ) 40 | ) 41 | ) 42 | ) 43 | 44 | ## yang yao 45 | ## motivation: R Shiny gallery and look at urls below -------------------------------------------------------------------------------- /man/summarizeColumns.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/explorePatentData.R 3 | \name{summarizeColumns} 4 | \alias{summarizeColumns} 5 | \title{Summarize columns of a data frame} 6 | \usage{ 7 | summarizeColumns(df, names, naOmit = FALSE) 8 | } 9 | \arguments{ 10 | \item{df}{A data frame of patent data.} 11 | 12 | \item{names}{a character vector of header names that you want to summarize.} 13 | 14 | \item{naOmit}{Logical. Optionally, remove NA values at the end of the summary. 15 | Useful when comparing fields that have NA values, such as features.} 16 | } 17 | \value{ 18 | A dataframe of summarize values. 19 | } 20 | \description{ 21 | Summarize columns of a data frame. 22 | 23 | Summarize a data frame \code{df} by a \code{names} character vector of 24 | header names. 25 | } 26 | \examples{ 27 | sumo <- cleanPatentData(patentData = patentr::acars, columnsExpected = sumobrainColumns, 28 | cleanNames = sumobrainNames, 29 | dateFields = sumobrainDateFields, 30 | dateOrders = sumobrainDateOrder, 31 | deduplicate = TRUE, 32 | cakcDict = patentr::cakcDict, 33 | docLengthTypesDict = patentr::docLengthTypesDict, 34 | keepType = "grant", 35 | firstAssigneeOnly = TRUE, 36 | assigneeSep = ";", 37 | stopWords = patentr::assigneeStopWords) 38 | 39 | # note that in reality, you need a patent analyst to carefully score 40 | # these patents, the score here is for demonstrational purposes 41 | score <- round(rnorm(dim(sumo)[1],mean=1.4,sd=0.9)) 42 | score[score>3] <- 3 43 | score[score<0] <- 0 44 | sumo$score <- score 45 | scoreSum <- summarizeColumns(sumo, "score") 46 | scoreSum 47 | # load library(ggplot2) for the below part to run 48 | # ggplot(scoreSum, aes(x=score, y = total, fill=factor(score) )) + geom_bar(stat="identity") 49 | nameAndScore <- summarizeColumns(sumo, c("assigneeClean","score")) 50 | # tail(nameAndScore) 51 | 52 | } 53 | -------------------------------------------------------------------------------- /tests/testthat/test-imports.R: -------------------------------------------------------------------------------- 1 | # test that import works 2 | 3 | # files 4 | file1 <- system.file("extdata/", "sumobrain_autonomous_search1.xlsx", package="patentr") 5 | file2 <- system.file("extdata/", "sumobrain_autonomous_search2.xlsx", package="patentr") 6 | files <- list(file1, file2) 7 | # read it in 8 | patData <- importPatentData(files) 9 | 10 | # should be a data frame 11 | expect_true(is.data.frame(patData)) 12 | 13 | 14 | # test_that("importing a data file works",{ 15 | # # files 16 | # file1 <- system.file("inst/extdata/", "sumobrain_autonomous_search1.xlsx", package="patentr") 17 | # file2 <- system.file("inst/extdata/", "sumobrain_autonomous_search2.xlsx", package="patentr") 18 | # files <- list(file1, file2) 19 | # # read it in 20 | # patData <- importPatentData(files) 21 | # 22 | # # should be a data frame 23 | # expect_true(is.data.frame(patData)) 24 | # }) 25 | 26 | # motivation 27 | # test taken from readxl package in tidyverse and modified 28 | # credit goes to tidyverse team 29 | # https://github.com/tidyverse/readxl/blob/83af028bcc577d23b01c4a1f47d2dfc314497253/tests/testthat/helper.R 30 | # NOTE: may need to cancel this test as readxl 0.1.1 still has error, only github version does not have error 31 | # this only works on 0.1.1.9000 (current at time of writing) and above 32 | test_that("can tolerate xls(x) that underreports number of columns",{ 33 | # tidyverse test 34 | #df <- readxl::read_excel(rprojroot::find_testthat_root_file("testData","mtcars.xls")) 35 | #expect_identical(ncol(df),ncol(mtcars)) 36 | # test modified 37 | # note, test cases modified to remove dependency on 0.1.1.9000 readxl, future versions (0.1.2+) 38 | # expected to fix this issue 39 | df2 <- readxl::read_excel(rprojroot::find_testthat_root_file("testData","sumobrain_autonomous_search1.xlsx"), skip=1) 40 | expect_identical(ncol(df2),ncol(acars)) 41 | 42 | }) 43 | 44 | -------------------------------------------------------------------------------- /R/patentr.R: -------------------------------------------------------------------------------- 1 | ## yang yao start 2 | #' patentr: A package for analyzing patent data. 3 | #' 4 | #' The package is a data processing and reporting tool of patent data sets for patent analysts. 5 | #' 6 | #' 7 | #' The package is aimed at patent agents, lawyers, managers, analysts, and 8 | #' academics who are working on patents. This may be used in a patent landscape 9 | #' analysis, company IP portfolio analysis, or a freedom to operate search. 10 | #' 11 | #' 12 | #' The patentr package provides four categories of important functions: 13 | #' 14 | #' \enumerate{ 15 | #' \item Data input and cleaning 16 | #' \item Data (re)processing 17 | #' \item Data exploration & visualization 18 | #' \item Visualization & reporting 19 | #' } 20 | #' 21 | #' 22 | #' @section load functions: 23 | #' 24 | #' \code{\link{importPatentData}}: Import xls patent data from filepaths. 25 | #' \code{\link{chooseFiles}}: Uses a popup window (Tk file dialog) to 26 | #' allow the user to choose a list of zero or more files interactively. 27 | #' 28 | #' @section clean functions: 29 | #' 30 | #' \code{\link{cleanHeaderNames}}, \code{\link{extractCountryCode}}, 31 | #' \code{\link{extractPubNumber}}, \code{\link{extractKindCode}}, 32 | #' \code{\link{extractDocLength}}, \code{\link{extractCleanDate}}, 33 | #' \code{\link{showDups}}, \code{\link{removeDups}}, \code{\link{generateDocType}}, 34 | #' \code{\link{cleanNames}}, \code{\link{cleanPatentData}} 35 | #' 36 | #' 37 | #' @section patentr data: 38 | #' \code{\link{acars}} To pay respect to the \code{\link[datasets]{mtcars}} data, 39 | #' this is a data set of autonomous driving car patents from major companies. 40 | #' \code{\link{kindCodes}} A data frame of kind codes by country with associated 41 | #' descriptions. 42 | #' \code{\link{docLengthTypes}} A data frame mapping doc length to the type of 43 | #' patent document. 44 | #' 45 | #' @docType package 46 | #' @name patentr 47 | NULL 48 | 49 | ## yang yao end -------------------------------------------------------------------------------- /man/patentr.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/patentr.R 3 | \docType{package} 4 | \name{patentr} 5 | \alias{patentr} 6 | \alias{patentr-package} 7 | \title{patentr: A package for analyzing patent data.} 8 | \description{ 9 | The package is a data processing and reporting tool of patent data sets for patent analysts. 10 | } 11 | \details{ 12 | The package is aimed at patent agents, lawyers, managers, analysts, and 13 | academics who are working on patents. This may be used in a patent landscape 14 | analysis, company IP portfolio analysis, or a freedom to operate search. 15 | 16 | The patentr package provides four categories of important functions: 17 | 18 | \enumerate{ 19 | \item Data input and cleaning 20 | \item Data (re)processing 21 | \item Data exploration & visualization 22 | \item Visualization & reporting 23 | } 24 | } 25 | \section{load functions}{ 26 | 27 | 28 | \code{\link{importPatentData}}: Import xls patent data from filepaths. 29 | \code{\link{chooseFiles}}: Uses a popup window (Tk file dialog) to 30 | allow the user to choose a list of zero or more files interactively. 31 | } 32 | 33 | \section{clean functions}{ 34 | 35 | 36 | \code{\link{cleanHeaderNames}}, \code{\link{extractCountryCode}}, 37 | \code{\link{extractPubNumber}}, \code{\link{extractKindCode}}, 38 | \code{\link{extractDocLength}}, \code{\link{extractCleanDate}}, 39 | \code{\link{showDups}}, \code{\link{removeDups}}, \code{\link{generateDocType}}, 40 | \code{\link{cleanNames}}, \code{\link{cleanPatentData}} 41 | } 42 | 43 | \section{patentr data}{ 44 | 45 | \code{\link{acars}} To pay respect to the \code{\link[datasets]{mtcars}} data, 46 | this is a data set of autonomous driving car patents from major companies. 47 | \code{\link{kindCodes}} A data frame of kind codes by country with associated 48 | descriptions. 49 | \code{\link{docLengthTypes}} A data frame mapping doc length to the type of 50 | patent document. 51 | } 52 | 53 | -------------------------------------------------------------------------------- /man/createGoogleURL.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/processPatentData.R 3 | \name{createGoogleURL} 4 | \alias{createGoogleURL} 5 | \title{Create a URL link to Google patents} 6 | \usage{ 7 | createGoogleURL(countryCode, pubNum, kindCode, 8 | googleURL = "https://patents.google.com/patent/", lang = "en") 9 | } 10 | \arguments{ 11 | \item{countryCode}{A character vector of the country code of the document. 12 | Typically a two-letter character.} 13 | 14 | \item{pubNum}{A character vector of the numeric portion of a publication number.} 15 | 16 | \item{kindCode}{character vector of the kind code of a document. If not available, 17 | enter a blank string "".} 18 | 19 | \item{googleURL}{A character string of the URL to Google Patents, with working 20 | default value.} 21 | 22 | \item{lang}{The language you want to read the patent, default set to "en" english.} 23 | } 24 | \value{ 25 | A character vector of properly formatted URL strings. 26 | } 27 | \description{ 28 | Create a URL string to link you to Google Patents. 29 | 30 | By concatenating the country code, publication number, and kind code, you can 31 | generate a URL to link you to google patents for further exploration. This 32 | feature is especially useful when browsing the data in a spreadsheet or in 33 | a Shiny app. It is also useful for extracting data from the HTML content. 34 | 35 | As each website (Google, lens.org, sumobrain.com, etc..) has a different 36 | method for generating patent URLs, these functions are website-specific. 37 | 38 | The original Google patents version still works as of March 2017 and the 39 | \code{googleURL} value is \code{https://www.google.com/patents/}. This older 40 | version may be easier to extract data. 41 | } 42 | \examples{ 43 | acars$kindCode <- extractKindCode(acars$docNum) 44 | acars$pubName <- extractPubNumber(acars$docNum) 45 | acars$googleURL <- createGoogleURL(countryCode = acars$countryCode, 46 | pubNum = acars$pubNum, kindCode =acars$kindCode) 47 | head(acars$googleURL) 48 | 49 | } 50 | -------------------------------------------------------------------------------- /man/factorForGraph.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/visualizePatentData.R 3 | \name{factorForGraph} 4 | \alias{factorForGraph} 5 | \title{Factor one column by another column's popularity} 6 | \usage{ 7 | factorForGraph(df, xVal, fillVal, decFill = TRUE) 8 | } 9 | \arguments{ 10 | \item{df}{A data frame containing the x and fill value columns.} 11 | 12 | \item{xVal}{A character value from a header name in \code{df} 13 | that will be used as 14 | the x value in a ggplot2 plot.} 15 | 16 | \item{fillVal}{A character value from a header name in \code{df} 17 | that will be used as the 18 | fill value in a ggplot2 plot.} 19 | 20 | \item{decFill}{Sort fill value in decreasing order.} 21 | } 22 | \value{ 23 | A data frame with two of the columns factored. 24 | } 25 | \description{ 26 | Factor (or refactor) a data frame of values to be used 27 | for graphing in the correct order. 28 | 29 | Many graphs require a reordering when plotting with a fill value. This 30 | helper function factors the x-value of a plot that will be stacked by 31 | \code{fillVal}. 32 | } 33 | \examples{ 34 | 35 | 36 | sumo <- cleanPatentData(patentData = patentr::acars, columnsExpected = sumobrainColumns, 37 | cleanNames = sumobrainNames, 38 | dateFields = sumobrainDateFields, 39 | dateOrders = sumobrainDateOrder, 40 | deduplicate = TRUE, 41 | cakcDict = patentr::cakcDict, 42 | docLengthTypesDict = patentr::docLengthTypesDict, 43 | keepType = "grant", 44 | firstAssigneeOnly = TRUE, 45 | assigneeSep = ";", 46 | stopWords = patentr::assigneeStopWords) 47 | 48 | # note that in reality, you need a patent analyst to carefully score 49 | # these patents, the score here is for demonstrational purposes 50 | score <- round(rnorm(dim(sumo)[1],mean=1.4,sd=0.9)) 51 | score[score>3] <- 3 52 | score[score<0] <- 0 53 | sumo$score <- score 54 | dim(sumo) 55 | sumoFactor <- factorForGraph(sumo, "assigneeClean", "score") 56 | # if you want to view, uncomment and load ggplot2 57 | # ggplot(sumoFactor, aes(x=assigneeClean, y=score, fill=factor(score))) + 58 | # geom_bar(stat="identity") 59 | 60 | 61 | } 62 | -------------------------------------------------------------------------------- /man/cleanNames.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/cleanPatentData.R 3 | \name{cleanNames} 4 | \alias{cleanNames} 5 | \title{Clean up string names.} 6 | \usage{ 7 | cleanNames(rawNames, firstAssigneeOnly = TRUE, sep = ";", 8 | removeStopWords = TRUE, stopWords = patentr::assigneeStopWords) 9 | } 10 | \arguments{ 11 | \item{rawNames}{The character vector you want to clean up} 12 | 13 | \item{firstAssigneeOnly}{A logical value, default set to TRUE, keeping only the first 14 | assignee if multiple exist.} 15 | 16 | \item{sep}{The separating character for multiple assignees, default set to semi-colon.} 17 | 18 | \item{removeStopWords}{Logical default TRUE, if want to remove common company stopwords 19 | found in the \code{stopWords} parameter.} 20 | 21 | \item{stopWords}{An optional character vector of words you want to remove. Default to 22 | \code{\link{assigneeStopWords}}.} 23 | } 24 | \value{ 25 | A character vector of cleaned up character names. 26 | } 27 | \description{ 28 | Quick cleanup of characters in a string, 29 | typically assignee (company names) and the inventors. 30 | 31 | If you have issues with this, you may need to convert to UTF-8 or ASCII. 32 | Use the \code{iconv(thisVector, to="UTF-8")} or \code{to="ASCII"} and it should 33 | fix the problem. See the examples for the code. 34 | 35 | 36 | This function: 37 | \enumerate{ 38 | \item{Removes values between spaces, such as (US)} 39 | \item{Changes all names to lower case} 40 | } 41 | } 42 | \examples{ 43 | 44 | assigneeNames <- cleanNames(acars$assignee) 45 | # get a feel for the less-messy data 46 | head(sort(table(assigneeNames), decreasing = TRUE)) 47 | 48 | # for a messier example, note you need to convert to ASCII/UTF-8 to get rid of errors 49 | # associated with tolower 50 | rawGoogleData <- system.file("extdata", "google_autonomous_search.csv", package = "patentr") 51 | rawGoogleData <- read.csv(rawGoogleData, stringsAsFactors = FALSE, skip = patentr::skipGoogle) 52 | rawGoogleData <- data.frame(lapply(rawGoogleData, 53 | function(x){iconv(x, to = "ASCII")}), stringsAsFactors = FALSE) 54 | assigneeClean <- cleanNames(rawGoogleData$assignee) 55 | head(sort(table(assigneeClean), decreasing = TRUE)) 56 | 57 | } 58 | -------------------------------------------------------------------------------- /man/getClaimFromURL.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/processPatentData.R 3 | \name{getClaimFromURL} 4 | \alias{getClaimFromURL} 5 | \title{Get a claim from a Google Patents URL} 6 | \usage{ 7 | getClaimFromURL(googleURL, langCode = "en", whichClaim = 1) 8 | } 9 | \arguments{ 10 | \item{googleURL}{The well-formatted google URL built from \code{\link{createGoogleURL}}. 11 | It is a character value.} 12 | 13 | \item{langCode}{The language code, used check for non-english text.} 14 | 15 | \item{whichClaim}{Default set to 1, a numeric determining which claim to get. Note 16 | if claim is out of bounds, the return claim will be a blank character.} 17 | } 18 | \value{ 19 | A character vector of the claim from each Google URL. If no claim exists, 20 | or if the country code is not on the inclusion list, an empty character value is returned 21 | for that index. 22 | } 23 | \description{ 24 | Input a valid Google Patents URL of the form given below and 25 | then get back a claim from the index of your choosing. If no claim exists or 26 | if your index is out of bounds, an empty character string returns. 27 | 28 | The function works on strings that begin with the following sequence: 29 | \code{https://patents.google.com/patent/}. If the string sequence afterwards 30 | is invalid, a 404 status returns from the GET command and eventually an empty 31 | string returns. 32 | } 33 | \examples{ 34 | 35 | \dontrun{ 36 | # works for USA 37 | aclaim <- getClaimFromURL("https://patents.google.com/patent/US8818682B1/en") 38 | print(aclaim) 39 | # test WO, EP 40 | woclaim <- getClaimFromURL("https://patents.google.com/patent/WO2015134152A1/en") 41 | print(woclaim) 42 | epclaim <- getClaimFromURL("https://patents.google.com/patent/EP2991875A1/en") 43 | print(epclaim) 44 | # test KR, JP, CN 45 | krclaim <- getClaimFromURL("https://patents.google.com/patent/KR20150127745A/en") 46 | cnclaim <- getClaimFromURL("https://patents.google.com/patent/CN104786953A/en") 47 | jpclaim <- getClaimFromURL("https://patents.google.com/patent/JP2016173842A/en") 48 | declaim <- getClaimFromURL("https://patents.google.com/patent/DE102014219223A1/en") 49 | } 50 | 51 | } 52 | \seealso{ 53 | \code{\link{createGoogleURL}}, \code{\link{extractCountryCode}}, 54 | \code{\link{cleanGoogleURL}} 55 | } 56 | -------------------------------------------------------------------------------- /man/acars.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/acars.R 3 | \docType{data} 4 | \name{acars} 5 | \alias{acars} 6 | \title{Autonomous Vehicle Patent Data from Sumobrain.com} 7 | \format{A data frame with 499 observations on 10 variables. 8 | \describe{ 9 | \item{docNum}{A published document number including the kind code, publication number, 10 | and kind code for the patent document.} 11 | \item{docTypeSumobrain}{Very similar to the country code, with minor additions, USAPP being the 12 | most noticable difference. } 13 | \item{pubDate}{Publication Date} 14 | \item{title}{Title} 15 | \item{abstract}{Abstract} 16 | \item{inventors}{Inventor Name} 17 | \item{assignee}{Assignee} 18 | \item{appNum}{Application Number} 19 | \item{dateFiled}{Filing Date} 20 | \item{classPrimary}{Primary Class} 21 | \item{classOthers}{Other Classes} 22 | }} 23 | \usage{ 24 | acars 25 | } 26 | \description{ 27 | An example data set of autonomous vehicle IP from major assignees. 28 | } 29 | \details{ 30 | The data search was performd on Monday, March 13, 2017 from sumobrain.com, and the exact 31 | search term was: 32 | 33 | \code{ABST/"autonomous" AN/(Apple* OR Google* OR Waymo* OR Tesla*} 34 | 35 | \code{OR Ford* OR General*) PD/12/13/1790->3/13/2017} 36 | 37 | View the search \href{http://www.sumobrain.com/result.html?p=1&stemming=on&sort=chron&uspat=on&usapp=on&eupat=on&jp=on&pct=on&collections=&srch=xprtsrch&date_range=all&hits=502&from_ss=&srch_id=&srch_name=&search_name=&selected_doc_flag=&selected_newdoc_flag=&selected_portfolio=&portfolio_name=&query_txt=ABST\%2F\%22autonomous\%22+AN\%2F\%28Apple*+OR+Google*+OR+Waymo*+OR+Tesla*+OR+Ford*+OR+General*\%29+PD\%2F12\%2F13\%2F1790-\%3E3\%2F13\%2F2017&search.x=0&search.y=0&search=search_ezy}{here}. 38 | 39 | For all collections (US patents, applications, EP documents, abstracts of Japan, and WIPO). 40 | 41 | Can get raw data with the following commands: 42 | 43 | \code{system.file("extdata", "sumobrain_autonomous_search1.xls", package = "patentr")} 44 | 45 | \code{system.file("extdata", "sumobrain_autonomous_search2.xls", package = "patentr")} 46 | } 47 | \seealso{ 48 | \url{http://www.sumobrain.com} You will need to create a free account to export data. 49 | 50 | \code{\link{acarsGoogle}} provides a similar search from Google. 51 | \code{\link{acarsLens}} provides a simialr search from Lens.org. 52 | } 53 | \keyword{data} 54 | -------------------------------------------------------------------------------- /man/acarsGoogle.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/acars.R 3 | \docType{data} 4 | \name{acarsGoogle} 5 | \alias{acarsGoogle} 6 | \title{Autonomous Vehicle Patent Data from Google Patents} 7 | \format{A data frame with 316 observations on 9 variables. 8 | \describe{ 9 | \item{\code{docNum}}{A published document number including the kind code, publication number, 10 | and kind code for the patent document.} 11 | \item{\code{title}}{The title of the invention.} 12 | \item{\code{assignee}}{The owner of the document.} 13 | \item{\code{inventors}}{The name(s) of the inventor(s), separated by commas.} 14 | \item{\code{priorityDate}}{The earliest priority date on the application.} 15 | \item{\code{dateFiled}}{Date the document was filed. They calll it filing/creation date.} 16 | \item{\code{pubDate}}{Date document became publicly available.} 17 | \item{\code{grantDate}}{Date the application became a grant. NA if there is no associated grant.} 18 | \item{\code{googleURL}}{The link to the Google Patents page for the document.} 19 | }} 20 | \usage{ 21 | acarsGoogle 22 | } 23 | \description{ 24 | An example data set of autonomous vehicle IP from major assignees. 25 | } 26 | \details{ 27 | The first row in the raw CSV export contains the search URL and is skipped. 28 | 29 | The data search was performd on Saturday, March 18, 2017 from patents.google.com, and the exact 30 | search: \href{https://patents.google.com/?q=AB\%3dautonomous&assignee=Apple*,Google*,Waymo*,Tesla*,Ford*,General*&before=filing:20170318}{Google Patents Search} 31 | For all countries available on Google. 32 | 33 | You process the raw data with the following commands: 34 | 35 | \code{temp <- system.file("extdata", "google_autonomous_search.csv", package = "patentr")} 36 | 37 | \code{# from the source package you can navigate to } 38 | 39 | \code{temp <- read.csv("inst/extdata/google_autonomous_search.csv", skip = 1, stringsAsFactors = FALSE)} 40 | 41 | \code{names(temp) <- googleNames} 42 | 43 | \code{temp <- data.frame(lapply(temp, function(x){iconv(x,to="ASCII")}),stringsAsFactors = FALSE)} 44 | 45 | \code{dateFields <- c("priorityDate","dateFiled","pubDate","grantDate")} 46 | 47 | \code{temp[dateFields] <- as.data.frame(lapply(temp[dateFields], as.Date, format="\%m/\%d/\%y"))} 48 | } 49 | \seealso{ 50 | \url{https://patents.google.com/} 51 | 52 | \code{\link{acars}} provides a similar search from Sumobrain. 53 | \code{\link{acarsLens}} provides a simialr search from Lens.org. 54 | } 55 | \keyword{data} 56 | -------------------------------------------------------------------------------- /man/summaryText.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/reportPatentData.R 3 | \name{summaryText} 4 | \alias{summaryText} 5 | \title{Add summary text to be used in a pptx slide} 6 | \usage{ 7 | summaryText(df, singular, plural, sumVar) 8 | } 9 | \arguments{ 10 | \item{df}{A summarized patent data frame, summarized by one variable. 11 | See \code{\link{summarizeColumns}}.} 12 | 13 | \item{singular}{The name of the variable, singular version. A character string. 14 | For example: assignee.} 15 | 16 | \item{plural}{The name of the variable, plural version. A character string. 17 | For example: assignees, with an 's'.} 18 | 19 | \item{sumVar}{The vector of the variable to summarize, taken from the original 20 | patent data set. For example \code{sumo$score} to summarize the score range.} 21 | } 22 | \value{ 23 | A length four character vector. 24 | } 25 | \description{ 26 | Add a standard summarized text that will be used in 27 | association with a plot. 28 | } 29 | \examples{ 30 | sumo <- cleanPatentData(patentData = patentr::acars, columnsExpected = sumobrainColumns, 31 | cleanNames = sumobrainNames, 32 | dateFields = sumobrainDateFields, 33 | dateOrders = sumobrainDateOrder, 34 | deduplicate = TRUE, 35 | cakcDict = patentr::cakcDict, 36 | docLengthTypesDict = patentr::docLengthTypesDict, 37 | keepType = "grant", 38 | firstAssigneeOnly = TRUE, 39 | assigneeSep = ";", 40 | stopWords = patentr::assigneeStopWords) 41 | 42 | # note that in reality, you need a patent analyst to carefully score 43 | # these patents, the score here is for demonstrational purposes 44 | score <- round(rnorm(dim(sumo)[1],mean=1.4,sd=0.9)) 45 | score[score>3] <- 3; score[score<0] <- 0 46 | sumo$score <- score 47 | sumo$assigneeSmall <- strtrim(sumo$assigneeClean,12) 48 | category <- c("system","control algorithm","product","control system", "communication") 49 | c <- round(rnorm(dim(sumo)[1],mean=2.5,sd=1.5)) 50 | c[c>5] <- 5; c[c<1] <- 1 51 | sumo$category <- category[c] 52 | feature1 <- c("adaptive", "park", "lane", NA,NA,NA,NA,NA, 53 | "brake", "steer","accelerate","deactivate") 54 | f <- round(rnorm(dim(sumo)[1],mean=5,sd=1)) 55 | l <- length(feature1) 56 | f[f>l] <- l; f[f<1] <- 1 57 | sumo$feature1 <- c(feature1,feature1[f])[1:dim(sumo)[1]] 58 | 59 | # Summarize the assignees 60 | as <- summarizeColumns(sumo, 'assigneeSmall') 61 | summaryText(as, 'assignee','assignees',sumo$score) 62 | # summarize the number of features 63 | f <- summarizeColumns(sumo, 'feature1', naOmit = TRUE) 64 | summaryText(f, 'feature','features',sumo$feature1) 65 | 66 | } 67 | -------------------------------------------------------------------------------- /man/flippedHistogram.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/visualizePatentData.R 3 | \name{flippedHistogram} 4 | \alias{flippedHistogram} 5 | \title{Plot a flipped histogram with a fill value} 6 | \usage{ 7 | flippedHistogram(df, xVal, fillVal, colors = patentr::scoreColors, 8 | recolor = FALSE) 9 | } 10 | \arguments{ 11 | \item{df}{The original data frame of patent data} 12 | 13 | \item{xVal}{A character value of a name in \code{df}} 14 | 15 | \item{fillVal}{A character value of a name in \code{df} to color the chart.} 16 | 17 | \item{colors}{A character vector of colors, the same length as the number of 18 | unique values in the column of \code{xVal[,fillVal]}. Default set to 19 | \code{scoreColors}.} 20 | 21 | \item{recolor}{A logical allowing you to choose to recolor the plot if the 22 | colors vector is not applicable to you. Default set to \code{FALSE}. Uses 23 | the helper function \code{\link{makeColors}} to generate colors. Note that your 24 | plot may fail if \code{colors} is not the same length as the number of unique 25 | values in fillVal and \code{recolor} is set to \code{FALSE}.} 26 | } 27 | \value{ 28 | A plot 29 | } 30 | \description{ 31 | Plot a flipped histogram with fill values. 32 | 33 | Often times, you want to plot a histogram showing patent documents 34 | faceted by one value and filled by another. 35 | } 36 | \examples{ 37 | 38 | 39 | sumo <- cleanPatentData(patentData = patentr::acars, columnsExpected = sumobrainColumns, 40 | cleanNames = sumobrainNames, 41 | dateFields = sumobrainDateFields, 42 | dateOrders = sumobrainDateOrder, 43 | deduplicate = TRUE, 44 | cakcDict = patentr::cakcDict, 45 | docLengthTypesDict = patentr::docLengthTypesDict, 46 | keepType = "grant", 47 | firstAssigneeOnly = TRUE, 48 | assigneeSep = ";", 49 | stopWords = patentr::assigneeStopWords) 50 | 51 | # note that in reality, you need a patent analyst to carefully score 52 | # these patents, the score here is for demonstrational purposes 53 | score <- round(rnorm(dim(sumo)[1],mean=1.4,sd=0.9)) 54 | score[score>3] <- 3 55 | score[score<0] <- 0 56 | sumo$score <- score 57 | sumo$assigneeSmall <- strtrim(sumo$assigneeClean,12) 58 | flippedHistogram(sumo, "assigneeSmall","score",colors=scoreColors) 59 | flippedHistogram(subset(sumo, score > 0), "assigneeSmall","score",colors=scoreColors) 60 | flippedHistogram(subset(sumo, score > 2) ,"score","assigneeSmall",colors=scoreColors, 61 | recolor = TRUE) 62 | flippedHistogram(subset(sumo, score > 2) ,"assigneeSmall","docType",colors=scoreColors, 63 | recolor = TRUE) 64 | 65 | } 66 | \seealso{ 67 | \code{\link{makeColors}}, \code{\link{capWord}} 68 | } 69 | -------------------------------------------------------------------------------- /R/explorePatentData.R: -------------------------------------------------------------------------------- 1 | # explore patent data, goes hand-in-hand with visualization 2 | ## yang yao start 3 | 4 | #' Summarize columns of a data frame 5 | #' 6 | #' @description Summarize columns of a data frame. 7 | #' 8 | #' Summarize a data frame \code{df} by a \code{names} character vector of 9 | #' header names. 10 | #' 11 | #' @param df A data frame of patent data. 12 | #' @param names a character vector of header names that you want to summarize. 13 | #' @param naOmit Logical. Optionally, remove NA values at the end of the summary. 14 | #' Useful when comparing fields that have NA values, such as features. 15 | #' 16 | #' @return A dataframe of summarize values. 17 | #' 18 | #' @examples 19 | #' sumo <- cleanPatentData(patentData = patentr::acars, columnsExpected = sumobrainColumns, 20 | #' cleanNames = sumobrainNames, 21 | #' dateFields = sumobrainDateFields, 22 | #' dateOrders = sumobrainDateOrder, 23 | #' deduplicate = TRUE, 24 | #' cakcDict = patentr::cakcDict, 25 | #' docLengthTypesDict = patentr::docLengthTypesDict, 26 | #' keepType = "grant", 27 | #' firstAssigneeOnly = TRUE, 28 | #' assigneeSep = ";", 29 | #' stopWords = patentr::assigneeStopWords) 30 | #' 31 | #' # note that in reality, you need a patent analyst to carefully score 32 | #' # these patents, the score here is for demonstrational purposes 33 | #' score <- round(rnorm(dim(sumo)[1],mean=1.4,sd=0.9)) 34 | #' score[score>3] <- 3 35 | #' score[score<0] <- 0 36 | #' sumo$score <- score 37 | #' scoreSum <- summarizeColumns(sumo, "score") 38 | #' scoreSum 39 | #' # load library(ggplot2) for the below part to run 40 | #' # ggplot(scoreSum, aes(x=score, y = total, fill=factor(score) )) + geom_bar(stat="identity") 41 | #' nameAndScore <- summarizeColumns(sumo, c("assigneeClean","score")) 42 | #' # tail(nameAndScore) 43 | #' 44 | #' @export 45 | #' 46 | #' @importFrom dplyr group_by_ 47 | #' @importFrom dplyr summarize 48 | #' @importFrom dplyr arrange 49 | #' @importFrom dplyr n 50 | #' @importFrom magrittr %>% 51 | #' 52 | summarizeColumns <- function(df, names, naOmit = FALSE){ 53 | 54 | # dplyr functions 55 | # as.symbol or as.name both work, unsure why 56 | names <- lapply(names, as.name) 57 | 58 | # for an error fix 59 | # http://stackoverflow.com/questions/9439256/how-can-i- 60 | # handle-r-cmd-check-no-visible-binding-for-global-variable-notes-when 61 | total <- NULL 62 | # group by names, sum them, and arrange them top to bottom 63 | df <- df %>% 64 | dplyr::group_by_(.dots = names) %>% 65 | dplyr::summarize(total=n()) %>% 66 | dplyr::arrange(total) 67 | 68 | if(naOmit){ 69 | df <- df %>% 70 | stats::na.omit() 71 | } 72 | return(df) 73 | } 74 | 75 | 76 | 77 | ## yang yao end -------------------------------------------------------------------------------- /man/cleanHeaderNames.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/cleanPatentData.R 3 | \name{cleanHeaderNames} 4 | \alias{cleanHeaderNames} 5 | \title{Generate a standard set of header names for import data} 6 | \usage{ 7 | cleanHeaderNames(patentData = NA, 8 | columnsExpected = patentr::sumobrainColumns, 9 | cleanNames = patentr::sumobrainNames) 10 | } 11 | \arguments{ 12 | \item{patentData}{A data frame. Default is NA.} 13 | 14 | \item{columnsExpected}{An expected number of columns. 15 | Default is Sumobrain \code{\link{sumobrainColumns}} data.} 16 | 17 | \item{cleanNames}{A standard list of clean names. Default is Sumobrain 18 | \code{\link{sumobrainNames}} data.} 19 | } 20 | \value{ 21 | A data frame 11 columns wide, with standard column names used in other 22 | functions. 23 | } 24 | \description{ 25 | Create a standard nameset from Sumobrain import data. 26 | See \code{\link{acars}} for the name set. 27 | 28 | There are three main sources of free and exportable patent data from the internet: 29 | \enumerate{ 30 | \item{\href{www.sumobrain.com}{Sumobrain}} 31 | \item{\href{www.lens.org}{The Lens}} 32 | \item{\href{www.patents.google.com}{Google}} 33 | } 34 | 35 | These three popular sites have varying levels of exportable data available. 36 | Sumobrain tends to be the most comprehensive, followed by Lens, and finally 37 | by Google. Thus, all three have hardcoded data available in the \code{patentr} 38 | package. 39 | 40 | To download Sumobrain data, go to \url{http://www.sumobrain.com} and create a free 41 | account. Then run your search, export the data (250 max at a time), and use the 42 | \code{\link{chooseFiles}} and \code{\link{importPatentData}} functions to load 43 | the data into R. 44 | 45 | To download Lens data, go to \url{www.lens.org}. You do not need to create an 46 | account. Run your search, and in the header section, look for the cloud icon 47 | with a downward arrow. Choose the CSV option. 48 | 49 | To download Google patent data, visit \url{www.patents.google.com}, run 50 | your search, and click "Download (CSV)" in the upper left-hand corner. 51 | } 52 | \examples{ 53 | cleanData <- cleanHeaderNames(patentData = acars) 54 | cleanDataLens <- cleanHeaderNames(patentData = acarsLens, 55 | columnsExpected = lensColumns, cleanNames = lensNames) 56 | 57 | } 58 | \seealso{ 59 | \enumerate{ 60 | \item{\code{\link{sumobrainColumns}}} 61 | \item{\code{\link{sumobrainNames}}} 62 | \item{\code{\link{skipSumobrain}}} 63 | \item{\code{\link{googleColumns}}} 64 | \item{\code{\link{googleNames}}} 65 | \item{\code{\link{skipGoogle}}} 66 | \item{\code{\link{lensColumns}}} 67 | \item{\code{\link{lensNames}}} 68 | \item{\code{\link{skipLens}}} 69 | } 70 | } 71 | -------------------------------------------------------------------------------- /man/generateDocType.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/cleanPatentData.R 3 | \name{generateDocType} 4 | \alias{generateDocType} 5 | \title{Determine the patent document type} 6 | \usage{ 7 | generateDocType(officeDocLength, countryAndKindCode, 8 | cakcDict = patentr::cakcDict, 9 | docLengthTypesDict = patentr::docLengthTypesDict) 10 | } 11 | \arguments{ 12 | \item{officeDocLength}{The concat value of country code and number of numerical digits. 13 | Extracted using the \code{\link{extractDocLength}} function.} 14 | 15 | \item{countryAndKindCode}{The concat value of the country code and kind code. 16 | Extracted using the \code{\link{extractCountryCode}} and \code{\link{extractKindCode}} 17 | functions.} 18 | 19 | \item{cakcDict}{A county and kind code dictionary. Default is \code{\link{cakcDict}}.} 20 | 21 | \item{docLengthTypesDict}{A document length and type dictionary. Default is \code{\link{docLengthTypesDict}}.} 22 | } 23 | \value{ 24 | A vector of characters labeling the document type, with NA for when 25 | no match was found. 26 | } 27 | \description{ 28 | Determine the type of document from the patent publication data. 29 | 30 | Often times, data exports from publicly available sources do not provide the 31 | type of patent document, or, if provided, still requires standardization. By 32 | using the kind code, country code, and pre-developed dictionaries for doc length 33 | and country code, you can get a great approximation of the types of documents. 34 | 35 | Note that you can use View(lens[lens$docType=="NA",]) to view the not-found 36 | document types. Often times, these are small countries. You can add to the 37 | \code{\link{cakcDict}} to fix these. They are also useful to ignore if you 38 | only want to focus on the larger countries, which are all covered. 39 | } 40 | \examples{ 41 | 42 | acars <- acars 43 | acars$pubNum <- extractPubNumber(acars$docNum) # pubnum, ex #### 44 | acars$countryCode <- extractCountryCode(acars$docNum) # country code, ex USAPP, USD 45 | acars$officeDocLength <- extractDocLength(countryCode = acars$countryCode, 46 | pubNum = acars$pubNum) # cc + pub num length concat 47 | acars$kindCode <- extractKindCode(acars$docNum) 48 | acars$countryAndKindCode <- with(acars, paste0(countryCode, kindCode)) 49 | 50 | acars$docType <- generateDocType(officeDocLength = acars$officeDocLength, 51 | countryAndKindCode = acars$countryAndKindCode, 52 | cakcDict = cakcDict, 53 | docLengthTypesDict = docLengthTypesDict) 54 | table(acars$docType) 55 | 56 | 57 | 58 | } 59 | \seealso{ 60 | \code{\link{cakcDict}}, \code{\link{docLengthTypesDict}} 61 | } 62 | -------------------------------------------------------------------------------- /inst/shiny/app/server.R: -------------------------------------------------------------------------------- 1 | ## yang yao 2 | ## motivation: R Shiny gallery and look at urls in ui.R 3 | library(shiny) 4 | 5 | 6 | ## kamil bojanczyk start 7 | function(input, output) { 8 | #read in file as rawdata 9 | rawData <- reactive({ 10 | inFile <- input$file1 11 | if (is.null(inFile)) 12 | return(NULL) 13 | if("csv" %in% unlist(strsplit(inFile$type,"[/]"))){ 14 | # print("reading file") 15 | read.csv(inFile$datapath, header=input$header, sep=input$sep, 16 | quote=input$quote) 17 | } else{ 18 | ext <- tools::file_ext(inFile$name) 19 | # print(paste("extention is",ext)) 20 | file.rename(inFile$datapath, 21 | paste(inFile$datapath, ext, sep=".")) 22 | readxl::read_excel(paste(inFile$datapath, ext, sep="."), 1) 23 | } 24 | }) 25 | ## kamil bojanczyk end 26 | #show raw data in a table 27 | output$contents <- renderTable({rawData()}) 28 | 29 | # clean the raw data 30 | cleanData <- eventReactive(input$cleanDataButton, { 31 | df <- rawData() 32 | if(is.null(df)) return(NULL) 33 | cleanPatentData(df, columnsExpected = sumobrainColumns, 34 | cleanNames = sumobrainNames, 35 | dateFields = sumobrainDateFields, 36 | dateOrders = sumobrainDateOrder, 37 | deduplicate = TRUE, 38 | cakcDict = patentr::cakcDict, 39 | docLengthTypesDict = patentr::docLengthTypesDict, 40 | keepType = "grant", 41 | firstAssigneeOnly = TRUE, 42 | assigneeSep = ";", 43 | stopWords = patentr::assigneeStopWords) 44 | }) 45 | 46 | #show the clean data in the tab 47 | output$cleanContents <- renderTable({ 48 | cleanData() 49 | }) 50 | 51 | #show the first plot 52 | output$outplot1 <- renderPlot({ 53 | df2<-cleanData() 54 | df2$assigneeSmall <- strtrim(df2$assigneeClean,12) 55 | score <- round(rnorm(dim(df2)[1],mean=1.4,sd=0.9)) 56 | score[score>3] <- 3 57 | score[score<0] <- 0 58 | df2$score <- score 59 | scoreSum <- summarizeColumns(df2, "score") 60 | ggplot(scoreSum, aes(x=score, y = total, fill=factor(score) )) + geom_bar(stat="identity") 61 | # score[score>3] <- 3 62 | # score[score<0] <- 0 63 | # df2$score <- score 64 | # df2 <- df2[score >2,] 65 | # flippedHistogram(df2, "assigneeSmall","score",colors=scoreColors) 66 | 67 | }) 68 | 69 | #download the clean data 70 | output$downloadData <- downloadHandler( 71 | filename = ('clean.csv'), 72 | content = function(file) { 73 | write.csv(cleanData(), file) 74 | } 75 | ) 76 | } 77 | 78 | 79 | 80 | ## yang yao 81 | ## motivation: R Shiny gallery and look at urls in ui.R -------------------------------------------------------------------------------- /man/facetPlot.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/visualizePatentData.R 3 | \name{facetPlot} 4 | \alias{facetPlot} 5 | \title{Make a tiled plot} 6 | \usage{ 7 | facetPlot(df, xVal, fillVal, facetVal, colors = patentr::scoreColors, 8 | recolor = FALSE) 9 | } 10 | \arguments{ 11 | \item{df}{A data frame of the cleaned data you want to plot.} 12 | 13 | \item{xVal}{A character string of the x value you want for your plot, must be a 14 | name of the header in \code{df}.} 15 | 16 | \item{fillVal}{A character string of the fill value you want for your plot, must be a 17 | name of the header in \code{df}.} 18 | 19 | \item{facetVal}{A character string of the facet you want for your plot, must be a 20 | name of the header in \code{df}.} 21 | 22 | \item{colors}{A character vector of colors, the same length as the number of 23 | unique values in the column of \code{xVal[,fillVal]}. Default set to 24 | \code{scoreColors}.} 25 | 26 | \item{recolor}{A logical allowing you to choose to recolor the plot if the 27 | colors vector is not applicable to you. Default set to \code{FALSE}. Uses 28 | the helper function \code{\link{makeColors}} to generate colors. Note that your 29 | plot may fail if \code{colors} is not the same length as the number of unique 30 | values in fillVal and \code{recolor} is set to \code{FALSE}.} 31 | } 32 | \value{ 33 | A ggplot2 plot object. 34 | } 35 | \description{ 36 | Tile plot an x and y variable by facet z. 37 | 38 | Tile plots are a a great way to show a dense amount of information in one 39 | plot sequence. Plotting document count by category, and plotting by assignee, 40 | is one example. 41 | } 42 | \examples{ 43 | 44 | \dontrun{ 45 | sumo <- cleanPatentData(patentData = patentr::acars, columnsExpected = sumobrainColumns, 46 | cleanNames = sumobrainNames, 47 | dateFields = sumobrainDateFields, 48 | dateOrders = sumobrainDateOrder, 49 | deduplicate = TRUE, 50 | cakcDict = patentr::cakcDict, 51 | docLengthTypesDict = patentr::docLengthTypesDict, 52 | keepType = "grant", 53 | firstAssigneeOnly = TRUE, 54 | assigneeSep = ";", 55 | stopWords = patentr::assigneeStopWords) 56 | 57 | # note that in reality, you need a patent analyst to carefully score 58 | # these patents, the score here is for demonstrational purposes 59 | score <- round(rnorm(dim(sumo)[1],mean=1.4,sd=0.9)) 60 | score[score>3] <- 3; score[score<0] <- 0 61 | sumo$score <- score 62 | sumo$assigneeSmall <- strtrim(sumo$assigneeClean,12) 63 | category <- c("system","control algorithm","product","control system", "communication") 64 | c <- round(rnorm(dim(sumo)[1],mean=2.5,sd=1.5)) 65 | c[c>5] <- 5; c[c<1] <- 1 66 | sumo$category <- category[c] 67 | 68 | xVal = "category" 69 | fillVal = "score" 70 | facetVal = "assigneeSmall" 71 | 72 | facetPlot(subset(sumo, score > 0), xVal, fillVal, facetVal, colors = patentr::scoreColors, 73 | recolor = FALSE) 74 | } 75 | 76 | } 77 | -------------------------------------------------------------------------------- /man/removeDups.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/cleanPatentData.R 3 | \name{removeDups} 4 | \alias{removeDups} 5 | \title{Remove duplicate entries in a patent data set} 6 | \usage{ 7 | removeDups(input, hasDup = NA, docType = NA, keepType = "grant") 8 | } 9 | \arguments{ 10 | \item{input}{A vector or a data frame which you wish to remove duplicate values. 11 | When choosing a data frame, you are more selective. For example, you may want to 12 | remove a patent document only if it has the same docNum and country code.} 13 | 14 | \item{hasDup}{A logical vector noting if a duplicate exists. If NA, ignore. The 15 | \code{\link{showDups}} funciton helps with this input.} 16 | 17 | \item{docType}{A character vector of the type of patent document (app, grant, etc.). 18 | If NA, ignore.} 19 | 20 | \item{keepType}{A character variable denoting which document type to keep. Default is "grant". 21 | If NA, ignore.} 22 | } 23 | \value{ 24 | A logical vector used to remove duplicate documents not fitting the one 25 | chosen. TRUE is for the document to keep. 26 | } 27 | \description{ 28 | Remove duplicate values in the patent data. Typically you will 29 | want to check if you have repeat document numbers. A document number should be 30 | a unique number in your dataset, thus, having a duplicate document number in your 31 | data set should be avoided. You can optionally specify which document type to keep. 32 | 33 | Often times, your data sets contain duplicate patent entries. This function is 34 | a wrapper function of the \code{\link[base]{duplicated}} function, 35 | applied to a dataframe or vector. 36 | 37 | For example, if you have the vector [US123, US123, US456], you will get the value 38 | TRUE FALSE TRUE and the duplicate value is removed. 39 | 40 | You can go deeper with the optional variables. For many analyses, we want to exclude the 41 | second document, typically the application. This function allows you to choose 42 | which document type to keep and the rest get thrown out. 43 | } 44 | \examples{ 45 | 46 | # simple removal: see how many rows were removed 47 | dim(acars) - dim(acars[removeDups(acars$appNum),]) 48 | 49 | # specific removal: keep the grant docs 50 | hasDup <- showDups(acars$appNum) 51 | pubNum <- extractPubNumber(acars$docNum) 52 | countryCode <- extractCountryCode(acars$docNum) 53 | officeDocLength <- extractDocLength(countryCode = countryCode, pubNum = pubNum) 54 | kindCode <- extractKindCode(acars$docNum) 55 | countryAndKindCode <- paste0(countryCode, kindCode) 56 | docType <- generateDocType(officeDocLength = officeDocLength, 57 | countryAndKindCode = countryAndKindCode, 58 | cakcDict = patentr::cakcDict, 59 | docLengthTypesDict = patentr::docLengthTypesDict) 60 | keepType <- "grant" 61 | toKeep <- removeDups(acars$appNum, hasDup = hasDup, docType = docType, keepType = keepType) 62 | table(toKeep) 63 | acarsDedup <- acars[toKeep, ] 64 | 65 | 66 | } 67 | \seealso{ 68 | \code{\link[base]{duplicated}}, \code{\link{showDups}} 69 | } 70 | -------------------------------------------------------------------------------- /man/addFullImagePptx.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/reportPatentData.R 3 | \name{addFullImagePptx} 4 | \alias{addFullImagePptx} 5 | \title{Add a full-sized plot image to a pptx} 6 | \usage{ 7 | addFullImagePptx(ppt, plot, slide_layout = "Title and Content", w = 13.3, 8 | h = 7.5) 9 | } 10 | \arguments{ 11 | \item{ppt}{A ppt object to add a slide to.} 12 | 13 | \item{plot}{A plot output object from ggplto2.} 14 | 15 | \item{slide_layout}{A character value, slide layout, default value is 16 | \code{"Title and Content"}.} 17 | 18 | \item{w}{Width in inches, default set to max width 13.3} 19 | 20 | \item{h}{Height in inches, default set to max height 7.5} 21 | } 22 | \value{ 23 | a pptx object. 24 | } 25 | \description{ 26 | Take a plot image from ggplot2 and size it to fit an entire 27 | slide. 28 | } 29 | \examples{ 30 | sumo <- cleanPatentData(patentData = patentr::acars, columnsExpected = sumobrainColumns, 31 | cleanNames = sumobrainNames, 32 | dateFields = sumobrainDateFields, 33 | dateOrders = sumobrainDateOrder, 34 | deduplicate = TRUE, 35 | cakcDict = patentr::cakcDict, 36 | docLengthTypesDict = patentr::docLengthTypesDict, 37 | keepType = "grant", 38 | firstAssigneeOnly = TRUE, 39 | assigneeSep = ";", 40 | stopWords = patentr::assigneeStopWords) 41 | 42 | # note that in reality, you need a patent analyst to carefully score 43 | # these patents, the score here is for demonstrational purposes 44 | score <- round(rnorm(dim(sumo)[1],mean=1.4,sd=0.9)) 45 | score[score>3] <- 3; score[score<0] <- 0 46 | sumo$score <- score 47 | sumo$assigneeSmall <- strtrim(sumo$assigneeClean,12) 48 | category <- c("system","control algorithm","product","control system", "communication") 49 | c <- round(rnorm(dim(sumo)[1],mean=2.5,sd=1.5)) 50 | c[c>5] <- 5; c[c<1] <- 1 51 | sumo$category <- category[c] 52 | 53 | xVal = "category" 54 | fillVal = "score" 55 | facetVal = "assigneeSmall" 56 | 57 | fp <- facetPlot(subset(sumo, score > 0), xVal, fillVal, facetVal, colors = patentr::scoreColors, 58 | recolor = FALSE) 59 | 60 | 61 | 62 | # create a ppt 63 | ppt <- ReporteRs::pptx(title="IP Update") 64 | # view the types of layouts available by default 65 | # slide.layouts(ppt) 66 | layoutTitleContent = "Title and Content" 67 | 68 | fp <- facetPlot(subset(sumo, score > 0), xVal, fillVal, facetVal, colors = patentr::scoreColors, 69 | recolor = FALSE) 70 | ppt <- addFullImagePptx(ppt, plot = fp, slide_layout = layoutTitleContent) 71 | fp <- facetPlot(subset(sumo, score > 1), xVal, fillVal, facetVal, colors = patentr::scoreColors, 72 | recolor = FALSE) 73 | ppt <- addFullImagePptx(ppt, plot = fp, slide_layout = layoutTitleContent) 74 | fp <- facetPlot(subset(sumo, score > 2), xVal, fillVal, facetVal, colors = patentr::scoreColors, 75 | recolor = FALSE) 76 | ppt <- addFullImagePptx(ppt, plot = fp, slide_layout = layoutTitleContent) 77 | 78 | 79 | # find a data folder and write it out to your folder 80 | # out <- paste("data/",Sys.Date(),"_exampleChartRightTextLeft.pptx",sep='') 81 | # ReporteRs::writeDoc(ppt, out) 82 | 83 | 84 | } 85 | \seealso{ 86 | \code{\link{addChartRightTextLeftPptx}} 87 | } 88 | -------------------------------------------------------------------------------- /R/importPatentData.R: -------------------------------------------------------------------------------- 1 | ## kamil bojanczyk start 2 | #' Read in a data file or list of files from excel spreadsheets. 3 | #' 4 | #' @description Import, read, and connect patent data files. Currently: xls files 5 | #' from a filepath. 6 | #' Future use: can read from a URL, an xlsx file, google doc, and a csv. 7 | #' 8 | #' @param rawDataFilePath A filepath, or list of filespaths, for xls files. 9 | #' @param skipLines Number of lines to skip before reading in your data file. 10 | #' @return A single data frame of all data. NULL if no data. 11 | #' @examples \dontrun{ 12 | #' 13 | #' # access the files here and put them in a data/ folder of your working directory. 14 | #' file1 <- system.file("extdata/", "sumobrain_autonomous_search1.xlsx", package="patentr") 15 | #' file2 <- system.file("extdata/", "sumobrain_autonomous_search2.xlsx", package="patentr") 16 | #' files <- list(file1, file2) 17 | #' ipData <- importPatentData(rawDataFilePath = files, skipLines = 1) 18 | #' 19 | #' # example 2 20 | #' # assume csv files are in the data folder 21 | #' ipData <- importPatentData(rawDataFilePath = list.files('data/', full.names=T), skipLines = 1) 22 | #' } 23 | #' 24 | #' 25 | #' @export 26 | #' 27 | #' @importFrom readxl read_excel 28 | #' @importFrom plyr ldply 29 | #' 30 | importPatentData <- function(rawDataFilePath = NA, skipLines = 1){ 31 | 32 | # grep all files that end in "xls". This is a lazy-mans error-check. 33 | filePaths <- rawDataFilePath[grep(".*.xls",rawDataFilePath,ignore.case=T)] 34 | if (length(filePaths) == 0){ 35 | warning("Inputted filepath list: ",rawDataFilePath,"\ndoes not contain any xls files.") 36 | # exit 37 | return(NULL) 38 | } 39 | else { 40 | # use read_excel from the readxl package 41 | # note: on xls files the last column might get dropped 42 | # a fix was supposed to have worked in Feb 2017 43 | # https://github.com/tidyverse/readxl/issues/152 44 | rawData <- lapply(rawDataFilePath, readxl::read_excel, skip = skipLines) 45 | 46 | # clean the data with ldply, unlists data and creates single data frame 47 | cleanData <- plyr::ldply(rawData) 48 | print(paste("Successfull loaded in a file with",dim(cleanData)[1], "rows and",dim(cleanData)[2],"columns.")) 49 | return(cleanData) 50 | } 51 | 52 | } 53 | 54 | 55 | 56 | #' Allow the user to navigate to files manually. 57 | #' 58 | #' @description Uses a popup window (Tk file dialog) to allow the user to choose a list of zero or more files interactively. 59 | #' 60 | #' @return A list of character vectors with absolute pathnames to files. 61 | #' 62 | #' @examples \dontrun{ 63 | #' filePaths <- chooseFiles() 64 | #' allData <- importPatentData(filePaths) 65 | #' } 66 | #' @export 67 | #' 68 | chooseFiles <- function() { 69 | ## Note: adding in @importFrom tcltk tk_choose.files breaks the build 70 | # on linux machines, it may be for the reason below (tcltk may not be installed 71 | # on some R builds) 72 | # http://r.789695.n4.nabble.com/Where-is-the-tcltk-package-td3434915.html 73 | # apparently do not need to necessarily include tcltk in Depends 74 | 75 | files <- tcltk::tk_choose.files(caption = "Select the file(s) you wish to read") 76 | files 77 | } 78 | 79 | ## kamil bojanczyk end -------------------------------------------------------------------------------- /inst/examples/edaPatentGuide.R: -------------------------------------------------------------------------------- 1 | ### simple exploratory data anaylsis guide 2 | 3 | # 1 read in data 4 | # access the files here and put them in a data/ folder of your working directory. 5 | file1 <- system.file("extdata/", "sumobrain_autonomous_search1.xls", package="patentr") 6 | file2 <- system.file("extdata/", "sumobrain_autonomous_search2.xls", package="patentr") 7 | files <- list(file1, file2) 8 | ipData <- importPatentData(rawDataFilePath = files, skipLines = 1) 9 | 10 | 11 | # 2 clean data that was read in 12 | sumo <- cleanPatentData(patentData = patentr::acars, columnsExpected = sumobrainColumns, 13 | cleanNames = sumobrainNames, 14 | dateFields = sumobrainDateFields, 15 | dateOrders = sumobrainDateOrder, 16 | deduplicate = TRUE, 17 | cakcDict = patentr::cakcDict, 18 | docLengthTypesDict = patentr::docLengthTypesDict, 19 | keepType = "grant", 20 | firstAssigneeOnly = TRUE, 21 | assigneeSep = ";", 22 | stopWords = patentr::assigneeStopWords) 23 | View(sumo) 24 | 25 | 26 | # 3 explore data 27 | # note that in reality, you need a patent analyst to carefully score 28 | # these patents, the score here is for demonstrational purposes 29 | score <- round(rnorm(dim(sumo)[1],mean=1.4,sd=0.9)) 30 | score[score>3] <- 3 31 | score[score<0] <- 0 32 | sumo$score <- score 33 | scoreSum <- summarizeColumns(sumo, "score") 34 | scoreSum 35 | # load library(ggplot2) for the below part to run 36 | # ggplot(scoreSum, aes(x=score, y = total, fill=factor(score) )) + geom_bar(stat="identity") 37 | nameAndScore <- summarizeColumns(sumo, c("assigneeClean","score")) 38 | tail(nameAndScore) 39 | names(sumo) 40 | tail(summarizeColumns(sumo, c("docType","score","countryCode"))) 41 | 42 | # 4 visualize 43 | ## 4-1 histogram 44 | sumo$assigneeSmall <- strtrim(sumo$assigneeClean,12) 45 | flippedHistogram(sumo, "assigneeSmall","score",colors=scoreColors) 46 | 47 | flippedHistogram(subset(sumo, score > 0), "assigneeSmall","score",colors=scoreColors) 48 | 49 | flippedHistogram(subset(sumo, score > 2) ,"assigneeSmall","docType",colors=scoreColors, 50 | recolor = TRUE) 51 | 52 | ## 4-2 facet plot 53 | category <- c("system","control algorithm","product","control system", "communication") 54 | c <- round(rnorm(dim(sumo)[1],mean=2.5,sd=1.5)) 55 | c[c>5] <- 5; c[c<1] <- 1 56 | sumo$category <- category[c] 57 | 58 | xVal = "category" 59 | fillVal = "score" 60 | facetVal = "assigneeSmall" 61 | 62 | # warning, if xVal has more than 10 unique vals, it is hard to see 63 | facetPlot(subset(sumo, score > 1), xVal, fillVal, facetVal, colors = patentr::scoreColors, 64 | recolor = FALSE) 65 | 66 | 67 | ## 4-3 tile plots 68 | feature1 <- c("adaptive", "park", "lane", NA,NA,NA,NA,NA, 69 | "brake", "steer","accelerate","deactivate") 70 | f <- round(rnorm(dim(sumo)[1],mean=5,sd=1)) 71 | l <- length(feature1) 72 | f[f>l] <- l; f[f<1] <- 1 73 | sumo$feature1 <- c(feature1,feature1[f])[1:dim(sumo)[1]] 74 | 75 | tilePlot(sumo, "category", "feature1") 76 | 77 | tilePlot(sumo, xVal = "assigneeSmall", tileVal = "feature1", fillVal = "category", 78 | xangle=90, xhjust=0, showLegend = TRUE) 79 | 80 | tilePlot(sumo, xVal = "assigneeSmall", tileVal = "feature1", fillVal = "category", 81 | xangle=90, xhjust=0, showLegend = TRUE, facetVal = "docType", fscale = "fixed") 82 | 83 | tilePlot(sumo, xVal = "assigneeSmall", tileVal = "feature1", fillVal = "category", 84 | xangle=90, xhjust=0, showLegend = TRUE, facetVal = "docType", fscale = "free") 85 | 86 | tilePlot(sumo, xVal = "assigneeSmall", tileVal = "feature1", fillVal = "category", 87 | xangle=90, xhjust=0, showLegend = TRUE, facetVal = "score", fscale = "free") 88 | -------------------------------------------------------------------------------- /man/tilePlot.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/visualizePatentData.R 3 | \name{tilePlot} 4 | \alias{tilePlot} 5 | \title{Make a facet tile plot to view two features.} 6 | \usage{ 7 | tilePlot(df, xVal, tileVal, fillVal = NA, xangle = 0, xhjust = 0.5, 8 | showLegend = FALSE, facetVal = NA, fscale = c("free", "fixed")) 9 | } 10 | \arguments{ 11 | \item{df}{The patent data frame you want to graph.} 12 | 13 | \item{xVal}{The x value you will be plotting, a character value that is a 14 | name of \code{df}.} 15 | 16 | \item{tileVal}{The tile value you will be plotting, a character value that is a 17 | name of \code{df}.} 18 | 19 | \item{fillVal}{An optional value for filling the color of the tiles on a third 20 | variable. Default set to \code{NA} and evaluates to \code{xVal}.} 21 | 22 | \item{xangle}{A numeric 0 to 360 value for the angle of the x axis text} 23 | 24 | \item{xhjust}{Double value between 0 and 1. 0 Means left justified, 1 means right justified, 25 | default set to 0.5 (middle), for the x axis text.} 26 | 27 | \item{showLegend}{A logical to allow you to show or hide the legend, which is 28 | mapped to the fillVal} 29 | 30 | \item{facetVal}{Optional faceting. 31 | A character string of the facet you want for your plot, must be a 32 | name of the header in \code{df}. Default set to \code{NA}.} 33 | 34 | \item{fscale}{Facet scale, a character value chosen from \code{c("free","fixed")}. 35 | Default set to \code{fixed}. It changes the y axis to adjust to each facet 36 | and drop unused y (tile) values or keeps them all constant.} 37 | } 38 | \value{ 39 | A ggplot2 facet plot object. 40 | } 41 | \description{ 42 | Scan for patent market gaps. 43 | Visualize the features of a set of patents by a category. Can view up to 44 | four dimensions of data with this plot (x, y, and optionals fill and facet). 45 | 46 | Quickly scan this chart to look for gaps in the feature sets. 47 | } 48 | \examples{ 49 | 50 | sumo <- cleanPatentData(patentData = patentr::acars, columnsExpected = sumobrainColumns, 51 | cleanNames = sumobrainNames, 52 | dateFields = sumobrainDateFields, 53 | dateOrders = sumobrainDateOrder, 54 | deduplicate = TRUE, 55 | cakcDict = patentr::cakcDict, 56 | docLengthTypesDict = patentr::docLengthTypesDict, 57 | keepType = "grant", 58 | firstAssigneeOnly = TRUE, 59 | assigneeSep = ";", 60 | stopWords = patentr::assigneeStopWords) 61 | 62 | # note that in reality, you need a patent analyst to carefully score 63 | # these patents, the score here is for demonstrational purposes 64 | score <- round(rnorm(dim(sumo)[1],mean=1.4,sd=0.9)) 65 | score[score>3] <- 3; score[score<0] <- 0 66 | sumo$score <- score 67 | sumo$assigneeSmall <- strtrim(sumo$assigneeClean,12) 68 | category <- c("system","control algorithm","product","control system", "communication") 69 | c <- round(rnorm(dim(sumo)[1],mean=2.5,sd=1.5)) 70 | c[c>5] <- 5; c[c<1] <- 1 71 | sumo$category <- category[c] 72 | feature1 <- c("adaptive", "park", "lane", NA,NA,NA,NA,NA, 73 | "brake", "steer","accelerate","deactivate") 74 | f <- round(rnorm(dim(sumo)[1],mean=5,sd=1)) 75 | l <- length(feature1) 76 | f[f>l] <- l; f[f<1] <- 1 77 | sumo$feature1 <- c(feature1,feature1[f])[1:dim(sumo)[1]] 78 | 79 | tilePlot(sumo, "category", "feature1") 80 | 81 | tilePlot(sumo, xVal = "assigneeSmall", tileVal = "feature1", fillVal = "category", 82 | xangle=90, xhjust=0, showLegend = TRUE) 83 | 84 | tilePlot(sumo, xVal = "assigneeSmall", tileVal = "feature1", fillVal = "category", 85 | xangle=90, xhjust=0, showLegend = TRUE, facetVal = "docType", fscale = "fixed") 86 | 87 | tilePlot(sumo, xVal = "assigneeSmall", tileVal = "feature1", fillVal = "category", 88 | xangle=90, xhjust=0, showLegend = TRUE, facetVal = "docType", fscale = "free") 89 | 90 | tilePlot(sumo, xVal = "assigneeSmall", tileVal = "feature1", fillVal = "category", 91 | xangle=90, xhjust=0, showLegend = TRUE, facetVal = "score", fscale = "free") 92 | 93 | } 94 | -------------------------------------------------------------------------------- /man/kindCodes.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{kindCodes} 5 | \alias{kindCodes} 6 | \title{A kind codes database to show the type of document for each patent document.} 7 | \format{A data frame. 8 | 9 | \describe{ 10 | \item{\code{countryCode}}{The country code for the originating office where the application 11 | was filed.} 12 | \item{kindCode}{The letter/number code to signify the type of document. Codes may 13 | change after a certain date, so pay attention to \code{dateStarted} and \code{dateDeprecated}} 14 | \item{isDeprecated}{Logical TRUE/FALSE if the kind code for the country is no longer in use.} 15 | \item{dateDeprecated}{The date the kind code stopped being in use.} 16 | \item{isNew}{If the kind code is a replacement for a former kind code, TRUE, else FALSE.} 17 | \item{dateStarted}{If isNew == TRUE, the date the new kind code began being used.} 18 | \item{comment}{Additional information explaining the details of the kind code.} 19 | \item{docTypeLong}{The long version of the document type.} 20 | \item{docType}{A shorter, standardized version of \code{docTypeLong}.} 21 | \item{expectDuplicate}{A logical TRUE/FALSE to help the analyst understand if the 22 | published document is expected to have a duplicate publication. For example, USB2 is 23 | a granted patent that has an application that was also published, whereas USB1 has no 24 | previous documents published. This helps speed up the deduplication process. } 25 | \item{countryAndKindCode}{A concatenation of country code and kind code. Used in 26 | the deduplication process and to determine the type of document.} 27 | }} 28 | \usage{ 29 | kindCodes 30 | } 31 | \description{ 32 | Patent documents have associated kind codes, which are letter/number code 33 | combinations that signify the type of document, such as application, granted 34 | patent, utility patent, etc. These kind codes vary by country and are a useful 35 | approach to classifying patent document types. Most, however, not all, downloaded 36 | data from free services such as sumobrain.com or lens.org includes the kind code 37 | at the end of the patent document number. 38 | } 39 | \details{ 40 | For example, from the sumobrain.com download from the \code{\link{acars}} data set, 41 | here are three documents: 42 | \enumerate{ 43 | \item{US6523912} 44 | \item{US20030060197} 45 | \item{EP1310400A1} 46 | } 47 | 48 | The first two items are missing kind codes. The third item has kind code A1 49 | and the country code is EP. 50 | 51 | To clean the data yourself: 52 | 53 | \code{temp <- readxl::read_excel(system.file("extdata", "kindCodes.xlsx", package = "patentr"))} 54 | 55 | \code{temp <- replace(temp, is.na(temp), "NA")} 56 | 57 | \code{temp$dateDeprecated <- as.numeric(temp$dateDeprecated)} 58 | 59 | \code{temp$dateDeprecated <- as.Date(temp$dateDeprecated, origin = "1899-12-30")} 60 | 61 | \code{temp$dateStarted <- as.numeric(temp$dateStarted)} 62 | 63 | \code{temp$dateStarted <- as.Date(temp$dateStarted, origin = "1899-12-30")} 64 | 65 | \code{temp$countryAndKindCode <- with(temp,paste0(countryCode, kindCode))} 66 | 67 | See \url{https://www.r-bloggers.com/date-formats-in-r/} for excel mac/windows 68 | and confirm this origin works for you by reviewing the source file. 69 | 70 | View the data sources: 71 | \enumerate{ 72 | \item{\href{https://www.uspto.gov/learning-and-resources/support-centers/electronic-business-center/kind-codes-included-uspto-patent}{USPTO kind codes}} 73 | \item{\href{https://www.cas.org/content/references/patkind}{CAS list of kind codes}} 74 | \item{\href{http://ipbookcompanion.org/links/pk_codes.pdf}{IP Book kind codes}} 75 | \item{\href{http://www.thomsonfilehistories.com/docs/RESOURCES_Kind\%20Codes\%20by\%20Country.pdf}{Thomson File Histories}} 76 | } 77 | For esp@cenet at the EPO, try the following link: 78 | \code{https://worldwide.espacenet.com/help?locale=en_EP&method=} 79 | \code{handleHelpTopic&topic=kindcodes\%5C} 80 | } 81 | \keyword{data} 82 | -------------------------------------------------------------------------------- /man/acarsLens.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/acars.R 3 | \docType{data} 4 | \name{acarsLens} 5 | \alias{acarsLens} 6 | \title{Autonomous Vehicle Patent Data from Lens Patent Search} 7 | \format{A data frame with 863 observations on 26 variables. 8 | \describe{ 9 | \item{resultNum}{The search result number.} 10 | \item{countryCode}{The jurisdiction of the patent document.} 11 | \item{\code{kindCode}}{The kind code.} 12 | \item{docNum}{The published document number with country code and kind code included.} 13 | \item{lensID}{The unique identification number of the document on lens.org} 14 | \item{pubDate}{Date the document was published.} 15 | \item{pubYear}{Year the document published.} 16 | \item{appNum}{The filing number of the application (country code, number, and abridged kind code, typically 'A')} 17 | \item{dateFiled}{Date the application for the patent document was filed.} 18 | \item{priorityApps}{Applications this patent document claims priority. 19 | Format: Country code, application number, A = application or P = provisional, YYYYMMDD of priority. 20 | Multiple application separated by a double semi-colon.} 21 | \item{title}{The title of the document.} 22 | \item{assignee}{The name of the applicant(s) at the time of filing.} 23 | \item{inventors}{The inventor(s).} 24 | \item{lensURL}{The lens.org URL for the document.} 25 | \item{docTypeLens}{A lens.org mapping of the doc type. 26 | Granted, application, ambiguous, unknown, search report, and possibly more values.} 27 | \item{hasFullText}{A logical value to show if there is a full text available from lens.org} 28 | \item{citeCount}{The number of times this document is cited, also known as forward citations.} 29 | \item{familySimpleCount}{The number of unique documents in the immediate patent family.} 30 | \item{familyExtendedCount}{The number of unique documents sharing a priority applicaiton in the extended family.} 31 | \item{seqCount}{Used in biological applications -- the number of sequences in the application.} 32 | \item{cpcClasses}{The CPC classification codes, separated by a double semi-colon.} 33 | \item{ipcrClasses}{The IPCR classification codes, separated by a double semi-colon.} 34 | \item{usClasses}{The US classification codes, separated by a double semi-colon.} 35 | \item{pubmedID}{A pubmed ID to any related research.} 36 | \item{DOI}{A digital object identifier. 37 | Go to doi.org and paste the value to get the associated research paper.} 38 | \item{npl}{Non-patent literature, or citations of non-patent sources. 39 | Separated with double semi-colons.} 40 | 41 | 42 | }} 43 | \usage{ 44 | acarsLens 45 | } 46 | \description{ 47 | An example data set of autonomous vehicle IP from major assignees. 48 | } 49 | \details{ 50 | The data search was performd on Saturday, March 18, 2017 from lens.org, and the exact 51 | search: 52 | 53 | \href{https://www.lens.org/lens/search?q=abstract\%3Aautonomous+\%26\%26+applicant\%3A\%28Apple*+OR+Google*+OR+Waymo*+OR+Tesla*+OR+Ford*+OR+General*\%29&predicate=\%26\%26&l=en}{Lens Patents Search} 54 | 55 | For all countries available on Lens. 56 | 57 | Can get raw data with the following commands: 58 | 59 | \code{temp <- system.file("extdata", "lens_autonomous_search.csv", package = "patentr")} 60 | 61 | \code{temp <- read.csv(temp, stringsAsFactors = FALSE)} 62 | 63 | \code{temp <- data.frame(lapply(temp, function(x){iconv(x,to="ASCII")}),stringsAsFactors = FALSE)} 64 | 65 | \code{names(temp) <- lensNames} 66 | 67 | \code{temp$dateFiled <- as.Date(temp$dateFiled, format = '\%m/\%d/\%y')} 68 | 69 | \code{temp$pubDate <- as.Date(temp$pubDate, format='\%m/\%d/\%y')} # note that % y is system-specific and may not work everywhere. 70 | 71 | \code{colsNum <- c("resultNum","citeCount","familySimpleCount","familyExtendedCount", "seqCount")} 72 | 73 | \code{temp[colsNum] <- sapply(temp[colsNum], as.numeric)} 74 | 75 | \code{temp$hasFullText <- sapply(temp$hasFullText, function(x) ifelse(x=="yes",TRUE,FALSE))} 76 | } 77 | \seealso{ 78 | \url{www.lens.org} You can export without an account, or can create 79 | an account to save your searches. 80 | 81 | \code{\link{acarsGoogle}} provides a similar search from Google. 82 | \code{\link{acars}} provides a similar search from sumobrain. 83 | } 84 | \keyword{data} 85 | -------------------------------------------------------------------------------- /man/cleanPatentData.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/cleanPatentData.R 3 | \name{cleanPatentData} 4 | \alias{cleanPatentData} 5 | \title{Generate a clean data set from the imported raw data.} 6 | \usage{ 7 | cleanPatentData(patentData = NULL, columnsExpected, cleanNames, 8 | dateFields = NA, dateOrders, deduplicate = TRUE, 9 | cakcDict = patentr::cakcDict, 10 | docLengthTypesDict = patentr::docLengthTypesDict, keepType = "grant", 11 | firstAssigneeOnly = TRUE, assigneeSep = ";", 12 | stopWords = patentr::assigneeStopWords) 13 | } 14 | \arguments{ 15 | \item{patentData}{The data frame of initial raw patent data.} 16 | 17 | \item{columnsExpected}{The expected width of the data frame, numeric.} 18 | 19 | \item{cleanNames}{A character vector of length columnsExpected to rename the 20 | data frame with.} 21 | 22 | \item{dateFields}{A character vector of the date column names which will be 23 | converted to `Date` format.} 24 | 25 | \item{dateOrders}{A character string of the format required to convert string 26 | data into `Date` data. Sumobrain is "ymd" and lens and Google data are "mdy". 27 | Hardcoded values include \code{\link{googleDateOrder}},\code{\link{lensDateOrder}}, 28 | and \code{\link{sumobrainDateOrder}}.} 29 | 30 | \item{deduplicate}{A logical, default set to TRUE, if you want to deduplicated 31 | any patent documents that have both an app and a grant.} 32 | 33 | \item{cakcDict}{A county and kind code dictionary. Default is \code{\link{cakcDict}}.} 34 | 35 | \item{docLengthTypesDict}{A document length and type dictionary. Default is \code{\link{docLengthTypesDict}}.} 36 | 37 | \item{keepType}{A character variable denoting which document type to keep. Default is "grant". 38 | If NA, ignore.} 39 | 40 | \item{firstAssigneeOnly}{For cleaning names, use the first assignee only, default TRUE.} 41 | 42 | \item{assigneeSep}{The separation character if there is more than one assignee. Default 43 | is ";" semicolon.} 44 | 45 | \item{stopWords}{The stopword list to remove from assignee names. Default is 46 | \code{\link{assigneeStopWords}}.} 47 | } 48 | \value{ 49 | A data frame of tidy patent data. 50 | } 51 | \description{ 52 | Generate a clean data set from the imported raw data set. The 53 | data available dictates the number of columns of attributes that can be 54 | generated. 55 | 56 | Sumobrain, Lens.org, and Google Patents have varying levels of data available. 57 | 58 | If you import your own data, be sure to adhere to the template format, or 59 | read carefully to create your own. 60 | } 61 | \examples{ 62 | 63 | 64 | sumo <- cleanPatentData(patentData = patentr::acars, columnsExpected = sumobrainColumns, 65 | cleanNames = sumobrainNames, 66 | dateFields = sumobrainDateFields, 67 | dateOrders = sumobrainDateOrder, 68 | deduplicate = TRUE, 69 | cakcDict = patentr::cakcDict, 70 | docLengthTypesDict = patentr::docLengthTypesDict, 71 | keepType = "grant", 72 | firstAssigneeOnly = TRUE, 73 | assigneeSep = ";", 74 | stopWords = patentr::assigneeStopWords) 75 | 76 | # use a fresh Google export csv 77 | # in a new csv download, however, it would not be the case 78 | 79 | 80 | rawGoogleData <- system.file("extdata", "google_autonomous_search.csv", 81 | package = "patentr") 82 | rawGoogleData <- read.csv(rawGoogleData, 83 | skip = skipGoogle, stringsAsFactors = FALSE) 84 | rawGoogleData <- data.frame(lapply(rawGoogleData, 85 | function(x){iconv(x, to = "ASCII")}), stringsAsFactors = FALSE) 86 | google <- cleanPatentData(patentData = rawGoogleData, columnsExpected = googleColumns, 87 | cleanNames = googleNames, 88 | dateFields = googleDateFields, 89 | dateOrders = googleDateOrder, 90 | deduplicate = TRUE, 91 | cakcDict = patentr::cakcDict, 92 | docLengthTypesDict = patentr::docLengthTypesDict, 93 | keepType = "grant", 94 | firstAssigneeOnly = TRUE, 95 | assigneeSep = ",", 96 | stopWords = patentr::assigneeStopWords) 97 | 98 | 99 | lensRawData <- system.file("extdata", "lens_autonomous_search.csv", 100 | package = "patentr") 101 | lensRawData <- read.csv(lensRawData, stringsAsFactors = FALSE, skip = skipLens) 102 | lensRawData <- data.frame(lapply(lensRawData, 103 | function(x){iconv(x, to = "ASCII")}), stringsAsFactors = FALSE) 104 | lens <- cleanPatentData(patentData = lensRawData, columnsExpected = lensColumns, 105 | cleanNames = lensNames, 106 | dateFields = lensDateFields, 107 | dateOrders = lensDateOrder, 108 | deduplicate = TRUE, 109 | cakcDict = patentr::cakcDict, 110 | docLengthTypesDict = patentr::docLengthTypesDict, 111 | keepType = "grant", 112 | firstAssigneeOnly = TRUE, 113 | assigneeSep = ";;", 114 | stopWords = patentr::assigneeStopWords) 115 | 116 | } 117 | \seealso{ 118 | For data formats: \code{\link{acars}} for Sumobrain, 119 | \code{\link{acarsGoogle}} for Google Patents data, and \code{\link{acarsLens}} 120 | for Lens.org data. 121 | } 122 | -------------------------------------------------------------------------------- /tests/testthat/test-process.R: -------------------------------------------------------------------------------- 1 | # used with processPatentData.R 2 | 3 | 4 | 5 | 6 | # sumobrain data 7 | test_that("Sumobrain data has clean google URLS.",{ 8 | df <- importPatentData(rprojroot::find_testthat_root_file("testData","sumobrain_autonomous_search1.xlsx"), skipLines = skipSumobrain) 9 | df <- cleanPatentData(patentData = df, columnsExpected = sumobrainColumns, 10 | cleanNames = sumobrainNames, dateFields = sumobrainDateFields, 11 | dateOrders = sumobrainDateOrder, deduplicate = TRUE, 12 | cakcDict = cakcDict, docLengthTypesDict = docLengthTypesDict, 13 | keepType = "grant",firstAssigneeOnly = TRUE, assigneeSep = ";", 14 | stopWords = assigneeStopWords) 15 | kc <- extractKindCode(df$docNum) 16 | pn <- extractPubNumber(df$docNum) 17 | cc <- extractCountryCode(df$docNum) 18 | gurl <- createGoogleURL(countryCode = cc, pubNum = pn, kindCode = kc) 19 | expect_equal(length(gurl) ,dim(df)[1]) 20 | }) 21 | 22 | 23 | 24 | test_that("Google data has clean google URLS.",{ 25 | df <- read.csv(rprojroot::find_testthat_root_file("testData","google_autonomous_search.csv"), 26 | skip = skipGoogle, stringsAsFactors = FALSE) 27 | df <- data.frame(lapply(df,function(x){iconv(x, to = "ASCII")}), stringsAsFactors = FALSE) 28 | 29 | df <- cleanPatentData(patentData = df, columnsExpected = googleColumns, 30 | cleanNames = googleNames, dateFields = googleDateFields, 31 | dateOrders = googleDateOrder, deduplicate = TRUE, 32 | cakcDict = cakcDict, docLengthTypesDict = docLengthTypesDict, 33 | keepType = "grant",firstAssigneeOnly = TRUE, assigneeSep = ",", 34 | stopWords = assigneeStopWords) 35 | kc <- extractKindCode(df$docNum) 36 | pn <- extractPubNumber(df$docNum) 37 | cc <- extractCountryCode(df$docNum) 38 | gurl <- createGoogleURL(countryCode = cc, pubNum = pn, kindCode = kc) 39 | expect_equal(length(gurl) ,dim(df)[1]) 40 | }) 41 | 42 | 43 | test_that("Lens.org patent data has clean google URLS.",{ 44 | df <- read.csv(rprojroot::find_testthat_root_file("testData","lens_autonomous_search.csv"), 45 | skip = skipLens, stringsAsFactors = FALSE) 46 | df <- data.frame(lapply(df,function(x){iconv(x, to = "ASCII")}), stringsAsFactors = FALSE) 47 | 48 | df <- cleanPatentData(patentData = df, columnsExpected = lensColumns, 49 | cleanNames = lensNames, dateFields = lensDateFields, 50 | dateOrders = lensDateOrder, deduplicate = TRUE, 51 | cakcDict = cakcDict, docLengthTypesDict = docLengthTypesDict, 52 | keepType = "grant",firstAssigneeOnly = TRUE, assigneeSep = ";;", 53 | stopWords = assigneeStopWords) 54 | kc <- extractKindCode(df$docNum) 55 | pn <- extractPubNumber(df$docNum) 56 | cc <- extractCountryCode(df$docNum) 57 | gurl <- createGoogleURL(countryCode = cc, pubNum = pn, kindCode = kc) 58 | expect_equal(length(gurl) ,dim(df)[1]) 59 | }) 60 | 61 | 62 | test_that("getClaimFromURL returns character value.",{ 63 | aclaim <- getClaimFromURL("https://patents.google.com/patent/US8818682B1/en") 64 | expect_is(aclaim ,"character") 65 | }) 66 | 67 | test_that("getClaimFromURL for an old patent should return blank.",{ 68 | anOldClaim <- getClaimFromURL("https://patents.google.com/patent/US881/en") 69 | expect_is(anOldClaim ,"character") 70 | }) 71 | 72 | test_that("getClaimFromURL from a bad (well-formatted, 404 error) URL should return blank.",{ 73 | aBadURLClaim <- getClaimFromURL("https://patents.google.com/patent/USsss881/en") 74 | expect_is(aBadURLClaim ,"character") 75 | }) 76 | 77 | 78 | test_that("cleanGoogleURL from /mx returns character.",{ 79 | expect_is(cleanGoogleURL("https://patents.google.com/patent/US8818682B1/mx") ,"character") 80 | }) 81 | 82 | test_that("cleanGoogleURL from / returns character.",{ 83 | expect_is(cleanGoogleURL("https://patents.google.com/patent/US8818682B1/") ,"character") 84 | }) 85 | 86 | 87 | test_that("cleanGoogleURL from no backslash returns character.",{ 88 | expect_is(cleanGoogleURL("https://patents.google.com/patent/US8818682B1") ,"character") 89 | }) 90 | 91 | test_that("cleanGoogleURL from /en returns character.",{ 92 | expect_is(cleanGoogleURL("https://patents.google.com/patent/US8818682B1/en") ,"character") 93 | }) 94 | 95 | 96 | 97 | test_that("getClaimFromURL should return a character of length 1.",{ 98 | krclaim <- getClaimFromURL("https://patents.google.com/patent/KR20150127745A/en") 99 | expect_equal(length(krclaim), 1) 100 | expect_is(krclaim, "character") 101 | }) 102 | 103 | 104 | test_that("getClaimsText reads in 3 urls and returns a character vector of length 3.",{ 105 | df <- importPatentData(rprojroot::find_testthat_root_file("testData","sumobrain_autonomous_search1.xlsx"), skipLines = skipSumobrain) 106 | df <- cleanPatentData(patentData = df, columnsExpected = sumobrainColumns, 107 | cleanNames = sumobrainNames, dateFields = sumobrainDateFields, 108 | dateOrders = sumobrainDateOrder, deduplicate = TRUE, 109 | cakcDict = cakcDict, docLengthTypesDict = docLengthTypesDict, 110 | keepType = "grant",firstAssigneeOnly = TRUE, assigneeSep = ";", 111 | stopWords = assigneeStopWords) 112 | urls <- df$googleURL[1:3] 113 | clms <- getClaimsText(urls) 114 | expect_equal(length(clms), 3) 115 | expect_is(urls, "character") 116 | }) 117 | 118 | -------------------------------------------------------------------------------- /tests/testthat/test-graphics.R: -------------------------------------------------------------------------------- 1 | # test graphics 2 | ## kamil bojanczyk start 3 | # svg("tests/testthat/testData/sb0.svg") 4 | # flippedHistogram(subset(sumo, score > 0), "assigneeSmall","score",colors=scoreColors) 5 | # dev.off() 6 | # graphics are the same 7 | test_that("Sumobrain flipped histogram outputs a plot.",{ 8 | 9 | file1 <- rprojroot::find_testthat_root_file("testData","sumobrain_autonomous_search1.xlsx") 10 | file2 <- rprojroot::find_testthat_root_file("testData","sumobrain_autonomous_search2.xlsx") 11 | files <- list(file1, file2) 12 | df <- importPatentData(rawDataFilePath = files, skipLines = skipSumobrain) 13 | df <- cleanPatentData(patentData = df, columnsExpected = sumobrainColumns, 14 | cleanNames = sumobrainNames, dateFields = sumobrainDateFields, 15 | dateOrders = sumobrainDateOrder, deduplicate = TRUE, 16 | cakcDict = cakcDict, docLengthTypesDict = docLengthTypesDict, 17 | keepType = "grant",firstAssigneeOnly = TRUE, assigneeSep = ";", 18 | stopWords = assigneeStopWords) 19 | 20 | 21 | # https://github.com/hadley/evaluate/blob/master/tests/testthat/ggplot-loop.r 22 | df$assigneeSmall <- strtrim(df$assigneeClean,12) 23 | score <- round(rnorm(dim(df)[1],mean=1.4,sd=0.9)) 24 | score[score>3] <- 3 25 | score[score<0] <- 0 26 | df$score <- score 27 | aPlot <- flippedHistogram(df, "assigneeSmall","score") 28 | 29 | # taken from hadley's ggplot2 tests 30 | # https://github.com/tidyverse/ggplot2/blob/master/tests/testthat/test-geom-hex.R 31 | out <- layer_data(aPlot) 32 | 33 | temp <- summarizeColumns(df, c("assigneeSmall","score")) 34 | expect_equal(nrow(out), nrow(temp)) 35 | expect_equal(sort(out$count), temp$total) 36 | expect_is(aPlot, c("gg","ggplot")) 37 | 38 | # Note: I read a SO post saying it is not wise to test svg outputs against 39 | # current plots, thus, the exact plot is not compared to an svg file 40 | }) 41 | 42 | 43 | test_that("facetPlot plots makes a plot object",{ 44 | 45 | file1 <- rprojroot::find_testthat_root_file("testData","sumobrain_autonomous_search1.xlsx") 46 | file2 <- rprojroot::find_testthat_root_file("testData","sumobrain_autonomous_search2.xlsx") 47 | files <- list(file1, file2) 48 | df <- importPatentData(rawDataFilePath = files, skipLines = skipSumobrain) 49 | df <- cleanPatentData(patentData = df, columnsExpected = sumobrainColumns, 50 | cleanNames = sumobrainNames, dateFields = sumobrainDateFields, 51 | dateOrders = sumobrainDateOrder, deduplicate = TRUE, 52 | cakcDict = cakcDict, docLengthTypesDict = docLengthTypesDict, 53 | keepType = "grant",firstAssigneeOnly = TRUE, assigneeSep = ";", 54 | stopWords = assigneeStopWords) 55 | 56 | 57 | # https://github.com/hadley/evaluate/blob/master/tests/testthat/ggplot-loop.r 58 | df$assigneeSmall <- strtrim(df$assigneeClean,12) 59 | score <- round(rnorm(dim(df)[1],mean=1.4,sd=0.9)) 60 | score[score>3] <- 3 61 | score[score<0] <- 0 62 | df$score <- score 63 | category <- c("system","control algorithm","product","control system", "communication") 64 | c <- round(rnorm(dim(df)[1],mean=2.5,sd=1.5)) 65 | c[c>5] <- 5; c[c<1] <- 1 66 | df$category <- category[c] 67 | 68 | xVal = "category" 69 | fillVal = "score" 70 | facetVal = "assigneeSmall" 71 | 72 | aPlot <- facetPlot(subset(df, score > 0), xVal, fillVal, facetVal, colors = patentr::scoreColors, 73 | recolor = FALSE) 74 | 75 | 76 | # taken from hadley's ggplot2 tests 77 | # https://github.com/tidyverse/ggplot2/blob/master/tests/testthat/test-geom-hex.R 78 | out <- layer_data(aPlot) 79 | expect_is(aPlot, c("gg","ggplot")) 80 | 81 | }) 82 | 83 | 84 | 85 | 86 | test_that("tilePlot plots makes a plot object",{ 87 | 88 | file1 <- rprojroot::find_testthat_root_file("testData","sumobrain_autonomous_search1.xlsx") 89 | file2 <- rprojroot::find_testthat_root_file("testData","sumobrain_autonomous_search2.xlsx") 90 | files <- list(file1, file2) 91 | df <- importPatentData(rawDataFilePath = files, skipLines = skipSumobrain) 92 | df <- cleanPatentData(patentData = df, columnsExpected = sumobrainColumns, 93 | cleanNames = sumobrainNames, dateFields = sumobrainDateFields, 94 | dateOrders = sumobrainDateOrder, deduplicate = TRUE, 95 | cakcDict = cakcDict, docLengthTypesDict = docLengthTypesDict, 96 | keepType = "grant",firstAssigneeOnly = TRUE, assigneeSep = ";", 97 | stopWords = assigneeStopWords) 98 | 99 | 100 | # https://github.com/hadley/evaluate/blob/master/tests/testthat/ggplot-loop.r 101 | df$assigneeSmall <- strtrim(df$assigneeClean,12) 102 | score <- round(rnorm(dim(df)[1],mean=1.4,sd=0.9)) 103 | score[score>3] <- 3 104 | score[score<0] <- 0 105 | df$score <- score 106 | category <- c("system","control algorithm","product","control system", "communication") 107 | c <- round(rnorm(dim(df)[1],mean=2.5,sd=1.5)) 108 | c[c>5] <- 5; c[c<1] <- 1 109 | df$category <- category[c] 110 | 111 | category <- c("system","control algorithm","product","control system", "communication") 112 | c <- round(rnorm(dim(df)[1],mean=2.5,sd=1.5)) 113 | c[c>5] <- 5; c[c<1] <- 1 114 | df$category <- category[c] 115 | feature1 <- c("adaptive", "park", "lane", NA,NA,NA,NA,NA, "brake", "steer","accelerate","deactivate") 116 | f <- round(rnorm(dim(df)[1],mean=5,sd=1)) 117 | l <- length(feature1) 118 | f[f>l] <- l; f[f<1] <- 1 119 | df$feature1 <- c(feature1,feature1[f])[1:dim(df)[1]] 120 | 121 | aPlot <- tilePlot(df, "category", "feature1") 122 | 123 | # taken from hadley's ggplot2 tests 124 | # https://github.com/tidyverse/ggplot2/blob/master/tests/testthat/test-geom-hex.R 125 | out <- layer_data(aPlot) 126 | expect_is(aPlot, c("gg","ggplot")) 127 | 128 | }) 129 | 130 | 131 | 132 | ## kamil bojanczyk end -------------------------------------------------------------------------------- /man/addChartRightTextLeftPptx.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/reportPatentData.R 3 | \name{addChartRightTextLeftPptx} 4 | \alias{addChartRightTextLeftPptx} 5 | \title{Add a PPTX slide with chart on the right and text on the left} 6 | \usage{ 7 | addChartRightTextLeftPptx(ppt, plot, text, title, 8 | slide_layout = "Title and Content", Poffx = 5.3, Poffy = 0, 9 | Pwidth = 8, Pheight = 7.5, Toffx = 1, Toffy = 2, Twidth = 5, 10 | Theight = 5.5) 11 | } 12 | \arguments{ 13 | \item{ppt}{A ppt object.} 14 | 15 | \item{plot}{A plot object from ggplot2.} 16 | 17 | \item{text}{A character vector of text, typically less than one paragraph 18 | in size.} 19 | 20 | \item{title}{A character title for a page. Default is NULL} 21 | 22 | \item{slide_layout}{The name of a slide layout, the same name as the names in a .potx 23 | powerpoint template file. Default is a Title and Content blank layout.} 24 | 25 | \item{Poffx}{Plot image x position from left top, inches. 26 | See \code{\link[ReporteRs]{addPlot}}. Default is 5.3.} 27 | 28 | \item{Poffy}{Plot image y position from left top, inches. 29 | See \code{\link[ReporteRs]{addPlot}}. Default is 0.} 30 | 31 | \item{Pwidth}{Plot image width, inches. 32 | See \code{\link[ReporteRs]{addPlot}}. Default is 8.} 33 | 34 | \item{Pheight}{Plot image height, inches. 35 | See \code{\link[ReporteRs]{addPlot}}. Default is 7.5} 36 | 37 | \item{Toffx}{Text image x position from left top, inches. 38 | See \code{\link[ReporteRs]{addPlot}}. Default is 1.} 39 | 40 | \item{Toffy}{Text image y position from left top, inches. 41 | See \code{\link[ReporteRs]{addPlot}}. Default is 2.} 42 | 43 | \item{Twidth}{Text image width, inches. 44 | See \code{\link[ReporteRs]{addPlot}}. Default is 5.} 45 | 46 | \item{Theight}{Text image height, inches. 47 | See \code{\link[ReporteRs]{addPlot}}. Default is 5.5.} 48 | } 49 | \description{ 50 | Generate a commonly-used PPTX slide format where the patent 51 | chart is on the right and some text is on the left. 52 | 53 | This function automates a number of steps used in formatting a pptx slide. 54 | It returns the ppt object with the new slide included. 55 | } 56 | \examples{ 57 | 58 | sumo <- cleanPatentData(patentData = patentr::acars, columnsExpected = sumobrainColumns, 59 | cleanNames = sumobrainNames, 60 | dateFields = sumobrainDateFields, 61 | dateOrders = sumobrainDateOrder, 62 | deduplicate = TRUE, 63 | cakcDict = patentr::cakcDict, 64 | docLengthTypesDict = patentr::docLengthTypesDict, 65 | keepType = "grant", 66 | firstAssigneeOnly = TRUE, 67 | assigneeSep = ";", 68 | stopWords = patentr::assigneeStopWords) 69 | 70 | # note that in reality, you need a patent analyst to carefully score 71 | # these patents, the score here is for demonstrational purposes 72 | score <- round(rnorm(dim(sumo)[1],mean=1.4,sd=0.9)) 73 | score[score>3] <- 3 74 | score[score<0] <- 0 75 | sumo$score <- score 76 | sumo$assigneeSmall <- strtrim(sumo$assigneeClean,12) 77 | category <- c("system","control algorithm","product","control system", "communication") 78 | c <- round(rnorm(dim(sumo)[1],mean=2.5,sd=1.5)) 79 | c[c>5] <- 5; c[c<1] <- 1 80 | sumo$category <- category[c] 81 | feature1 <- c("adaptive", "park", "lane", NA,NA,NA,NA,NA, 82 | "brake", "steer","accelerate","deactivate") 83 | f <- round(rnorm(dim(sumo)[1],mean=5,sd=1)) 84 | l <- length(feature1) 85 | f[f>l] <- l; f[f<1] <- 1 86 | sumo$feature1 <- c(feature1,feature1[f])[1:dim(sumo)[1]] 87 | 88 | flippedHistogram(sumo, "assigneeSmall","score",colors=scoreColors) 89 | flippedHistogram(subset(sumo, score > 0), "assigneeSmall","score",colors=scoreColors) 90 | 91 | flippedHistogram(subset(sumo, score > 2) ,"assigneeSmall","docType",colors=scoreColors, 92 | recolor = TRUE) 93 | 94 | 95 | 96 | 97 | # create a ppt 98 | ppt <- ReporteRs::pptx(title="IP Update") 99 | # view the types of layouts available by default 100 | # slide.layouts(ppt) 101 | layoutTitleContent = "Title and Content" 102 | 103 | # first plot of top score (3) 104 | asdt <- summarizeColumns(subset(sumo,score > 2),'docType') 105 | ppt <- 106 | addChartRightTextLeftPptx(ppt = ppt, 107 | plot = flippedHistogram(subset(sumo, score > 2) , 108 | "assigneeSmall","docType", 109 | colors=scoreColors, 110 | recolor = TRUE), 111 | text = summaryText(asdt, "doc type", "doc types", 112 | subset(sumo,score>2)$docType), 113 | title = "Doc Types for Top Score Docs", 114 | slide_layout = layoutTitleContent) 115 | 116 | # top scores by assignee 117 | ascore <- summarizeColumns(subset(sumo,score > 2),'assigneeSmall') 118 | ppt <- 119 | addChartRightTextLeftPptx(ppt = ppt, 120 | plot = flippedHistogram(subset(sumo, score > 2) , 121 | "assigneeSmall","score", 122 | colors=scoreColors, 123 | recolor = FALSE), 124 | text = summaryText(ascore, "assignee", "assignees", 125 | subset(sumo,score>2)$assigneeSmall), 126 | title = "Assignees with Top Scores", 127 | slide_layout = layoutTitleContent) 128 | 129 | 130 | # last plot is category 131 | sc <- summarizeColumns(sumo,'category') 132 | ppt <- 133 | addChartRightTextLeftPptx(ppt = ppt, 134 | plot = flippedHistogram(sumo ,"category", 135 | "score", colors = scoreColors, 136 | recolor = TRUE), 137 | text = summaryText(sc, "category", "categories", sumo$category), 138 | title = "Categories and Scores", 139 | slide_layout = layoutTitleContent) 140 | 141 | # find a data folder and write it out to your folder 142 | # out <- paste("data/",Sys.Date(),"_exampleChartRightTextLeft.pptx",sep='') 143 | # ReporteRs::writeDoc(ppt, out) 144 | 145 | 146 | } 147 | \seealso{ 148 | \code{\link[ReporteRs]{pptx}}, \code{\link{addFullImagePptx}} 149 | } 150 | -------------------------------------------------------------------------------- /tests/testthat/test-cleaning.R: -------------------------------------------------------------------------------- 1 | # test cleaning data 2 | 3 | 4 | # clean names 5 | test_that("Imported Sumobrain csv to data frame has names standardized",{ 6 | df <- importPatentData(rprojroot::find_testthat_root_file("testData","sumobrain_autonomous_search1.xlsx"), skipLines = 1) 7 | df <- cleanHeaderNames(patentData = df) 8 | expect_identical(names(df),names(acars)) 9 | 10 | }) 11 | 12 | 13 | # same length when extracting country code 14 | test_that("Country code extracted from document number, and all country codes are chars of length 2-4",{ 15 | df <- importPatentData(rprojroot::find_testthat_root_file("testData","sumobrain_autonomous_search1.xlsx"), skipLines = 1) 16 | df <- cleanHeaderNames(patentData = df) 17 | expect_length(extractCountryCode(df$docNum),dim(df)[1]) 18 | }) 19 | 20 | # same length when extracting publication number 21 | test_that("Publication number, numeric portion extracted from document number properly",{ 22 | df <- importPatentData(rprojroot::find_testthat_root_file("testData","sumobrain_autonomous_search1.xlsx"), skipLines = 1) 23 | df <- cleanHeaderNames(patentData = df) 24 | # should return the same length 25 | expect_length(extractPubNumber(df$docNum),dim(df)[1]) 26 | }) 27 | 28 | 29 | # same length when extracting kind code 30 | test_that("Kind code extracted returns same length as number of rows of data frame",{ 31 | df <- importPatentData(rprojroot::find_testthat_root_file("testData","sumobrain_autonomous_search1.xlsx"), skipLines = 1) 32 | df <- cleanHeaderNames(patentData = df) 33 | # should return the same length 34 | expect_length(extractKindCode(df$docNum),dim(df)[1]) 35 | }) 36 | 37 | 38 | # same length when extracting kind code 39 | test_that("Office doc length extracted returns same length as number of rows of data frame",{ 40 | df <- importPatentData(rprojroot::find_testthat_root_file("testData","sumobrain_autonomous_search1.xlsx"), skipLines = 1) 41 | df <- cleanHeaderNames(patentData = df) 42 | df$pubName <- extractPubNumber(df$docNum) 43 | df$countryCode <- extractCountryCode(df$docNum) 44 | df$officeDocLength <- extractDocLength(countryCode = df$countryCode, pubNum = df$pubNum) 45 | # should return the same length 46 | expect_length(df$officeDocLength ,dim(df)[1]) 47 | }) 48 | 49 | 50 | # Dates converted properly 51 | test_that("Dates converted properly from characters",{ 52 | df <- importPatentData(rprojroot::find_testthat_root_file("testData","sumobrain_autonomous_search1.xlsx"), skipLines = 1) 53 | df <- cleanHeaderNames(patentData = df) 54 | df$pubDate <- extractCleanDate(df$pubDate) 55 | # should return the same length 56 | expect_equal(inherits(df$pubDate, "Date") ,TRUE) 57 | }) 58 | 59 | # same length when extracting kind code 60 | test_that("Google URL vector returns same length as number of rows of data frame",{ 61 | df <- importPatentData(rprojroot::find_testthat_root_file("testData","sumobrain_autonomous_search1.xlsx"), skipLines = 1) 62 | df <- cleanHeaderNames(patentData = df) 63 | df$pubNum <- extractPubNumber(df$docNum) 64 | df$countryCode <- extractCountryCode(df$docNum) 65 | df$kindCode <- extractKindCode(df$docNum) 66 | # should return the same length 67 | expect_length(createGoogleURL(countryCode = df$countryCode, 68 | pubNum = df$pubNum, 69 | kindCode =df$kindCode) ,dim(df)[1]) 70 | }) 71 | 72 | 73 | # duplicates are removed if exist 74 | test_that("Removing dups is a logical vector",{ 75 | df <- importPatentData(rprojroot::find_testthat_root_file("testData","sumobrain_autonomous_search1.xlsx"), skipLines = 1) 76 | df <- cleanHeaderNames(patentData = df) 77 | # should be of type logical 78 | expect_type(removeDups(df$docNum) ,"logical") 79 | }) 80 | 81 | 82 | # duplicates are shown 83 | test_that("Showing all duplicates and showDups is a logical vector",{ 84 | df <- importPatentData(rprojroot::find_testthat_root_file("testData","sumobrain_autonomous_search1.xlsx"), skipLines = 1) 85 | df <- cleanHeaderNames(patentData = df) 86 | # should be of type logical 87 | expect_type(showDups(df$appNum) ,"logical") 88 | }) 89 | 90 | 91 | # same length when generating the type of document 92 | test_that("generateDocType returns same length as number of rows of data frame",{ 93 | df <- importPatentData(rprojroot::find_testthat_root_file("testData","sumobrain_autonomous_search1.xlsx"), skipLines = 1) 94 | df <- cleanHeaderNames(patentData = df) 95 | df$pubNum <- extractPubNumber(df$docNum) 96 | df$countryCode <- extractCountryCode(df$docNum) 97 | df$kindCode <- extractKindCode(df$docNum) 98 | df$officeDocLength <- extractDocLength(countryCode = df$countryCode, pubNum = df$pubNum) 99 | df$countryAndKindCode <- with(df, paste0(countryCode, kindCode)) 100 | # should return the same length 101 | temp <- generateDocType(officeDocLength = df$officeDocLength, 102 | countryAndKindCode = df$countryAndKindCode, 103 | cakcDict = patentr::cakcDict, 104 | docLengthTypesDict = patentr::docLengthTypesDict) 105 | expect_length(temp ,dim(df)[1]) 106 | }) 107 | 108 | 109 | 110 | # names returns the same length 111 | test_that("Google URL vector returns same length as number of rows of data frame",{ 112 | df <- importPatentData(rprojroot::find_testthat_root_file("testData","sumobrain_autonomous_search1.xlsx"), skipLines = 1) 113 | df <- cleanHeaderNames(patentData = df) 114 | expect_length(cleanNames(df$assignee), dim(df)[1]) 115 | }) 116 | 117 | 118 | # sumobrain full clean returns data frame 119 | test_that("Sumobrain data cleanPatentData returns a data frame.",{ 120 | df <- importPatentData(rprojroot::find_testthat_root_file("testData","sumobrain_autonomous_search1.xlsx"), skipLines = skipSumobrain) 121 | df <- cleanPatentData(patentData = df, columnsExpected = sumobrainColumns, 122 | cleanNames = sumobrainNames, dateFields = sumobrainDateFields, 123 | dateOrders = sumobrainDateOrder, deduplicate = TRUE, 124 | cakcDict = cakcDict, docLengthTypesDict = docLengthTypesDict, 125 | keepType = "grant",firstAssigneeOnly = TRUE, assigneeSep = ";", 126 | stopWords = assigneeStopWords) 127 | # should be of type logical 128 | expect_is(df ,"data.frame") 129 | }) 130 | 131 | # google patent data full clean returns data frame 132 | test_that("Google patent data cleanPatentData returns a data frame.",{ 133 | df <- read.csv(rprojroot::find_testthat_root_file("testData","google_autonomous_search.csv"), 134 | skip = skipGoogle, stringsAsFactors = FALSE) 135 | df <- data.frame(lapply(df,function(x){iconv(x, to = "ASCII")}), stringsAsFactors = FALSE) 136 | 137 | df <- cleanPatentData(patentData = df, columnsExpected = googleColumns, 138 | cleanNames = googleNames, dateFields = googleDateFields, 139 | dateOrders = googleDateOrder, deduplicate = TRUE, 140 | cakcDict = cakcDict, docLengthTypesDict = docLengthTypesDict, 141 | keepType = "grant",firstAssigneeOnly = TRUE, assigneeSep = ",", 142 | stopWords = assigneeStopWords) 143 | # should be of type logical 144 | expect_is(df ,"data.frame") 145 | }) 146 | 147 | 148 | # lens.org data file 149 | test_that("Lens.org patent data cleanPatentData returns a data frame.",{ 150 | df <- read.csv(rprojroot::find_testthat_root_file("testData","lens_autonomous_search.csv"), 151 | skip = skipLens, stringsAsFactors = FALSE) 152 | df <- data.frame(lapply(df,function(x){iconv(x, to = "ASCII")}), stringsAsFactors = FALSE) 153 | 154 | df <- cleanPatentData(patentData = df, columnsExpected = lensColumns, 155 | cleanNames = lensNames, dateFields = lensDateFields, 156 | dateOrders = lensDateOrder, deduplicate = TRUE, 157 | cakcDict = cakcDict, docLengthTypesDict = docLengthTypesDict, 158 | keepType = "grant",firstAssigneeOnly = TRUE, assigneeSep = ";;", 159 | stopWords = assigneeStopWords) 160 | # should be of type logical 161 | expect_is(df ,"data.frame") 162 | }) 163 | 164 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | patentr 2 | ================ 3 | Kamil Bojanczyk 4 | 2017-03-22 5 | 6 | 7 | Introducing `patentr`, the toolkit for patent data analysis in R. The summary md file currently holds all documentation. 8 | 9 | The package is aimed at patent agents, lawyers, managers, analysts, and academics who are working on patents. This may be used in a patent landscape analysis, company IP portfolio analysis, or a freedom to operate search. 10 | 11 | This is a data processing and reporting tool on patent data sets for patent analysts. The motivation comes from a lack of useful, exportable patent data. `patentr` builds upon the free data available from Sumobrain.com, Lens.org, and Google Patents, leveraging their data to summarize and analyze patents. 12 | 13 | `patentr` performs four key functions: 14 | 15 | 1. *Data input:* Easily **import** patent excel and csv files from the top patent websites 16 | 17 | - `CSV` from Google Patents and lens.org 18 | - `xlsx` from sumobrain.com 19 | 20 | 1. *Data cleaning:* **Sanitize** patent data and extract useful metadata for custom analyses 21 | 22 | - Clean up important fields such as names, dates, country codes, and kind codes 23 | - Infer the document type so that you don't analyze the same patent twice 24 | - Deduplicate data sets and prioritize grants over applications 25 | - Use the generated Google URL to jump to the patent document or to download claim data using the included `httr` and `XML` functions 26 | 27 | 1. *Exploratory data analysis:* **Explore** patent data and quickly **visualize** important attributes 28 | 29 | - Quickly summarize patent data by relevant columns to get document count 30 | - View standard histogram, tile, and facet plots of important information 31 | - Extract claim information for **wordcloud** analysis 32 | - Interact with your data on the **Shiny** user interface 33 | 34 | 1. *Reporting:* Export your data as **powerpoints** and **PDFs** 35 | 36 | - Browse through the **many example plots** 37 | - Download your charts locally as a **PDF** or make your own **PPTX** 38 | 39 | **Note:** The Shiny app works only with `xlsx` data. Simply upload the data file, click "clean", and then you can view a straightforward graph and the raw data. 40 | 41 | There are three core date sets available, all based on autonomous car patent sets: `acars` (from Sumobrain.com), `acarsLens`, and `acarsGoogle`. All data sets are reproducible and their sources can be found in their documentation. 42 | 43 | Data Input and Data Sources 44 | --------------------------- 45 | 46 | Choose your data from Sumobrain.com for excel files, or Lens.org and Google Patents for `csv` files. 47 | 48 | You can read in patent data files from publicly available sources and clean the data into a more useful, usable format for further analysis. `patentr` has an **interactive** browser that allows you to choose a **list** of files of xlsx format. Alternatively, you can read in your own `csv` files. 49 | 50 | ``` r 51 | # read in xlsx files 52 | file1 <- system.file("extdata/", "sumobrain_autonomous_search1.xlsx", package="patentr") 53 | file2 <- system.file("extdata/", "sumobrain_autonomous_search2.xlsx", package="patentr") 54 | files <- list(file1, file2) 55 | ipData <- importPatentData(rawDataFilePath = files, skipLines = 1) 56 | # example 2 -- a popup window appears for you to choose xlsx files 57 | filePaths <- chooseFiles() 58 | allData <- importPatentData(filePaths) 59 | # example 3 -- read in csv files 60 | google <- read.csv(system.file("testData/","google_autonomous_search.csv", package ="patentr") 61 | skip = skipGoogle, stringsAsFactors = FALSE) 62 | google <- data.frame(lapply(lens,function(x){iconv(x, to = "ASCII")}), stringsAsFactors = FALSE) 63 | 64 | ``` 65 | 66 | Clean Data 67 | ---------- 68 | 69 | There are ten different cleaning functions available, all wrapped up nicely into the `cleanPatentData` function. This single function can save you hours of work cleaning and processing your data. Read the documentation carefully, as there are a number of time-saving preloaded variables to name the columns, process the dates, clean up the assignee names, and much more. 70 | 71 | For excel files, use the `cleanPatentData` function directly. For csv files, use the pre-processing lines below. 72 | 73 | Clean data uses `extract` functions that take in character vectors and return extracted metadata useful in patent data analysis. A master cleaner function bundles all these functions together. The user also has the ability to use the functions one-by-one for custom analysis. 74 | 75 | ``` r 76 | lensRawData <- system.file("extdata", "lens_autonomous_search.csv", package = "patentr") 77 | lensRawData <- read.csv(lensRawData, stringsAsFactors = FALSE, skip = skipLens) 78 | lensRawData <- data.frame(lapply(lensRawData, 79 | function(x){iconv(x, to = "ASCII")}), stringsAsFactors = FALSE) 80 | lens <- cleanPatentData(patentData = lensRawData, columnsExpected = lensColumns, 81 | cleanNames = lensNames, dateFields = lensDateFields, dateOrders = lensDateOrder, 82 | deduplicate = TRUE, cakcDict = patentr::cakcDict, docLengthTypesDict = patentr::docLengthTypesDict, 83 | keepType = "grant", firstAssigneeOnly = TRUE, assigneeSep = ";;", stopWords = patentr::assigneeStopWords) 84 | ``` 85 | 86 | Exploratory Analysis 87 | -------------------- 88 | 89 | The exploratory analysis includes simple summaries and numerous graphings. Ideally, a patent analyst needs to add the following columns to the cleaned data to make full use of the package: \* score \* category \* feature 1 (main feature) \* feature 2 (secondary feature) 90 | 91 | For the purpose of this first package, all examples come with a pre-built 0 to 3 score, 3 being the highest. Categories are also predefined, as is feature 1. These are important variables that require days to weeks of a patent analysts time, thus, in future realeases an expert-tagged data set will be available. 92 | 93 | A simple example is the word cloud. We load a file, deduplicate it, and then quickly view the top phrases. Another example is a simple facet that shows the category of a patent technology, along with the major feature of that patent. For example, every autonomous car technology category has a lane feature, as staying in the lane for a car is important. 94 | 95 | ![Wordcloud](vignettes/Rplot01.png) ![Tile plot](vignettes/Rplot.png) 96 | 97 | Reporting 98 | --------- 99 | 100 | The package allows the user to output a set of pre-defined plots and summary information. There are pdf and pptx options. 101 | 102 | ``` r 103 | sumo <- cleanPatentData(patentData = patentr::acars, columnsExpected = sumobrainColumns, 104 | cleanNames = sumobrainNames, dateFields = sumobrainDateFields, dateOrders = sumobrainDateOrder, 105 | deduplicate = TRUE, cakcDict = patentr::cakcDict, docLengthTypesDict = patentr::docLengthTypesDict, 106 | keepType = "grant", firstAssigneeOnly = TRUE, assigneeSep = ";", stopWords = patentr::assigneeStopWords) 107 | 108 | score <- round(rnorm(dim(sumo)[1],mean=1.4,sd=0.9)) 109 | score[score>3] <- 3; score[score<0] <- 0; sumo$score <- score 110 | sumo$assigneeSmall <- strtrim(sumo$assigneeClean,12) 111 | category <- c("system","control algorithm","product","control system", "communication") 112 | c <- round(rnorm(dim(sumo)[1],mean=2.5,sd=1.5)); c[c>5] <- 5; c[c<1] <- 1; sumo$category <- category[c] 113 | xVal = "category"; fillVal = "score"; facetVal = "assigneeSmall" 114 | # create a ppt 115 | ppt <- ReporteRs::pptx(title="IP Update") 116 | layoutTitleContent = "Title and Content" 117 | fp <- facetPlot(subset(sumo, score > 0), xVal, fillVal, facetVal, colors = patentr::scoreColors, recolor = FALSE) 118 | ppt <- addFullImagePptx(ppt, plot = fp, slide_layout = layoutTitleContent) 119 | # find a data folder and write it out to your folder 120 | out <- paste("data/",Sys.Date(),"_exampleChartRightTextLeft.pptx",sep='') 121 | ReporteRs::writeDoc(ppt, out) 122 | ``` 123 | 124 | Upcoming Features in the Second Release 125 | --------------------------------------- 126 | 127 | The next round of patent features will include: 128 | 129 | - Directly edit patent data in Shiny, or upload and redownload the data to Google Spreadsheets or excel 130 | - Utilize a custom template set to auto-generate a powerpoint presentation 131 | - Use supervised learning to semi-automate the classification of: 132 | - score 133 | - category 134 | - feature 1 (main feature) 135 | - feature 2 (secondary feature) 136 | -------------------------------------------------------------------------------- /R/acars.R: -------------------------------------------------------------------------------- 1 | ## kamil bojanczyk start 2 | #' Autonomous Vehicle Patent Data from Sumobrain.com 3 | #' 4 | #' An example data set of autonomous vehicle IP from major assignees. 5 | #' 6 | #' The data search was performd on Monday, March 13, 2017 from sumobrain.com, and the exact 7 | #' search term was: 8 | #' 9 | #' \code{ABST/"autonomous" AN/(Apple* OR Google* OR Waymo* OR Tesla*} 10 | #' 11 | #' \code{OR Ford* OR General*) PD/12/13/1790->3/13/2017} 12 | #' 13 | #' View the search \href{http://www.sumobrain.com/result.html?p=1&stemming=on&sort=chron&uspat=on&usapp=on&eupat=on&jp=on&pct=on&collections=&srch=xprtsrch&date_range=all&hits=502&from_ss=&srch_id=&srch_name=&search_name=&selected_doc_flag=&selected_newdoc_flag=&selected_portfolio=&portfolio_name=&query_txt=ABST\%2F\%22autonomous\%22+AN\%2F\%28Apple*+OR+Google*+OR+Waymo*+OR+Tesla*+OR+Ford*+OR+General*\%29+PD\%2F12\%2F13\%2F1790-\%3E3\%2F13\%2F2017&search.x=0&search.y=0&search=search_ezy}{here}. 14 | #' 15 | #' 16 | #' For all collections (US patents, applications, EP documents, abstracts of Japan, and WIPO). 17 | #' 18 | #' Can get raw data with the following commands: 19 | #' 20 | #' \code{system.file("extdata", "sumobrain_autonomous_search1.xls", package = "patentr")} 21 | #' 22 | #' \code{system.file("extdata", "sumobrain_autonomous_search2.xls", package = "patentr")} 23 | #' 24 | #' 25 | #' @name acars 26 | #' @docType data 27 | #' @keywords data 28 | #' 29 | #' 30 | #' @format 31 | #' A data frame with 499 observations on 10 variables. 32 | #' \describe{ 33 | #' \item{docNum}{A published document number including the kind code, publication number, 34 | #' and kind code for the patent document.} 35 | #' \item{docTypeSumobrain}{Very similar to the country code, with minor additions, USAPP being the 36 | #' most noticable difference. } 37 | #' \item{pubDate}{Publication Date} 38 | #' \item{title}{Title} 39 | #' \item{abstract}{Abstract} 40 | #' \item{inventors}{Inventor Name} 41 | #' \item{assignee}{Assignee} 42 | #' \item{appNum}{Application Number} 43 | #' \item{dateFiled}{Filing Date} 44 | #' \item{classPrimary}{Primary Class} 45 | #' \item{classOthers}{Other Classes} 46 | #' } 47 | #' 48 | #' @seealso \url{http://www.sumobrain.com} You will need to create a free account to export data. 49 | #' 50 | #' \code{\link{acarsGoogle}} provides a similar search from Google. 51 | #' \code{\link{acarsLens}} provides a simialr search from Lens.org. 52 | #' 53 | "acars" 54 | 55 | 56 | #' Autonomous Vehicle Patent Data from Google Patents 57 | #' 58 | #' An example data set of autonomous vehicle IP from major assignees. 59 | #' 60 | #' The first row in the raw CSV export contains the search URL and is skipped. 61 | #' 62 | #' The data search was performd on Saturday, March 18, 2017 from patents.google.com, and the exact 63 | #' search: \href{https://patents.google.com/?q=AB\%3dautonomous&assignee=Apple*,Google*,Waymo*,Tesla*,Ford*,General*&before=filing:20170318}{Google Patents Search} 64 | #' For all countries available on Google. 65 | #' 66 | #' You process the raw data with the following commands: 67 | #' 68 | #' \code{temp <- system.file("extdata", "google_autonomous_search.csv", package = "patentr")} 69 | #' 70 | #' \code{# from the source package you can navigate to } 71 | #' 72 | #' \code{temp <- read.csv("inst/extdata/google_autonomous_search.csv", skip = 1, stringsAsFactors = FALSE)} 73 | #' 74 | #' \code{names(temp) <- googleNames} 75 | #' 76 | #' \code{temp <- data.frame(lapply(temp, function(x){iconv(x,to="ASCII")}),stringsAsFactors = FALSE)} 77 | #' 78 | #' \code{dateFields <- c("priorityDate","dateFiled","pubDate","grantDate")} 79 | #' 80 | #' \code{temp[dateFields] <- as.data.frame(lapply(temp[dateFields], as.Date, format="\%m/\%d/\%y"))} 81 | #' 82 | #' 83 | #' @name acarsGoogle 84 | #' @docType data 85 | #' @keywords data 86 | #' 87 | #' 88 | #' @format 89 | #' A data frame with 316 observations on 9 variables. 90 | #' \describe{ 91 | #' \item{\code{docNum}}{A published document number including the kind code, publication number, 92 | #' and kind code for the patent document.} 93 | #' \item{\code{title}}{The title of the invention.} 94 | #' \item{\code{assignee}}{The owner of the document.} 95 | #' \item{\code{inventors}}{The name(s) of the inventor(s), separated by commas.} 96 | #' \item{\code{priorityDate}}{The earliest priority date on the application.} 97 | #' \item{\code{dateFiled}}{Date the document was filed. They calll it filing/creation date.} 98 | #' \item{\code{pubDate}}{Date document became publicly available.} 99 | #' \item{\code{grantDate}}{Date the application became a grant. NA if there is no associated grant.} 100 | #' \item{\code{googleURL}}{The link to the Google Patents page for the document.} 101 | #' } 102 | #' 103 | #' @seealso \url{https://patents.google.com/} 104 | #' 105 | #' \code{\link{acars}} provides a similar search from Sumobrain. 106 | #' \code{\link{acarsLens}} provides a simialr search from Lens.org. 107 | #' 108 | "acarsGoogle" 109 | 110 | ## kamil bojanczyk end 111 | ## yang yao start 112 | #' Autonomous Vehicle Patent Data from Lens Patent Search 113 | #' 114 | #' An example data set of autonomous vehicle IP from major assignees. 115 | #' 116 | #' The data search was performd on Saturday, March 18, 2017 from lens.org, and the exact 117 | #' search: 118 | #' 119 | #' \href{https://www.lens.org/lens/search?q=abstract\%3Aautonomous+\%26\%26+applicant\%3A\%28Apple*+OR+Google*+OR+Waymo*+OR+Tesla*+OR+Ford*+OR+General*\%29&predicate=\%26\%26&l=en}{Lens Patents Search} 120 | #' 121 | #' For all countries available on Lens. 122 | #' 123 | #' Can get raw data with the following commands: 124 | #' 125 | #' \code{temp <- system.file("extdata", "lens_autonomous_search.csv", package = "patentr")} 126 | #' 127 | #' \code{temp <- read.csv(temp, stringsAsFactors = FALSE)} 128 | #' 129 | #' \code{temp <- data.frame(lapply(temp, function(x){iconv(x,to="ASCII")}),stringsAsFactors = FALSE)} 130 | #' 131 | #' \code{names(temp) <- lensNames} 132 | #' 133 | #' \code{temp$dateFiled <- as.Date(temp$dateFiled, format = '\%m/\%d/\%y')} 134 | #' 135 | #' \code{temp$pubDate <- as.Date(temp$pubDate, format='\%m/\%d/\%y')} # note that % y is system-specific and may not work everywhere. 136 | #' 137 | #' \code{colsNum <- c("resultNum","citeCount","familySimpleCount","familyExtendedCount", "seqCount")} 138 | #' 139 | #' \code{temp[colsNum] <- sapply(temp[colsNum], as.numeric)} 140 | #' 141 | #' \code{temp$hasFullText <- sapply(temp$hasFullText, function(x) ifelse(x=="yes",TRUE,FALSE))} 142 | #' 143 | #' @name acarsLens 144 | #' @docType data 145 | #' @keywords data 146 | #' 147 | #' 148 | #' @format 149 | #' A data frame with 863 observations on 26 variables. 150 | #' \describe{ 151 | #' \item{resultNum}{The search result number.} 152 | #' \item{countryCode}{The jurisdiction of the patent document.} 153 | #' \item{\code{kindCode}}{The kind code.} 154 | #' \item{docNum}{The published document number with country code and kind code included.} 155 | #' \item{lensID}{The unique identification number of the document on lens.org} 156 | #' \item{pubDate}{Date the document was published.} 157 | #' \item{pubYear}{Year the document published.} 158 | #' \item{appNum}{The filing number of the application (country code, number, and abridged kind code, typically 'A')} 159 | #' \item{dateFiled}{Date the application for the patent document was filed.} 160 | #' \item{priorityApps}{Applications this patent document claims priority. 161 | #' Format: Country code, application number, A = application or P = provisional, YYYYMMDD of priority. 162 | #' Multiple application separated by a double semi-colon.} 163 | #' \item{title}{The title of the document.} 164 | #' \item{assignee}{The name of the applicant(s) at the time of filing.} 165 | #' \item{inventors}{The inventor(s).} 166 | #' \item{lensURL}{The lens.org URL for the document.} 167 | #' \item{docTypeLens}{A lens.org mapping of the doc type. 168 | #' Granted, application, ambiguous, unknown, search report, and possibly more values.} 169 | #' \item{hasFullText}{A logical value to show if there is a full text available from lens.org} 170 | #' \item{citeCount}{The number of times this document is cited, also known as forward citations.} 171 | #' \item{familySimpleCount}{The number of unique documents in the immediate patent family.} 172 | #' \item{familyExtendedCount}{The number of unique documents sharing a priority applicaiton in the extended family.} 173 | #' \item{seqCount}{Used in biological applications -- the number of sequences in the application.} 174 | #' \item{cpcClasses}{The CPC classification codes, separated by a double semi-colon.} 175 | #' \item{ipcrClasses}{The IPCR classification codes, separated by a double semi-colon.} 176 | #' \item{usClasses}{The US classification codes, separated by a double semi-colon.} 177 | #' \item{pubmedID}{A pubmed ID to any related research.} 178 | #' \item{DOI}{A digital object identifier. 179 | #' Go to doi.org and paste the value to get the associated research paper.} 180 | #' \item{npl}{Non-patent literature, or citations of non-patent sources. 181 | #' Separated with double semi-colons.} 182 | #' 183 | #' 184 | #' } 185 | #' 186 | #' @seealso \url{www.lens.org} You can export without an account, or can create 187 | #' an account to save your searches. 188 | #' 189 | #' \code{\link{acarsGoogle}} provides a similar search from Google. 190 | #' \code{\link{acars}} provides a similar search from sumobrain. 191 | #' 192 | "acarsLens" 193 | ## yang yao end -------------------------------------------------------------------------------- /vignettes/summary.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "patentr" 3 | author: "Kamil Bojanczyk; Yao Yang" 4 | date: "`r Sys.Date()`" 5 | output: rmarkdown::html_vignette 6 | vignette: > 7 | %\VignetteIndexEntry{patentr} 8 | %\VignetteEngine{knitr::rmarkdown} 9 | %\VignetteEncoding{UTF-8} 10 | --- 11 | 12 | The package is aimed at patent agents, lawyers, managers, analysts, and 13 | academics who are working on patents. This may be used in a patent landscape 14 | analysis, company IP portfolio analysis, or a freedom to operate search. 15 | 16 | This is a data processing and reporting tool on patent data sets for 17 | patent analysts. The motivation comes from a lack of useful, exportable 18 | patent data. `patentr` builds upon the free data available from Sumobrain.com, 19 | Lens.org, and Google Patents, leveraging their data to summarize and analyze 20 | patents. 21 | 22 | `patentr` performs four key functions: 23 | 24 | 1. *Data input:* Easily **import** patent excel and csv files from the top patent websites 25 | + `CSV` from Google Patents and lens.org 26 | + `xlsx` from sumobrain.com 27 | 2. *Data cleaning:* **Sanitize** patent data and extract useful metadata for custom analyses 28 | + Clean up important fields such as names, dates, country codes, and kind codes 29 | + Infer the document type so that you don't analyze the same patent twice 30 | + Deduplicate data sets and prioritize grants over applications 31 | + Use the generated Google URL to jump to the patent document or to download 32 | claim data using the included `httr` and `XML` functions 33 | 3. *Exploratory data analysis:* **Explore** patent data and quickly **visualize** important attributes 34 | + Quickly summarize patent data by relevant columns to get document count 35 | + View standard histogram, tile, and facet plots of important information 36 | + Extract claim information for **wordcloud** analysis 37 | + Interact with your data on the **Shiny** user interface 38 | 4. *Reporting:* Export your data as **powerpoints** and **PDFs** 39 | + Browse through the **many example plots** 40 | + Download your charts locally as a **PDF** or make your own **PPTX** 41 | 42 | **Note:** The Shiny app works only with `xlsx` data. Simply upload the data file, 43 | click "clean", and then you can view a straightforward graph and the raw data. 44 | 45 | There are three core date sets available, all based on autonomous car patent sets: 46 | `acars` (from Sumobrain.com), `acarsLens`, and `acarsGoogle`. All data sets are 47 | reproducible and their sources can be found in their documentation. 48 | 49 | ## Data Input and Data Sources 50 | 51 | Choose your data from Sumobrain.com for excel files, or Lens.org and Google Patents 52 | for `csv` files. 53 | 54 | You can read in patent data files from publicly available sources and clean the 55 | data into a more useful, usable format for further analysis. `patentr` has an 56 | **interactive** browser that allows you to choose a **list** of files of xlsx 57 | format. Alternatively, you can read in your own `csv` files. 58 | ```{r, eval=FALSE} 59 | # read in xlsx files 60 | file1 <- system.file("extdata/", "sumobrain_autonomous_search1.xlsx", package="patentr") 61 | file2 <- system.file("extdata/", "sumobrain_autonomous_search2.xlsx", package="patentr") 62 | files <- list(file1, file2) 63 | ipData <- importPatentData(rawDataFilePath = files, skipLines = 1) 64 | # example 2 -- a popup window appears for you to choose xlsx files 65 | filePaths <- chooseFiles() 66 | allData <- importPatentData(filePaths) 67 | # example 3 -- read in csv files 68 | google <- read.csv(system.file("testData/","google_autonomous_search.csv", package ="patentr") 69 | skip = skipGoogle, stringsAsFactors = FALSE) 70 | google <- data.frame(lapply(lens,function(x){iconv(x, to = "ASCII")}), stringsAsFactors = FALSE) 71 | 72 | 73 | ``` 74 | 75 | 76 | ## Clean Data 77 | 78 | There are ten different cleaning functions available, all wrapped up nicely into 79 | the `cleanPatentData` function. This single function can save you hours of work 80 | cleaning and processing your data. Read the documentation carefully, as there are 81 | a number of time-saving preloaded variables to name the columns, process the 82 | dates, clean up the assignee names, and much more. 83 | 84 | For excel files, use the `cleanPatentData` function directly. For csv files, 85 | use the pre-processing lines below. 86 | 87 | Clean data uses `extract` functions that take in character vectors and return 88 | extracted metadata useful in patent data analysis. A master cleaner function 89 | bundles all these functions together. The user also has the ability to use the 90 | functions one-by-one for custom analysis. 91 | 92 | ```{r, eval=FALSE} 93 | lensRawData <- system.file("extdata", "lens_autonomous_search.csv", package = "patentr") 94 | lensRawData <- read.csv(lensRawData, stringsAsFactors = FALSE, skip = skipLens) 95 | lensRawData <- data.frame(lapply(lensRawData, 96 | function(x){iconv(x, to = "ASCII")}), stringsAsFactors = FALSE) 97 | lens <- cleanPatentData(patentData = lensRawData, columnsExpected = lensColumns, 98 | cleanNames = lensNames, dateFields = lensDateFields, dateOrders = lensDateOrder, 99 | deduplicate = TRUE, cakcDict = patentr::cakcDict, docLengthTypesDict = patentr::docLengthTypesDict, 100 | keepType = "grant", firstAssigneeOnly = TRUE, assigneeSep = ";;", stopWords = patentr::assigneeStopWords) 101 | ``` 102 | 103 | 104 | ## Exploratory Analysis 105 | 106 | The exploratory analysis includes simple summaries and numerous graphings. Ideally, 107 | a patent analyst needs to add the following columns to the cleaned data to make 108 | full use of the package: 109 | * score 110 | * category 111 | * feature 1 (main feature) 112 | * feature 2 (secondary feature) 113 | 114 | For the purpose of this first package, all examples come with a pre-built 0 to 3 115 | score, 3 being the highest. Categories are also predefined, as is feature 1. These 116 | are important variables that require days to weeks of a patent analysts time, thus, 117 | in future realeases an expert-tagged data set will be available. 118 | 119 | A simple example is the word cloud. We load a file, deduplicate it, and then 120 | quickly view the top phrases. Another example is a simple facet that shows 121 | the category of a patent technology, along with the major feature of that patent. 122 | For example, every autonomous car technology category has a lane feature, as 123 | staying in the lane for a car is important. 124 | 125 | ![Wordcloud](Rplot01.png) ![Tile plot](Rplot.png) 126 | ```{r warning=FALSE, echo = FALSE, message=FALSE, eval=FALSE} 127 | devtools::load_all() 128 | sumo <- cleanPatentData(patentData = patentr::acars, columnsExpected = sumobrainColumns, 129 | cleanNames = sumobrainNames, 130 | dateFields = sumobrainDateFields, 131 | dateOrders = sumobrainDateOrder, 132 | deduplicate = TRUE, 133 | cakcDict = patentr::cakcDict, 134 | docLengthTypesDict = patentr::docLengthTypesDict, 135 | keepType = "grant", 136 | firstAssigneeOnly = TRUE, 137 | assigneeSep = ";", 138 | stopWords = patentr::assigneeStopWords) 139 | 140 | # df <- dplyr::select(sumo, title, abstract) 141 | df <- sumo[,c("title","abstract")] 142 | wordCloudIt(df, excludeWords, minfreq = 20, 143 | random.order = FALSE, rot.per = 0.25) 144 | 145 | 146 | # note that in reality, you need a patent analyst to carefully score 147 | # these patents, the score here is for demonstrational purposes 148 | score <- round(rnorm(dim(sumo)[1],mean=1.4,sd=0.9)) 149 | score[score>3] <- 3; score[score<0] <- 0 150 | sumo$score <- score 151 | sumo$assigneeSmall <- strtrim(sumo$assigneeClean,12) 152 | category <- c("system","control algorithm","product","control system", "communication") 153 | c <- round(rnorm(dim(sumo)[1],mean=2.5,sd=1.5)) 154 | c[c>5] <- 5; c[c<1] <- 1 155 | sumo$category <- category[c] 156 | feature1 <- c("adaptive", "park", "lane", NA,NA,NA,NA,NA, 157 | "brake", "steer","accelerate","deactivate") 158 | f <- round(rnorm(dim(sumo)[1],mean=5,sd=1)) 159 | l <- length(feature1) 160 | f[f>l] <- l; f[f<1] <- 1 161 | sumo$feature1 <- c(feature1,feature1[f])[1:dim(sumo)[1]] 162 | 163 | tilePlot(sumo, "category", "feature1", xangle = 90, xhjust=0) 164 | 165 | 166 | ``` 167 | 168 | 169 | 170 | ## Reporting 171 | The package allows the user to output a set of pre-defined plots and 172 | summary information. There are pdf and pptx options. 173 | 174 | ```{r, eval = FALSE} 175 | sumo <- cleanPatentData(patentData = patentr::acars, columnsExpected = sumobrainColumns, 176 | cleanNames = sumobrainNames, dateFields = sumobrainDateFields, dateOrders = sumobrainDateOrder, 177 | deduplicate = TRUE, cakcDict = patentr::cakcDict, docLengthTypesDict = patentr::docLengthTypesDict, 178 | keepType = "grant", firstAssigneeOnly = TRUE, assigneeSep = ";", stopWords = patentr::assigneeStopWords) 179 | 180 | score <- round(rnorm(dim(sumo)[1],mean=1.4,sd=0.9)) 181 | score[score>3] <- 3; score[score<0] <- 0; sumo$score <- score 182 | sumo$assigneeSmall <- strtrim(sumo$assigneeClean,12) 183 | category <- c("system","control algorithm","product","control system", "communication") 184 | c <- round(rnorm(dim(sumo)[1],mean=2.5,sd=1.5)); c[c>5] <- 5; c[c<1] <- 1; sumo$category <- category[c] 185 | xVal = "category"; fillVal = "score"; facetVal = "assigneeSmall" 186 | # create a ppt 187 | ppt <- ReporteRs::pptx(title="IP Update") 188 | layoutTitleContent = "Title and Content" 189 | fp <- facetPlot(subset(sumo, score > 0), xVal, fillVal, facetVal, colors = patentr::scoreColors, recolor = FALSE) 190 | ppt <- addFullImagePptx(ppt, plot = fp, slide_layout = layoutTitleContent) 191 | # find a data folder and write it out to your folder 192 | out <- paste("data/",Sys.Date(),"_exampleChartRightTextLeft.pptx",sep='') 193 | ReporteRs::writeDoc(ppt, out) 194 | ``` 195 | 196 | 197 | ## Upcoming Features in the Second Release 198 | 199 | The next round of patent features will include: 200 | 201 | * Directly edit patent data in Shiny, or upload and redownload the data to 202 | Google Spreadsheets or excel 203 | * Utilize a custom template set to auto-generate a powerpoint presentation 204 | * Use supervised learning to semi-automate the classification of: 205 | + score 206 | + category 207 | + feature 1 (main feature) 208 | + feature 2 (secondary feature) -------------------------------------------------------------------------------- /R/processPatentData.R: -------------------------------------------------------------------------------- 1 | ## kamil bojanczk start 2 | #' Create a URL link to Google patents 3 | #' 4 | #' @description Create a URL string to link you to Google Patents. 5 | #' 6 | #' By concatenating the country code, publication number, and kind code, you can 7 | #' generate a URL to link you to google patents for further exploration. This 8 | #' feature is especially useful when browsing the data in a spreadsheet or in 9 | #' a Shiny app. It is also useful for extracting data from the HTML content. 10 | #' 11 | #' As each website (Google, lens.org, sumobrain.com, etc..) has a different 12 | #' method for generating patent URLs, these functions are website-specific. 13 | #' 14 | #' The original Google patents version still works as of March 2017 and the 15 | #' \code{googleURL} value is \code{https://www.google.com/patents/}. This older 16 | #' version may be easier to extract data. 17 | #' 18 | #' @param countryCode A character vector of the country code of the document. 19 | #' Typically a two-letter character. 20 | #' @param pubNum A character vector of the numeric portion of a publication number. 21 | #' @param kindCode character vector of the kind code of a document. If not available, 22 | #' enter a blank string "". 23 | #' @param googleURL A character string of the URL to Google Patents, with working 24 | #' default value. 25 | #' @param lang The language you want to read the patent, default set to "en" english. 26 | #' 27 | #' @return A character vector of properly formatted URL strings. 28 | #' 29 | #' @examples 30 | #' acars$kindCode <- extractKindCode(acars$docNum) 31 | #' acars$pubName <- extractPubNumber(acars$docNum) 32 | #' acars$googleURL <- createGoogleURL(countryCode = acars$countryCode, 33 | #' pubNum = acars$pubNum, kindCode =acars$kindCode) 34 | #' head(acars$googleURL) 35 | #' 36 | #' @export 37 | createGoogleURL <- function(countryCode, pubNum, kindCode, 38 | googleURL = "https://patents.google.com/patent/", 39 | lang ="en"){ 40 | # create the URL 41 | paste(googleURL, countryCode, pubNum, kindCode, "/",lang, sep='') 42 | # TODO: validate the URL 43 | # http://stackoverflow.com/questions/28527100/check-if-https-hypertext-transfer-protocol-secure-url-is-valid 44 | } 45 | 46 | 47 | 48 | #' Get a claim from a Google Patents URL 49 | #' 50 | #' @description Input a valid Google Patents URL of the form given below and 51 | #' then get back a claim from the index of your choosing. If no claim exists or 52 | #' if your index is out of bounds, an empty character string returns. 53 | #' 54 | #' The function works on strings that begin with the following sequence: 55 | #' \code{https://patents.google.com/patent/}. If the string sequence afterwards 56 | #' is invalid, a 404 status returns from the GET command and eventually an empty 57 | #' string returns. 58 | #' 59 | #' 60 | #' 61 | #' 62 | #' @return A character vector of the claim from each Google URL. If no claim exists, 63 | #' or if the country code is not on the inclusion list, an empty character value is returned 64 | #' for that index. 65 | #' 66 | #' @param googleURL The well-formatted google URL built from \code{\link{createGoogleURL}}. 67 | #' It is a character value. 68 | #' @param langCode The language code, used check for non-english text. 69 | #' @param whichClaim Default set to 1, a numeric determining which claim to get. Note 70 | #' if claim is out of bounds, the return claim will be a blank character. 71 | #' 72 | #' 73 | #' @seealso \code{\link{createGoogleURL}}, \code{\link{extractCountryCode}}, 74 | #' \code{\link{cleanGoogleURL}} 75 | #' 76 | #' @examples 77 | #' 78 | #' \dontrun{ 79 | #' # works for USA 80 | #' aclaim <- getClaimFromURL("https://patents.google.com/patent/US8818682B1/en") 81 | #' print(aclaim) 82 | #' # test WO, EP 83 | #' woclaim <- getClaimFromURL("https://patents.google.com/patent/WO2015134152A1/en") 84 | #' print(woclaim) 85 | #' epclaim <- getClaimFromURL("https://patents.google.com/patent/EP2991875A1/en") 86 | #' print(epclaim) 87 | #' # test KR, JP, CN 88 | #' krclaim <- getClaimFromURL("https://patents.google.com/patent/KR20150127745A/en") 89 | #' cnclaim <- getClaimFromURL("https://patents.google.com/patent/CN104786953A/en") 90 | #' jpclaim <- getClaimFromURL("https://patents.google.com/patent/JP2016173842A/en") 91 | #' declaim <- getClaimFromURL("https://patents.google.com/patent/DE102014219223A1/en") 92 | #' } 93 | #' 94 | #' @export 95 | #' 96 | #' @importFrom XML xpathSApply 97 | #' @importFrom XML xmlValue 98 | #' @importFrom XML getNodeSet 99 | #' @importFrom XML htmlParse 100 | #' @importFrom httr GET 101 | #' 102 | getClaimFromURL <- function(googleURL, langCode="en", whichClaim = 1){ 103 | 104 | # make sure language code is set in URL 105 | googleURL <- cleanGoogleURL(googleURL = googleURL, langCode = langCode) 106 | 107 | # pd = patent data 108 | pd1 <- httr::GET(url = googleURL) 109 | pd2 <- XML::htmlParse(pd1) 110 | # future mode will have an input vector of options to choose from 111 | pd3 <- XML::getNodeSet(pd2, "//div[@class='claim']") 112 | 113 | # pc = patent claim 114 | # if exists 115 | if(length(pd3)>=whichClaim){ 116 | 117 | # if US 118 | # works for USA 119 | pc <- XML::xmlValue(pd3[[whichClaim]]) 120 | 121 | } 122 | else{ 123 | # works for WO, EP 124 | pd3 <- XML::getNodeSet(pd2, "//claim") 125 | if(length(pd3)>=whichClaim){ 126 | pc <- XML::xmlValue(pd3[[whichClaim]]) 127 | 128 | 129 | } else{ 130 | # catch all 131 | pc <- "" 132 | } 133 | 134 | } 135 | 136 | # if english, get rid of non-english words and try replacing 137 | # any alphanumerics that are non-english vocabulary 138 | if(langCode == "en" && (length(pd3) >= whichClaim) ){ 139 | 140 | pd3 <- XML::getNodeSet(pd2, "//div[@class='claim']") 141 | pd3 <- XML::getNodeSet(pd2, "//div[contains(@class,'claim')]") 142 | pd4 <- XML::getNodeSet(pd3[[whichClaim]], 143 | paste0("//div[@num=",whichClaim,"]//span[@class='notranslate']/text()")) 144 | pd5 <- paste((sapply(pd4, XML::xmlValue)), collapse = "") 145 | # if returns a value and less than the original printout 146 | # replace pc with the new "cleaner" version 147 | # note this may have issues and "too much" may be returned 148 | # requires further testing, (nchar(pd5) < nchar(pc)) may need to 149 | if( nchar(pd5) > 1 ){ 150 | pc <- pd5 151 | } 152 | pc <- gsub("[^[:alnum:] ]","",pc) 153 | } 154 | 155 | 156 | # trim to remove new lines and numbering with a period 157 | pc <- trimws(gsub("\\n|[0-9].", "", pc)) 158 | # remove excessive spacing 159 | pc <- gsub("\\s+"," ", pc) 160 | # return a trimmed version of the claim 161 | return(pc) 162 | 163 | } 164 | # later want to get the # of claims 165 | # http://stackoverflow.com/questions/8702039/how-to-find-the-max-attribute-from-an-xml-document-using-xpath-1-0 166 | # do something like this /library/book[@id = max(/library/book/@id)] 167 | # may need to sleep to not call too many 168 | # https://stat.ethz.ch/R-manual/R-devel/library/base/html/Sys.sleep.html 169 | 170 | 171 | 172 | 173 | #' Sanitize a Google URL before attempting to extract data 174 | #' 175 | #' @description Clean up the google URL to make sure it will be read properly. 176 | #' 177 | #' If you use the \code{\link{createGoogleURL}} function, you won't have to use this function. 178 | #' However, if you use your own generator or want to change the language, use this 179 | #' function to do so. 180 | #' 181 | #' @param googleURL A character value of a google URL. 182 | #' @param langCode A language code, default set to "en" English. 183 | #' 184 | #' @return A clean character vector of a Google Patents URL. 185 | #' 186 | #' @export 187 | #' 188 | #' @examples 189 | #' 190 | #' cleanGoogleURL("https://patents.google.com/patent/US8818682B1/mx") 191 | #' cleanGoogleURL("https://patents.google.com/patent/US8818682B1/") 192 | #' cleanGoogleURL("https://patents.google.com/patent/US8818682B1") 193 | #' cleanGoogleURL("https://patents.google.com/patent/US8818682B1/en") 194 | #' 195 | #' @seealso \code{\link{createGoogleURL}} 196 | #' 197 | cleanGoogleURL <- function(googleURL, langCode="en"){ 198 | 199 | expr <- paste0("\\/",langCode) 200 | # if the last two digist are not the language code, attempt to fix it 201 | if(regexpr(expr,googleURL)==-1L){ 202 | 203 | # 3 types of errors 204 | # 1 /en <--> /mx replace lang code 205 | # 2 /en <--> / add lang code 206 | # 3 /en <--> '' doesn't exist, add lang code and backslash 207 | 208 | if(regexpr("\\/[A-Za-z]{2}$",googleURL)>-1L){ 209 | googleURL <- gsub("\\/[A-Za-z]{2}$",paste0("/",langCode),googleURL) 210 | } else if(regexpr("\\/$",googleURL)>-1L){ 211 | googleURL <- gsub("\\/$",paste0("/",langCode),googleURL) 212 | } else{ 213 | # warning, attempting to generate URL, this may fail 214 | googleURL <- gsub("$",paste0("/",langCode),googleURL) 215 | } 216 | 217 | } 218 | return(googleURL) 219 | } 220 | 221 | #' Get claims data for all rows in a data frame 222 | #' 223 | #' @description Generate claims data for all rows in a data frame. 224 | #' 225 | #' This is a wrapper function for the \code{\link{getClaimFromURL}} function. 226 | #' 227 | #' @param googleURLs A character vector of Google URLs 228 | #' @param langCode A language code, default set to "en" 229 | #' @param whichClaim Which claim (if available) to return. Default set to 1st. 230 | #' 231 | #' @export 232 | #' 233 | #' @examples 234 | #' 235 | #' \dontrun{ 236 | #' cc <- extractCountryCode(acars$docNum) 237 | #' pn <- extractPubNumber(acars$docNum) 238 | #' kc <- extractKindCode(acars$docNum) 239 | #' urls <- createGoogleURL(countryCode = cc, pubNum = pn ,kindCode = kc) 240 | #' urls <- urls[1:4] 241 | #' clms <- getClaimsText(urls) 242 | #' clms[1] 243 | #' } 244 | #' @seealso \code{\link{createGoogleURL}}, \code{\link{cleanGoogleURL}}, 245 | #' \code{\link{getClaimFromURL}} 246 | #' 247 | getClaimsText <- function(googleURLs, langCode="en",whichClaim=1){ 248 | sapply(googleURLs, function(x){ 249 | getClaimFromURL(googleURL = x, langCode = langCode, whichClaim = whichClaim) 250 | }) 251 | } 252 | 253 | 254 | 255 | ## kamil bojanczk end -------------------------------------------------------------------------------- /README.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | output: github_document 3 | title: "patentr" 4 | date: "`r Sys.Date()`" 5 | author: "Kamil Bojanczyk, Yao Yang" 6 | --- 7 | 8 | 9 | 10 | ```{r, echo = FALSE} 11 | knitr::opts_chunk$set( 12 | collapse = TRUE, 13 | comment = "#>", 14 | fig.path = "README-" 15 | ) 16 | ``` 17 | 18 | Introducing `patentr`, the toolkit for patent data analysis in R. The summary md 19 | file currently holds all documentation. 20 | 21 | 22 | The package is aimed at patent agents, lawyers, managers, analysts, and 23 | academics who are working on patents. This may be used in a patent landscape 24 | analysis, company IP portfolio analysis, or a freedom to operate search. 25 | 26 | This is a data processing and reporting tool on patent data sets for 27 | patent analysts. The motivation comes from a lack of useful, exportable 28 | patent data. `patentr` builds upon the free data available from Sumobrain.com, 29 | Lens.org, and Google Patents, leveraging their data to summarize and analyze 30 | patents. 31 | 32 | `patentr` performs four key functions: 33 | 34 | 1. *Data input:* Easily **import** patent excel and csv files from the top patent websites 35 | + `CSV` from Google Patents and lens.org 36 | + `xlsx` from sumobrain.com 37 | 2. *Data cleaning:* **Sanitize** patent data and extract useful metadata for custom analyses 38 | + Clean up important fields such as names, dates, country codes, and kind codes 39 | + Infer the document type so that you don't analyze the same patent twice 40 | + Deduplicate data sets and prioritize grants over applications 41 | + Use the generated Google URL to jump to the patent document or to download 42 | claim data using the included `httr` and `XML` functions 43 | 3. *Exploratory data analysis:* **Explore** patent data and quickly **visualize** important attributes 44 | + Quickly summarize patent data by relevant columns to get document count 45 | + View standard histogram, tile, and facet plots of important information 46 | + Extract claim information for **wordcloud** analysis 47 | + Interact with your data on the **Shiny** user interface 48 | 4. *Reporting:* Export your data as **powerpoints** and **PDFs** 49 | + Browse through the **many example plots** 50 | + Download your charts locally as a **PDF** or make your own **PPTX** 51 | 52 | **Note:** The Shiny app works only with `xlsx` data. Simply upload the data file, 53 | click "clean", and then you can view a straightforward graph and the raw data. 54 | 55 | There are three core date sets available, all based on autonomous car patent sets: 56 | `acars` (from Sumobrain.com), `acarsLens`, and `acarsGoogle`. All data sets are 57 | reproducible and their sources can be found in their documentation. 58 | 59 | ## Data Input and Data Sources 60 | 61 | Choose your data from Sumobrain.com for excel files, or Lens.org and Google Patents 62 | for `csv` files. 63 | 64 | You can read in patent data files from publicly available sources and clean the 65 | data into a more useful, usable format for further analysis. `patentr` has an 66 | **interactive** browser that allows you to choose a **list** of files of xlsx 67 | format. Alternatively, you can read in your own `csv` files. 68 | ```{r, eval=FALSE} 69 | # read in xlsx files 70 | file1 <- system.file("extdata/", "sumobrain_autonomous_search1.xlsx", package="patentr") 71 | file2 <- system.file("extdata/", "sumobrain_autonomous_search2.xlsx", package="patentr") 72 | files <- list(file1, file2) 73 | ipData <- importPatentData(rawDataFilePath = files, skipLines = 1) 74 | # example 2 -- a popup window appears for you to choose xlsx files 75 | filePaths <- chooseFiles() 76 | allData <- importPatentData(filePaths) 77 | # example 3 -- read in csv files 78 | google <- read.csv(system.file("testData/","google_autonomous_search.csv", package ="patentr") 79 | skip = skipGoogle, stringsAsFactors = FALSE) 80 | google <- data.frame(lapply(lens,function(x){iconv(x, to = "ASCII")}), stringsAsFactors = FALSE) 81 | 82 | 83 | ``` 84 | 85 | 86 | ## Clean Data 87 | 88 | There are ten different cleaning functions available, all wrapped up nicely into 89 | the `cleanPatentData` function. This single function can save you hours of work 90 | cleaning and processing your data. Read the documentation carefully, as there are 91 | a number of time-saving preloaded variables to name the columns, process the 92 | dates, clean up the assignee names, and much more. 93 | 94 | For excel files, use the `cleanPatentData` function directly. For csv files, 95 | use the pre-processing lines below. 96 | 97 | Clean data uses `extract` functions that take in character vectors and return 98 | extracted metadata useful in patent data analysis. A master cleaner function 99 | bundles all these functions together. The user also has the ability to use the 100 | functions one-by-one for custom analysis. 101 | 102 | ```{r, eval=FALSE} 103 | lensRawData <- system.file("extdata", "lens_autonomous_search.csv", package = "patentr") 104 | lensRawData <- read.csv(lensRawData, stringsAsFactors = FALSE, skip = skipLens) 105 | lensRawData <- data.frame(lapply(lensRawData, 106 | function(x){iconv(x, to = "ASCII")}), stringsAsFactors = FALSE) 107 | lens <- cleanPatentData(patentData = lensRawData, columnsExpected = lensColumns, 108 | cleanNames = lensNames, dateFields = lensDateFields, dateOrders = lensDateOrder, 109 | deduplicate = TRUE, cakcDict = patentr::cakcDict, docLengthTypesDict = patentr::docLengthTypesDict, 110 | keepType = "grant", firstAssigneeOnly = TRUE, assigneeSep = ";;", stopWords = patentr::assigneeStopWords) 111 | ``` 112 | 113 | 114 | ## Exploratory Analysis 115 | 116 | The exploratory analysis includes simple summaries and numerous graphings. Ideally, 117 | a patent analyst needs to add the following columns to the cleaned data to make 118 | full use of the package: 119 | * score 120 | * category 121 | * feature 1 (main feature) 122 | * feature 2 (secondary feature) 123 | 124 | For the purpose of this first package, all examples come with a pre-built 0 to 3 125 | score, 3 being the highest. Categories are also predefined, as is feature 1. These 126 | are important variables that require days to weeks of a patent analysts time, thus, 127 | in future realeases an expert-tagged data set will be available. 128 | 129 | A simple example is the word cloud. We load a file, deduplicate it, and then 130 | quickly view the top phrases. Another example is a simple facet that shows 131 | the category of a patent technology, along with the major feature of that patent. 132 | For example, every autonomous car technology category has a lane feature, as 133 | staying in the lane for a car is important. 134 | 135 | ![Wordcloud](vignettes/Rplot01.png) ![Tile plot](vignettes/Rplot.png) 136 | ```{r warning=FALSE, echo = FALSE, message=FALSE, eval=FALSE} 137 | devtools::load_all() 138 | sumo <- cleanPatentData(patentData = patentr::acars, columnsExpected = sumobrainColumns, 139 | cleanNames = sumobrainNames, 140 | dateFields = sumobrainDateFields, 141 | dateOrders = sumobrainDateOrder, 142 | deduplicate = TRUE, 143 | cakcDict = patentr::cakcDict, 144 | docLengthTypesDict = patentr::docLengthTypesDict, 145 | keepType = "grant", 146 | firstAssigneeOnly = TRUE, 147 | assigneeSep = ";", 148 | stopWords = patentr::assigneeStopWords) 149 | 150 | # df <- dplyr::select(sumo, title, abstract) 151 | df <- sumo[,c("title","abstract")] 152 | wordCloudIt(df, excludeWords, minfreq = 20, 153 | random.order = FALSE, rot.per = 0.25) 154 | 155 | 156 | # note that in reality, you need a patent analyst to carefully score 157 | # these patents, the score here is for demonstrational purposes 158 | score <- round(rnorm(dim(sumo)[1],mean=1.4,sd=0.9)) 159 | score[score>3] <- 3; score[score<0] <- 0 160 | sumo$score <- score 161 | sumo$assigneeSmall <- strtrim(sumo$assigneeClean,12) 162 | category <- c("system","control algorithm","product","control system", "communication") 163 | c <- round(rnorm(dim(sumo)[1],mean=2.5,sd=1.5)) 164 | c[c>5] <- 5; c[c<1] <- 1 165 | sumo$category <- category[c] 166 | feature1 <- c("adaptive", "park", "lane", NA,NA,NA,NA,NA, 167 | "brake", "steer","accelerate","deactivate") 168 | f <- round(rnorm(dim(sumo)[1],mean=5,sd=1)) 169 | l <- length(feature1) 170 | f[f>l] <- l; f[f<1] <- 1 171 | sumo$feature1 <- c(feature1,feature1[f])[1:dim(sumo)[1]] 172 | 173 | tilePlot(sumo, "category", "feature1", xangle = 90, xhjust=0) 174 | 175 | 176 | ``` 177 | 178 | 179 | 180 | ## Reporting 181 | The package allows the user to output a set of pre-defined plots and 182 | summary information. There are pdf and pptx options. 183 | 184 | ```{r, eval = FALSE} 185 | sumo <- cleanPatentData(patentData = patentr::acars, columnsExpected = sumobrainColumns, 186 | cleanNames = sumobrainNames, dateFields = sumobrainDateFields, dateOrders = sumobrainDateOrder, 187 | deduplicate = TRUE, cakcDict = patentr::cakcDict, docLengthTypesDict = patentr::docLengthTypesDict, 188 | keepType = "grant", firstAssigneeOnly = TRUE, assigneeSep = ";", stopWords = patentr::assigneeStopWords) 189 | 190 | score <- round(rnorm(dim(sumo)[1],mean=1.4,sd=0.9)) 191 | score[score>3] <- 3; score[score<0] <- 0; sumo$score <- score 192 | sumo$assigneeSmall <- strtrim(sumo$assigneeClean,12) 193 | category <- c("system","control algorithm","product","control system", "communication") 194 | c <- round(rnorm(dim(sumo)[1],mean=2.5,sd=1.5)); c[c>5] <- 5; c[c<1] <- 1; sumo$category <- category[c] 195 | xVal = "category"; fillVal = "score"; facetVal = "assigneeSmall" 196 | # create a ppt 197 | ppt <- ReporteRs::pptx(title="IP Update") 198 | layoutTitleContent = "Title and Content" 199 | fp <- facetPlot(subset(sumo, score > 0), xVal, fillVal, facetVal, colors = patentr::scoreColors, recolor = FALSE) 200 | ppt <- addFullImagePptx(ppt, plot = fp, slide_layout = layoutTitleContent) 201 | # find a data folder and write it out to your folder 202 | out <- paste("data/",Sys.Date(),"_exampleChartRightTextLeft.pptx",sep='') 203 | ReporteRs::writeDoc(ppt, out) 204 | ``` 205 | 206 | 207 | ## Upcoming Features in the Second Release 208 | 209 | The next round of patent features will include: 210 | 211 | * Directly edit patent data in Shiny, or upload and redownload the data to 212 | Google Spreadsheets or excel 213 | * Utilize a custom template set to auto-generate a powerpoint presentation 214 | * Use supervised learning to semi-automate the classification of: 215 | + score 216 | + category 217 | + feature 1 (main feature) 218 | + feature 2 (secondary feature) -------------------------------------------------------------------------------- /R/reportPatentData.R: -------------------------------------------------------------------------------- 1 | # reporting-related functions to generate ppt slides 2 | ## yang yao start 3 | 4 | 5 | 6 | #' Add summary text to be used in a pptx slide 7 | #' 8 | #' @description Add a standard summarized text that will be used in 9 | #' association with a plot. 10 | #' 11 | #' @param df A summarized patent data frame, summarized by one variable. 12 | #' See \code{\link{summarizeColumns}}. 13 | #' @param singular The name of the variable, singular version. A character string. 14 | #' For example: assignee. 15 | #' @param plural The name of the variable, plural version. A character string. 16 | #' For example: assignees, with an 's'. 17 | #' @param sumVar The vector of the variable to summarize, taken from the original 18 | #' patent data set. For example \code{sumo$score} to summarize the score range. 19 | #' 20 | #' @return A length four character vector. 21 | #' 22 | #' @examples 23 | #' sumo <- cleanPatentData(patentData = patentr::acars, columnsExpected = sumobrainColumns, 24 | #' cleanNames = sumobrainNames, 25 | #' dateFields = sumobrainDateFields, 26 | #' dateOrders = sumobrainDateOrder, 27 | #' deduplicate = TRUE, 28 | #' cakcDict = patentr::cakcDict, 29 | #' docLengthTypesDict = patentr::docLengthTypesDict, 30 | #' keepType = "grant", 31 | #' firstAssigneeOnly = TRUE, 32 | #' assigneeSep = ";", 33 | #' stopWords = patentr::assigneeStopWords) 34 | #' 35 | #' # note that in reality, you need a patent analyst to carefully score 36 | #' # these patents, the score here is for demonstrational purposes 37 | #' score <- round(rnorm(dim(sumo)[1],mean=1.4,sd=0.9)) 38 | #' score[score>3] <- 3; score[score<0] <- 0 39 | #' sumo$score <- score 40 | #' sumo$assigneeSmall <- strtrim(sumo$assigneeClean,12) 41 | #' category <- c("system","control algorithm","product","control system", "communication") 42 | #' c <- round(rnorm(dim(sumo)[1],mean=2.5,sd=1.5)) 43 | #' c[c>5] <- 5; c[c<1] <- 1 44 | #' sumo$category <- category[c] 45 | #' feature1 <- c("adaptive", "park", "lane", NA,NA,NA,NA,NA, 46 | #' "brake", "steer","accelerate","deactivate") 47 | #' f <- round(rnorm(dim(sumo)[1],mean=5,sd=1)) 48 | #' l <- length(feature1) 49 | #' f[f>l] <- l; f[f<1] <- 1 50 | #' sumo$feature1 <- c(feature1,feature1[f])[1:dim(sumo)[1]] 51 | #' 52 | #' # Summarize the assignees 53 | #' as <- summarizeColumns(sumo, 'assigneeSmall') 54 | #' summaryText(as, 'assignee','assignees',sumo$score) 55 | #' # summarize the number of features 56 | #' f <- summarizeColumns(sumo, 'feature1', naOmit = TRUE) 57 | #' summaryText(f, 'feature','features',sumo$feature1) 58 | #' 59 | #' @export 60 | #' 61 | summaryText <- function(df, singular, plural, sumVar){ 62 | 63 | m1range <- paste("For entry range ",capWord(min(sumVar, na.rm = TRUE)), " to ", 64 | capWord(max(sumVar, na.rm = TRUE)),"...", sep='') 65 | 66 | m2size <- paste("There are ", dim(df)[1]," ", plural,".", sep='') 67 | 68 | m3top <- paste("Top ",singular," is ", capWord(as.character(utils::tail(unlist(df[,1]),1))),", with ", 69 | as.character(utils::tail(unlist(df[,2]),1))," documents.",sep='') 70 | 71 | m4total <- paste("Total IP count is ", sum(as.numeric(unlist(df[,2])))," documents.",sep='') 72 | 73 | c(m1range, m2size, m3top, m4total) 74 | } 75 | 76 | 77 | 78 | 79 | #' Add a PPTX slide with chart on the right and text on the left 80 | #' 81 | #' @description Generate a commonly-used PPTX slide format where the patent 82 | #' chart is on the right and some text is on the left. 83 | #' 84 | #' This function automates a number of steps used in formatting a pptx slide. 85 | #' It returns the ppt object with the new slide included. 86 | #' 87 | #' @param ppt A ppt object. 88 | #' @param plot A plot object from ggplot2. 89 | #' @param text A character vector of text, typically less than one paragraph 90 | #' in size. 91 | #' @param title A character title for a page. Default is NULL 92 | #' @param slide_layout The name of a slide layout, the same name as the names in a .potx 93 | #' powerpoint template file. Default is a Title and Content blank layout. 94 | #' @param Poffx Plot image x position from left top, inches. 95 | #' See \code{\link[ReporteRs]{addPlot}}. Default is 5.3. 96 | #' @param Poffy Plot image y position from left top, inches. 97 | #' See \code{\link[ReporteRs]{addPlot}}. Default is 0. 98 | #' @param Pwidth Plot image width, inches. 99 | #' See \code{\link[ReporteRs]{addPlot}}. Default is 8. 100 | #' @param Pheight Plot image height, inches. 101 | #' See \code{\link[ReporteRs]{addPlot}}. Default is 7.5 102 | #' @param Toffx Text image x position from left top, inches. 103 | #' See \code{\link[ReporteRs]{addPlot}}. Default is 1. 104 | #' @param Toffy Text image y position from left top, inches. 105 | #' See \code{\link[ReporteRs]{addPlot}}. Default is 2. 106 | #' @param Twidth Text image width, inches. 107 | #' See \code{\link[ReporteRs]{addPlot}}. Default is 5. 108 | #' @param Theight Text image height, inches. 109 | #' See \code{\link[ReporteRs]{addPlot}}. Default is 5.5. 110 | #' 111 | #' 112 | #' @examples 113 | #' 114 | #' sumo <- cleanPatentData(patentData = patentr::acars, columnsExpected = sumobrainColumns, 115 | #' cleanNames = sumobrainNames, 116 | #' dateFields = sumobrainDateFields, 117 | #' dateOrders = sumobrainDateOrder, 118 | #' deduplicate = TRUE, 119 | #' cakcDict = patentr::cakcDict, 120 | #' docLengthTypesDict = patentr::docLengthTypesDict, 121 | #' keepType = "grant", 122 | #' firstAssigneeOnly = TRUE, 123 | #' assigneeSep = ";", 124 | #' stopWords = patentr::assigneeStopWords) 125 | #' 126 | #' # note that in reality, you need a patent analyst to carefully score 127 | #' # these patents, the score here is for demonstrational purposes 128 | #' score <- round(rnorm(dim(sumo)[1],mean=1.4,sd=0.9)) 129 | #' score[score>3] <- 3 130 | #' score[score<0] <- 0 131 | #' sumo$score <- score 132 | #' sumo$assigneeSmall <- strtrim(sumo$assigneeClean,12) 133 | #' category <- c("system","control algorithm","product","control system", "communication") 134 | #' c <- round(rnorm(dim(sumo)[1],mean=2.5,sd=1.5)) 135 | #' c[c>5] <- 5; c[c<1] <- 1 136 | #' sumo$category <- category[c] 137 | #' feature1 <- c("adaptive", "park", "lane", NA,NA,NA,NA,NA, 138 | #' "brake", "steer","accelerate","deactivate") 139 | #' f <- round(rnorm(dim(sumo)[1],mean=5,sd=1)) 140 | #' l <- length(feature1) 141 | #' f[f>l] <- l; f[f<1] <- 1 142 | #' sumo$feature1 <- c(feature1,feature1[f])[1:dim(sumo)[1]] 143 | #' 144 | #' flippedHistogram(sumo, "assigneeSmall","score",colors=scoreColors) 145 | #' flippedHistogram(subset(sumo, score > 0), "assigneeSmall","score",colors=scoreColors) 146 | #' 147 | #' flippedHistogram(subset(sumo, score > 2) ,"assigneeSmall","docType",colors=scoreColors, 148 | #' recolor = TRUE) 149 | #' 150 | #' 151 | #' 152 | #' 153 | #' # create a ppt 154 | #' ppt <- ReporteRs::pptx(title="IP Update") 155 | #' # view the types of layouts available by default 156 | #' # slide.layouts(ppt) 157 | #' layoutTitleContent = "Title and Content" 158 | #' 159 | #' # first plot of top score (3) 160 | #' asdt <- summarizeColumns(subset(sumo,score > 2),'docType') 161 | #' ppt <- 162 | #' addChartRightTextLeftPptx(ppt = ppt, 163 | #' plot = flippedHistogram(subset(sumo, score > 2) , 164 | #' "assigneeSmall","docType", 165 | #' colors=scoreColors, 166 | #' recolor = TRUE), 167 | #' text = summaryText(asdt, "doc type", "doc types", 168 | #' subset(sumo,score>2)$docType), 169 | #' title = "Doc Types for Top Score Docs", 170 | #' slide_layout = layoutTitleContent) 171 | #' 172 | #' # top scores by assignee 173 | #' ascore <- summarizeColumns(subset(sumo,score > 2),'assigneeSmall') 174 | #' ppt <- 175 | #' addChartRightTextLeftPptx(ppt = ppt, 176 | #' plot = flippedHistogram(subset(sumo, score > 2) , 177 | #' "assigneeSmall","score", 178 | #' colors=scoreColors, 179 | #' recolor = FALSE), 180 | #' text = summaryText(ascore, "assignee", "assignees", 181 | #' subset(sumo,score>2)$assigneeSmall), 182 | #' title = "Assignees with Top Scores", 183 | #' slide_layout = layoutTitleContent) 184 | #' 185 | #' 186 | #' # last plot is category 187 | #' sc <- summarizeColumns(sumo,'category') 188 | #' ppt <- 189 | #' addChartRightTextLeftPptx(ppt = ppt, 190 | #' plot = flippedHistogram(sumo ,"category", 191 | #' "score", colors = scoreColors, 192 | #' recolor = TRUE), 193 | #' text = summaryText(sc, "category", "categories", sumo$category), 194 | #' title = "Categories and Scores", 195 | #' slide_layout = layoutTitleContent) 196 | #' 197 | #' # find a data folder and write it out to your folder 198 | #' # out <- paste("data/",Sys.Date(),"_exampleChartRightTextLeft.pptx",sep='') 199 | #' # ReporteRs::writeDoc(ppt, out) 200 | #' 201 | #' 202 | #' @seealso \code{\link[ReporteRs]{pptx}}, \code{\link{addFullImagePptx}} 203 | #' 204 | #' 205 | #' @export 206 | #' 207 | #' @import ReporteRs 208 | #' 209 | addChartRightTextLeftPptx <- function(ppt, plot, text, title, slide_layout = "Title and Content", 210 | Poffx = 5.3,Poffy = 0,Pwidth = 8, Pheight = 7.5, 211 | Toffx = 1, Toffy = 2, Twidth = 5, Theight = 5.5){ 212 | # add a new slide 213 | ppt <- ReporteRs::addSlide(ppt, slide.layout = slide_layout) 214 | # add the plot, it takes up slightly more than half (13.3in by 7.5in per slide) 215 | ppt <- ReporteRs::addPlot(ppt, print, x = plot, 216 | offx = Poffx, offy = Poffy, 217 | width = Pwidth, height = Pheight) 218 | # add in bullet point text 219 | ppt <- ReporteRs::addParagraph(ppt, text, 220 | par.properties = ReporteRs::parProperties(list.style='unordered', level=1), 221 | offx = Toffx, offy = Toffy, 222 | width = Twidth, height=Theight) 223 | # add in title overlaid last 224 | ppt <- ReporteRs::addTitle(ppt, title) 225 | ppt 226 | } 227 | 228 | 229 | 230 | 231 | #' Add a full-sized plot image to a pptx 232 | #' 233 | #' @description Take a plot image from ggplot2 and size it to fit an entire 234 | #' slide. 235 | #' 236 | #' @param ppt A ppt object to add a slide to. 237 | #' @param plot A plot output object from ggplto2. 238 | #' @param slide_layout A character value, slide layout, default value is 239 | #' \code{"Title and Content"}. 240 | #' @param w Width in inches, default set to max width 13.3 241 | #' @param h Height in inches, default set to max height 7.5 242 | #' 243 | #' 244 | #' @return a pptx object. 245 | #' 246 | #' @importFrom ReporteRs addSlide 247 | #' @importFrom ReporteRs addPlot 248 | #' 249 | #' @examples 250 | #' sumo <- cleanPatentData(patentData = patentr::acars, columnsExpected = sumobrainColumns, 251 | #' cleanNames = sumobrainNames, 252 | #' dateFields = sumobrainDateFields, 253 | #' dateOrders = sumobrainDateOrder, 254 | #' deduplicate = TRUE, 255 | #' cakcDict = patentr::cakcDict, 256 | #' docLengthTypesDict = patentr::docLengthTypesDict, 257 | #' keepType = "grant", 258 | #' firstAssigneeOnly = TRUE, 259 | #' assigneeSep = ";", 260 | #' stopWords = patentr::assigneeStopWords) 261 | #' 262 | #' # note that in reality, you need a patent analyst to carefully score 263 | #' # these patents, the score here is for demonstrational purposes 264 | #' score <- round(rnorm(dim(sumo)[1],mean=1.4,sd=0.9)) 265 | #' score[score>3] <- 3; score[score<0] <- 0 266 | #' sumo$score <- score 267 | #' sumo$assigneeSmall <- strtrim(sumo$assigneeClean,12) 268 | #' category <- c("system","control algorithm","product","control system", "communication") 269 | #' c <- round(rnorm(dim(sumo)[1],mean=2.5,sd=1.5)) 270 | #' c[c>5] <- 5; c[c<1] <- 1 271 | #' sumo$category <- category[c] 272 | #' 273 | #' xVal = "category" 274 | #' fillVal = "score" 275 | #' facetVal = "assigneeSmall" 276 | #' 277 | #' fp <- facetPlot(subset(sumo, score > 0), xVal, fillVal, facetVal, colors = patentr::scoreColors, 278 | #' recolor = FALSE) 279 | #' 280 | #' 281 | #' 282 | #' # create a ppt 283 | #' ppt <- ReporteRs::pptx(title="IP Update") 284 | #' # view the types of layouts available by default 285 | #' # slide.layouts(ppt) 286 | #' layoutTitleContent = "Title and Content" 287 | #' 288 | #' fp <- facetPlot(subset(sumo, score > 0), xVal, fillVal, facetVal, colors = patentr::scoreColors, 289 | #' recolor = FALSE) 290 | #' ppt <- addFullImagePptx(ppt, plot = fp, slide_layout = layoutTitleContent) 291 | #' fp <- facetPlot(subset(sumo, score > 1), xVal, fillVal, facetVal, colors = patentr::scoreColors, 292 | #' recolor = FALSE) 293 | #' ppt <- addFullImagePptx(ppt, plot = fp, slide_layout = layoutTitleContent) 294 | #' fp <- facetPlot(subset(sumo, score > 2), xVal, fillVal, facetVal, colors = patentr::scoreColors, 295 | #' recolor = FALSE) 296 | #' ppt <- addFullImagePptx(ppt, plot = fp, slide_layout = layoutTitleContent) 297 | #' 298 | #' 299 | #' # find a data folder and write it out to your folder 300 | #' # out <- paste("data/",Sys.Date(),"_exampleChartRightTextLeft.pptx",sep='') 301 | #' # ReporteRs::writeDoc(ppt, out) 302 | #' 303 | #' 304 | #' @export 305 | #' 306 | #' @seealso \code{\link{addChartRightTextLeftPptx}} 307 | #' 308 | addFullImagePptx <- function(ppt, plot, slide_layout = "Title and Content", 309 | w = 13.3, h = 7.5){ 310 | ppt <- ReporteRs::addSlide(ppt, slide.layout = slide_layout) 311 | ppt <- ReporteRs::addPlot(ppt, print, x = plot, offx = 0, offy = 0, 312 | width = w, height = h) 313 | 314 | return(ppt) 315 | 316 | } 317 | 318 | 319 | #' Make a PDF output of a plot 320 | #' 321 | #' @description Make a PDF output of a plot. 322 | #' 323 | #' @param graph The graph object to input 324 | #' @param name A character name to name your file. It can have a filepath as well. 325 | #' @param w The width, in inches, of your image, default set to 12. 326 | #' @param h The height, in inches, of your image, default set to 12. 327 | #' 328 | #' 329 | #' @return No ret 330 | #' 331 | #' @examples 332 | #' 333 | #' sumo <- cleanPatentData(patentData = patentr::acars, columnsExpected = sumobrainColumns, 334 | #' cleanNames = sumobrainNames, 335 | #' dateFields = sumobrainDateFields, 336 | #' dateOrders = sumobrainDateOrder, 337 | #' deduplicate = TRUE, 338 | #' cakcDict = patentr::cakcDict, 339 | #' docLengthTypesDict = patentr::docLengthTypesDict, 340 | #' keepType = "grant", 341 | #' firstAssigneeOnly = TRUE, 342 | #' assigneeSep = ";", 343 | #' stopWords = patentr::assigneeStopWords) 344 | #' 345 | #' # df <- dplyr::select(sumo, title, abstract) 346 | #' df <- sumo[,c("title","abstract")] 347 | #' addPdfImage(wordCloudIt(df, excludeWords, minfreq = 20, 348 | #' random.order = FALSE, rot.per = 0.25),"wordCloud") 349 | #' 350 | #' @export 351 | #' 352 | #' 353 | addPdfImage <- function(graph,name = "image",w=12,h=12){ 354 | name <- paste(name,".pdf",sep='') 355 | grDevices::pdf(name,width=w,height=h) 356 | print(graph) 357 | grDevices::dev.off() 358 | } 359 | 360 | 361 | ## yang yao end --------------------------------------------------------------------------------