├── DESCRIPTION
├── LICENSE
├── NAMESPACE
├── R
    ├── accuracy.R
    ├── asDocumentTermMatrix.R
    ├── calcAccurateAmongTopK.R
    ├── cosineSimilarity.R
    ├── createDescriptives.R
    ├── createSimilarityTableStringdist.R
    ├── createSimilarityTableSubstring.R
    ├── createSimilarityTableWordwiseStringdist.R
    ├── data.R
    ├── expandPredictionResults.R
    ├── logLoss.R
    ├── plotAgreementRateVsProductionRate.R
    ├── plotReliabilityDiagram.R
    ├── plotTruePredictionsVsFalsePredictions.R
    ├── predictCreecysMemoryBasedReasoning.R
    ├── predictGweonsNearestNeighbor.R
    ├── predictLogisticRegressionWithPenalization.R
    ├── predictSimilarityBasedReasoning.R
    ├── predictWithCodingIndex.R
    ├── predictXgboost.R
    ├── prepare_German_coding_index_Gesamtberufsliste_der_BA.R
    ├── produceResults.R
    ├── removeFaultyAndUncodableAnswers_And_PrepareForAnalysis.R
    ├── selectMaxProbMethod.R
    ├── sharpness.R
    ├── stringPreprocessing.R
    ├── trainCreecysMemoryBasedReasoning.R
    ├── trainGweonsNearestNeighbor.R
    ├── trainLogisticRegressionWithPenalization.R
    ├── trainSimilarityBasedReasoning.R
    ├── trainSimilarityBasedReasoning2.R
    └── trainXgboost.R
├── README.md
├── data
    ├── coding_index_excerpt.RData
    ├── frequent_phrases.RData
    ├── kldb2010PlusFive.RData
    ├── occupations.RData
    ├── surveyCountsSubstringSimilarity.RData
    └── surveyCountsWordwiseSimilarity.RData
└── man
    ├── accuracy.Rd
    ├── asDocumentTermMatrix.Rd
    ├── calcAccurateAmongTopK.Rd
    ├── coding_index_excerpt.Rd
    ├── cosineSimilarity.Rd
    ├── createDescriptives.Rd
    ├── createSimilarityTableStringdist.Rd
    ├── createSimilarityTableSubstring.Rd
    ├── createSimilarityTableWordwiseStringdist.Rd
    ├── expandPredictionResults.Rd
    ├── frequent_phrases.Rd
    ├── kldb2010PlusFive.Rd
    ├── logLoss.Rd
    ├── occupations.Rd
    ├── plotAgreementRateVsProductionRate.Rd
    ├── plotReliabilityDiagram.Rd
    ├── plotTruePredictionsVsFalsePredictions.Rd
    ├── predictCreecysMemoryBasedReasoning.Rd
    ├── predictGweonsNearestNeighbor.Rd
    ├── predictLogisticRegressionWithPenalization.Rd
    ├── predictSimilarityBasedReasoning.Rd
    ├── predictWithCodingIndex.Rd
    ├── predictXgboost.Rd
    ├── prepare_German_coding_index_Gesamtberufsliste_der_BA.Rd
    ├── produceResults.Rd
    ├── removeFaultyAndUncodableAnswers_And_PrepareForAnalysis.Rd
    ├── selectMaxProbMethod.Rd
    ├── sharpness.Rd
    ├── stringPreprocessing.Rd
    ├── surveyCountsSubstringSimilarity.Rd
    ├── surveyCountsWordwiseSimilarity.Rd
    ├── trainCreecysMemoryBasedReasoning.Rd
    ├── trainGweonsNearestNeighbor.Rd
    ├── trainLogisticRegressionWithPenalization.Rd
    ├── trainSimilarityBasedReasoning.Rd
    ├── trainSimilarityBasedReasoning2.Rd
    └── trainXgboost.Rd


/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: occupationCoding
 2 | Type: Package
 3 | Title: Supervised Learning for Occupation Coding
 4 | Version: 0.1.0
 5 | Author: Malte Schierholz <malte.schierholz@iab.de>
 6 | Maintainer: Malte Schierholz <malte.schierholz@iab.de>
 7 | Description: Occupation Coding refers to coding short verbal texts (survey answers)
 8 |     into an occupational classification. This package implements several supervised learning
 9 |     techniques to simplify its automation. More details can be found in an upcoming publication.
10 |     See predict-functions for some examples of what this package can do.
11 | Depends: R (>= 3.5.0)
12 | License: GPL-3
13 | RoxygenNote: 7.1.1
14 | Encoding: UTF-8
15 | LazyData: true
16 | IMPORTS:
17 |     text2vec (== 0.4.0),
18 |     tm (== 0.6-2),
19 |     stringr,
20 |     ggplot2,
21 |     data.table
22 | SUGGESTS:
23 |     glmnet,
24 |     xgboost,
25 |     SnowballC,
26 |     mvtnorm,
27 |     stringdist,
28 |     Matrix,
29 |     parallel,
30 |     readxl
31 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | export(accuracy)
 4 | export(asDocumentTermMatrix)
 5 | export(calcAccurateAmongTopK)
 6 | export(cosineSimilarity)
 7 | export(createDescriptives)
 8 | export(createSimilarityTableStringdist)
 9 | export(createSimilarityTableSubstring)
10 | export(createSimilarityTableWordwiseStringdist)
11 | export(expandPredictionResults)
12 | export(logLoss)
13 | export(plotAgreementRateVsProductionRate)
14 | export(plotReliabilityDiagram)
15 | export(plotTruePredictionsVsFalsePredictions)
16 | export(predictCreecysMemoryBasedReasoning)
17 | export(predictGweonsNearestNeighbor)
18 | export(predictLogisticRegressionWithPenalization)
19 | export(predictSimilarityBasedReasoning)
20 | export(predictWithCodingIndex)
21 | export(predictXgboost)
22 | export(prepare_German_coding_index_Gesamtberufsliste_der_BA)
23 | export(produceResults)
24 | export(removeFaultyAndUncodableAnswers_And_PrepareForAnalysis)
25 | export(selectMaxProbMethod)
26 | export(sharpness)
27 | export(stringPreprocessing)
28 | export(trainCreecysMemoryBasedReasoning)
29 | export(trainGweonsNearestNeighbor)
30 | export(trainLogisticRegressionWithPenalization)
31 | export(trainSimilarityBasedReasoning)
32 | export(trainSimilarityBasedReasoning2)
33 | export(trainXgboost)
34 | import(data.table)
35 | import(ggplot2)
36 | import(text2vec)
37 | 


--------------------------------------------------------------------------------
/R/accuracy.R:
--------------------------------------------------------------------------------
 1 | #' Accuracy
 2 | #'
 3 | #' Calculate accuracy \eqn{p = \frac{1}{n} \sum acc} and standard errors \eqn{\sqrt{\frac{1}{n} * p * (1-p)}}.
 4 | #'
 5 | #' Note that this function also works if \code{occupationalPredictionsAmongTopK} contains less than \code{n} individuals.
 6 | #'
 7 | #' @param occupationalPredictionsAmongTopK a data table created with \code{\link{calcAccurateAmongTopK}}.
 8 | #' @param n Number of unique persons in test data (may be larger than the number of persons in occupationalPredictionsAmongTopK if for some persons no codes were suggested)
 9 | #'
10 | #' @return a data.table with columns N (number of unique IDs in occupationalPredictionsAmongTopK), acc (\code{sum(acc) / n}), se (standard error of acc), mean.in.subset.N (\code{mean(acc)}), and count.pred.prob0 (\code{sum(pred.prob == 0)})
11 | #' @import data.table
12 | #' @export
13 | #'
14 | #' @examples
15 | #' # set up data
16 | #' data(occupations)
17 | #' allowed.codes <- c("71402", "71403", "63302", "83112", "83124", "83131", "83132", "83193", "83194", "-0004", "-0030")
18 | #' allowed.codes.titles <- c("Office clerks and secretaries (without specialisation)-skilled tasks", "Office clerks and secretaries (without specialisation)-complex tasks", "Gastronomy occupations (without specialisation)-skilled tasks",
19 | #'  "Occupations in child care and child-rearing-skilled tasks", "Occupations in social work and social pedagogics-highly complex tasks", "Pedagogic specialists in social care work and special needs education-unskilled/semiskilled tasks", "Pedagogic specialists in social care work and special needs education-skilled tasks", "Supervisors in education and social work, and of pedagogic specialists in social care work", "Managers in education and social work, and of pedagogic specialists in social care work",
20 | #'  "Not precise enough for coding", "Student assistants")
21 | #' proc.occupations <- removeFaultyAndUncodableAnswers_And_PrepareForAnalysis(occupations, colNames = c("orig_answer", "orig_code"), allowed.codes, allowed.codes.titles)
22 | #'
23 | #' ## split sample
24 | #' set.seed(3451345)
25 | #' n.test <- 50
26 | #' group <- sample(c(rep("test", n.test), rep("training", nrow(proc.occupations) - n.test)))
27 | #' splitted.data <- split(proc.occupations, group)
28 | #'
29 | #' # train model and make predictions
30 | #' model <- trainLogisticRegressionWithPenalization(splitted.data$train, preprocessing = list(stopwords = tm::stopwords("de"), stemming = "de", countWords = FALSE), tuning = list(alpha = 0.05, maxit = 50^5, nlambda = 100, thresh = 1e-5))
31 | #' res <- predictLogisticRegressionWithPenalization(model, splitted.data$test)
32 | #'
33 | #' # expand to contain more categories than the initial ones
34 | #' res.proc1 <- expandPredictionResults(res, allowed.codes = c("12345", allowed.codes), method.name = "glmnet1")
35 | #'
36 | #' # we can use different methods to create a combined dataset. This is how to run the subsequent analysis functions only once.
37 | #' res.proc2 <- expandPredictionResults(res, allowed.codes = c("12345", allowed.codes), method.name = "glmnet2")
38 | #' res.proc <- rbind(res.proc1, res.proc2); class(res.proc) <- c(class(res.proc), "occupationalPredictionsComplete")
39 | #'
40 | #' calcAccurateAmongTopK(res.proc, k = 5)[,mean(acc), by = method.name]
41 | #' accuracy(calcAccurateAmongTopK(res.proc, k = 5), n = 50)
42 | #' accuracy(calcAccurateAmongTopK(res.proc, k = 1), n = 50)
43 | accuracy <- function(occupationalPredictionsAmongTopK, n) {
44 | 
45 |   occupationalPredictionsAmongTopK[, list(.N, acc = sum(acc) / n, se = sqrt(sum(acc) * (n - sum(acc)) / n^3), mean.in.subset.N = mean(acc), count.pred.prob0 = sum(pred.prob == 0)), by = method.name] # se = sqrt(1/n * p * (1-p)) = sqrt(n.true * n.false / n^3)
46 | 
47 | }
48 | 


--------------------------------------------------------------------------------
/R/asDocumentTermMatrix.R:
--------------------------------------------------------------------------------
 1 | #' Document-Term Matrix
 2 | #'
 3 | #' Constructs a document-term matrix.
 4 | #'
 5 | #' @param input a character vector.
 6 | #' @param vect.vocab a vocabulary created with \code{\link[text2vec:vectorizers]{vocab_vectorizer}}. If \code{NULL}, the vocabulary is created from the input. See example for a typical use case.
 7 | #' @param stopwords character vector of stopwords to exclude when creating the vocabulary. \code{tm::stopwords("de")} provides German stopwords.
 8 | #' @param stemming \code{NULL} for no stemming and \code{"de"} for stemming using the German porter stemmer.
 9 | #' @param type character, one of c("dgCMatrix", "dgTMatrix", "lda_c") taken from \code{\link[text2vec]{create_dtm}}. \code{dgCMatrix} are useful for glmnet; \code{dgTMatrix} matrix refers to sparse matrices in triplet form, i.e. positions of all non-zero values are stored (easier to work with, but non-unique).
10 | #'
11 | #' @return A list with two elements
12 | #' \describe{
13 | #'   \item{dtm}{a sparse document-term-matrix, depending on the \code{type}-parameter}
14 | #'   \item{vect.vocab}{a vocabulary that can be inserted as \code{vect.vocab} to build a document term matrix on new data with the same vocabulary.}
15 | #' }
16 | #'
17 | #' @seealso
18 | #' \url{http://text2vec.org/vectorization.html} for details on the implementation used here,
19 | #' another implementation \code{\link[tm]{TermDocumentMatrix}} is slower
20 | #'
21 | #' @export
22 | #'
23 | #' @examples
24 | #' x <- c("Verkauf von Schreibwaren", "Verkauf", "Schreibwaren")
25 | #' asDocumentTermMatrix(x)
26 | #' asDocumentTermMatrix(x, type = "dgTMatrix")
27 | #' asDocumentTermMatrix(x, stopwords = tm::stopwords("de"))
28 | #'
29 | #' (x <- c("Verkauf von B\xfcchern, Schreibwaren", "Fach\xe4rzin f\xfcr Kinder- und Jugendmedizin im \xf6ffentlichen Gesundheitswesen", "Industriemechaniker", "Dipl.-Ing. - Agrarwirtschaft (Landwirtschaft)"))
30 | #' x <- stringPreprocessing(x)
31 | #' dtm <- asDocumentTermMatrix(x, stemming = "de")
32 | #' print(dtm$dtm)
33 | #' dimnames(dtm$dtm)[[2]]
34 | #'
35 | #' # use the newly created vocab_vectorizer
36 | #'(x <- stringPreprocessing(c("WILL NOT SHOW UP", "Verkauf von B\xfcchern, Schreibwaren", "Fach\xe4rzin f\xfcr Kinder- und Jugendmedizin")))
37 | #' asDocumentTermMatrix(x, vect.vocab = dtm$vect.vocab, stopwords = character(0), stemming = "de")$dtm
38 | asDocumentTermMatrix <- function(input, vect.vocab = NULL,
39 |                                  stopwords = character(0),
40 |                                  stemming = NULL,
41 |                                  type = c("dgCMatrix", "dgTMatrix", "lda_c")) {
42 | 
43 |   # tokenize with or without stemming?
44 |   if (!is.null(stemming)) {
45 |     if (!requireNamespace("SnowballC", quietly = TRUE)) {
46 |       stop("Package \"SnowballC\" needed if you want to do stemming. Please install it.",
47 |            call. = FALSE)
48 |     }
49 |     tok_fun <-function(x) {
50 |       tokens <- text2vec::word_tokenizer(x)
51 |       lapply(tokens, SnowballC::wordStem, language = stemming)
52 |     }
53 |   } else {
54 |     tok_fun <- text2vec::word_tokenizer
55 |   }
56 | 
57 |   # prep_fun = toupper
58 |   it_train = text2vec::itoken(tolower(input),
59 |                     # preprocessor = prep_fun,
60 |                     tokenizer = tok_fun,
61 |                     progressbar = FALSE)
62 |   if (is.null(vect.vocab)) {
63 |     vocab = text2vec::create_vocabulary(it_train, stopwords = tolower(stopwords))
64 |     vect.vocab = text2vec::vocab_vectorizer(vocab)
65 |   }
66 |   input.dtm = text2vec::create_dtm(it_train, vect.vocab, type = type)
67 |   return(list(dtm = input.dtm, vect.vocab = vect.vocab))
68 | }
69 | 


--------------------------------------------------------------------------------
/R/calcAccurateAmongTopK.R:
--------------------------------------------------------------------------------
 1 | #' Calculate aggregate properties for top k predicted categories
 2 | #'
 3 | #' Start with a data.table of class 'occupationalPredictions' (for each combination of pred.code and answer one prediction) and calulate if one of the top k entries is accurate.
 4 | #'
 5 | #' \code{num.suggested} and \code{general.among.top5} is currently not used. Relates to situations if the prediction algorithm does not provide all codes.
 6 | #'
 7 | #' @param occupationalPredictions a data.table created with a \code{\link{expandPredictionResults}}-function from this package.
 8 | #' @param k how many top k categories to aggregate over?
 9 | #'
10 | #' @seealso ...
11 | #'
12 | #' @return a data.table
13 | #' @import data.table
14 | #' @export
15 | #'
16 | #' @examples
17 | #' # set up data
18 | #' data(occupations)
19 | #' allowed.codes <- c("71402", "71403", "63302", "83112", "83124", "83131", "83132", "83193", "83194", "-0004", "-0030")
20 | #' allowed.codes.titles <- c("Office clerks and secretaries (without specialisation)-skilled tasks", "Office clerks and secretaries (without specialisation)-complex tasks", "Gastronomy occupations (without specialisation)-skilled tasks",
21 | #'  "Occupations in child care and child-rearing-skilled tasks", "Occupations in social work and social pedagogics-highly complex tasks", "Pedagogic specialists in social care work and special needs education-unskilled/semiskilled tasks", "Pedagogic specialists in social care work and special needs education-skilled tasks", "Supervisors in education and social work, and of pedagogic specialists in social care work", "Managers in education and social work, and of pedagogic specialists in social care work",
22 | #'  "Not precise enough for coding", "Student assistants")
23 | #' proc.occupations <- removeFaultyAndUncodableAnswers_And_PrepareForAnalysis(occupations, colNames = c("orig_answer", "orig_code"), allowed.codes, allowed.codes.titles)
24 | #'
25 | #' ## split sample
26 | #' set.seed(3451345)
27 | #' n.test <- 50
28 | #' group <- sample(c(rep("test", n.test), rep("training", nrow(proc.occupations) - n.test)))
29 | #' splitted.data <- split(proc.occupations, group)
30 | #'
31 | #' # train model and make predictions
32 | #' model <- trainLogisticRegressionWithPenalization(splitted.data$train, preprocessing = list(stopwords = tm::stopwords("de"), stemming = "de", countWords = FALSE), tuning = list(alpha = 0.05, maxit = 50^5, nlambda = 100, thresh = 1e-5))
33 | #' res <- predictLogisticRegressionWithPenalization(model, splitted.data$test)
34 | #'
35 | #' # expand to contain more categories than the initial ones
36 | #' res.proc1 <- expandPredictionResults(res, allowed.codes = c("12345", allowed.codes), method.name = "glmnet1")
37 | #'
38 | #' # aggregate over top k categories
39 | #' calcAccurateAmongTopK(res.proc1, k = 1)[,mean(acc)]
40 | #' calcAccurateAmongTopK(res.proc1, k = 5)[,mean(acc)]
41 | #'
42 | #' # we can use different methods to create a combined dataset. This is how to run the subsequent analysis functions only once.
43 | #' res.proc2 <- expandPredictionResults(res, allowed.codes = c("12345", allowed.codes), method.name = "glmnet2")
44 | #' res.proc <- rbind(res.proc1, res.proc2); class(res.proc) <- c(class(res.proc), "occupationalPredictionsComplete")
45 | #'
46 | #' calcAccurateAmongTopK(res.proc, k = 5)[,mean(acc), by = method.name]
47 | #' # res[, calcAccurateAmongTopK(.SD, k = 5), by = method][,mean(acc), by = method]
48 | calcAccurateAmongTopK <- function(occupationalPredictions, k = 1) {
49 | 
50 |   if (!("occupationalPredictionsComplete" %in% class(occupationalPredictions))){
51 |     stop("'occupationalPredictionsComplete' needs to have class 'occupationalPredictionsComplete' (=constructed with a expandPredictionResults method).")
52 |   }
53 | 
54 |   # determine if a pred.code is accurate (acc) and get the (at most) k most probable pred.codes per id
55 |   # only codes if they were probable in the first place (among.suggested.code == TRUE)
56 |   occupationalPredictions2 <- occupationalPredictions[among.suggested.code == TRUE, head(.SD[order(sample(1:.N, .N))][order(pred.prob, decreasing = TRUE), list(ans, pred.code, pred.prob, code, acc = code == pred.code, among.suggested.code)], k), by = list(id, method.name)]
57 | 
58 |   if (k == 1) {
59 |     randomChoices <- merge(occupationalPredictions, occupationalPredictions2, by = c("pred.prob", "id", "method.name"))[, .N, by = list(method.name, id)][N > 2]
60 | 
61 |     if (nrow(randomChoices) > 0) {
62 |       warning("Some ids have more than one most probable code (see above). Random choice made (or lowest pred.code selected?)")
63 |       print(randomChoices)
64 |     }
65 |   }
66 | 
67 |   # calculate key statistics (pred.code: set of predicted codes among top k, pred.prob: sum of probabilities, acc: number of correct predictions, num.suggested: number (1-k), general.among.top5: indicator for any non-included code),
68 |   return(occupationalPredictions2[, list(pred.code = paste(pred.code, collapse = ", "), pred.prob = sum(pred.prob), acc = sum(acc), num.suggested.codes = sum(!among.suggested.code), num.suggested = .N, general.among.top5 = "-9999" %in% pred.code), by = list(id, ans, code, method.name)])
69 | }
70 | 


--------------------------------------------------------------------------------
/R/cosineSimilarity.R:
--------------------------------------------------------------------------------
 1 | #' Cosine Similarity
 2 | #'
 3 | #' Calculate cosine similarity between every row in \code{matrix1} and every row in \code{matrix2}.
 4 | #'
 5 | #' Cosine similarity is a measure of similarity between two vectors \eqn{x} and \eqn{y} that measures the cosine of the angle between them. Since we consider positive vectors, its maximal value is 1 if both vectors are identical and its minimal value is 0 if \eqn{x \times y = 0}.
 6 | #'
 7 | #' The definition is: \eqn{similarity = (x \times y) / (||x|| \times ||y||) = (\sum_i x_i \times y_i) / (\sqrt{(\sum_i x_i^2)} \times \sqrt{(\sum_i y_i^2)})}
 8 | #'
 9 | #' @param matrix1 a matrix of type \code{dgCMatrix}.
10 | #' @param matrix2 a matrix of type \code{dgCMatrix}.
11 | #'
12 | #' @seealso \code{\link[Matrix]{Matrix}}
13 | #'
14 | #' @return A \code{dgCMatrix} where element \code{A[index1, index2]} is the cosine similarity between \code{matrix1[index1,]} and \code{matrix2[index2,]}.
15 | #' @export
16 | #'
17 | #' @examples
18 | #' x <- c("Verkauf von Schreibwaren", "Verkauf", "Schreibwaren", "Industriemechaniker", "NOTINDOCUMENTTERMMATRIX")
19 | #' (y <- c("Verkauf von B\xfcchern, Schreibwaren", "Fach\xe4rzin f\xfcr Kinder- und Jugendmedizin im \xf6ffentlichen Gesundheitswesen", "Industriemechaniker", "Dipl.-Ing. - Agrarwirtschaft (Landwirtschaft)"))
20 | #'
21 | #' tok_fun = text2vec::word_tokenizer
22 | #' it_train = text2vec::itoken(tolower(y), tokenizer = tok_fun, progressbar = FALSE)
23 | #' vocab = text2vec::create_vocabulary(it_train)
24 | #' vect.vocab = text2vec::vocab_vectorizer(vocab)
25 | #'
26 | #' matrix1 <- asDocumentTermMatrix(x, vect.vocab = vect.vocab)$dtm
27 | #' matrix2 <- asDocumentTermMatrix(y, vect.vocab = vect.vocab)$dtm
28 | #'
29 | #' cosineSimilarity(matrix1, matrix1)
30 | #' cosineSimilarity(matrix1, matrix2)
31 | cosineSimilarity <- function(matrix1, matrix2) {
32 | 
33 |   if (class(matrix1) != "dgCMatrix") stop("matrix1 needs to have class dgCMatrix")
34 |   if (class(matrix2) != "dgCMatrix") stop("matrix2 needs to have class dgCMatrix")
35 | 
36 |   if (matrix1@Dim[2] != matrix2@Dim[2]) stop("matrix1 and matrix2 need to have identical numbers of columns.")
37 | 
38 |   similarity <- Matrix::tcrossprod(matrix1, matrix2) / outer(sqrt(Matrix::rowSums(matrix1)), sqrt(Matrix::rowSums(matrix2)))
39 |   similarity@x[is.na(similarity@x)] <- 0 # if a row in either matrix1 or matrix2 is all zero, the denominator is 0 and similarity is NA -> replace with zero.
40 | 
41 |   return(similarity)
42 | }
43 | 


--------------------------------------------------------------------------------
/R/createDescriptives.R:
--------------------------------------------------------------------------------
 1 | #' Describe occupational data
 2 | #'
 3 | #' Outputs the following descriptives: frequencies, mean, median, max of number of words. Number of codes used and frequencies for special codes
 4 | #'
 5 | #' @param data eiter a data.table created with \code{\link{removeFaultyAndUncodableAnswers_And_PrepareForAnalysis}} or a character vector
 6 | #' @param noninformative a charactor vector: if an answer is in \code{noninformative}, it is excluded
 7 | #'
 8 | #' @return Side effects only: a plot and text
 9 | #' @import data.table
10 | #' @export
11 | #'
12 | #' @examples
13 | #' data <- data.table(answers = c("LEITER VERTRIEB", "Kfz-Schlossermeister", "Aushilfe im Hotel(Bereich Housekeeping)", "Studentische Hilfskraft"),
14 | #'                    codes = c("61194", "25213", "63221", "-0001"))
15 | #' (allowed.codes <- c("11101", "61194", "25213", "63221", "-0001"))
16 | #' (allowed.codes.titles <- c("Berufe in der Landwirtschaft (ohne Spezialisierung) - Helfer-/Anlernt\xe4tigkeiten", "Berufe in der Kraftfahrzeugtechnik - komplexe Spezialistent\xe4tigkeiten", "F\xfchrungskräfte - Einkauf und Vertrieb", "Berufe im Hotelservice - Helfer-/Anlernt\xe4tigkeiten", "a negative label (used for categories like 'student assistant' that are not in the official classification)"))
17 | #' data <- removeFaultyAndUncodableAnswers_And_PrepareForAnalysis(data, colNames = c("answers", "codes"), allowed.codes, allowed.codes.titles)
18 | #' createDescriptives(data)
19 | #'
20 | #' (answer <- c("LEITER VERTRIEB", "Kfz-Schlossermeister", "Aushilfe im Hotel(Bereich Housekeeping)"))
21 | #' createDescriptives(answer)
22 | #'
23 | #' data(occupations)
24 | #' allowed.codes <- c("71402", "71403", "63302", "83112", "83124", "83131", "83132", "83193", "83194", "-0004", "-0030")
25 | #' allowed.codes.titles <- c("Office clerks and secretaries (without specialisation)-skilled tasks", "Office clerks and secretaries (without specialisation)-complex tasks",
26 | #'  "Gastronomy occupations (without specialisation)-skilled tasks", "Occupations in child care and child-rearing-skilled tasks", "Occupations in social work and social pedagogics-highly complex tasks", "Pedagogic specialists in social care work and special needs education-unskilled/semiskilled tasks", "Pedagogic specialists in social care work and special needs education-skilled tasks", "Supervisors in education and social work, and of pedagogic specialists in social care work", "Managers in education and social work, and of pedagogic specialists in social care work",
27 | #'  "Not precise enough for coding", "Student assistants")
28 | #' occupations <- removeFaultyAndUncodableAnswers_And_PrepareForAnalysis(occupations, colNames = c("orig_answer", "orig_code"), allowed.codes, allowed.codes.titles)
29 | #' createDescriptives(occupations)
30 | createDescriptives <- function (data, noninformative = c("")) {
31 |   if ("occupationData" %in% class(data)) {
32 |     ans <- data[, gsub(" {2,}", " ", ans)] # remove multiple (>2) empty space
33 |   }
34 |   if (is.character(data)) {
35 |     ans <- gsub(" {2,}", " ", data)
36 |   }
37 | 
38 |   ans <- ans[!(ans %in% noninformative)]
39 |   cat("Number of informative answers:", length(ans), "\n")
40 | 
41 |   # answer length
42 |   ans_freq <- sapply(strsplit(ans, " "), length)
43 |   plot(table(ans_freq))
44 |   cat("Frequency of Number of Words:\n")
45 |   print(table(ans_freq))
46 |   cat("\n")
47 |   cat("Mean Number of Words: ", mean(ans_freq), "\n")
48 |   cat("Median Number of Words: ", median(ans_freq), "\n")
49 |   cat("Maximal Number of Words: ", max(ans_freq), "\n")
50 | 
51 |   if (is.data.table(data)) {
52 |     cat("Number of categories (with code > 0) from classification used: ", data[code > 0 & code %in% attr(data, "classification")[, code], .N, by = code][, .N], "\n")
53 |     cat("Frequency of categories (wih code < 0) from classification:\n")
54 |     print(data[code < 0 & code %in% attr(data, "classification")[, code], table(code)])
55 |   }
56 | }
57 | 


--------------------------------------------------------------------------------
/R/createSimilarityTableStringdist.R:
--------------------------------------------------------------------------------
 1 | #' Similarity Table with Coding index
 2 | #'
 3 | #' Calculate string similarity between \code{unique.string} and \code{(coding_index_w_codes, coding_index_without_codes)}.
 4 | #'
 5 | #' Special function for similarity-based reasoning: creates distance data with osa-method c(d = 1, i = 1, s = 1, t = 1)
 6 | #' dist == 0: strings in dict and data are identical
 7 | #  dist == 1: exactly one deletion, insertion, substitution, or transposition
 8 | #  dist == 2: exactly two deletion, insertion, substitution, or transposition, the majority is a mismatch with dist == 2
 9 | #'
10 | #' @param unique.string a character vector (usually unique(answer))
11 | #' @param coding_index_w_codes a data.table with columns "title" and "Code".
12 | #' @param coding_index_without_codes a character vector of additional titles
13 | #' @param dist.control a list that will be passed to \code{\link[stringdist:stringdist]{stringdistmatrix}}. Currently only two elements are implemented:
14 | #' \describe{
15 | #'   \item{method}{Method for distance calculation.}
16 | #'   \item{weight}{For method='osa' or 'dl'.}
17 | #' }
18 | #' @param threshold All entries with distance above this threshold will be removed from the result
19 | #'
20 | #' @seealso \code{\link{trainSimilarityBasedReasoning}}, \code{\link{createSimilarityTableWordwiseStringdist}}, \code{\link{createSimilarityTableSubstring}}
21 | #'
22 | #' @return a list with elements
23 | #' \describe{
24 | #'   \item{dist_table_w_code}{a data.table with colummns \code{intString}, \code{dictString.title}, \code{dictString.Code}, \code{dist}}
25 | #'   \item{dist_table_without_code}{\code{NULL} or a data.table with colummns \code{intString}, \code{dictString}, \code{dist}}
26 | #'   \item{vect_vocab}{see \code{link{asDocumentTermMatrix}}}
27 | #' }
28 | #' 
29 | #' @export
30 | #' @examples
31 | #' ## Prepare coding index
32 | #' # write female titles beneath the male title
33 | #' coding_index <- rbind(coding_index_excerpt[, list(title = bezMale, Code)],
34 | #'                       coding_index_excerpt[, list(title = bezFemale, Code)])
35 | #' # standardize titles from the coding index
36 | #' coding_index <- coding_index[,title := stringPreprocessing(title)]
37 | #' # drop duplicate lines, might be suboptimal because we keep each title and its associated code only a single time. This means we delete duplicates and the associated, possibly relevant codes.
38 | #' coding_index <- coding_index[!duplicated(title)]
39 | #'
40 | #' (x <- c("Abgeordneter", "Abgeordneter", "Abgeordnete", "abgeordnet", "FSJ", "FSJ2", "Industriemechaniker", "Dipl.-Ing. - Agrarwirtschaft (Landwirtschaft)"))
41 | #' createSimilarityTableStringdist(unique.string = stringPreprocessing(x),
42 | #'                                 coding_index_w_codes = coding_index,
43 | #'                                 coding_index_without_codes = frequent_phrases,
44 | #'                                 dist.control = list(method = "osa", weight = c(d = 1, i = 1, s = 1, t = 1)),
45 | #'                                 threshold = 3)
46 | createSimilarityTableStringdist <- function(unique.string,
47 |                                             coding_index_w_codes,
48 |                                             coding_index_without_codes,
49 |                                             dist.control = list(method = "osa", weight = c(d = 1, i = 1, s = 1, t = 1)),
50 |                                             threshold = 3) {
51 | 
52 |   distmat <- stringdist::stringdistmatrix(coding_index_w_codes[, title], unique.string, method = dist.control$method, weight = dist.control$weight)
53 |   dist_table_w_code <- data.table(intString = unique.string[which(distmat <= threshold, arr.ind = TRUE)[, 2]], dictString = coding_index_w_codes[, list(title, Code)][which(distmat <= threshold, arr.ind = TRUE)[, 1]], dist = distmat[which(distmat <= threshold, arr.ind = TRUE)])
54 | 
55 |   dist_table_without_code <- NULL
56 |   # do the same for coding_index_without_codes
57 |   if (length(coding_index_without_codes) > 2) {
58 |     distmat <- stringdist::stringdistmatrix(coding_index_without_codes, unique.string, method = dist.control$method, weight = dist.control$weight)
59 |     dist_table_without_code <- data.table(intString = unique.string[which(distmat <= threshold, arr.ind = TRUE)[, 2]], dictString = coding_index_without_codes[which(distmat <= threshold, arr.ind = TRUE)[, 1]], dist = distmat[which(distmat <= threshold, arr.ind = TRUE)])
60 |   }
61 | 
62 |   return(list(dist_table_w_code = dist_table_w_code, dist_table_without_code = dist_table_without_code, vect_vocab = NULL))
63 | 
64 | }
65 | 


--------------------------------------------------------------------------------
/R/createSimilarityTableSubstring.R:
--------------------------------------------------------------------------------
 1 | #' Similarity Table with Coding index
 2 | #'
 3 | #' Calculate SUBSTRING similarity between \code{unique.string} and \code{(coding_index_w_codes, coding_index_without_codes)}. unique.string and coding_index are similar if coding_index is a substring of unique.string.
 4 | #'
 5 | #' Special function for similarity-based reasoning: creates distance data with substring-method
 6 | #'
 7 | #' @param unique.string a character vector (usually unique(answer))
 8 | #' @param coding_index_w_codes a data.table with columns "title" and "Code".
 9 | #' @param coding_index_without_codes a character vector of additional titles
10 | #'
11 | #' @seealso \code{\link{trainSimilarityBasedReasoning}}, \code{\link{createSimilarityTableWordwiseStringdist}}, \code{\link{createSimilarityTableStringdist}}, \code{\link{createSimilarityTableStringdist}}
12 | #'
13 | #' @return a list with elements
14 | #' \describe{
15 | #'   \item{dist_table_w_code}{a data.table with colummns \code{intString}, \code{dictString.title}, \code{dictString.Code}, \code{dist}}
16 | #'   \item{dist_table_without_code}{\code{NULL} or a data.table with colummns \code{intString}, \code{dictString}, \code{dist}}
17 | #'   \item{vect_vocab}{see \code{link{asDocumentTermMatrix}}}
18 | #' }
19 | #' 
20 | #' @export
21 | #'
22 | #' @examples
23 | #' ## Prepare coding index
24 | #' # write female titles beneath the male title
25 | #' coding_index <- rbind(coding_index_excerpt[, list(title = bezMale, Code)],
26 | #'                       coding_index_excerpt[, list(title = bezFemale, Code)])
27 | #' # standardize titles from the coding index
28 | #' coding_index <- coding_index[,title := stringPreprocessing(title)]
29 | #' # drop duplicate lines, might be suboptimal because we keep each title and its associated code only a single time. This means we delete duplicates and the associated, possibly relevant codes.
30 | #' coding_index <- coding_index[!duplicated(title)]
31 | #'
32 | #' (x <- c("Abgeordneter", "Abgeordneter", "Abgeordnete", "abgeordnet", "FSJ", "FSJ2", "Industriemechaniker", "Dipl.-Ing. - Agrarwirtschaft (Landwirtschaft)"))
33 | #' createSimilarityTableSubstring(unique.string = stringPreprocessing(x),
34 | #'                                coding_index_w_codes = coding_index,
35 | #'                                coding_index_without_codes = frequent_phrases)
36 | createSimilarityTableSubstring <- function(unique.string,
37 |                                            coding_index_w_codes,
38 |                                            coding_index_without_codes) {
39 | 
40 | 
41 |   # create distance data where dictionary string is substring from survey string
42 |   #####################################################
43 |   # find all indices of survey strings where coding_index[i,] is a substring
44 |   distmat.ind <- lapply(1:nrow(coding_index_w_codes), FUN = function(i) grep(coding_index_w_codes[i, title], unique.string))
45 |   # bring them in data.table format
46 |   length.ind <- sapply(distmat.ind, length)
47 |   distmat.ind <- data.table(dict = rep(1:nrow(coding_index_w_codes), times = length.ind), u.string = unlist(distmat.ind))
48 |   # get strings instead of indices
49 |   dist_table_w_code <- data.table(intString = unique.string[distmat.ind[, u.string]], dictString = coding_index_w_codes[, list(title, Code)][distmat.ind[, dict]], dist = 0L)
50 | 
51 |   dist_table_without_code <- NULL
52 |   # do the same for coding_index_without_codes
53 |   if (length(coding_index_without_codes) > 2) {
54 |     distmat.ind <- lapply(1:length(coding_index_without_codes), FUN = function(i) grep(coding_index_without_codes[i], unique.string))
55 |     # bring them in data.table format
56 |     length.ind <- sapply(distmat.ind, length)
57 |     distmat.ind <- data.table(dict = rep(1:length(coding_index_without_codes), times = length.ind), u.string = unlist(distmat.ind))
58 |     # get strings instead of indices
59 |     dist_table_without_code <- data.table(intString = unique.string[distmat.ind[, u.string]], dictString = coding_index_without_codes[distmat.ind[, dict]], dist = 0L)
60 |   }
61 | 
62 |   return(list(dist_table_w_code = dist_table_w_code, dist_table_without_code = dist_table_without_code, vect_vocab = NULL))
63 | }
64 | 


--------------------------------------------------------------------------------
/R/createSimilarityTableWordwiseStringdist.R:
--------------------------------------------------------------------------------
 1 | #' Wordwise Similarity Table with Coding index
 2 | #'
 3 | #' Calculate string similarity on a word basis between \code{unique.string} and \code{(coding_index_w_codes, coding_index_without_codes)}.
 4 | #'
 5 | #' Special function for similarity-based reasoning: creates WORDWISE!!!!! distance data with osa-method c(d = 1, i = 1, s = 1, t = 1)
 6 | #' --> allows to correct 1 letter in a single word and matches this word with the dictionary.
 7 | #' This means: unique.string is split wordwise and that word is used for matchning which has lowest osa-distance (all in case of a tie)
 8 | #' example:
 9 | #' "KUESTER and HAUSMEISTER" has distance 0 to both dictString.title HAUSMEISTER and KUESTER. Because the word HAUSMEISTER has minimal distance, another dictString.title HAUMEISTER, which has dist = 1 is not included.
10 | #'
11 | #' @param unique.string a character vector (usually unique(answer))
12 | #' @param coding_index_w_codes a data.table with columns "title" and "Code".
13 | #' @param coding_index_without_codes a character vector of additional titles
14 | #' @param preprocessing a list with elements
15 | #' \describe{
16 | #'   \item{stopwords}{a character vector, use \code{tm::stopwords("de")} for German stopwords.}
17 | #'   \item{stemming}{\code{NULL} for no stemming and \code{"de"} for stemming using the German porter stemmer.}
18 | #'   \item{strPreprocessing}{\code{TRUE} if \code{\link{stringPreprocessing}} shall be used.}
19 | #'   \item{removePunct}{\code{TRUE} if \code{\link[tm]{removePunctuation}} shall be used.}
20 | #' }
21 | #' @param dist.control a list that will be passed to \code{\link[stringdist:stringdist]{stringdistmatrix}}. Currently only two elements are implemented:
22 | #' \describe{
23 | #'   \item{method}{Method for distance calculation.}
24 | #'   \item{weight}{For method='osa' or 'dl'.}
25 | #' }
26 | #' @param threshold All entries with distance above this threshold will be removed from the result
27 | #'
28 | #' @seealso \code{\link{trainSimilarityBasedReasoning}}, \code{\link{createSimilarityTableStringdist}}, \code{\link{createSimilarityTableSubstring}}
29 | #'
30 | #' @return a list with elements
31 | #' \describe{
32 | #'   \item{dist_table_w_code}{a data.table with colummns \code{intString}, \code{dictString.title}, \code{dictString.Code}, \code{dist}}
33 | #'   \item{dist_table_without_code}{\code{NULL} or a data.table with colummns \code{intString}, \code{dictString}, \code{dist}}
34 | #'   \item{vect_vocab}{see \code{link{asDocumentTermMatrix}}}
35 | #' }
36 | #'
37 | #' @export
38 | #' @examples
39 | #' ## Prepare coding index
40 | #' # write female titles beneath the male title
41 | #' coding_index <- rbind(coding_index_excerpt[, list(title = bezMale, Code)],
42 | #'                       coding_index_excerpt[, list(title = bezFemale, Code)])
43 | #' # standardize titles from the coding index
44 | #' coding_index <- coding_index[,title := stringPreprocessing(title)]
45 | #' # drop duplicate lines, might be suboptimal because we keep each title and its associated code only a single time. This means we delete duplicates and the associated, possibly relevant codes.
46 | #' coding_index <- coding_index[!duplicated(title)]
47 | #'
48 | #' (x <- c("Abgeordneter", "Abgeordneter", "Abgeordnete", "abgeordnet", "abgeordnet zu xxx", "FSJ", "FSJ2", "Industriemechaniker", "Dipl.-Ing. - Agrarwirtschaft (Landwirtschaft)"))
49 | #' createSimilarityTableWordwiseStringdist(unique.string = stringPreprocessing(x),
50 | #'                                         coding_index_w_codes = coding_index,
51 | #'                                          coding_index_without_codes = frequent_phrases,
52 | #'                                          preprocessing = list(stopwords = tm::stopwords("de"), stemming = NULL, strPreprocessing = TRUE, removePunct = FALSE),
53 | #'                                          dist.control = list(method = "osa", weight = c(d = 1, i = 1, s = 1, t = 1)),
54 | #'                                          threshold = 1)
55 | createSimilarityTableWordwiseStringdist <- function(unique.string,
56 |                                                     coding_index_w_codes,
57 |                                                     coding_index_without_codes,
58 |                                                     preprocessing,
59 |                                                     dist.control = list(method = "osa", weight = c(d = 1, i = 1, s = 1, t = 1)),
60 |                                                     threshold = 1) {
61 | 
62 |   # prepare text for efficient computation -> transform to sparse matrix
63 |   matrix <- asDocumentTermMatrix(unique.string, vect.vocab = NULL,
64 |                                  stopwords = preprocessing$stopwords,
65 |                                  stemming = preprocessing$stemming,
66 |                                  type = "dgTMatrix")
67 | 
68 |   distmat <- stringdist::stringdistmatrix(coding_index_w_codes[, title], toupper(matrix$dtm@Dimnames[[2]]), method = dist.control$method, weight = dist.control$weight) # calculate distances between each word in matrix1 and the coding_index_w_codes
69 |   distmat.table <- data.table(word.id = which(distmat <= threshold, arr.ind = TRUE)[, 2], dictString = coding_index_w_codes[, list(title, Code)][which(distmat <= threshold, arr.ind = TRUE)[, 1]], dist = distmat[which(distmat <= threshold, arr.ind = TRUE)])
70 |   distmat.table <- merge(distmat.table, data.table(intString = unique.string[matrix$dtm@i + 1], word.id = matrix$dtm@j + 1), by = "word.id", allow.cartesian=TRUE) # we have now columns for each unique.string, word, dictString, dist
71 |   # now keep all words that have at least one dictString with minimal distance
72 |   distmat.table[, dist2 := min(dist), by = list(intString, word.id)]
73 |   dist_table_w_code <- distmat.table[, .SD[dist == dist2, list(dictString.title, dictString.Code, dist)], by = intString]
74 | 
75 |   dist_table_without_code <- NULL
76 |   # do the same for coding_index_without_codes
77 |   if (length(coding_index_without_codes) > 2) {
78 |     distmat <- stringdist::stringdistmatrix(coding_index_without_codes, toupper(matrix$dtm@Dimnames[[2]]), method = dist.control$method, weight = dist.control$weight) # calculate distances between each word in matrix1 and the coding_index
79 |     distmat.table <- data.table(word.id = which(distmat <= threshold, arr.ind = TRUE)[, 2], dictString = coding_index_without_codes[which(distmat <= threshold, arr.ind = TRUE)[, 1]], dist = distmat[which(distmat <= threshold, arr.ind = TRUE)])
80 |     distmat.table <- merge(distmat.table, data.table(intString = unique.string[matrix$dtm@i + 1], word.id = matrix$dtm@j + 1), by = "word.id", allow.cartesian=TRUE) # we have now columns for each unique.string, word, dictString, dist
81 |     # now keep all words that have at least one dictString with minimal distance
82 |     distmat.table[, dist2 := min(dist), by = list(intString, word.id)]
83 |     dist_table_without_code <- distmat.table[, .SD[dist == dist2, list(dictString, dist)], by = intString]
84 |   }
85 | 
86 |   return(list(dist_table_w_code = dist_table_w_code, dist_table_without_code = dist_table_without_code, vect_vocab = matrix$vect.vocab))
87 | 
88 | }
89 | 


--------------------------------------------------------------------------------
/R/expandPredictionResults.R:
--------------------------------------------------------------------------------
 1 | #' Expands predicted datasets to contain all allowed codes
 2 | #'
 3 | #' Start with a data.table of class 'occupationalPredictions' (for each combination of pred.code and answer one prediction) and expand it to contain all allowed codes.
 4 | #'
 5 | #' The problem solved here is this: Most algorithms do not provide codes for all categories from the classification, because this would require that the categories are in the training data. This function expands the dataset and predicts some very small probabilities (or 0) for classification codes that the training algorithm found impossible to predict.
 6 | #'
 7 | #' @param data a data.table created with a \code{predict}-function from this package.
 8 | #' @param allowed.codes a character vector of all allowed codes.
 9 | #' @param method.name the name how the method shall be called.
10 | #'
11 | #' @seealso \code{\link{produceResults}}
12 | #'
13 | #' @return a data.table
14 | #' @import data.table
15 | #' @export
16 | #'
17 | #' @examples
18 | #' # set up data
19 | #' data(occupations)
20 | #' allowed.codes <- c("71402", "71403", "63302", "83112", "83124", "83131", "83132", "83193", "83194", "-0004", "-0030")
21 | #' allowed.codes.titles <- c("Office clerks and secretaries (without specialisation)-skilled tasks", "Office clerks and secretaries (without specialisation)-complex tasks", "Gastronomy occupations (without specialisation)-skilled tasks",
22 | #'  "Occupations in child care and child-rearing-skilled tasks", "Occupations in social work and social pedagogics-highly complex tasks", "Pedagogic specialists in social care work and special needs education-unskilled/semiskilled tasks", "Pedagogic specialists in social care work and special needs education-skilled tasks", "Supervisors in education and social work, and of pedagogic specialists in social care work", "Managers in education and social work, and of pedagogic specialists in social care work",
23 | #'  "Not precise enough for coding", "Student assistants")
24 | #' proc.occupations <- removeFaultyAndUncodableAnswers_And_PrepareForAnalysis(occupations, colNames = c("orig_answer", "orig_code"), allowed.codes, allowed.codes.titles)
25 | #'
26 | #' ## split sample
27 | #' set.seed(3451345)
28 | #' n.test <- 50
29 | #' group <- sample(c(rep("test", n.test), rep("training", nrow(proc.occupations) - n.test)))
30 | #' splitted.data <- split(proc.occupations, group)
31 | #'
32 | #' # train model and make predictions
33 | #' model <- trainLogisticRegressionWithPenalization(splitted.data$train, preprocessing = list(stopwords = tm::stopwords("de"), stemming = "de", countWords = FALSE), tuning = list(alpha = 0.05, maxit = 50^5, nlambda = 100, thresh = 1e-5))
34 | #' res <- predictLogisticRegressionWithPenalization(model, splitted.data$test)
35 | #'
36 | #' expandPredictionResults(res, allowed.codes, method.name = "Logistic Regression")
37 | expandPredictionResults <- function(occupationalPredictions, allowed.codes, method.name) {
38 | 
39 |   if (!("occupationalPredictions" %in% class(occupationalPredictions))){
40 |     stop("'occupationalPredictions' needs to have class 'occupationalPredictions' (=constructed with a predict method).")
41 |   }
42 | 
43 |   # 1. construct with CJ an expanded data.table that contains one row for every combination of id and allowed.code. 2. Merge with original data to expand that dataset
44 |   res.complete <- merge(occupationalPredictions[, list(id, pred.code, ans, pred.prob, code, among.suggested.code = TRUE)], CJ(id = unique(occupationalPredictions[, id]), pred.code = allowed.codes), by = c("id", "pred.code"), all.y = TRUE)
45 | 
46 |   # special case: in some prediction datasets a code "-9999" was predicted that is a placeholder for all other codes. Use this probability
47 |   if (occupationalPredictions[, any(pred.code == "-9999")]) {
48 |     res.complete <- merge(res.complete, occupationalPredictions[pred.code == "-9999", list(pred.prob.insert = pred.prob, id, true.code.insert = code)], by = "id", all.x = TRUE)
49 |   } else {
50 |     res.complete <- merge(res.complete, occupationalPredictions[pred.code == min(pred.code), list(pred.prob.insert = 0, id, true.code.insert = code)], by = "id", all.x = TRUE)
51 |   }
52 | 
53 |   # fill new rows with appropriate values
54 |   res.complete[is.na(pred.prob), pred.prob := pred.prob.insert]
55 |   res.complete[is.na(code), code := true.code.insert]
56 |   res.complete[, true.code.insert := NULL]
57 |   res.complete[, pred.prob.insert := NULL]
58 | 
59 |   # an indicator that tells us if the original algorithm came up with this code (impossible.code = FALSE)
60 |   res.complete[is.na(among.suggested.code), among.suggested.code := FALSE]
61 | 
62 |   res.complete[, method.name := method.name]
63 | 
64 |   class(res.complete) <- c(class(res.complete), "occupationalPredictionsComplete")
65 |   return(res.complete)
66 | }
67 | 


--------------------------------------------------------------------------------
/R/logLoss.R:
--------------------------------------------------------------------------------
 1 | #' Log loss
 2 | #'
 3 | #' Calculate log loss \eqn{\log_2 loss = \frac{1}{N} \sum_n \log_2 loss_n} and standard error \eqn{\sqrt{\frac{1}{N(N-1)} \sum_n (\log_2 loss_n - \log_2 loss)^2}} with \eqn{loss_n = \sum_k -y_{nk} \log_2 p_{nk}}
 4 | #'
 5 | #' log loss is the average probability of true categories that actually realized.
 6 | #'
 7 | #' @param occupationalPredictions a data.table created with a \code{\link{expandPredictionResults}}-function from this package.
 8 | #'
 9 | #' @seealso \code{\link{sharpness}}
10 | #'
11 | #' @return a data.table
12 | #' @import data.table
13 | #' @export
14 | #'
15 | #' @examples
16 | #' # set up data
17 | #' data(occupations)
18 | #' allowed.codes <- c("71402", "71403", "63302", "83112", "83124", "83131", "83132", "83193", "83194", "-0004", "-0030")
19 | #' allowed.codes.titles <- c("Office clerks and secretaries (without specialisation)-skilled tasks", "Office clerks and secretaries (without specialisation)-complex tasks", "Gastronomy occupations (without specialisation)-skilled tasks",
20 | #'  "Occupations in child care and child-rearing-skilled tasks", "Occupations in social work and social pedagogics-highly complex tasks", "Pedagogic specialists in social care work and special needs education-unskilled/semiskilled tasks", "Pedagogic specialists in social care work and special needs education-skilled tasks", "Supervisors in education and social work, and of pedagogic specialists in social care work", "Managers in education and social work, and of pedagogic specialists in social care work",
21 | #'  "Not precise enough for coding", "Student assistants")
22 | #' proc.occupations <- removeFaultyAndUncodableAnswers_And_PrepareForAnalysis(occupations, colNames = c("orig_answer", "orig_code"), allowed.codes, allowed.codes.titles)
23 | #'
24 | #' ## split sample
25 | #' set.seed(3451345)
26 | #' n.test <- 50
27 | #' group <- sample(c(rep("test", n.test), rep("training", nrow(proc.occupations) - n.test)))
28 | #' splitted.data <- split(proc.occupations, group)
29 | #'
30 | #' # train model and make predictions
31 | #' model <- trainLogisticRegressionWithPenalization(splitted.data$train, preprocessing = list(stopwords = tm::stopwords("de"), stemming = "de", countWords = FALSE), tuning = list(alpha = 0.05, maxit = 50^5, nlambda = 100, thresh = 1e-5))
32 | #' res <- predictLogisticRegressionWithPenalization(model, splitted.data$test)
33 | #'
34 | #' # expand to contain more categories than the initial ones
35 | #' res.proc1 <- expandPredictionResults(res, allowed.codes = c("12345", allowed.codes), method.name = "glmnet1")
36 | #'
37 | #' # we can use different methods to create a combined dataset. This is how to run the subsequent analysis functions only once.
38 | #' res.proc2 <- expandPredictionResults(res, allowed.codes = c("12345", allowed.codes), method.name = "glmnet2")
39 | #' res.proc <- rbind(res.proc1, res.proc2); class(res.proc) <- c(class(res.proc), "occupationalPredictionsComplete")
40 | #'
41 | #' logLoss(res.proc)
42 | logLoss <- function(occupationalPredictions) {
43 | 
44 |   if (!("occupationalPredictionsComplete" %in% class(occupationalPredictions))){
45 |     stop("'occupationalPredictionsComplete' needs to have class 'occupationalPredictionsComplete' (=constructed with a expandPredictionResults method).")
46 |   }
47 | 
48 |   # calculate mean log loss +- standard errors
49 |   res <- occupationalPredictions[code == pred.code, list(.N, logscore = -mean(log2(pred.prob)), se = sqrt(var(log2(pred.prob))/.N)), by = method.name]
50 | 
51 |   if (any(res[, is.infinite(logscore)])) {
52 |     res[, type := "all observations (true log score)"]
53 | 
54 |     res2 <- occupationalPredictions[code == pred.code & pred.prob > 0, list(.N, logscore = -mean(log2(pred.prob)), se = sqrt(var(log2(pred.prob))/.N), type = "observations with pred.prob = 0 of true category excluded"), by = method.name]
55 |     return(rbind(res, res2))
56 |   } else {
57 |     return(res)
58 |   }
59 | 
60 |   # calculate mean log loss
61 |   # check that no other category is predicted with maximal probability that is lower than pred.prob (result should all be -9999)
62 | #  res[, .SD[acc == TRUE | pred.code == "-9999"][which.min(pred.prob)], by = list(id, sim.name)][, .N, by = pred.code]
63 |   # an example how the so found probabilities look like
64 |   # summary(res[, .SD[acc == TRUE | pred.code == "-9999"][which.max(pred.prob)], by = list(id, sim.name)][sim.name == "substring", list(pred.prob, log(pred.prob), log2(pred.prob))])
65 |   # if so, we may calculate the mean log loss (+- standard errors)
66 | #  res[, .SD[acc == TRUE | pred.code == "-9999"][which.max(pred.prob)], by = list(id, sim.name)][, list(.N, logscore = -mean(log2(pred.prob)), se = sqrt(var(log2(pred.prob))/.N)), by = sim.name]
67 | 
68 | }
69 | 


--------------------------------------------------------------------------------
/R/plotAgreementRateVsProductionRate.R:
--------------------------------------------------------------------------------
 1 | #' Plot agreement rate vs. production rate
 2 | #'
 3 | #' Plot the proportion of answers coded correctly for any given production rate (proportion of \code{n} that are coded automatically.)
 4 | #'
 5 | #' @param occupationalPredictionsAmongTopK a data table created with \code{\link{calcAccurateAmongTopK}}.
 6 | #' @param n Number of unique persons in test data (may be larger than the number of persons in occupationalPredictionsAmongTopK if for some persons no codes were suggested)
 7 | #' @param yintercept Where to draw a horizontal line?
 8 | #' @param filename If a \code{filename} is specified the diagram will be saved at with this name.
 9 | #'
10 | #' @seealso \code{\link{plotTruePredictionsVsFalsePredictions}}, \code{\link{calcAccurateAmongTopK}}
11 | #'
12 | #' @return a ggplot
13 | #' @import data.table
14 | #' @import ggplot2
15 | #' @export
16 | #'
17 | #' @examples
18 | #' # set up data
19 | #' data(occupations)
20 | #' allowed.codes <- c("71402", "71403", "63302", "83112", "83124", "83131", "83132", "83193", "83194", "-0004", "-0030")
21 | #' allowed.codes.titles <- c("Office clerks and secretaries (without specialisation)-skilled tasks", "Office clerks and secretaries (without specialisation)-complex tasks", "Gastronomy occupations (without specialisation)-skilled tasks",
22 | #'  "Occupations in child care and child-rearing-skilled tasks", "Occupations in social work and social pedagogics-highly complex tasks", "Pedagogic specialists in social care work and special needs education-unskilled/semiskilled tasks", "Pedagogic specialists in social care work and special needs education-skilled tasks", "Supervisors in education and social work, and of pedagogic specialists in social care work", "Managers in education and social work, and of pedagogic specialists in social care work",
23 | #'  "Not precise enough for coding", "Student assistants")
24 | #' proc.occupations <- removeFaultyAndUncodableAnswers_And_PrepareForAnalysis(occupations, colNames = c("orig_answer", "orig_code"), allowed.codes, allowed.codes.titles)
25 | #'
26 | #' ## split sample
27 | #' set.seed(3451345)
28 | #' n.test <- 50
29 | #' group <- sample(c(rep("test", n.test), rep("training", nrow(proc.occupations) - n.test)))
30 | #' splitted.data <- split(proc.occupations, group)
31 | #'
32 | #' # train model and make predictions
33 | #' model <- trainLogisticRegressionWithPenalization(splitted.data$train, preprocessing = list(stopwords = tm::stopwords("de"), stemming = "de", countWords = FALSE), tuning = list(alpha = 0.05, maxit = 50^5, nlambda = 100, thresh = 1e-5))
34 | #' res <- predictLogisticRegressionWithPenalization(model, splitted.data$test)
35 | #'
36 | #' # expand to contain more categories than the initial ones
37 | #' res.proc1 <- expandPredictionResults(res, allowed.codes = c("12345", allowed.codes), method.name = "glmnet1")
38 | #'
39 | #' # we can use different methods to create a combined dataset. This is how to run the subsequent analysis functions only once.
40 | #' res.proc2 <- expandPredictionResults(res, allowed.codes = c("12345", allowed.codes), method.name = "glmnet2")
41 | #' res.proc <- rbind(res.proc1, res.proc2); class(res.proc) <- c(class(res.proc), "occupationalPredictionsComplete")
42 | #'
43 | #' calcAccurateAmongTopK(res.proc, k = 5)[,mean(acc), by = method.name]
44 | #' plotAgreementRateVsProductionRate(calcAccurateAmongTopK(res.proc, k = 5), n = n.test, yintercept = 0.85)
45 | #' plotAgreementRateVsProductionRate(calcAccurateAmongTopK(res.proc, k = 1), n = n.test, yintercept = 0.85, filename = "test.pdf")
46 | plotAgreementRateVsProductionRate <- function(occupationalPredictionsAmongTopK, n, yintercept, filename = NULL) {
47 | 
48 |   print(paste0("If these numbers are different than the size of the test data ", n, ", make sure to understand what this means. (x-axis will end before 1)"))
49 |   print(occupationalPredictionsAmongTopK[, .N, by = method.name])
50 | 
51 |   # calculate coordinates to plot
52 |   agreementProductionGraph <- occupationalPredictionsAmongTopK[, .SD[order(pred.prob, acc, decreasing = TRUE), list(y = cumsum(acc)/ 1:.N, x = 1:.N/n)], by = method.name]
53 | 
54 |   p <- ggplot(agreementProductionGraph, aes(x = x, y = y, colour = method.name, group = paste(method.name))) +
55 |     geom_line() +
56 |     geom_hline(yintercept = yintercept) +
57 |     #   geom_line(data = agreementProductionGraph[method == "dw averaged"], aes(x = x, y = y, colour = method, size = method))
58 |     scale_colour_manual(values = c("chocolate", "blue4", "red", c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7"), c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7"), c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7"), c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")), #
59 |                         name = "Prediction Method",
60 |                         guide = guide_legend(direction = "horizontal",
61 |                                              nrow = 2, ncol=4,
62 |                                              title.position = "top",
63 |                                              title.hjust = 0.5,
64 |                                              label.position="right",
65 |                                              label.hjust = 0,
66 |                                              label.vjust = 0.5,
67 |                                              reverse = TRUE
68 |                                              #                                  label.theme = element_text(angle = 90)
69 |                         )) +
70 |     labs(x = "Production Rate", y = "Agreement Rate") + # , title = "Agreement Rates for most probable category at various production rates") +
71 |     theme(legend.position="bottom")
72 | 
73 |   if (!is.null(filename)) ggsave(filename, plot = p, width = 7, height = 7)
74 | 
75 |   return(p)
76 | }
77 | 
78 | 


--------------------------------------------------------------------------------
/R/plotReliabilityDiagram.R:
--------------------------------------------------------------------------------
 1 | #' Reliability Diagram
 2 | #'
 3 | #' Plots the observed relative frequency of correctness against the forecasted probability.
 4 | #'
 5 | #' @param occupationalPredictionsAmongTopK a data table created with \code{\link{calcAccurateAmongTopK}}.
 6 | #' @param k how many top k categories to aggregate over?
 7 | #' @param num.codes Number of allowed categories in classification
 8 | #' @param filename If a \code{filename} is specified the diagram will be saved at with this name.
 9 | #'
10 | #' @seealso \code{\link{sharpness}}
11 | #'
12 | #' @return a ggplot
13 | #' @import data.table
14 | #' @import ggplot2
15 | #' @export
16 | #'
17 | #' @examples
18 | #' # set up data
19 | #' data(occupations)
20 | #' allowed.codes <- c("71402", "71403", "63302", "83112", "83124", "83131", "83132", "83193", "83194", "-0004", "-0030")
21 | #' allowed.codes.titles <- c("Office clerks and secretaries (without specialisation)-skilled tasks", "Office clerks and secretaries (without specialisation)-complex tasks", "Gastronomy occupations (without specialisation)-skilled tasks",
22 | #'  "Occupations in child care and child-rearing-skilled tasks", "Occupations in social work and social pedagogics-highly complex tasks", "Pedagogic specialists in social care work and special needs education-unskilled/semiskilled tasks", "Pedagogic specialists in social care work and special needs education-skilled tasks", "Supervisors in education and social work, and of pedagogic specialists in social care work", "Managers in education and social work, and of pedagogic specialists in social care work",
23 | #'  "Not precise enough for coding", "Student assistants")
24 | #' proc.occupations <- removeFaultyAndUncodableAnswers_And_PrepareForAnalysis(occupations, colNames = c("orig_answer", "orig_code"), allowed.codes, allowed.codes.titles)
25 | #'
26 | #' ## split sample
27 | #' set.seed(3451345)
28 | #' n.test <- 50
29 | #' group <- sample(c(rep("test", n.test), rep("training", nrow(proc.occupations) - n.test)))
30 | #' splitted.data <- split(proc.occupations, group)
31 | #'
32 | #' # train model and make predictions
33 | #' model <- trainLogisticRegressionWithPenalization(splitted.data$train, preprocessing = list(stopwords = tm::stopwords("de"), stemming = "de", countWords = FALSE), tuning = list(alpha = 0.05, maxit = 50^5, nlambda = 100, thresh = 1e-5))
34 | #' res <- predictLogisticRegressionWithPenalization(model, splitted.data$test)
35 | #'
36 | #' # expand to contain more categories than the initial ones
37 | #' res.proc1 <- expandPredictionResults(res, allowed.codes = c("12345", allowed.codes), method.name = "glmnet1")
38 | #'
39 | #' # we can use different methods to create a combined dataset. This is how to run the subsequent analysis functions only once.
40 | #' res.proc2 <- expandPredictionResults(res, allowed.codes = c("12345", allowed.codes), method.name = "glmnet2")
41 | #' res.proc <- rbind(res.proc1, res.proc2); class(res.proc) <- c(class(res.proc), "occupationalPredictionsComplete")
42 | #'
43 | #' plotReliabilityDiagram(res.proc, k = 5, num.codes = length(allowed.codes) + 1) # + 1 because we introduced the code "12345" later
44 | #' plotReliabilityDiagram(res.proc, k = 1, num.codes = length(allowed.codes) + 1, filename = "test.pdf")
45 | plotReliabilityDiagram <- function(occupationalPredictions, k, num.codes, filename = NULL) {
46 | 
47 |   # same as calcAccurateAmongTopK but we also keep among.suggested.code == FALSE
48 |   occupationalPredictionsAmongTopK <- occupationalPredictions[, head(.SD[order(pred.prob, decreasing = TRUE), list(pred.prob, acc = code == pred.code)], k), by = list(id, method.name)][, list(pred.prob = sum(pred.prob), acc = sum(acc)), by = list(id, method.name)]
49 | 
50 |   print("Make sure the following is the size of your test data (otherwise an error happened).")
51 |   print(occupationalPredictionsAmongTopK[, .N, by = method.name])
52 | 
53 |   # calculate coordinates to plot
54 |   reliabilityDiagram <- occupationalPredictionsAmongTopK[, list(.N, forecast.probability = mean(pred.prob), observed.frequency = mean(acc)), by = list(cut(pred.prob, breaks = seq(0, 1, 0.1)), method.name)]
55 | 
56 |   p <- ggplot(reliabilityDiagram, aes(x = forecast.probability, y = observed.frequency, colour = method.name, group = paste(method.name))) +
57 |     geom_point(aes(size = N), shape = 15) + scale_size(range = c(0.01, 4)) + guides(size = "none") +
58 |     geom_line(linetype = "dotdash", size = 0.5) + # http://ggplot2.tidyverse.org/articles/ggplot2-specs.html
59 |     geom_abline(intercept = 0, slope = 1, size = 1) +
60 |     geom_hline(yintercept = 1/num.codes) +
61 |     scale_x_continuous(breaks = seq(0, 1, 0.1), limits = c(0, 1)) +
62 |     scale_y_continuous(breaks = seq(0, 1, 0.1), limits = c(0, 1)) +
63 |     scale_colour_manual(values = c("chocolate", "blue4", "red", c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7"), c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7"), c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7"), c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")), #
64 |                         name = "Prediction Method",
65 |                         guide = guide_legend(direction = "horizontal",
66 |                                              nrow = 2, ncol=4,
67 |                                              title.position = "top",
68 |                                              title.hjust = 0.5,
69 |                                              label.position="right",
70 |                                              label.hjust = 0,
71 |                                              label.vjust = 0.5,
72 |                                              reverse = TRUE
73 |                                              #                                  label.theme = element_text(angle = 90)
74 |                         )) +
75 |     labs(x = "Mean Forecast Probability within Decile", y = "Observed Relative Frequency of Agreement") +
76 |     theme(legend.position="bottom")
77 | 
78 |   if (!is.null(filename)) ggsave(filename, plot = p, width = 7, height = 7)
79 | 
80 |   return(p)
81 | }
82 | 


--------------------------------------------------------------------------------
/R/plotTruePredictionsVsFalsePredictions.R:
--------------------------------------------------------------------------------
 1 | #' Plot true predictions versus false predictions
 2 | #'
 3 | #' Show how many predictions would be correct as a function of how many would be incorrect.
 4 | #'
 5 | #' @param occupationalPredictionsAmongTopK a data table created with \code{\link{calcAccurateAmongTopK}}.
 6 | #' @param filename If a \code{filename} is specified the diagram will be saved at with this name.
 7 | #'
 8 | #' @seealso \code{\link{plotAgreementRateVsProductionRate}}, \code{\link{calcAccurateAmongTopK}}
 9 | #'
10 | #' @return a ggplot
11 | #' @import data.table
12 | #' @import ggplot2
13 | #' @export
14 | #'
15 | #' @examples
16 | #' # set up data
17 | #' data(occupations)
18 | #' allowed.codes <- c("71402", "71403", "63302", "83112", "83124", "83131", "83132", "83193", "83194", "-0004", "-0030")
19 | #' allowed.codes.titles <- c("Office clerks and secretaries (without specialisation)-skilled tasks", "Office clerks and secretaries (without specialisation)-complex tasks", "Gastronomy occupations (without specialisation)-skilled tasks",
20 | #'  "Occupations in child care and child-rearing-skilled tasks", "Occupations in social work and social pedagogics-highly complex tasks", "Pedagogic specialists in social care work and special needs education-unskilled/semiskilled tasks", "Pedagogic specialists in social care work and special needs education-skilled tasks", "Supervisors in education and social work, and of pedagogic specialists in social care work", "Managers in education and social work, and of pedagogic specialists in social care work",
21 | #'  "Not precise enough for coding", "Student assistants")
22 | #' proc.occupations <- removeFaultyAndUncodableAnswers_And_PrepareForAnalysis(occupations, colNames = c("orig_answer", "orig_code"), allowed.codes, allowed.codes.titles)
23 | #'
24 | #' ## split sample
25 | #' set.seed(3451345)
26 | #' n.test <- 50
27 | #' group <- sample(c(rep("test", n.test), rep("training", nrow(proc.occupations) - n.test)))
28 | #' splitted.data <- split(proc.occupations, group)
29 | #'
30 | #' # train model and make predictions
31 | #' model <- trainLogisticRegressionWithPenalization(splitted.data$train, preprocessing = list(stopwords = tm::stopwords("de"), stemming = "de", countWords = FALSE), tuning = list(alpha = 0.05, maxit = 50^5, nlambda = 100, thresh = 1e-5))
32 | #' res <- predictLogisticRegressionWithPenalization(model, splitted.data$test)
33 | #'
34 | #' # expand to contain more categories than the initial ones
35 | #' res.proc1 <- expandPredictionResults(res, allowed.codes = c("12345", allowed.codes), method.name = "glmnet1")
36 | #'
37 | #' # we can use different methods to create a combined dataset. This is how to run the subsequent analysis functions only once.
38 | #' res.proc2 <- expandPredictionResults(res, allowed.codes = c("12345", allowed.codes), method.name = "glmnet2")
39 | #' res.proc <- rbind(res.proc1, res.proc2); class(res.proc) <- c(class(res.proc), "occupationalPredictionsComplete")
40 | #'
41 | #' calcAccurateAmongTopK(res.proc, k = 5)[,mean(acc), by = method.name]
42 | #' plotTruePredictionsVsFalsePredictions(calcAccurateAmongTopK(res.proc, k = 5))
43 | #' plotTruePredictionsVsFalsePredictions(calcAccurateAmongTopK(res.proc, k = 1), filename = "test.pdf")
44 | plotTruePredictionsVsFalsePredictions <- function(occupationalPredictionsAmongTopK, filename = NULL) {
45 | 
46 |   print("Make sure this is the size of your test data. (or count the missing one as additional false predictions on the upper right)")
47 |   print(occupationalPredictionsAmongTopK[, .N, by = method.name])
48 | 
49 |   # calculate coordinates to plot
50 |   n.true.predictions.vs.n.false.predictions <- occupationalPredictionsAmongTopK[, .SD[order(pred.prob, decreasing = TRUE), list(y = cumsum(acc), x = cumsum(1 - acc))], by = method.name]
51 | 
52 |   p <- ggplot(n.true.predictions.vs.n.false.predictions, aes(x = x, y = y, colour = method.name, group = paste(method.name))) +
53 |     geom_line(size = 0.1, linetype = "solid") + scale_size(range = c(0, 1)) +
54 |     #   geom_line(data = agreementProductionGraph[method == "dw averaged"], aes(x = x, y = y, colour = method, size = method))
55 |     scale_colour_manual(values = c("chocolate", "blue4", "red", c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7"), c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7"), c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7"), c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")), #
56 |                         name = "Prediction Method",
57 |                         guide = guide_legend(direction = "horizontal",
58 |                                              nrow = 2, ncol=4,
59 |                                              title.position = "top",
60 |                                              title.hjust = 0.5,
61 |                                              label.position="right",
62 |                                              label.hjust = 0,
63 |                                              label.vjust = 0.5,
64 |                                              reverse = TRUE
65 |                                              #                                  label.theme = element_text(angle = 90)
66 |                         )) +
67 |     labs(x = "No. of False Predictions", y = "No. of True Predictions") +
68 |     theme(legend.position="bottom")
69 | 
70 |   if (!is.null(filename)) ggsave(filename, plot = p, width = 7, height = 7)
71 | 
72 |   return(p)
73 | }
74 | 
75 | 


--------------------------------------------------------------------------------
/R/predictWithCodingIndex.R:
--------------------------------------------------------------------------------
  1 | #' Code answers with a coding index
  2 | #'
  3 | #' Look up the correct code in a coding index. We often find no code, 1 code or even more than one possible code this way.
  4 | #'
  5 | #' @param newdata either a data.table created with \code{\link{removeFaultyAndUncodableAnswers_And_PrepareForAnalysis}} or a character vector.
  6 | #' @param coding_index a data.table as created with function \code{\link{prepare_German_coding_index_Gesamtberufsliste_der_BA}}
  7 | #' @param include.substrings (default: \code{FALSE}). If \code{FALSE}, a match is found if, after preprocessing, the entry from the coding index and the string-element are exactly identical. If TRUE (Attention: THIS IS SLOW!!), a match is found if, after preprocessing, the entry from the coding index is a substring of the string-element.
  8 | #' @param max.count.categories (default: \code{Inf}). Should we search the whole coding index (default) or should we exclude entries with large \code{count_categories}, an indicator of job title ambiguity? Only entries in the coding index with \code{count_categories \eqn{\le} max.count.categories} are searched.
  9 | #'
 10 | #' @seealso
 11 | #' \code{\link{predictSimilarityBasedReasoning}}
 12 | #'
 13 | #' @return a data.table with columns id, ans, and pred.code (format is not comparable to other formats in this package.)
 14 | #' @export
 15 | #'
 16 | #' @examples
 17 | #' # set up data
 18 | #' data(occupations)
 19 | #' allowed.codes <- c("71402", "71403", "63302", "83112", "83124", "83131", "83132", "83193", "83194", "-0004", "-0030")
 20 | #' allowed.codes.titles <- c("Office clerks and secretaries (without specialisation)-skilled tasks", "Office clerks and secretaries (without specialisation)-complex tasks", "Gastronomy occupations (without specialisation)-skilled tasks",
 21 | #'  "Occupations in child care and child-rearing-skilled tasks", "Occupations in social work and social pedagogics-highly complex tasks", "Pedagogic specialists in social care work and special needs education-unskilled/semiskilled tasks", "Pedagogic specialists in social care work and special needs education-skilled tasks", "Supervisors in education and social work, and of pedagogic specialists in social care work", "Managers in education and social work, and of pedagogic specialists in social care work",
 22 | #'  "Not precise enough for coding", "Student assistants")
 23 | #' proc.occupations <- removeFaultyAndUncodableAnswers_And_PrepareForAnalysis(occupations, colNames = c("orig_answer", "orig_code"), allowed.codes, allowed.codes.titles)
 24 | #'
 25 | #' # recommended default
 26 | #' res <- predictWithCodingIndex(proc.occupations,
 27 | #'                               coding_index = coding_index_excerpt,
 28 | #'                               include.substrings = FALSE,
 29 | #'                               max.count.categories = Inf)
 30 | #'
 31 | #' # playing around with the parameters to obtain other results
 32 | #' res <- predictWithCodingIndex(proc.occupations,
 33 | #'                               coding_index = coding_index_excerpt,
 34 | #'                               include.substrings = TRUE,
 35 | #'                               max.count.categories = 15)
 36 | #'
 37 | #' #################################
 38 | #' # Analysis: Standard functions from this package won't work here.
 39 | #' # Absolute numbers: either nothing is predicted (nPredictedCodes = NA), or 1 or more cods are predicted
 40 | #' res[ , .N, by = list(nPredictedCodes = 1 + nchar(pred.code) %/% 6 )]
 41 | #' # Relative Numbers
 42 | #' res[ , .N / res[, .N], by = list(nPredictedCodes = 1 + nchar(pred.code) %/% 6 )]
 43 | #' # Agreement rate among answers where only a single code was predicted
 44 | #' res[nchar(pred.code) == 5, mean(pred.code == code)]
 45 | predictWithCodingIndex <- function(newdata, coding_index,
 46 |                                    include.substrings = FALSE, max.count.categories = Inf) {
 47 | 
 48 |   # get character vector depending on type of input
 49 |   if ("occupationData" %in% class(newdata)) {
 50 |     ans <- newdata[, ans]
 51 |     if (exists("id", newdata)) {
 52 |       id <- newdata[, id]
 53 |     } else {
 54 |       id <- 1:length(ans)
 55 |     }
 56 |   }
 57 |   if (is.character(newdata)) {
 58 |     ans <- newdata
 59 |     id <- 1:length(ans)
 60 |   }
 61 | 
 62 |   ###########################################
 63 |   ###### prepare coding index ######
 64 | 
 65 |   # write female titles beneath the male titles
 66 |   coding_index <- rbind(coding_index[, list(title = bezMale, Code, count_categories)],
 67 |                         coding_index[, list(title = bezFemale, Code, count_categories)])
 68 |   # drop duplicate lines
 69 |   coding_index <- coding_index[,.N, by = list(title, Code, count_categories)
 70 |                                ][,list(title, Code, count_categories)]
 71 | 
 72 |   # standardize titles from the coding index
 73 |   coding_index <- coding_index[,title := stringPreprocessing(title)]
 74 | 
 75 | 
 76 |   ###########################################
 77 |   ####### do coding with the dictionary #####
 78 |   string.proc <- stringPreprocessing(ans)
 79 |   if (!include.substrings) {
 80 |     # substrings are not considered as matches
 81 |     res <- sapply(string.proc, function(str) {
 82 |       kldb_code <- coding_index[title == str & count_categories <= max.count.categories,
 83 |                                 paste(unique(Code), collapse = ",")]
 84 |       if (kldb_code == "" | length(kldb_code) == 0) kldb_code <- NA
 85 |       return(kldb_code)
 86 |     })
 87 |   } else {
 88 |     # substrings are considered as matches
 89 | 
 90 |     # loop through coding_index$titles
 91 |     coding_index_matches <- sapply(coding_index$title, grepl, string.proc)
 92 |     # loop through string.proc
 93 |     res <- apply(coding_index_matches, 1, function(vec) coding_index[vec & count_categories <= max.count.categories, paste(unique(Code), collapse = ",")])
 94 |     res[res == "" | length(res) == 0] <- NA
 95 |   }
 96 | 
 97 |   res <- data.table(id, ans, pred.code = res)
 98 | 
 99 |   # add additional columns from new data to the result
100 |   if ("occupationData" %in% class(newdata)) {
101 |     for (i in seq_along(names(newdata))) {
102 |       if (names(newdata)[i] != "ans") {
103 |         set(res, i = NULL, j = names(newdata)[i], value = newdata[, i, with = FALSE])
104 |       }
105 |     }
106 | 
107 |     # class(res) <- c(class(res), "occupationalPredictions") Only if we had probabilities for more than one category. One could adapt the above function to deliver this.
108 |   }
109 | 
110 |   return(res)
111 | }
112 | 


--------------------------------------------------------------------------------
/R/produceResults.R:
--------------------------------------------------------------------------------
  1 | #' Produces summaries of predictive performance
  2 | #'
  3 | #' Produces \code{\link{accuracy}}, \code{\link{plotReliabilityDiagram}}, \code{\link{sharpness}}, \code{\link{logLoss}}, \code{\link{plotTruePredictionsVsFalsePredictions}}, and \code{\link{plotAgreementRateVsProductionRate}}.
  4 | #'
  5 | #' @param occupationalPredictions a data.table created with a \code{\link{expandPredictionResults}}-function from this package.
  6 | #' @param k how many top k categories to aggregate over?
  7 | #' @param n Number of unique persons in test data (may be larger than the number of persons in occupationalPredictionsAmongTopK if for some persons no codes were suggested)
  8 | #' @param num.codes Number of allowed categories in classification
  9 | #'
 10 | #' @seealso \code{\link{calcAccurateAmongTopK}}
 11 | #'
 12 | #' @return a data.table
 13 | #' @import data.table
 14 | #' @export
 15 | #'
 16 | #' @examples
 17 | #' # set up data
 18 | #' data(occupations)
 19 | #' allowed.codes <- c("71402", "71403", "63302", "83112", "83124", "83131", "83132", "83193", "83194", "-0004", "-0030")
 20 | #' allowed.codes.titles <- c("Office clerks and secretaries (without specialisation)-skilled tasks", "Office clerks and secretaries (without specialisation)-complex tasks", "Gastronomy occupations (without specialisation)-skilled tasks",
 21 | #'  "Occupations in child care and child-rearing-skilled tasks", "Occupations in social work and social pedagogics-highly complex tasks", "Pedagogic specialists in social care work and special needs education-unskilled/semiskilled tasks", "Pedagogic specialists in social care work and special needs education-skilled tasks", "Supervisors in education and social work, and of pedagogic specialists in social care work", "Managers in education and social work, and of pedagogic specialists in social care work",
 22 | #'  "Not precise enough for coding", "Student assistants")
 23 | #' proc.occupations <- removeFaultyAndUncodableAnswers_And_PrepareForAnalysis(occupations, colNames = c("orig_answer", "orig_code"), allowed.codes, allowed.codes.titles)
 24 | #'
 25 | #' ## split sample
 26 | #' set.seed(3451345)
 27 | #' n.test <- 50
 28 | #' group <- sample(c(rep("test", n.test), rep("training", nrow(proc.occupations) - n.test)))
 29 | #' splitted.data <- split(proc.occupations, group)
 30 | #'
 31 | #' # train model and make predictions
 32 | #' model <- trainLogisticRegressionWithPenalization(splitted.data$train, preprocessing = list(stopwords = tm::stopwords("de"), stemming = "de", countWords = FALSE), tuning = list(alpha = 0.05, maxit = 50^5, nlambda = 100, thresh = 1e-5))
 33 | #' res <- predictLogisticRegressionWithPenalization(model, splitted.data$test)
 34 | #'
 35 | #' # expand to contain more categories than the initial ones
 36 | #' res.proc1 <- expandPredictionResults(res, allowed.codes = c("12345", allowed.codes), method.name = "glmnet1")
 37 | #'
 38 | #' # aggregate over top k categories
 39 | #' calcAccurateAmongTopK(res.proc1, k = 1)[,mean(acc)]
 40 | #' calcAccurateAmongTopK(res.proc1, k = 5)[,mean(acc)]
 41 | #'
 42 | #' # we can use different methods to create a combined dataset. This is how to run the subsequent analysis functions only once.
 43 | #' res.proc2 <- expandPredictionResults(res, allowed.codes = c("12345", allowed.codes), method.name = "glmnet2")
 44 | #' res.proc <- rbind(res.proc1, res.proc2); class(res.proc) <- c(class(res.proc), "occupationalPredictionsComplete")
 45 | #'
 46 | #' produceResults(res.proc, k = 1, n = n.test, num.codes = length(allowed.codes) + 1)
 47 | produceResults <- function(occupationalPredictions, k = 1, n, num.codes) {
 48 | 
 49 |   if (!("occupationalPredictionsComplete" %in% class(occupationalPredictions))){
 50 |     stop("'occupationalPredictionsComplete' needs to have class 'occupationalPredictionsComplete' (=constructed with a expandPredictionResults method).")
 51 |   }
 52 | 
 53 |   no.individuals <- occupationalPredictions[, length(unique(id))]
 54 |   print(paste("Number of individuals:", no.individuals))
 55 |   print(paste("Expected Number of rows in dataset:", no.individuals*num.codes, "=", no.individuals, "Individuals *", num.codes, "=", "Codes"))
 56 |   print(paste("Observed Number of rows in dataset:"))
 57 | 
 58 |   print(occupationalPredictions[, .N, by = method.name])
 59 | 
 60 |   accurateAmong <- calcAccurateAmongTopK(occupationalPredictions, k = k)
 61 | 
 62 |   print("## Agreement Rate (at 100% production rate):")
 63 |   print(accuracy(accurateAmong, n = n))
 64 | 
 65 |   print("## Plot Reliability Diagram")
 66 |   print(plotReliabilityDiagram(occupationalPredictions, k = k, num.codes = num.codes))
 67 | 
 68 |   print("## Sharpness:")
 69 |   print(sharpness(occupationalPredictions))
 70 | 
 71 |   print("## Log2 loss:")
 72 |   print(logLoss(occupationalPredictions))
 73 | 
 74 |   print("## Plot True Predictions vs False Predictions")
 75 |   print(plotTruePredictionsVsFalsePredictions(accurateAmong))
 76 | 
 77 |   print("## Plot Agreement Rate vs Production Rate")
 78 |   print(plotAgreementRateVsProductionRate(accurateAmong, n = n, yintercept = 0.85))
 79 | 
 80 |   # calculate AUC (I don't like ROC because the number of positive conditions (used in the tpr/recall) is not meaningful as a denominator here)
 81 |   # library(verification)
 82 |   # calcAccurateAmongTopK(occupationalPredictions, k = k)[, verification::roc.area(acc, pred.prob), by = sim.name] # used in the paper
 83 | 
 84 |   # ###########################
 85 |   # # idea: absolute number in a category should be similar to the predicted number (aggregates per category). But the results are a bit strange.. so include in the paper?
 86 |   #
 87 |   # # calculate residuals
 88 |   # res.complete[, acc := pred.code == true.code]
 89 |   # res.complete[, residual := acc - pred.prob] # y_{n_fk} - \hat{p}_{n_fk}
 90 |   #
 91 |   # # this should be exactly 1 for all ids
 92 |   # summary(res.complete[sim.name == "osa1111 = 0" & pred.code != "-9999", sum(pred.prob), by = id])
 93 |   # res.complete[sim.name == "osa1111 = 0" & pred.code != "-9999", sum(residual), by = list(pred.code, sim.name)][order(V1), sum(V1)]
 94 |   # # this should be 1064 -> our numbers are up to 1.8 off.
 95 |   # res.complete[sim.name == "osa1111 = 0" & pred.code != "-9999", sum(pred.prob), by = id][,sum(V1)]
 96 |   #
 97 |   # pdf("Z:/Eigene Dateien/in work/Algorithms for Occupation Coding/paper/graphs/category_residuals.pdf")
 98 |   # res.complete[sim.name == "osa1111 = 0" & pred.code != "-9999", sum(residual), by = list(pred.code, sim.name)][order(V1), plot(V1, ylab = "Residual")]
 99 |   # dev.off()
100 |   #
101 |   # pdf("Z:/Eigene Dateien/in work/Algorithms for Occupation Coding/paper/graphs/category_residuals2.pdf")
102 |   # plot(res.complete[sim.name == "osa1111 = 0" & pred.code != "-9999", list(residual = sum(residual)), by = list(pred.code, sim.name)][order(residual), ][residual < -5 | residual > 5, list(pred.code = as.factor(pred.code), by = residual)], las = 3, ylab = "Residual")
103 |   # abline(h = 0)
104 |   # dev.off()
105 |   # res[sim.name == "osa1111 = 0", list(sum(pred.prob)), by = pred.code][order(V1)]
106 |   #
107 |   # res.complete[sim.name == "osa1111 = 0" & pred.code != "-9999", list(residual = sum(residual)), by = list(pred.code, sim.name)][residual > -0.5 & residual < 0.5][, .N, by = residual]
108 |   #
109 |   # res.complete[sim.name == "osa1111 = 0" & pred.code != "-9999", list(residual = sum(residual)), by = list(pred.code, sim.name)][residual > -0.5 & residual < 0.5, list(.N, mean(residual))]
110 |   # turtle[code == 71402, .N]
111 |   # res.complete[sim.name == "osa1111 = 0" & pred.code == "71402", list(sum(pred.prob), sum(residual))]
112 |   #
113 | }
114 | 


--------------------------------------------------------------------------------
/R/removeFaultyAndUncodableAnswers_And_PrepareForAnalysis.R:
--------------------------------------------------------------------------------
 1 | #' Data Preparation
 2 | #'
 3 | #' Prepare data (i.e. columns 'id', 'ans', and 'code' are appended to the dataset - only these columns will be used later) and remove answer that we cannot use (i.e. answers that have non-ASCII characters after preprocessing and answers that are at most one character long). During data preparation you should make sure that nothing important is lost here.
 4 | #'
 5 | #' The 2010 German classification is available at \url{https://www.klassifikationsserver.de/}.
 6 | #'
 7 | #' @param answers a character vector of answers
 8 | #' @param codes a vector of classification codes having the same length as \code{answers}. Will be transformed to character.
 9 | #' @param allowed.codes a vector of allowed codes from the classification.
10 | #' @param allowed.codes.titles Labels for \code{allowed.codes}. Should have the same length as \code{allowed.codes}.
11 | #'
12 | #' @seealso \code{\link{createDescriptives}}
13 | #'
14 | #' @return a data.table with attributes \code{classification} and \code{overview_tally}
15 | #' @import data.table
16 | #' @export
17 | #'
18 | #' @examples
19 | #' occupations <- data.table(answers = c("LEITER VERTRIEB", "Kfz-Schlossermeister", "Aushilfe im Hotel(Bereich Housekeeping)"),
20 | #'                    codes = c("61194", "25213", "63221"))
21 | #' (allowed.codes <- c("11101", "61194", "25213", "63221", "..."))
22 | #' (allowed.codes.titles <- c("Berufe in der Landwirtschaft (ohne Spezialisierung) - Helfer-/Anlernt\xe4tigkeiten", "Berufe in der Kraftfahrzeugtechnik - komplexe Spezialistent\xe4tigkeiten", "F\xfchrungskräfte - Einkauf und Vertrieb", "Berufe im Hotelservice - Helfer-/Anlernt\xe4tigkeiten", "many more category labels from the classification"))
23 | #' removeFaultyAndUncodableAnswers_And_PrepareForAnalysis(occupations, colNames = c("answers", "codes"), allowed.codes, allowed.codes.titles)
24 | #'
25 | #' data(occupations)
26 | #' allowed.codes <- c("71402", "71403", "63302", "83112", "83124", "83131", "83132", "83193", "83194", "-0004", "-0030")
27 | #' allowed.codes.titles <- c("Office clerks and secretaries (without specialisation)-skilled tasks", "Office clerks and secretaries (without specialisation)-complex tasks", "Gastronomy occupations (without specialisation)-skilled tasks",
28 | #'  "Occupations in child care and child-rearing-skilled tasks", "Occupations in social work and social pedagogics-highly complex tasks", "Pedagogic specialists in social care work and special needs education-unskilled/semiskilled tasks", "Pedagogic specialists in social care work and special needs education-skilled tasks", "Supervisors in education and social work, and of pedagogic specialists in social care work", "Managers in education and social work, and of pedagogic specialists in social care work",
29 | #'  "Not precise enough for coding", "Student assistants")
30 | #' removeFaultyAndUncodableAnswers_And_PrepareForAnalysis(occupations, colNames = c("orig_answer", "orig_code"), allowed.codes, allowed.codes.titles)
31 | #'
32 | #' ## we could also paste both answers together
33 | #' occupations[, answer_combined := paste(orig_answer, orig_answer2)]
34 | #' removeFaultyAndUncodableAnswers_And_PrepareForAnalysis(occupations, colNames = c("answer_combined", "orig_code"), allowed.codes, allowed.codes.titles)
35 | removeFaultyAndUncodableAnswers_And_PrepareForAnalysis <- function(data, colNames = c("answer", "code"), allowed.codes, allowed.codes.titles = 1:length(allowed.codes)) {
36 | 
37 |   # the classification contains all allowed codes
38 |   classification <- data.table(code = allowed.codes, title = allowed.codes.titles)
39 | 
40 |   # insert 2 new columns into data, bases on colNames
41 |   res <- copy(data)
42 |   res[, "ans" := as.character(get(colNames[1]))]
43 |   res[, "code" := as.character(get(colNames[2]))]
44 | 
45 |   # provide some additional info about removed answers
46 |   overview_tally_preparation <- list(N_verbatims_start = nrow(res))
47 | 
48 |   ## remove non-ASCII answers
49 |   overview_tally_preparation$N_ASCII_removed <- res[,sum(!tau::is.ascii(stringPreprocessing(ans)))]
50 |   res <- res[tau::is.ascii(stringPreprocessing(ans)),]
51 | 
52 |   ## remove answers with at most one character
53 |   overview_tally_preparation$N_short_verbatims <- res[nchar(stringPreprocessing(ans)) <= 1, .N]
54 |   res <- res[nchar(stringPreprocessing(ans)) > 1, ]
55 | 
56 |   ## remove answers and codes if the code is not allowed
57 |   overview_tally_preparation$N_Codes_removed <- res[!(code %in% allowed.codes), .N]
58 |   res <- res[(code %in% allowed.codes)]
59 | 
60 |   # set ids
61 |   res[, id := 1:.N]
62 |   setkey(res, id)
63 | 
64 |   attr(res, "classification") <- classification
65 |   attr(res, "overview_tally") <- overview_tally_preparation
66 |   class(res) <- c(class(res), "occupationData")
67 | 
68 |   cat("Number of cases at start:", overview_tally_preparation$N_verbatims_start, "\n")
69 |   cat("Number of cases with non-ASCII characters (removed):", overview_tally_preparation$N_ASCII_removed, "\n")
70 |   cat("Number of cases with codes that are not allowed (removed):", overview_tally_preparation$N_Codes_removed, " (check with 'data[!(code %in% allowed.codes), table(code)]')\n")
71 |   cat("Number of cases with at most one character (removed):", overview_tally_preparation$N_short_verbatims, "\n")
72 |   cat("Number of remaining cases:", res[, .N], "\n")
73 | 
74 |   return(res)
75 | }
76 | 


--------------------------------------------------------------------------------
/R/sharpness.R:
--------------------------------------------------------------------------------
 1 | #' Sharpness
 2 | #'
 3 | #' Calculate Sharpness \eqn{\log_2 loss = \frac{1}{N} \sum_n entropy_n} and standard error \eqn{\sqrt{\frac{1}{N(N-1)} \sum_n (entropy_n - sharpness)^2}} with \eqn{entropy_n = - \sum_k p_{nk} \log_2 p_{nk}}
 4 | #'
 5 | #' Sharpness is zero (optimal) if a single category is predicted with probability 1. It is maximal if all categories have equal probability \eqn{p = \frac{1}{K}}
 6 | #'
 7 | #' Note: What should be done if a predicted probability is zero? \eqn{0 \times log(0)} is not defined but necessary to calculate sharpness. We set \eqn{0 \times log(0) = 0}. This also means we exclude observations from our analysis if all probabilities are predicted as zero. An alternative could be to set such zeros to \eqn{1/k)}, which would lead to very different sharpness.
 8 | #'
 9 | #' @param occupationalPredictions a data.table created with a \code{\link{expandPredictionResults}}-function from this package.
10 | #'
11 | #' @seealso \code{\link{plotReliabilityDiagram}}, \code{link{logLoss}}
12 | #'
13 | #' @return a data.table
14 | #' @import data.table
15 | #' @export
16 | #'
17 | #' @examples
18 | #' # set up data
19 | #' data(occupations)
20 | #' allowed.codes <- c("71402", "71403", "63302", "83112", "83124", "83131", "83132", "83193", "83194", "-0004", "-0030")
21 | #' allowed.codes.titles <- c("Office clerks and secretaries (without specialisation)-skilled tasks", "Office clerks and secretaries (without specialisation)-complex tasks", "Gastronomy occupations (without specialisation)-skilled tasks",
22 | #'  "Occupations in child care and child-rearing-skilled tasks", "Occupations in social work and social pedagogics-highly complex tasks", "Pedagogic specialists in social care work and special needs education-unskilled/semiskilled tasks", "Pedagogic specialists in social care work and special needs education-skilled tasks", "Supervisors in education and social work, and of pedagogic specialists in social care work", "Managers in education and social work, and of pedagogic specialists in social care work",
23 | #'  "Not precise enough for coding", "Student assistants")
24 | #' proc.occupations <- removeFaultyAndUncodableAnswers_And_PrepareForAnalysis(occupations, colNames = c("orig_answer", "orig_code"), allowed.codes, allowed.codes.titles)
25 | #'
26 | #' ## split sample
27 | #' set.seed(3451345)
28 | #' n.test <- 50
29 | #' group <- sample(c(rep("test", n.test), rep("training", nrow(proc.occupations) - n.test)))
30 | #' splitted.data <- split(proc.occupations, group)
31 | #'
32 | #' # train model and make predictions
33 | #' model <- trainLogisticRegressionWithPenalization(splitted.data$train, preprocessing = list(stopwords = tm::stopwords("de"), stemming = "de", countWords = FALSE), tuning = list(alpha = 0.05, maxit = 50^5, nlambda = 100, thresh = 1e-5))
34 | #' res <- predictLogisticRegressionWithPenalization(model, splitted.data$test)
35 | #'
36 | #' # expand to contain more categories than the initial ones
37 | #' res.proc1 <- expandPredictionResults(res, allowed.codes = c("12345", allowed.codes), method.name = "glmnet1")
38 | #'
39 | #' # we can use different methods to create a combined dataset. This is how to run the subsequent analysis functions only once.
40 | #' res.proc2 <- expandPredictionResults(res, allowed.codes = c("12345", allowed.codes), method.name = "glmnet2")
41 | #' res.proc <- rbind(res.proc1, res.proc2); class(res.proc) <- c(class(res.proc), "occupationalPredictionsComplete")
42 | #'
43 | #' sharpness(res.proc)
44 | sharpness <- function(occupationalPredictions) {
45 | 
46 |   if (!("occupationalPredictionsComplete" %in% class(occupationalPredictions))){
47 |     stop("'occupationalPredictionsComplete' needs to have class 'occupationalPredictionsComplete' (=constructed with a expandPredictionResults method).")
48 |   }
49 | 
50 |   # we set 0 * log_2 0 = 0  (implemented as pred.prob > 0)
51 |   occupationalPredictions[pred.prob > 0, list(entropy = sum(-pred.prob * log2(pred.prob))), by = list(id, method.name)][, list(.N, sharpness = mean(entropy), se = sqrt(var(entropy) / .N)), by = method.name]
52 |   # res.complete[, list(entropy = sum(-pred.prob * log2(pred.prob))), by = list(id, sim.name)][, list(sharpness = mean(entropy), se = sqrt(var(entropy) / .N)), by = sim.name]
53 | 
54 | }
55 | 


--------------------------------------------------------------------------------
/R/stringPreprocessing.R:
--------------------------------------------------------------------------------
 1 | #' Preprocess German occupational text
 2 | #'
 3 | #' Function replaces some common characters / character sequences (e.g., Ä, Ü, "DIPL.-ING.") with their uppercase equivalents and removes punctuation, empty spaces and the word "Diplom".
 4 | #'
 5 | #' \code{\link{charToRaw}} helps to find UTF-8 characters.
 6 | #'
 7 | #' @encoding UTF-8
 8 | #' @param verbatim a character vector.
 9 | #' @param lang (default de) Everything else will throw an error.
10 | #'
11 | #' @return the same character vector after processing
12 | #' @export
13 | #'
14 | #' @examples
15 | #' (x <- c("Verkauf von B\xfcchern, Schreibwaren", "Fach\xe4rzin f\xfcr Kinder- und Jugendmedizin im \xf6ffentlichen Gesundheitswesen", "Industriemechaniker", "Dipl.-Ing. - Agrarwirtschaft (Landwirtschaft)"))
16 | #' stringPreprocessing(x)
17 | stringPreprocessing <- function(verbatim, lang = "de") {
18 |   if(!is.character(verbatim))
19 |     stop("Character Input required for function stringPreprocessing")
20 |   if (lang != "de")
21 |     stop("This function was programmed for German characters and German occupational titles")
22 |   verbatim <- toupper(verbatim)
23 |   verbatim <- gsub(intToUtf8(0xc4), "AE", verbatim, fixed=TRUE)
24 |   verbatim <- gsub(intToUtf8(0xd6), "OE", verbatim, fixed=TRUE)
25 |   verbatim <- gsub(intToUtf8(0xdc), "UE", verbatim, fixed=TRUE)
26 |   verbatim <- gsub(intToUtf8(0xdf), "SS", verbatim, fixed=TRUE)
27 |   verbatim <- gsub(intToUtf8(0xc2), "A", verbatim, fixed=TRUE)
28 |   verbatim <- gsub(intToUtf8(0xc9), "E", verbatim, fixed=TRUE)
29 |   verbatim <- gsub(intToUtf8(0xca), "E", verbatim, fixed=TRUE)
30 |   verbatim <- gsub(intToUtf8(0xcd), "I", verbatim, fixed=TRUE)
31 |   verbatim <- gsub(intToUtf8(0xce), "I", verbatim, fixed=TRUE)
32 |   verbatim <- gsub(intToUtf8(0xd4), "O", verbatim, fixed=TRUE)
33 |   verbatim <- gsub("/", " ", verbatim , fixed=TRUE)
34 |   verbatim <- gsub("+", " ", verbatim , fixed=TRUE)
35 |   verbatim <- gsub("-", " ", verbatim , fixed=TRUE)
36 |   verbatim <- gsub(")", " ", verbatim , fixed=TRUE)
37 |   verbatim <- gsub("(", " ", verbatim , fixed=TRUE)
38 |   verbatim <- gsub("\u20ac", "EURO", verbatim, fixed=TRUE) # no solution with intToUtf8
39 |   verbatim <- gsub("\u0080", "EURO", verbatim, fixed = TRUE) # there must be a solution with intToUtf8
40 |   # verbatim <- gsub(" [A-Za-z]{1,3}\\.", "", verbatim) # Abkürzungen mit max. 3 Buchstaben vor einem Punkt werden gelöscht
41 |   verbatim <- gsub("DIPL.-ING.", "DIPLOMINGENIEUR", verbatim , fixed=TRUE) # this abbreviation is often used in the coding index
42 |   verbatim <- gsub("ING.", "INGENIEUR", verbatim , fixed=TRUE) # also sometimes used
43 |   verbatim <- gsub("DIPL.", "DIPLOM", verbatim , fixed=TRUE) # this abbreviation is often used in the coding index
44 |   verbatim <- gsub("DIPLOM", "DIPLOM ", verbatim , fixed=TRUE) # better to think of DIPLOM as a separate word
45 |   verbatim <- tm::removePunctuation(verbatim)
46 |   verbatim <- stringr::str_trim(verbatim)
47 |   verbatim <- gsub(" {2,}", " ", verbatim) # in case we have at least two double spaces, replace it with a single one
48 | #  verbatim <- gsub("  ", " ", verbatim , fixed=TRUE) # in case we have multiple double spaces
49 | #  verbatim <- gsub("  ", " ", verbatim , fixed=TRUE) # in case we have multiple double spaces
50 | #  verbatim <- gsub("  ", " ", verbatim , fixed=TRUE) # in case we have multiple double spaces
51 | #  verbatim <- gsub( " *\\(.*?\\) *", "", verbatim) # would it make sense to remove text in parenthesis completely?
52 |   verbatim <- gsub("DIPLOM ","",verbatim)
53 |   verbatim <- gsub("DIPL ","",verbatim)
54 |   return(verbatim)
55 | }
56 | 


--------------------------------------------------------------------------------
/R/trainCreecysMemoryBasedReasoning.R:
--------------------------------------------------------------------------------
 1 | #' Train Creecys Memory-based reaoning model
 2 | #'
 3 | #' The function does some preprocessing and calculates the importance of various features.
 4 | #'
 5 | #' @param data a data.table created with \code{\link{removeFaultyAndUncodableAnswers_And_PrepareForAnalysis}}
 6 | #' @param preprocessing a list with elements
 7 | #' \describe{
 8 | #'   \item{stopwords}{a character vector, use \code{tm::stopwords("de")} for German stopwords.}
 9 | #'   \item{stemming}{\code{NULL} for no stemming and \code{"de"} for stemming using the German porter stemmer.}
10 | #'   \item{strPreprocessing}{\code{TRUE} if \code{\link{stringPreprocessing}} shall be used.}
11 | #'   \item{removePunct}{\code{TRUE} if \code{\link[tm]{removePunctuation}} shall be used.}
12 | #' }
13 | #'
14 | #' @seealso
15 | #' \code{\link{predictCreecysMemoryBasedReasoning}}
16 | #'
17 | #' Creecy, R. H., Masand, B. M., Smith, S. J., Waltz, D. L. (1992). Trading MIPS and Memory for Knowledge Engineering. Comm. ACM 35(8). pp. 48--65.
18 | #'
19 | #' @return a processed feature matrix to be used in \code{\link{predictCreecysMemoryBasedReasoning}}
20 | #' @import data.table
21 | #' @export
22 | #'
23 | #' @examples
24 | #' # set up data
25 | #' data(occupations)
26 | #' allowed.codes <- c("71402", "71403", "63302", "83112", "83124", "83131", "83132", "83193", "83194", "-0004", "-0030")
27 | #' allowed.codes.titles <- c("Office clerks and secretaries (without specialisation)-skilled tasks", "Office clerks and secretaries (without specialisation)-complex tasks", "Gastronomy occupations (without specialisation)-skilled tasks",
28 | #'  "Occupations in child care and child-rearing-skilled tasks", "Occupations in social work and social pedagogics-highly complex tasks", "Pedagogic specialists in social care work and special needs education-unskilled/semiskilled tasks", "Pedagogic specialists in social care work and special needs education-skilled tasks", "Supervisors in education and social work, and of pedagogic specialists in social care work", "Managers in education and social work, and of pedagogic specialists in social care work",
29 | #'  "Not precise enough for coding", "Student assistants")
30 | #' proc.occupations <- removeFaultyAndUncodableAnswers_And_PrepareForAnalysis(occupations, colNames = c("orig_answer", "orig_code"), allowed.codes, allowed.codes.titles)
31 | #'
32 | #' # Recommended configuration (and commonly used in this package)
33 | #' memModel <- trainCreecysMemoryBasedReasoning(proc.occupations,
34 | #'                  preprocessing = list(stopwords = character(0), stemming = NULL, strPreprocessing = TRUE, removePunct = FALSE))
35 | trainCreecysMemoryBasedReasoning <- function(data,
36 |                                        preprocessing = list(stopwords = character(0), stemming = NULL, strPreprocessing = TRUE, removePunct = TRUE)) {
37 | 
38 |   codes <- data[,code]
39 | 
40 |   # preprocessing
41 |   if (preprocessing$removePunct) {
42 |     ans <- data[, tm::removePunctuation(ans)]
43 |   } else {
44 |     ans <- data[, ans]
45 |   }
46 | 
47 |   if (preprocessing$strPreprocessing) {
48 |     ans <- stringPreprocessing(ans)
49 |   }
50 | 
51 |   # prepare text for efficient computation -> transform to sparse matrix
52 |   matrix <- asDocumentTermMatrix(ans, vect.vocab = NULL,
53 |                                  stopwords = preprocessing$stopwords,
54 |                                  stemming = preprocessing$stemming,
55 |                                  type = "dgTMatrix")
56 | 
57 |   # create features (every word and every combination of 2 words is a feature, neglecting the word order: "single word fields along with all pairwise conjunctions")
58 |   DT.wordwise <- data.table(id.training = matrix$dtm@i + 1, words = matrix$dtm@Dimnames[[2]][matrix$dtm@j + 1])[, .SD[order(words)], by = id.training] # [, .SD[order(words)], by = id.training] creates alphabetical ordering, possibly not needed
59 |   create_features <- function(words) {
60 |     combined_words <- outer(words, words, FUN = paste, sep = ".")
61 |     c(words, combined_words[upper.tri(combined_words)]) # exploiting alphabetic ordering
62 |   }
63 |   DT.wordwise <- DT.wordwise[,.SD[,list(feature = create_features(words))], keyby = id.training]
64 |   # DT.wordwise[id.training == 8]
65 |   # save for each feature the assigned code
66 |   training.prepared <- data.table(id.training = 1:length(ans), code = codes, key = "id.training")[DT.wordwise]
67 | 
68 |   # calculate feature importance
69 |   training.prepared[, feature_freq := .N, by = feature]
70 |   # "per category feature importance", P(C_i | f_k)
71 |   training.prepared[, p.code.given.feature := .N / feature_freq, by = list(feature, code)]
72 |   # "cross category" feature importance Weight f_k = sum P(C_i | f_k) ^2
73 |   training.prepared[, cross.category.feature.weight := .SD[!duplicated(code), sum(p.code.given.feature^2)], by = feature]
74 | 
75 |   return(list(training.prepared = training.prepared, vect.vocab = matrix$vect.vocab, preprocessing = preprocessing))
76 | }
77 | 


--------------------------------------------------------------------------------
/R/trainGweonsNearestNeighbor.R:
--------------------------------------------------------------------------------
  1 | #' Trains Gweons Nearest Neighbor model
  2 | #'
  3 | #' Function does some preprocessing and creates a document term matrix to be used for the Nearest Neighbor model.
  4 | #'
  5 | #' @param data a data.table created with \code{\link{removeFaultyAndUncodableAnswers_And_PrepareForAnalysis}}
  6 | #' @param preprocessing a list with elements
  7 | #' \describe{
  8 | #'   \item{stopwords}{a character vector, use \code{tm::stopwords("de")} for German stopwords.}
  9 | #'   \item{stemming}{\code{NULL} for no stemming and \code{"de"} for stemming using the German porter stemmer.}
 10 | #'   \item{strPreprocessing}{\code{TRUE} if \code{\link{stringPreprocessing}} shall be used.}
 11 | #'   \item{removePunct}{\code{TRUE} if \code{\link[tm]{removePunctuation}} shall be used.}
 12 | #' }
 13 | #'
 14 | #' @seealso
 15 | #' \code{\link{predictGweonsNearestNeighbor}}
 16 | #'
 17 | #' Gweon, H.; Schonlau, M., Kaczmirek, L., Blohm, M., Steiner, S. (2017). Three Methods for Occupation Coding Based on Statistical Learning. Journal of Official Statistics 33(1), pp. 101--122
 18 | #'
 19 | #' @return a document term matrix with some additional attributes
 20 | #' @import data.table
 21 | #' @import text2vec
 22 | #' @export
 23 | #'
 24 | #' @examples
 25 | #' # set up data
 26 | #' data(occupations)
 27 | #' allowed.codes <- c("71402", "71403", "63302", "83112", "83124", "83131", "83132", "83193", "83194", "-0004", "-0030")
 28 | #' allowed.codes.titles <- c("Office clerks and secretaries (without specialisation)-skilled tasks", "Office clerks and secretaries (without specialisation)-complex tasks", "Gastronomy occupations (without specialisation)-skilled tasks",
 29 | #'  "Occupations in child care and child-rearing-skilled tasks", "Occupations in social work and social pedagogics-highly complex tasks", "Pedagogic specialists in social care work and special needs education-unskilled/semiskilled tasks", "Pedagogic specialists in social care work and special needs education-skilled tasks", "Supervisors in education and social work, and of pedagogic specialists in social care work", "Managers in education and social work, and of pedagogic specialists in social care work",
 30 | #'  "Not precise enough for coding", "Student assistants")
 31 | #' proc.occupations <- removeFaultyAndUncodableAnswers_And_PrepareForAnalysis(occupations, colNames = c("orig_answer", "orig_code"), allowed.codes, allowed.codes.titles)
 32 | #'
 33 | #' # Recommended configuration
 34 | #' dtmModel <- trainGweonsNearestNeighbor(proc.occupations,
 35 | #'                  preprocessing = list(stopwords = tm::stopwords("de"), stemming = "de", strPreprocessing = TRUE, removePunct = FALSE))
 36 | #' # Configuration used by Gweon et al. (2017)
 37 | #' dtmModel <- trainGweonsNearestNeighbor(proc.occupations,
 38 | #'                  preprocessing = list(stopwords = tm::stopwords("de"), stemming = "de", strPreprocessing = FALSE, removePunct = TRUE))
 39 | #' # Configuration used for most other approaches in this package
 40 | #' dtmModel <- trainGweonsNearestNeighbor(proc.occupations,
 41 | #'                  preprocessing = list(stopwords = character(0), stemming = NULL, strPreprocessing = TRUE, removePunct = FALSE))
 42 | #'
 43 | #' #######################################################
 44 | #' ## RUN A GRID SEARCH (takes some time)
 45 | #' \donttest{
 46 | #' # create a grid of all combinations to be tried
 47 | #' model.grid <- data.table(expand.grid(stopwords = c(TRUE, FALSE), stemming = c(FALSE, "de"), strPreprocessing = c(TRUE, FALSE), nearest.neighbors.multiplier = c(0.05, 0.1, 0.2)))
 48 | #'
 49 | #' # Do grid search
 50 | #' for (i in 1:nrow(model.grid)) {
 51 | #'   res.model <- trainGweonsNearestNeighbor(splitted.data$training, preprocessing = list(stopwords = if (model.grid[i, stopwords]) tm::stopwords("de") else character(0),
 52 | #'                                                                                        stemming = if (model.grid[i, stemming == "de"]) "de" else NULL,
 53 | #'                                                                                        strPreprocessing = model.grid[i, strPreprocessing],
 54 | #'                                                                                        removePunct = !model.grid[i, strPreprocessing]))
 55 | #'
 56 | #'   res.proc <- predictGweonsNearestNeighbor(res.model, splitted.data$test,
 57 | #'                                         tuning = list(nearest.neighbors.multiplier = model.grid[i, nearest.neighbors.multiplier]))
 58 | #'   res.proc <- expandPredictionResults(res.proc, allowed.codes = allowed.codes, method.name = "NearestNeighbor_Gweon")
 59 | #'
 60 | #'   ac <- accuracy(calcAccurateAmongTopK(res.proc, k = 1), n = nrow(splitted.data$test))
 61 | #'   ll <- logLoss(res.proc)
 62 | #'   sh <- sharpness(res.proc)
 63 | #'
 64 | #'   model.grid[i, acc := ac[, acc]]
 65 | #'   model.grid[i, acc.se := ac[, se]]
 66 | #'   model.grid[i, acc.N := ac[, N]]
 67 | #'   model.grid[i, acc.prob0 := ac[, count.pred.prob0]]
 68 | #'   model.grid[i, loss.full := ll[1, logscore]]
 69 | #'   model.grid[i, loss.full.se := ll[1, se]]
 70 | #'   model.grid[i, loss.full.N := ll[1, N]]
 71 | #'   model.grid[i, loss.sub := ll[2, logscore]]
 72 | #'   model.grid[i, loss.sub.se := ll[2, se]]
 73 | #'   model.grid[i, loss.sub.N := ll[2, N]]
 74 | #'   model.grid[i, sharp := sh[, sharpness]]
 75 | #'   model.grid[i, sharp.se := sh[, se]]
 76 | #'   model.grid[i, sharp.N := sh[, N]]
 77 | #' }
 78 | #'
 79 | #' model.grid[order(stopwords, stemming, strPreprocessing, nearest.neighbors.multiplier)]
 80 | #'
 81 | #'
 82 | #' }
 83 | trainGweonsNearestNeighbor <- function(data,
 84 |                                        preprocessing = list(stopwords = tm::stopwords("de"), stemming = "de", strPreprocessing = FALSE, removePunct = TRUE)) {
 85 | 
 86 |   # preprocessing
 87 |   if (preprocessing$removePunct) {
 88 |     ans <- data[, tm::removePunctuation(ans)]
 89 |   } else {
 90 |     ans <- data[, ans]
 91 |   }
 92 | 
 93 |   if (preprocessing$strPreprocessing) {
 94 |     ans <- stringPreprocessing(ans)
 95 |   }
 96 | 
 97 |   # prepare text for efficient computation -> transform to sparse matrix
 98 |   matrix <- asDocumentTermMatrix(ans, vect.vocab = NULL,
 99 |                                  stopwords = preprocessing$stopwords,
100 |                                  stemming = preprocessing$stemming,
101 |                                  type = "dgCMatrix")
102 | 
103 |   return(list(matrix = matrix$dtm, vect.vocab = matrix$vect.vocab, preprocessing = preprocessing, code = data[,code],
104 |               num.allowed.codes = length(attr(data, "classification")$code)))
105 | }
106 | 


--------------------------------------------------------------------------------
/R/trainLogisticRegressionWithPenalization.R:
--------------------------------------------------------------------------------
 1 | #' Train a logistic regression model with penalization
 2 | #'
 3 | #' Function does some preprocessing and calls glmnet for a logistic regression model
 4 | #'
 5 | #' Setting \code{tuning$alpha = 0} (Ridge Penalty) seems to be most stable.
 6 | #'
 7 | #' In our experience, \code{glmnet} often returns a warning like \code{3: from glmnet Fortran code (error code -72); Convergence for 72th lambda value not reached after maxit=100000 iterations; solutions for larger lambdas returned}. To solve this issue, we can increase \code{maxit} to try more iterations or we can decrease the threshold \code{thresh}.
 8 | #'
 9 | #' @param data a data.table created with \code{\link{removeFaultyAndUncodableAnswers_And_PrepareForAnalysis}}
10 | #' @param preprocessing a list with elements
11 | #' \describe{
12 | #'   \item{stopwords}{a character vector, use \code{tm::stopwords("de")} for German stopwords.}
13 | #'   \item{stemming}{\code{NULL} for no stemming and \code{"de"} for stemming using the German porter stemmer.}
14 | #'   \item{countWords}{Set to TRUE if the predictor matrix should contain a column for answer length.}
15 | #' }
16 | #' @param tuning a list with elements that will be passed to \code{\link[glmnet]{glmnet}}
17 | #'
18 | #' @seealso \code{\link{predictLogisticRegressionWithPenalization}}, \code{\link[glmnet]{glmnet}}
19 | #'
20 | #' @return a logistic regression model. Commands from \code{\link[glmnet]{glmnet}} should work.
21 | #' @import data.table
22 | #' @import text2vec
23 | #' @export
24 | #'
25 | #' @examples
26 | #' # set up data
27 | #' data(occupations)
28 | #' allowed.codes <- c("71402", "71403", "63302", "83112", "83124", "83131", "83132", "83193", "83194", "-0004", "-0030")
29 | #' allowed.codes.titles <- c("Office clerks and secretaries (without specialisation)-skilled tasks", "Office clerks and secretaries (without specialisation)-complex tasks", "Gastronomy occupations (without specialisation)-skilled tasks",
30 | #'  "Occupations in child care and child-rearing-skilled tasks", "Occupations in social work and social pedagogics-highly complex tasks", "Pedagogic specialists in social care work and special needs education-unskilled/semiskilled tasks", "Pedagogic specialists in social care work and special needs education-skilled tasks", "Supervisors in education and social work, and of pedagogic specialists in social care work", "Managers in education and social work, and of pedagogic specialists in social care work",
31 | #'  "Not precise enough for coding", "Student assistants")
32 | #' proc.occupations <- removeFaultyAndUncodableAnswers_And_PrepareForAnalysis(occupations, colNames = c("orig_answer", "orig_code"), allowed.codes, allowed.codes.titles)
33 | #'
34 | #' # Recommended configuration
35 | #' trainLogisticRegressionWithPenalization(proc.occupations,
36 | #'                  preprocessing = list(stopwords = character(0), stemming = "de", countWords = FALSE),
37 | #'                  tuning = list(alpha = 0.05, maxit = 10^6, nlambda = 100, thresh = 1e-7))
38 | #'
39 | #' # Other possibility
40 | #' trainLogisticRegressionWithPenalization(proc.occupations,
41 | #'                  preprocessing = list(stopwords = tm::stopwords("de"), stemming = NULL, countWords = TRUE),
42 | #'                  tuning = list(alpha = 0.05, maxit = 10^6, nlambda = 100, thresh = 1e-7))
43 | trainLogisticRegressionWithPenalization <- function(data,
44 |                                                   preprocessing = list(stopwords = character(0), stemming = NULL, countWords = FALSE),
45 |                                                   tuning = list(alpha = 0.05, maxit = 10^5, nlambda = 100, thresh = 1e-07)) {
46 | 
47 |   if (!requireNamespace("glmnet", quietly = TRUE)) {
48 |     stop("Package \"glmnet\" needed if you want to use this function. Please install it.",
49 |          call. = FALSE)
50 |   }
51 | 
52 |   # remove seldom codes
53 |   seldom.codes <- data[, .N, by = code][N <= 1, code]
54 |   if (length(seldom.codes) > 0) {
55 |     warning(paste("This algorithm requires that every outcome code appears at least twice in the training data. The following codes were removed:", paste(seldom.codes, collapse = ", ")))
56 | 
57 |     data <- data[!(code %in% seldom.codes)]
58 |   }
59 | 
60 |   # and prepare codes
61 |   outcome <- as.factor(data[,code])
62 | 
63 |   # preprocessing
64 |   ans <- data[, stringPreprocessing(ans)]
65 | 
66 |   # prepare text for efficient computation -> transform to sparse matrix
67 |   matrix <- asDocumentTermMatrix(ans, vect.vocab = NULL,
68 |                                  stopwords = preprocessing$stopwords,
69 |                                  stemming = preprocessing$stemming,
70 |                                  type = "dgCMatrix")
71 | 
72 |   # include feature for number of words
73 |   if (preprocessing$countWords) {
74 |     ans_freq <- sapply(strsplit(ans, " "), length)
75 |     matrix$dtm <- cbind(matrix$dtm, ans_freq)
76 |   }
77 | 
78 |   # estimate model
79 |   fit <- glmnet::glmnet(matrix$dtm, outcome, family="multinomial", alpha = tuning$alpha, maxit = tuning$maxit, nlambda = tuning$nlambda, thresh = tuning$thresh)
80 | 
81 |   # and save preprocessing
82 |   fit$vect.vocab <- matrix$vect.vocab
83 |   fit$preprocessing <- preprocessing
84 |   fit$seldom.codes <- seldom.codes
85 |   return(fit)
86 | }
87 | 


--------------------------------------------------------------------------------
/data/coding_index_excerpt.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/malsch/occupationCoding/940a6f6117f264e093246ad1868ec9bde45ba40c/data/coding_index_excerpt.RData


--------------------------------------------------------------------------------
/data/frequent_phrases.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/malsch/occupationCoding/940a6f6117f264e093246ad1868ec9bde45ba40c/data/frequent_phrases.RData


--------------------------------------------------------------------------------
/data/kldb2010PlusFive.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/malsch/occupationCoding/940a6f6117f264e093246ad1868ec9bde45ba40c/data/kldb2010PlusFive.RData


--------------------------------------------------------------------------------
/data/occupations.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/malsch/occupationCoding/940a6f6117f264e093246ad1868ec9bde45ba40c/data/occupations.RData


--------------------------------------------------------------------------------
/data/surveyCountsSubstringSimilarity.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/malsch/occupationCoding/940a6f6117f264e093246ad1868ec9bde45ba40c/data/surveyCountsSubstringSimilarity.RData


--------------------------------------------------------------------------------
/data/surveyCountsWordwiseSimilarity.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/malsch/occupationCoding/940a6f6117f264e093246ad1868ec9bde45ba40c/data/surveyCountsWordwiseSimilarity.RData


--------------------------------------------------------------------------------
/man/accuracy.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/accuracy.R
 3 | \name{accuracy}
 4 | \alias{accuracy}
 5 | \title{Accuracy}
 6 | \usage{
 7 | accuracy(occupationalPredictionsAmongTopK, n)
 8 | }
 9 | \arguments{
10 | \item{occupationalPredictionsAmongTopK}{a data table created with \code{\link{calcAccurateAmongTopK}}.}
11 | 
12 | \item{n}{Number of unique persons in test data (may be larger than the number of persons in occupationalPredictionsAmongTopK if for some persons no codes were suggested)}
13 | }
14 | \value{
15 | a data.table with columns N (number of unique IDs in occupationalPredictionsAmongTopK), acc (\code{sum(acc) / n}), se (standard error of acc), mean.in.subset.N (\code{mean(acc)}), and count.pred.prob0 (\code{sum(pred.prob == 0)})
16 | }
17 | \description{
18 | Calculate accuracy \eqn{p = \frac{1}{n} \sum acc} and standard errors \eqn{\sqrt{\frac{1}{n} * p * (1-p)}}.
19 | }
20 | \details{
21 | Note that this function also works if \code{occupationalPredictionsAmongTopK} contains less than \code{n} individuals.
22 | }
23 | \examples{
24 | # set up data
25 | data(occupations)
26 | allowed.codes <- c("71402", "71403", "63302", "83112", "83124", "83131", "83132", "83193", "83194", "-0004", "-0030")
27 | allowed.codes.titles <- c("Office clerks and secretaries (without specialisation)-skilled tasks", "Office clerks and secretaries (without specialisation)-complex tasks", "Gastronomy occupations (without specialisation)-skilled tasks",
28 |  "Occupations in child care and child-rearing-skilled tasks", "Occupations in social work and social pedagogics-highly complex tasks", "Pedagogic specialists in social care work and special needs education-unskilled/semiskilled tasks", "Pedagogic specialists in social care work and special needs education-skilled tasks", "Supervisors in education and social work, and of pedagogic specialists in social care work", "Managers in education and social work, and of pedagogic specialists in social care work",
29 |  "Not precise enough for coding", "Student assistants")
30 | proc.occupations <- removeFaultyAndUncodableAnswers_And_PrepareForAnalysis(occupations, colNames = c("orig_answer", "orig_code"), allowed.codes, allowed.codes.titles)
31 | 
32 | ## split sample
33 | set.seed(3451345)
34 | n.test <- 50
35 | group <- sample(c(rep("test", n.test), rep("training", nrow(proc.occupations) - n.test)))
36 | splitted.data <- split(proc.occupations, group)
37 | 
38 | # train model and make predictions
39 | model <- trainLogisticRegressionWithPenalization(splitted.data$train, preprocessing = list(stopwords = tm::stopwords("de"), stemming = "de", countWords = FALSE), tuning = list(alpha = 0.05, maxit = 50^5, nlambda = 100, thresh = 1e-5))
40 | res <- predictLogisticRegressionWithPenalization(model, splitted.data$test)
41 | 
42 | # expand to contain more categories than the initial ones
43 | res.proc1 <- expandPredictionResults(res, allowed.codes = c("12345", allowed.codes), method.name = "glmnet1")
44 | 
45 | # we can use different methods to create a combined dataset. This is how to run the subsequent analysis functions only once.
46 | res.proc2 <- expandPredictionResults(res, allowed.codes = c("12345", allowed.codes), method.name = "glmnet2")
47 | res.proc <- rbind(res.proc1, res.proc2); class(res.proc) <- c(class(res.proc), "occupationalPredictionsComplete")
48 | 
49 | calcAccurateAmongTopK(res.proc, k = 5)[,mean(acc), by = method.name]
50 | accuracy(calcAccurateAmongTopK(res.proc, k = 5), n = 50)
51 | accuracy(calcAccurateAmongTopK(res.proc, k = 1), n = 50)
52 | }
53 | 


--------------------------------------------------------------------------------
/man/asDocumentTermMatrix.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/asDocumentTermMatrix.R
 3 | \name{asDocumentTermMatrix}
 4 | \alias{asDocumentTermMatrix}
 5 | \title{Document-Term Matrix}
 6 | \usage{
 7 | asDocumentTermMatrix(
 8 |   input,
 9 |   vect.vocab = NULL,
10 |   stopwords = character(0),
11 |   stemming = NULL,
12 |   type = c("dgCMatrix", "dgTMatrix", "lda_c")
13 | )
14 | }
15 | \arguments{
16 | \item{input}{a character vector.}
17 | 
18 | \item{vect.vocab}{a vocabulary created with \code{\link[text2vec:vectorizers]{vocab_vectorizer}}. If \code{NULL}, the vocabulary is created from the input. See example for a typical use case.}
19 | 
20 | \item{stopwords}{character vector of stopwords to exclude when creating the vocabulary. \code{tm::stopwords("de")} provides German stopwords.}
21 | 
22 | \item{stemming}{\code{NULL} for no stemming and \code{"de"} for stemming using the German porter stemmer.}
23 | 
24 | \item{type}{character, one of c("dgCMatrix", "dgTMatrix", "lda_c") taken from \code{\link[text2vec]{create_dtm}}. \code{dgCMatrix} are useful for glmnet; \code{dgTMatrix} matrix refers to sparse matrices in triplet form, i.e. positions of all non-zero values are stored (easier to work with, but non-unique).}
25 | }
26 | \value{
27 | A list with two elements
28 | \describe{
29 |   \item{dtm}{a sparse document-term-matrix, depending on the \code{type}-parameter}
30 |   \item{vect.vocab}{a vocabulary that can be inserted as \code{vect.vocab} to build a document term matrix on new data with the same vocabulary.}
31 | }
32 | }
33 | \description{
34 | Constructs a document-term matrix.
35 | }
36 | \examples{
37 | x <- c("Verkauf von Schreibwaren", "Verkauf", "Schreibwaren")
38 | asDocumentTermMatrix(x)
39 | asDocumentTermMatrix(x, type = "dgTMatrix")
40 | asDocumentTermMatrix(x, stopwords = tm::stopwords("de"))
41 | 
42 | (x <- c("Verkauf von B\xfcchern, Schreibwaren", "Fach\xe4rzin f\xfcr Kinder- und Jugendmedizin im \xf6ffentlichen Gesundheitswesen", "Industriemechaniker", "Dipl.-Ing. - Agrarwirtschaft (Landwirtschaft)"))
43 | x <- stringPreprocessing(x)
44 | dtm <- asDocumentTermMatrix(x, stemming = "de")
45 | print(dtm$dtm)
46 | dimnames(dtm$dtm)[[2]]
47 | 
48 | # use the newly created vocab_vectorizer
49 | (x <- stringPreprocessing(c("WILL NOT SHOW UP", "Verkauf von B\xfcchern, Schreibwaren", "Fach\xe4rzin f\xfcr Kinder- und Jugendmedizin")))
50 | asDocumentTermMatrix(x, vect.vocab = dtm$vect.vocab, stopwords = character(0), stemming = "de")$dtm
51 | }
52 | \seealso{
53 | \url{http://text2vec.org/vectorization.html} for details on the implementation used here,
54 | another implementation \code{\link[tm]{TermDocumentMatrix}} is slower
55 | }
56 | 


--------------------------------------------------------------------------------
/man/calcAccurateAmongTopK.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/calcAccurateAmongTopK.R
 3 | \name{calcAccurateAmongTopK}
 4 | \alias{calcAccurateAmongTopK}
 5 | \title{Calculate aggregate properties for top k predicted categories}
 6 | \usage{
 7 | calcAccurateAmongTopK(occupationalPredictions, k = 1)
 8 | }
 9 | \arguments{
10 | \item{occupationalPredictions}{a data.table created with a \code{\link{expandPredictionResults}}-function from this package.}
11 | 
12 | \item{k}{how many top k categories to aggregate over?}
13 | }
14 | \value{
15 | a data.table
16 | }
17 | \description{
18 | Start with a data.table of class 'occupationalPredictions' (for each combination of pred.code and answer one prediction) and calulate if one of the top k entries is accurate.
19 | }
20 | \details{
21 | \code{num.suggested} and \code{general.among.top5} is currently not used. Relates to situations if the prediction algorithm does not provide all codes.
22 | }
23 | \examples{
24 | # set up data
25 | data(occupations)
26 | allowed.codes <- c("71402", "71403", "63302", "83112", "83124", "83131", "83132", "83193", "83194", "-0004", "-0030")
27 | allowed.codes.titles <- c("Office clerks and secretaries (without specialisation)-skilled tasks", "Office clerks and secretaries (without specialisation)-complex tasks", "Gastronomy occupations (without specialisation)-skilled tasks",
28 |  "Occupations in child care and child-rearing-skilled tasks", "Occupations in social work and social pedagogics-highly complex tasks", "Pedagogic specialists in social care work and special needs education-unskilled/semiskilled tasks", "Pedagogic specialists in social care work and special needs education-skilled tasks", "Supervisors in education and social work, and of pedagogic specialists in social care work", "Managers in education and social work, and of pedagogic specialists in social care work",
29 |  "Not precise enough for coding", "Student assistants")
30 | proc.occupations <- removeFaultyAndUncodableAnswers_And_PrepareForAnalysis(occupations, colNames = c("orig_answer", "orig_code"), allowed.codes, allowed.codes.titles)
31 | 
32 | ## split sample
33 | set.seed(3451345)
34 | n.test <- 50
35 | group <- sample(c(rep("test", n.test), rep("training", nrow(proc.occupations) - n.test)))
36 | splitted.data <- split(proc.occupations, group)
37 | 
38 | # train model and make predictions
39 | model <- trainLogisticRegressionWithPenalization(splitted.data$train, preprocessing = list(stopwords = tm::stopwords("de"), stemming = "de", countWords = FALSE), tuning = list(alpha = 0.05, maxit = 50^5, nlambda = 100, thresh = 1e-5))
40 | res <- predictLogisticRegressionWithPenalization(model, splitted.data$test)
41 | 
42 | # expand to contain more categories than the initial ones
43 | res.proc1 <- expandPredictionResults(res, allowed.codes = c("12345", allowed.codes), method.name = "glmnet1")
44 | 
45 | # aggregate over top k categories
46 | calcAccurateAmongTopK(res.proc1, k = 1)[,mean(acc)]
47 | calcAccurateAmongTopK(res.proc1, k = 5)[,mean(acc)]
48 | 
49 | # we can use different methods to create a combined dataset. This is how to run the subsequent analysis functions only once.
50 | res.proc2 <- expandPredictionResults(res, allowed.codes = c("12345", allowed.codes), method.name = "glmnet2")
51 | res.proc <- rbind(res.proc1, res.proc2); class(res.proc) <- c(class(res.proc), "occupationalPredictionsComplete")
52 | 
53 | calcAccurateAmongTopK(res.proc, k = 5)[,mean(acc), by = method.name]
54 | # res[, calcAccurateAmongTopK(.SD, k = 5), by = method][,mean(acc), by = method]
55 | }
56 | \seealso{
57 | ...
58 | }
59 | 


--------------------------------------------------------------------------------
/man/coding_index_excerpt.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \encoding{UTF-8}
 5 | \name{coding_index_excerpt}
 6 | \alias{coding_index_excerpt}
 7 | \title{An excerpt from the Gesamtberufsliste der BA}
 8 | \format{
 9 | A data frame with 90 rows and 5 variables:
10 | \describe{
11 |   \item{Berufsbenennungen}{neutral short title}
12 |   \item{bezMale}{male long title}
13 |   \item{bezFemale}{female long title}
14 |   \item{Code}{Code from the German Classification of Occupations (KldB 2010)}
15 |   \item{count_categories}{An indicator of job title ambiguity. Only used within function \code{\link{predictWithCodingIndex}}}
16 | }
17 | }
18 | \source{
19 | Bundesagentur für Arbeit (2019). Gesamtberufsliste der Bundesagentur für Arbeit. Stand: 03.01.2019. The \code{Gesamtberufsliste der BA} is available at \url{https://download-portal.arbeitsagentur.de/files/}.
20 | 
21 | The function \code{\link{prepare_German_coding_index_Gesamtberufsliste_der_BA}} was used to process the downloaded \code{.xlsx}-file and prepare this coding index. The resulting coding index has 27853 rows. We selected 90 rows that are related to the \code{\link{occupations}} dataset.
22 | }
23 | \usage{
24 | coding_index_excerpt
25 | }
26 | \description{
27 | 90 selected job titles from the \code{Gesamtberufsliste der BA}. This dataset is not complete and only used to demonstrate the functionality of the package. We recommend that users build their own coding index. \code{\link{prepare_German_coding_index_Gesamtberufsliste_der_BA}} can be used to build a German coding index.
28 | }
29 | \keyword{datasets}
30 | 


--------------------------------------------------------------------------------
/man/cosineSimilarity.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/cosineSimilarity.R
 3 | \name{cosineSimilarity}
 4 | \alias{cosineSimilarity}
 5 | \title{Cosine Similarity}
 6 | \usage{
 7 | cosineSimilarity(matrix1, matrix2)
 8 | }
 9 | \arguments{
10 | \item{matrix1}{a matrix of type \code{dgCMatrix}.}
11 | 
12 | \item{matrix2}{a matrix of type \code{dgCMatrix}.}
13 | }
14 | \value{
15 | A \code{dgCMatrix} where element \code{A[index1, index2]} is the cosine similarity between \code{matrix1[index1,]} and \code{matrix2[index2,]}.
16 | }
17 | \description{
18 | Calculate cosine similarity between every row in \code{matrix1} and every row in \code{matrix2}.
19 | }
20 | \details{
21 | Cosine similarity is a measure of similarity between two vectors \eqn{x} and \eqn{y} that measures the cosine of the angle between them. Since we consider positive vectors, its maximal value is 1 if both vectors are identical and its minimal value is 0 if \eqn{x \times y = 0}.
22 | 
23 | The definition is: \eqn{similarity = (x \times y) / (||x|| \times ||y||) = (\sum_i x_i \times y_i) / (\sqrt{(\sum_i x_i^2)} \times \sqrt{(\sum_i y_i^2)})}
24 | }
25 | \examples{
26 | x <- c("Verkauf von Schreibwaren", "Verkauf", "Schreibwaren", "Industriemechaniker", "NOTINDOCUMENTTERMMATRIX")
27 | (y <- c("Verkauf von B\xfcchern, Schreibwaren", "Fach\xe4rzin f\xfcr Kinder- und Jugendmedizin im \xf6ffentlichen Gesundheitswesen", "Industriemechaniker", "Dipl.-Ing. - Agrarwirtschaft (Landwirtschaft)"))
28 | 
29 | tok_fun = text2vec::word_tokenizer
30 | it_train = text2vec::itoken(tolower(y), tokenizer = tok_fun, progressbar = FALSE)
31 | vocab = text2vec::create_vocabulary(it_train)
32 | vect.vocab = text2vec::vocab_vectorizer(vocab)
33 | 
34 | matrix1 <- asDocumentTermMatrix(x, vect.vocab = vect.vocab)$dtm
35 | matrix2 <- asDocumentTermMatrix(y, vect.vocab = vect.vocab)$dtm
36 | 
37 | cosineSimilarity(matrix1, matrix1)
38 | cosineSimilarity(matrix1, matrix2)
39 | }
40 | \seealso{
41 | \code{\link[Matrix]{Matrix}}
42 | }
43 | 


--------------------------------------------------------------------------------
/man/createDescriptives.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/createDescriptives.R
 3 | \name{createDescriptives}
 4 | \alias{createDescriptives}
 5 | \title{Describe occupational data}
 6 | \usage{
 7 | createDescriptives(data, noninformative = c(""))
 8 | }
 9 | \arguments{
10 | \item{data}{eiter a data.table created with \code{\link{removeFaultyAndUncodableAnswers_And_PrepareForAnalysis}} or a character vector}
11 | 
12 | \item{noninformative}{a charactor vector: if an answer is in \code{noninformative}, it is excluded}
13 | }
14 | \value{
15 | Side effects only: a plot and text
16 | }
17 | \description{
18 | Outputs the following descriptives: frequencies, mean, median, max of number of words. Number of codes used and frequencies for special codes
19 | }
20 | \examples{
21 | data <- data.table(answers = c("LEITER VERTRIEB", "Kfz-Schlossermeister", "Aushilfe im Hotel(Bereich Housekeeping)", "Studentische Hilfskraft"),
22 |                    codes = c("61194", "25213", "63221", "-0001"))
23 | (allowed.codes <- c("11101", "61194", "25213", "63221", "-0001"))
24 | (allowed.codes.titles <- c("Berufe in der Landwirtschaft (ohne Spezialisierung) - Helfer-/Anlernt\xe4tigkeiten", "Berufe in der Kraftfahrzeugtechnik - komplexe Spezialistent\xe4tigkeiten", "F\xfchrungskräfte - Einkauf und Vertrieb", "Berufe im Hotelservice - Helfer-/Anlernt\xe4tigkeiten", "a negative label (used for categories like 'student assistant' that are not in the official classification)"))
25 | data <- removeFaultyAndUncodableAnswers_And_PrepareForAnalysis(data, colNames = c("answers", "codes"), allowed.codes, allowed.codes.titles)
26 | createDescriptives(data)
27 | 
28 | (answer <- c("LEITER VERTRIEB", "Kfz-Schlossermeister", "Aushilfe im Hotel(Bereich Housekeeping)"))
29 | createDescriptives(answer)
30 | 
31 | data(occupations)
32 | allowed.codes <- c("71402", "71403", "63302", "83112", "83124", "83131", "83132", "83193", "83194", "-0004", "-0030")
33 | allowed.codes.titles <- c("Office clerks and secretaries (without specialisation)-skilled tasks", "Office clerks and secretaries (without specialisation)-complex tasks",
34 |  "Gastronomy occupations (without specialisation)-skilled tasks", "Occupations in child care and child-rearing-skilled tasks", "Occupations in social work and social pedagogics-highly complex tasks", "Pedagogic specialists in social care work and special needs education-unskilled/semiskilled tasks", "Pedagogic specialists in social care work and special needs education-skilled tasks", "Supervisors in education and social work, and of pedagogic specialists in social care work", "Managers in education and social work, and of pedagogic specialists in social care work",
35 |  "Not precise enough for coding", "Student assistants")
36 | occupations <- removeFaultyAndUncodableAnswers_And_PrepareForAnalysis(occupations, colNames = c("orig_answer", "orig_code"), allowed.codes, allowed.codes.titles)
37 | createDescriptives(occupations)
38 | }
39 | 


--------------------------------------------------------------------------------
/man/createSimilarityTableStringdist.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/createSimilarityTableStringdist.R
 3 | \name{createSimilarityTableStringdist}
 4 | \alias{createSimilarityTableStringdist}
 5 | \title{Similarity Table with Coding index}
 6 | \usage{
 7 | createSimilarityTableStringdist(
 8 |   unique.string,
 9 |   coding_index_w_codes,
10 |   coding_index_without_codes,
11 |   dist.control = list(method = "osa", weight = c(d = 1, i = 1, s = 1, t = 1)),
12 |   threshold = 3
13 | )
14 | }
15 | \arguments{
16 | \item{unique.string}{a character vector (usually unique(answer))}
17 | 
18 | \item{coding_index_w_codes}{a data.table with columns "title" and "Code".}
19 | 
20 | \item{coding_index_without_codes}{a character vector of additional titles}
21 | 
22 | \item{dist.control}{a list that will be passed to \code{\link[stringdist:stringdist]{stringdistmatrix}}. Currently only two elements are implemented:
23 | \describe{
24 |   \item{method}{Method for distance calculation.}
25 |   \item{weight}{For method='osa' or 'dl'.}
26 | }}
27 | 
28 | \item{threshold}{All entries with distance above this threshold will be removed from the result}
29 | }
30 | \value{
31 | a list with elements
32 | \describe{
33 |   \item{dist_table_w_code}{a data.table with colummns \code{intString}, \code{dictString.title}, \code{dictString.Code}, \code{dist}}
34 |   \item{dist_table_without_code}{\code{NULL} or a data.table with colummns \code{intString}, \code{dictString}, \code{dist}}
35 |   \item{vect_vocab}{see \code{link{asDocumentTermMatrix}}}
36 | }
37 | }
38 | \description{
39 | Calculate string similarity between \code{unique.string} and \code{(coding_index_w_codes, coding_index_without_codes)}.
40 | }
41 | \details{
42 | Special function for similarity-based reasoning: creates distance data with osa-method c(d = 1, i = 1, s = 1, t = 1)
43 | dist == 0: strings in dict and data are identical
44 | }
45 | \examples{
46 | ## Prepare coding index
47 | # write female titles beneath the male title
48 | coding_index <- rbind(coding_index_excerpt[, list(title = bezMale, Code)],
49 |                       coding_index_excerpt[, list(title = bezFemale, Code)])
50 | # standardize titles from the coding index
51 | coding_index <- coding_index[,title := stringPreprocessing(title)]
52 | # drop duplicate lines, might be suboptimal because we keep each title and its associated code only a single time. This means we delete duplicates and the associated, possibly relevant codes.
53 | coding_index <- coding_index[!duplicated(title)]
54 | 
55 | (x <- c("Abgeordneter", "Abgeordneter", "Abgeordnete", "abgeordnet", "FSJ", "FSJ2", "Industriemechaniker", "Dipl.-Ing. - Agrarwirtschaft (Landwirtschaft)"))
56 | createSimilarityTableStringdist(unique.string = stringPreprocessing(x),
57 |                                 coding_index_w_codes = coding_index,
58 |                                 coding_index_without_codes = frequent_phrases,
59 |                                 dist.control = list(method = "osa", weight = c(d = 1, i = 1, s = 1, t = 1)),
60 |                                 threshold = 3)
61 | }
62 | \seealso{
63 | \code{\link{trainSimilarityBasedReasoning}}, \code{\link{createSimilarityTableWordwiseStringdist}}, \code{\link{createSimilarityTableSubstring}}
64 | }
65 | 


--------------------------------------------------------------------------------
/man/createSimilarityTableSubstring.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/createSimilarityTableSubstring.R
 3 | \name{createSimilarityTableSubstring}
 4 | \alias{createSimilarityTableSubstring}
 5 | \title{Similarity Table with Coding index}
 6 | \usage{
 7 | createSimilarityTableSubstring(
 8 |   unique.string,
 9 |   coding_index_w_codes,
10 |   coding_index_without_codes
11 | )
12 | }
13 | \arguments{
14 | \item{unique.string}{a character vector (usually unique(answer))}
15 | 
16 | \item{coding_index_w_codes}{a data.table with columns "title" and "Code".}
17 | 
18 | \item{coding_index_without_codes}{a character vector of additional titles}
19 | }
20 | \value{
21 | a list with elements
22 | \describe{
23 |   \item{dist_table_w_code}{a data.table with colummns \code{intString}, \code{dictString.title}, \code{dictString.Code}, \code{dist}}
24 |   \item{dist_table_without_code}{\code{NULL} or a data.table with colummns \code{intString}, \code{dictString}, \code{dist}}
25 |   \item{vect_vocab}{see \code{link{asDocumentTermMatrix}}}
26 | }
27 | }
28 | \description{
29 | Calculate SUBSTRING similarity between \code{unique.string} and \code{(coding_index_w_codes, coding_index_without_codes)}. unique.string and coding_index are similar if coding_index is a substring of unique.string.
30 | }
31 | \details{
32 | Special function for similarity-based reasoning: creates distance data with substring-method
33 | }
34 | \examples{
35 | ## Prepare coding index
36 | # write female titles beneath the male title
37 | coding_index <- rbind(coding_index_excerpt[, list(title = bezMale, Code)],
38 |                       coding_index_excerpt[, list(title = bezFemale, Code)])
39 | # standardize titles from the coding index
40 | coding_index <- coding_index[,title := stringPreprocessing(title)]
41 | # drop duplicate lines, might be suboptimal because we keep each title and its associated code only a single time. This means we delete duplicates and the associated, possibly relevant codes.
42 | coding_index <- coding_index[!duplicated(title)]
43 | 
44 | (x <- c("Abgeordneter", "Abgeordneter", "Abgeordnete", "abgeordnet", "FSJ", "FSJ2", "Industriemechaniker", "Dipl.-Ing. - Agrarwirtschaft (Landwirtschaft)"))
45 | createSimilarityTableSubstring(unique.string = stringPreprocessing(x),
46 |                                coding_index_w_codes = coding_index,
47 |                                coding_index_without_codes = frequent_phrases)
48 | }
49 | \seealso{
50 | \code{\link{trainSimilarityBasedReasoning}}, \code{\link{createSimilarityTableWordwiseStringdist}}, \code{\link{createSimilarityTableStringdist}}, \code{\link{createSimilarityTableStringdist}}
51 | }
52 | 


--------------------------------------------------------------------------------
/man/createSimilarityTableWordwiseStringdist.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/createSimilarityTableWordwiseStringdist.R
 3 | \name{createSimilarityTableWordwiseStringdist}
 4 | \alias{createSimilarityTableWordwiseStringdist}
 5 | \title{Wordwise Similarity Table with Coding index}
 6 | \usage{
 7 | createSimilarityTableWordwiseStringdist(
 8 |   unique.string,
 9 |   coding_index_w_codes,
10 |   coding_index_without_codes,
11 |   preprocessing,
12 |   dist.control = list(method = "osa", weight = c(d = 1, i = 1, s = 1, t = 1)),
13 |   threshold = 1
14 | )
15 | }
16 | \arguments{
17 | \item{unique.string}{a character vector (usually unique(answer))}
18 | 
19 | \item{coding_index_w_codes}{a data.table with columns "title" and "Code".}
20 | 
21 | \item{coding_index_without_codes}{a character vector of additional titles}
22 | 
23 | \item{preprocessing}{a list with elements
24 | \describe{
25 |   \item{stopwords}{a character vector, use \code{tm::stopwords("de")} for German stopwords.}
26 |   \item{stemming}{\code{NULL} for no stemming and \code{"de"} for stemming using the German porter stemmer.}
27 |   \item{strPreprocessing}{\code{TRUE} if \code{\link{stringPreprocessing}} shall be used.}
28 |   \item{removePunct}{\code{TRUE} if \code{\link[tm]{removePunctuation}} shall be used.}
29 | }}
30 | 
31 | \item{dist.control}{a list that will be passed to \code{\link[stringdist:stringdist]{stringdistmatrix}}. Currently only two elements are implemented:
32 | \describe{
33 |   \item{method}{Method for distance calculation.}
34 |   \item{weight}{For method='osa' or 'dl'.}
35 | }}
36 | 
37 | \item{threshold}{All entries with distance above this threshold will be removed from the result}
38 | }
39 | \value{
40 | a list with elements
41 | \describe{
42 |   \item{dist_table_w_code}{a data.table with colummns \code{intString}, \code{dictString.title}, \code{dictString.Code}, \code{dist}}
43 |   \item{dist_table_without_code}{\code{NULL} or a data.table with colummns \code{intString}, \code{dictString}, \code{dist}}
44 |   \item{vect_vocab}{see \code{link{asDocumentTermMatrix}}}
45 | }
46 | }
47 | \description{
48 | Calculate string similarity on a word basis between \code{unique.string} and \code{(coding_index_w_codes, coding_index_without_codes)}.
49 | }
50 | \details{
51 | Special function for similarity-based reasoning: creates WORDWISE!!!!! distance data with osa-method c(d = 1, i = 1, s = 1, t = 1)
52 | --> allows to correct 1 letter in a single word and matches this word with the dictionary.
53 | This means: unique.string is split wordwise and that word is used for matchning which has lowest osa-distance (all in case of a tie)
54 | example:
55 | "KUESTER and HAUSMEISTER" has distance 0 to both dictString.title HAUSMEISTER and KUESTER. Because the word HAUSMEISTER has minimal distance, another dictString.title HAUMEISTER, which has dist = 1 is not included.
56 | }
57 | \examples{
58 | ## Prepare coding index
59 | # write female titles beneath the male title
60 | coding_index <- rbind(coding_index_excerpt[, list(title = bezMale, Code)],
61 |                       coding_index_excerpt[, list(title = bezFemale, Code)])
62 | # standardize titles from the coding index
63 | coding_index <- coding_index[,title := stringPreprocessing(title)]
64 | # drop duplicate lines, might be suboptimal because we keep each title and its associated code only a single time. This means we delete duplicates and the associated, possibly relevant codes.
65 | coding_index <- coding_index[!duplicated(title)]
66 | 
67 | (x <- c("Abgeordneter", "Abgeordneter", "Abgeordnete", "abgeordnet", "abgeordnet zu xxx", "FSJ", "FSJ2", "Industriemechaniker", "Dipl.-Ing. - Agrarwirtschaft (Landwirtschaft)"))
68 | createSimilarityTableWordwiseStringdist(unique.string = stringPreprocessing(x),
69 |                                         coding_index_w_codes = coding_index,
70 |                                          coding_index_without_codes = frequent_phrases,
71 |                                          preprocessing = list(stopwords = tm::stopwords("de"), stemming = NULL, strPreprocessing = TRUE, removePunct = FALSE),
72 |                                          dist.control = list(method = "osa", weight = c(d = 1, i = 1, s = 1, t = 1)),
73 |                                          threshold = 1)
74 | }
75 | \seealso{
76 | \code{\link{trainSimilarityBasedReasoning}}, \code{\link{createSimilarityTableStringdist}}, \code{\link{createSimilarityTableSubstring}}
77 | }
78 | 


--------------------------------------------------------------------------------
/man/expandPredictionResults.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/expandPredictionResults.R
 3 | \name{expandPredictionResults}
 4 | \alias{expandPredictionResults}
 5 | \title{Expands predicted datasets to contain all allowed codes}
 6 | \usage{
 7 | expandPredictionResults(occupationalPredictions, allowed.codes, method.name)
 8 | }
 9 | \arguments{
10 | \item{allowed.codes}{a character vector of all allowed codes.}
11 | 
12 | \item{method.name}{the name how the method shall be called.}
13 | 
14 | \item{data}{a data.table created with a \code{predict}-function from this package.}
15 | }
16 | \value{
17 | a data.table
18 | }
19 | \description{
20 | Start with a data.table of class 'occupationalPredictions' (for each combination of pred.code and answer one prediction) and expand it to contain all allowed codes.
21 | }
22 | \details{
23 | The problem solved here is this: Most algorithms do not provide codes for all categories from the classification, because this would require that the categories are in the training data. This function expands the dataset and predicts some very small probabilities (or 0) for classification codes that the training algorithm found impossible to predict.
24 | }
25 | \examples{
26 | # set up data
27 | data(occupations)
28 | allowed.codes <- c("71402", "71403", "63302", "83112", "83124", "83131", "83132", "83193", "83194", "-0004", "-0030")
29 | allowed.codes.titles <- c("Office clerks and secretaries (without specialisation)-skilled tasks", "Office clerks and secretaries (without specialisation)-complex tasks", "Gastronomy occupations (without specialisation)-skilled tasks",
30 |  "Occupations in child care and child-rearing-skilled tasks", "Occupations in social work and social pedagogics-highly complex tasks", "Pedagogic specialists in social care work and special needs education-unskilled/semiskilled tasks", "Pedagogic specialists in social care work and special needs education-skilled tasks", "Supervisors in education and social work, and of pedagogic specialists in social care work", "Managers in education and social work, and of pedagogic specialists in social care work",
31 |  "Not precise enough for coding", "Student assistants")
32 | proc.occupations <- removeFaultyAndUncodableAnswers_And_PrepareForAnalysis(occupations, colNames = c("orig_answer", "orig_code"), allowed.codes, allowed.codes.titles)
33 | 
34 | ## split sample
35 | set.seed(3451345)
36 | n.test <- 50
37 | group <- sample(c(rep("test", n.test), rep("training", nrow(proc.occupations) - n.test)))
38 | splitted.data <- split(proc.occupations, group)
39 | 
40 | # train model and make predictions
41 | model <- trainLogisticRegressionWithPenalization(splitted.data$train, preprocessing = list(stopwords = tm::stopwords("de"), stemming = "de", countWords = FALSE), tuning = list(alpha = 0.05, maxit = 50^5, nlambda = 100, thresh = 1e-5))
42 | res <- predictLogisticRegressionWithPenalization(model, splitted.data$test)
43 | 
44 | expandPredictionResults(res, allowed.codes, method.name = "Logistic Regression")
45 | }
46 | \seealso{
47 | \code{\link{produceResults}}
48 | }
49 | 


--------------------------------------------------------------------------------
/man/frequent_phrases.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \encoding{UTF-8}
 5 | \name{frequent_phrases}
 6 | \alias{frequent_phrases}
 7 | \title{Some job titles and job descriptions}
 8 | \format{
 9 | A character vector with 701 elements
10 | }
11 | \usage{
12 | frequent_phrases
13 | }
14 | \description{
15 | There exist some job titles/descriptions that are not part of the \code{Gesamtberufsliste_der_BA} (see \code{\link{prepare_German_coding_index_Gesamtberufsliste_der_BA}}), possibly because they are too general or because they are misspelled. They are still common answers that should not be ignored. We use these answers within the function \code{\link{trainSimilarityBasedReasoning}}.
16 | }
17 | \details{
18 | As a general rule, we included some of the most frequent verbal answers (after \code{\link{stringPreprocessing}}) if they have more than three characters and have no empty spaces. Creating predictions for these answers would be impossible if only the job titles from the Gesamtberufsliste were used only.
19 | }
20 | \keyword{datasets}
21 | 


--------------------------------------------------------------------------------
/man/kldb2010PlusFive.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \encoding{UTF-8}
 5 | \name{kldb2010PlusFive}
 6 | \alias{kldb2010PlusFive}
 7 | \title{2010 German Classification of Occupations (KldB 2010)}
 8 | \format{
 9 | A data frame with 2 variables:
10 | \describe{
11 |   \item{code}{5-digit number (Berufsgattung)}
12 |   \item{title}{Category label}
13 | }
14 | }
15 | \source{
16 | Federal Employment Agency (2011). Klassifikation der Berufe 2010, Bundesagentur für Arbeit, Nuremberg.
17 | }
18 | \usage{
19 | kldb2010PlusFive
20 | }
21 | \description{
22 | This file contains the labels for all 1286 five-digit categories (Berufsgattungen) from the 2010 German Classification of Occupations.
23 | }
24 | \details{
25 | Five additional categories were added for coding purposes: (-0030 = Studentische Hilfskraft, -0004 = Berufsbeschreibung zu unpräzise/nicht kodierbar, -0006 = Mehrere Jobs, -0012 = Arbeiter/in (nicht näher spezifiziert), -0019 = Freiwilligendienst, FSJ, Zivildienst)
26 | }
27 | \keyword{datasets}
28 | 


--------------------------------------------------------------------------------
/man/logLoss.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/logLoss.R
 3 | \name{logLoss}
 4 | \alias{logLoss}
 5 | \title{Log loss}
 6 | \usage{
 7 | logLoss(occupationalPredictions)
 8 | }
 9 | \arguments{
10 | \item{occupationalPredictions}{a data.table created with a \code{\link{expandPredictionResults}}-function from this package.}
11 | }
12 | \value{
13 | a data.table
14 | }
15 | \description{
16 | Calculate log loss \eqn{\log_2 loss = \frac{1}{N} \sum_n \log_2 loss_n} and standard error \eqn{\sqrt{\frac{1}{N(N-1)} \sum_n (\log_2 loss_n - \log_2 loss)^2}} with \eqn{loss_n = \sum_k -y_{nk} \log_2 p_{nk}}
17 | }
18 | \details{
19 | log loss is the average probability of true categories that actually realized.
20 | }
21 | \examples{
22 | # set up data
23 | data(occupations)
24 | allowed.codes <- c("71402", "71403", "63302", "83112", "83124", "83131", "83132", "83193", "83194", "-0004", "-0030")
25 | allowed.codes.titles <- c("Office clerks and secretaries (without specialisation)-skilled tasks", "Office clerks and secretaries (without specialisation)-complex tasks", "Gastronomy occupations (without specialisation)-skilled tasks",
26 |  "Occupations in child care and child-rearing-skilled tasks", "Occupations in social work and social pedagogics-highly complex tasks", "Pedagogic specialists in social care work and special needs education-unskilled/semiskilled tasks", "Pedagogic specialists in social care work and special needs education-skilled tasks", "Supervisors in education and social work, and of pedagogic specialists in social care work", "Managers in education and social work, and of pedagogic specialists in social care work",
27 |  "Not precise enough for coding", "Student assistants")
28 | proc.occupations <- removeFaultyAndUncodableAnswers_And_PrepareForAnalysis(occupations, colNames = c("orig_answer", "orig_code"), allowed.codes, allowed.codes.titles)
29 | 
30 | ## split sample
31 | set.seed(3451345)
32 | n.test <- 50
33 | group <- sample(c(rep("test", n.test), rep("training", nrow(proc.occupations) - n.test)))
34 | splitted.data <- split(proc.occupations, group)
35 | 
36 | # train model and make predictions
37 | model <- trainLogisticRegressionWithPenalization(splitted.data$train, preprocessing = list(stopwords = tm::stopwords("de"), stemming = "de", countWords = FALSE), tuning = list(alpha = 0.05, maxit = 50^5, nlambda = 100, thresh = 1e-5))
38 | res <- predictLogisticRegressionWithPenalization(model, splitted.data$test)
39 | 
40 | # expand to contain more categories than the initial ones
41 | res.proc1 <- expandPredictionResults(res, allowed.codes = c("12345", allowed.codes), method.name = "glmnet1")
42 | 
43 | # we can use different methods to create a combined dataset. This is how to run the subsequent analysis functions only once.
44 | res.proc2 <- expandPredictionResults(res, allowed.codes = c("12345", allowed.codes), method.name = "glmnet2")
45 | res.proc <- rbind(res.proc1, res.proc2); class(res.proc) <- c(class(res.proc), "occupationalPredictionsComplete")
46 | 
47 | logLoss(res.proc)
48 | }
49 | \seealso{
50 | \code{\link{sharpness}}
51 | }
52 | 


--------------------------------------------------------------------------------
/man/occupations.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \encoding{UTF-8}
 5 | \name{occupations}
 6 | \alias{occupations}
 7 | \title{A selection of 250 coded occupational answers}
 8 | \format{
 9 | A data frame with 250 rows and 3 variables:
10 | \describe{
11 |   \item{orig_answer}{verbal answer given to the first question about occupation}
12 |   \item{orig_answer2}{verbal answer given to the second question about occupation (not used (much) in this package but it illustrates that this and several other variables are used by human coders)}
13 |   \item{orig_code}{5-digit codes from the German classification of Occupation. Negative codes are not part of the official classification but were used for coding. -0030 refers to student assistants, -0004 means that the job description was not precise enough for coding.}
14 | }
15 | }
16 | \source{
17 | The following codes were hand-selected and double-checked to ensure anonymity: 71402 (50 cases), 71403 (20 cases), 63302 (20 cases), 83112 (25 cases), 83124 (25 cases), 83131 (20 cases), 83132 (20 cases), 83193 (15 cases), 83194 (5 cases), -0004 (25 cases), -0030 (25 cases)
18 | 
19 | Antoni, M., Drasch, K., Kleinert, C., Matthes, B., Ruland, M. and Trahms, A. (2010). Arbeiten und Lernen im Wandel * Teil 1: Überblick über die Studie, FDZ-Methodenreport 05/2010, Forschungsdatenzentrum der Bundesagentur fur Arbeit im Institut für Arbeitsmarkt- und Berufsforschung, Nuremberg.
20 | 
21 | Drasch, K., Matthes, B., Munz, M., Paulus, W. and Valentin, M.-A. (2012). Arbeiten und Lernen im Wandel * Teil V: Die Codierung der offenen Angaben zur beruflichen Tätigkeit, Ausbildung und Branche, FDZ-Methodenreport 04/2012, Forschungsdatenzentrum der Bundesagentur für Arbeit im Institut für Arbeitsmarkt- und Berufsforschung, Nuremberg.
22 | }
23 | \usage{
24 | occupations
25 | }
26 | \description{
27 | A dataset containing 250 selected verbal answers and their associated codes from the 2010 German classification of Occupation. This dataset is not meant to be representative, but only used to demonstrate the functionality of the package. We anticipate that accurate occupation coding will be more difficult with real data compared to this toy example.
28 | }
29 | \keyword{datasets}
30 | 


--------------------------------------------------------------------------------
/man/plotAgreementRateVsProductionRate.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/plotAgreementRateVsProductionRate.R
 3 | \name{plotAgreementRateVsProductionRate}
 4 | \alias{plotAgreementRateVsProductionRate}
 5 | \title{Plot agreement rate vs. production rate}
 6 | \usage{
 7 | plotAgreementRateVsProductionRate(
 8 |   occupationalPredictionsAmongTopK,
 9 |   n,
10 |   yintercept,
11 |   filename = NULL
12 | )
13 | }
14 | \arguments{
15 | \item{occupationalPredictionsAmongTopK}{a data table created with \code{\link{calcAccurateAmongTopK}}.}
16 | 
17 | \item{n}{Number of unique persons in test data (may be larger than the number of persons in occupationalPredictionsAmongTopK if for some persons no codes were suggested)}
18 | 
19 | \item{yintercept}{Where to draw a horizontal line?}
20 | 
21 | \item{filename}{If a \code{filename} is specified the diagram will be saved at with this name.}
22 | }
23 | \value{
24 | a ggplot
25 | }
26 | \description{
27 | Plot the proportion of answers coded correctly for any given production rate (proportion of \code{n} that are coded automatically.)
28 | }
29 | \examples{
30 | # set up data
31 | data(occupations)
32 | allowed.codes <- c("71402", "71403", "63302", "83112", "83124", "83131", "83132", "83193", "83194", "-0004", "-0030")
33 | allowed.codes.titles <- c("Office clerks and secretaries (without specialisation)-skilled tasks", "Office clerks and secretaries (without specialisation)-complex tasks", "Gastronomy occupations (without specialisation)-skilled tasks",
34 |  "Occupations in child care and child-rearing-skilled tasks", "Occupations in social work and social pedagogics-highly complex tasks", "Pedagogic specialists in social care work and special needs education-unskilled/semiskilled tasks", "Pedagogic specialists in social care work and special needs education-skilled tasks", "Supervisors in education and social work, and of pedagogic specialists in social care work", "Managers in education and social work, and of pedagogic specialists in social care work",
35 |  "Not precise enough for coding", "Student assistants")
36 | proc.occupations <- removeFaultyAndUncodableAnswers_And_PrepareForAnalysis(occupations, colNames = c("orig_answer", "orig_code"), allowed.codes, allowed.codes.titles)
37 | 
38 | ## split sample
39 | set.seed(3451345)
40 | n.test <- 50
41 | group <- sample(c(rep("test", n.test), rep("training", nrow(proc.occupations) - n.test)))
42 | splitted.data <- split(proc.occupations, group)
43 | 
44 | # train model and make predictions
45 | model <- trainLogisticRegressionWithPenalization(splitted.data$train, preprocessing = list(stopwords = tm::stopwords("de"), stemming = "de", countWords = FALSE), tuning = list(alpha = 0.05, maxit = 50^5, nlambda = 100, thresh = 1e-5))
46 | res <- predictLogisticRegressionWithPenalization(model, splitted.data$test)
47 | 
48 | # expand to contain more categories than the initial ones
49 | res.proc1 <- expandPredictionResults(res, allowed.codes = c("12345", allowed.codes), method.name = "glmnet1")
50 | 
51 | # we can use different methods to create a combined dataset. This is how to run the subsequent analysis functions only once.
52 | res.proc2 <- expandPredictionResults(res, allowed.codes = c("12345", allowed.codes), method.name = "glmnet2")
53 | res.proc <- rbind(res.proc1, res.proc2); class(res.proc) <- c(class(res.proc), "occupationalPredictionsComplete")
54 | 
55 | calcAccurateAmongTopK(res.proc, k = 5)[,mean(acc), by = method.name]
56 | plotAgreementRateVsProductionRate(calcAccurateAmongTopK(res.proc, k = 5), n = n.test, yintercept = 0.85)
57 | plotAgreementRateVsProductionRate(calcAccurateAmongTopK(res.proc, k = 1), n = n.test, yintercept = 0.85, filename = "test.pdf")
58 | }
59 | \seealso{
60 | \code{\link{plotTruePredictionsVsFalsePredictions}}, \code{\link{calcAccurateAmongTopK}}
61 | }
62 | 


--------------------------------------------------------------------------------
/man/plotReliabilityDiagram.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/plotReliabilityDiagram.R
 3 | \name{plotReliabilityDiagram}
 4 | \alias{plotReliabilityDiagram}
 5 | \title{Reliability Diagram}
 6 | \usage{
 7 | plotReliabilityDiagram(occupationalPredictions, k, num.codes, filename = NULL)
 8 | }
 9 | \arguments{
10 | \item{k}{how many top k categories to aggregate over?}
11 | 
12 | \item{num.codes}{Number of allowed categories in classification}
13 | 
14 | \item{filename}{If a \code{filename} is specified the diagram will be saved at with this name.}
15 | 
16 | \item{occupationalPredictionsAmongTopK}{a data table created with \code{\link{calcAccurateAmongTopK}}.}
17 | }
18 | \value{
19 | a ggplot
20 | }
21 | \description{
22 | Plots the observed relative frequency of correctness against the forecasted probability.
23 | }
24 | \examples{
25 | # set up data
26 | data(occupations)
27 | allowed.codes <- c("71402", "71403", "63302", "83112", "83124", "83131", "83132", "83193", "83194", "-0004", "-0030")
28 | allowed.codes.titles <- c("Office clerks and secretaries (without specialisation)-skilled tasks", "Office clerks and secretaries (without specialisation)-complex tasks", "Gastronomy occupations (without specialisation)-skilled tasks",
29 |  "Occupations in child care and child-rearing-skilled tasks", "Occupations in social work and social pedagogics-highly complex tasks", "Pedagogic specialists in social care work and special needs education-unskilled/semiskilled tasks", "Pedagogic specialists in social care work and special needs education-skilled tasks", "Supervisors in education and social work, and of pedagogic specialists in social care work", "Managers in education and social work, and of pedagogic specialists in social care work",
30 |  "Not precise enough for coding", "Student assistants")
31 | proc.occupations <- removeFaultyAndUncodableAnswers_And_PrepareForAnalysis(occupations, colNames = c("orig_answer", "orig_code"), allowed.codes, allowed.codes.titles)
32 | 
33 | ## split sample
34 | set.seed(3451345)
35 | n.test <- 50
36 | group <- sample(c(rep("test", n.test), rep("training", nrow(proc.occupations) - n.test)))
37 | splitted.data <- split(proc.occupations, group)
38 | 
39 | # train model and make predictions
40 | model <- trainLogisticRegressionWithPenalization(splitted.data$train, preprocessing = list(stopwords = tm::stopwords("de"), stemming = "de", countWords = FALSE), tuning = list(alpha = 0.05, maxit = 50^5, nlambda = 100, thresh = 1e-5))
41 | res <- predictLogisticRegressionWithPenalization(model, splitted.data$test)
42 | 
43 | # expand to contain more categories than the initial ones
44 | res.proc1 <- expandPredictionResults(res, allowed.codes = c("12345", allowed.codes), method.name = "glmnet1")
45 | 
46 | # we can use different methods to create a combined dataset. This is how to run the subsequent analysis functions only once.
47 | res.proc2 <- expandPredictionResults(res, allowed.codes = c("12345", allowed.codes), method.name = "glmnet2")
48 | res.proc <- rbind(res.proc1, res.proc2); class(res.proc) <- c(class(res.proc), "occupationalPredictionsComplete")
49 | 
50 | plotReliabilityDiagram(res.proc, k = 5, num.codes = length(allowed.codes) + 1) # + 1 because we introduced the code "12345" later
51 | plotReliabilityDiagram(res.proc, k = 1, num.codes = length(allowed.codes) + 1, filename = "test.pdf")
52 | }
53 | \seealso{
54 | \code{\link{sharpness}}
55 | }
56 | 


--------------------------------------------------------------------------------
/man/plotTruePredictionsVsFalsePredictions.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/plotTruePredictionsVsFalsePredictions.R
 3 | \name{plotTruePredictionsVsFalsePredictions}
 4 | \alias{plotTruePredictionsVsFalsePredictions}
 5 | \title{Plot true predictions versus false predictions}
 6 | \usage{
 7 | plotTruePredictionsVsFalsePredictions(
 8 |   occupationalPredictionsAmongTopK,
 9 |   filename = NULL
10 | )
11 | }
12 | \arguments{
13 | \item{occupationalPredictionsAmongTopK}{a data table created with \code{\link{calcAccurateAmongTopK}}.}
14 | 
15 | \item{filename}{If a \code{filename} is specified the diagram will be saved at with this name.}
16 | }
17 | \value{
18 | a ggplot
19 | }
20 | \description{
21 | Show how many predictions would be correct as a function of how many would be incorrect.
22 | }
23 | \examples{
24 | # set up data
25 | data(occupations)
26 | allowed.codes <- c("71402", "71403", "63302", "83112", "83124", "83131", "83132", "83193", "83194", "-0004", "-0030")
27 | allowed.codes.titles <- c("Office clerks and secretaries (without specialisation)-skilled tasks", "Office clerks and secretaries (without specialisation)-complex tasks", "Gastronomy occupations (without specialisation)-skilled tasks",
28 |  "Occupations in child care and child-rearing-skilled tasks", "Occupations in social work and social pedagogics-highly complex tasks", "Pedagogic specialists in social care work and special needs education-unskilled/semiskilled tasks", "Pedagogic specialists in social care work and special needs education-skilled tasks", "Supervisors in education and social work, and of pedagogic specialists in social care work", "Managers in education and social work, and of pedagogic specialists in social care work",
29 |  "Not precise enough for coding", "Student assistants")
30 | proc.occupations <- removeFaultyAndUncodableAnswers_And_PrepareForAnalysis(occupations, colNames = c("orig_answer", "orig_code"), allowed.codes, allowed.codes.titles)
31 | 
32 | ## split sample
33 | set.seed(3451345)
34 | n.test <- 50
35 | group <- sample(c(rep("test", n.test), rep("training", nrow(proc.occupations) - n.test)))
36 | splitted.data <- split(proc.occupations, group)
37 | 
38 | # train model and make predictions
39 | model <- trainLogisticRegressionWithPenalization(splitted.data$train, preprocessing = list(stopwords = tm::stopwords("de"), stemming = "de", countWords = FALSE), tuning = list(alpha = 0.05, maxit = 50^5, nlambda = 100, thresh = 1e-5))
40 | res <- predictLogisticRegressionWithPenalization(model, splitted.data$test)
41 | 
42 | # expand to contain more categories than the initial ones
43 | res.proc1 <- expandPredictionResults(res, allowed.codes = c("12345", allowed.codes), method.name = "glmnet1")
44 | 
45 | # we can use different methods to create a combined dataset. This is how to run the subsequent analysis functions only once.
46 | res.proc2 <- expandPredictionResults(res, allowed.codes = c("12345", allowed.codes), method.name = "glmnet2")
47 | res.proc <- rbind(res.proc1, res.proc2); class(res.proc) <- c(class(res.proc), "occupationalPredictionsComplete")
48 | 
49 | calcAccurateAmongTopK(res.proc, k = 5)[,mean(acc), by = method.name]
50 | plotTruePredictionsVsFalsePredictions(calcAccurateAmongTopK(res.proc, k = 5))
51 | plotTruePredictionsVsFalsePredictions(calcAccurateAmongTopK(res.proc, k = 1), filename = "test.pdf")
52 | }
53 | \seealso{
54 | \code{\link{plotAgreementRateVsProductionRate}}, \code{\link{calcAccurateAmongTopK}}
55 | }
56 | 


--------------------------------------------------------------------------------
/man/predictCreecysMemoryBasedReasoning.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/predictCreecysMemoryBasedReasoning.R
 3 | \name{predictCreecysMemoryBasedReasoning}
 4 | \alias{predictCreecysMemoryBasedReasoning}
 5 | \title{Predict codes with Creecys Memory-based reaoning model}
 6 | \usage{
 7 | predictCreecysMemoryBasedReasoning(
 8 |   model,
 9 |   newdata,
10 |   tuning = list(k.neighbors = 12, metric = c("SUM", "ERROR", "MAX"))
11 | )
12 | }
13 | \arguments{
14 | \item{model}{the output created from \code{\link{trainCreecysMemoryBasedReasoning}}}
15 | 
16 | \item{newdata}{eiter a data.table created with \code{\link{removeFaultyAndUncodableAnswers_And_PrepareForAnalysis}} or a character vector}
17 | 
18 | \item{tuning}{a list with elements
19 | \describe{
20 |   \item{k.neighbors}{Number of nearest neighbors to use.}
21 |   \item{metric}{\code{metric} determines how to calculate 'nearness'. Setting \code{metric == MAX} is not recommended. See Creecy et al. for their reasoning and testing of different metrics.}
22 | }}
23 | }
24 | \value{
25 | a data.table that provides a confidence score for the most likely category. Unlike other prediction functions in this package, no probabilities for all categories are provided, which makes post-processing a bit more difficult. See examples.
26 | }
27 | \description{
28 | Function does the same preprocessing as in \code{\link{trainCreecysMemoryBasedReasoning}} and predicts codes with a modified \code{k}-nearest-neighbor approach.
29 | }
30 | \examples{
31 | # set up data
32 | data(occupations)
33 | allowed.codes <- c("71402", "71403", "63302", "83112", "83124", "83131", "83132", "83193", "83194", "-0004", "-0030")
34 | allowed.codes.titles <- c("Office clerks and secretaries (without specialisation)-skilled tasks", "Office clerks and secretaries (without specialisation)-complex tasks", "Gastronomy occupations (without specialisation)-skilled tasks",
35 |  "Occupations in child care and child-rearing-skilled tasks", "Occupations in social work and social pedagogics-highly complex tasks", "Pedagogic specialists in social care work and special needs education-unskilled/semiskilled tasks", "Pedagogic specialists in social care work and special needs education-skilled tasks", "Supervisors in education and social work, and of pedagogic specialists in social care work", "Managers in education and social work, and of pedagogic specialists in social care work",
36 |  "Not precise enough for coding", "Student assistants")
37 | proc.occupations <- removeFaultyAndUncodableAnswers_And_PrepareForAnalysis(occupations, colNames = c("orig_answer", "orig_code"), allowed.codes, allowed.codes.titles)
38 | 
39 | ## split sample
40 | set.seed(3451345)
41 | n.test <- 50
42 | group <- sample(c(rep("test", n.test), rep("training", nrow(proc.occupations) - n.test)))
43 | splitted.data <- split(proc.occupations, group)
44 | 
45 | # train model and make predictions
46 | memModel <- trainCreecysMemoryBasedReasoning(splitted.data$training,
47 |                                              preprocessing = list(stopwords = character(0), stemming = NULL, strPreprocessing = TRUE, removePunct = FALSE))
48 | testi <- predictCreecysMemoryBasedReasoning(memModel, c("test", "HIWI", "Hilfswissenschaftler"), tuning = list(k.neighbors = 12, metric = c("SUM")))
49 | testi; testi
50 | 
51 | resMem <- predictCreecysMemoryBasedReasoning(memModel, splitted.data$test, tuning = list(k.neighbors = 12, metric = c("SUM")))
52 | 
53 | # Analyize predictions
54 | accuracy(resMem[, list(pred.code, pred.prob = confidence.score, acc = pred.code == code, num.suggested.codes = 1, num.suggested = 1, general.among.top5 = pred.code == "-9999", method.name = "Creecy.Sum.12")], n = n.test)
55 | plotTruePredictionsVsFalsePredictions(resMem[, list(pred.code, pred.prob = confidence.score, acc = pred.code == code, num.suggested.codes = 1, num.suggested = 1, general.among.top5 = pred.code == "-9999", method.name = "Creecy.Sum.12")])
56 | plotAgreementRateVsProductionRate(resMem[, list(pred.code, pred.prob = confidence.score, acc = pred.code == code, num.suggested.codes = 1, num.suggested = 1, general.among.top5 = pred.code == "-9999", method.name = "Creecy.Sum.12")], n = n.test, yintercept = 0.85)
57 | 
58 | #' #######################################################
59 | ## RUN A GRID SEARCH (takes some time)
60 | \donttest{
61 | # create a grid of all tuning combinations to try
62 |  model.grid <- data.table(expand.grid(stopwords = c(TRUE, FALSE), stemming = c(FALSE, "de"), metric = c("SUM", "ERROR"), k.neighbors = c(2, 10, 17)))
63 | 
64 |  # Do grid search
65 |  for (i in 1:nrow(model.grid)) {
66 | 
67 |    res.model <- trainCreecysMemoryBasedReasoning(splitted.data$training, preprocessing = list(stopwords = if (model.grid[i, stopwords]) tm::stopwords("de") else character(0),
68 |                                                                                               stemming = if (model.grid[i, stemming == "de"]) "de" else NULL,
69 |                                                                                               strPreprocessing = TRUE,
70 |                                                                                               removePunct = FALSE))
71 | 
72 |    res.proc <- predictCreecysMemoryBasedReasoning(res.model, splitted.data$test,
73 |                                                   tuning = list(k.neighbors = model.grid[i, k.neighbors],
74 |                                                                 metric = model.grid[i, metric]))
75 | 
76 |    ac <- accuracy(res.proc[, list(pred.code, pred.prob = confidence.score, acc = pred.code == code, num.suggested.codes = 1, num.suggested = 1, general.among.top5 = pred.code == "-9999", method.name = "Creecy.Sum.12")], n = nrow(splitted.data$test))
77 | 
78 |    model.grid[i, acc := ac[, acc]]
79 |    model.grid[i, acc.se := ac[, se]]
80 |    model.grid[i, acc.N := ac[, N]]
81 |    model.grid[i, acc.prob0 := ac[, count.pred.prob0]]
82 |  }
83 | 
84 | model.grid[order(metric, k.neighbors, stemming)]
85 | }
86 | }
87 | \seealso{
88 | \code{\link{trainCreecysMemoryBasedReasoning}}
89 | 
90 | Creecy, R. H., Masand, B. M., Smith, S. J., Waltz, D. L. (1992). Trading MIPS and Memory for Knowledge Engineering. Comm. ACM 35(8). pp. 48--65.
91 | }
92 | 


--------------------------------------------------------------------------------
/man/predictGweonsNearestNeighbor.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/predictGweonsNearestNeighbor.R
 3 | \name{predictGweonsNearestNeighbor}
 4 | \alias{predictGweonsNearestNeighbor}
 5 | \title{Predict codes with Gweons Nearest Neighbor Method}
 6 | \usage{
 7 | predictGweonsNearestNeighbor(
 8 |   model,
 9 |   newdata,
10 |   tuning = list(nearest.neighbors.multiplier = 0.1)
11 | )
12 | }
13 | \arguments{
14 | \item{model}{the output created from \code{\link{trainGweonsNearestNeighbor}}}
15 | 
16 | \item{newdata}{eiter a data.table created with \code{\link{removeFaultyAndUncodableAnswers_And_PrepareForAnalysis}} or a character vector}
17 | 
18 | \item{tuning}{a list with element
19 | \describe{
20 |   \item{nearest.neighbors.multiplier}{defaults to 0.1. Gweon et al. (2017) show that 0.1 is a better choice than 0 but the exact value is a bit arbitrary.}
21 | }}
22 | }
23 | \value{
24 | a data.table of class \code{occupationalPredictions} that contains predicted probabilities \code{pred.prob} for every combination of \code{ans} and \code{pred.code}. pred.code may not cover the full set of possible codes. If all predicted codes have probability 0, these predictions are removed and we instead insert \code{pred.code := "-9999"} with \code{pred.prob = 1/num.allowed.codes}.
25 | }
26 | \description{
27 | Function does the same preprocessing as in \code{\link{trainGweonsNearestNeighbor}} and predicts codes with a modified 1-nearest-neighbor approach.
28 | }
29 | \examples{
30 | # set up data
31 | data(occupations)
32 | allowed.codes <- c("71402", "71403", "63302", "83112", "83124", "83131", "83132", "83193", "83194", "-0004", "-0030")
33 | allowed.codes.titles <- c("Office clerks and secretaries (without specialisation)-skilled tasks", "Office clerks and secretaries (without specialisation)-complex tasks", "Gastronomy occupations (without specialisation)-skilled tasks",
34 |  "Occupations in child care and child-rearing-skilled tasks", "Occupations in social work and social pedagogics-highly complex tasks", "Pedagogic specialists in social care work and special needs education-unskilled/semiskilled tasks", "Pedagogic specialists in social care work and special needs education-skilled tasks", "Supervisors in education and social work, and of pedagogic specialists in social care work", "Managers in education and social work, and of pedagogic specialists in social care work",
35 |  "Not precise enough for coding", "Student assistants")
36 | proc.occupations <- removeFaultyAndUncodableAnswers_And_PrepareForAnalysis(occupations, colNames = c("orig_answer", "orig_code"), allowed.codes, allowed.codes.titles)
37 | 
38 | ## split sample
39 | set.seed(3451345)
40 | n.test <- 50
41 | group <- sample(c(rep("test", n.test), rep("training", nrow(proc.occupations) - n.test)))
42 | splitted.data <- split(proc.occupations, group)
43 | 
44 | # train model and make predictions
45 | model <- trainGweonsNearestNeighbor(splitted.data$train,
46 |                                     preprocessing = list(stopwords = tm::stopwords("de"), stemming = "de", strPreprocessing = TRUE, removePunct = FALSE))
47 | predictGweonsNearestNeighbor(model, c("test", "HIWI", "Hilfswissenschaftler"))
48 | res <- predictGweonsNearestNeighbor(model, splitted.data$test)
49 | 
50 | # look at most probable answer from each id
51 | res[, .SD[which.max(pred.prob), list(ans, true.code = code, pred.code, acc = code == pred.code)], by = id]
52 | res[, .SD[which.max(pred.prob), list(ans, true.code = code, pred.code, acc = code == pred.code)], by = id][, mean(acc)] # calculate accurac of predictions
53 | 
54 | # for further analysis we usually require further processing:
55 | produceResults(expandPredictionResults(res, allowed.codes, method.name = "GweonsNearestNeighbor"), k = 1, n = n.test, num.codes = length(allowed.codes))
56 | }
57 | \seealso{
58 | \code{\link{trainGweonsNearestNeighbor}}
59 | 
60 | Gweon, H.; Schonlau, M., Kaczmirek, L., Blohm, M., Steiner, S. (2017). Three Methods for Occupation Coding Based on Statistical Learning. Journal of Official Statistics 33(1), pp. 101--122
61 | 
62 | This function is based on \url{https://github.com/hgweon/occupation-coding/blob/master/Modified_NN.r}. Considerable speed improvements were implemented.
63 | }
64 | 


--------------------------------------------------------------------------------
/man/predictLogisticRegressionWithPenalization.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/predictLogisticRegressionWithPenalization.R
  3 | \name{predictLogisticRegressionWithPenalization}
  4 | \alias{predictLogisticRegressionWithPenalization}
  5 | \title{Predict codes using a logistic regression model}
  6 | \usage{
  7 | predictLogisticRegressionWithPenalization(
  8 |   model,
  9 |   newdata,
 10 |   lambda = min(model$lambda)
 11 | )
 12 | }
 13 | \arguments{
 14 | \item{model}{the output created from \code{\link{trainLogisticRegressionWithPenalization}}}
 15 | 
 16 | \item{newdata}{eiter a data.table created with \code{\link{removeFaultyAndUncodableAnswers_And_PrepareForAnalysis}} or a character vector}
 17 | 
 18 | \item{lambda}{see \code{\link[glmnet]{glmnet}}}
 19 | }
 20 | \value{
 21 | a data.table of class \code{occupationalPredictions} that contains predicted probabilities \code{pred.prob} for every combination of \code{ans} and \code{pred.code}. pred.code may not cover the full set of possible codes.
 22 | }
 23 | \description{
 24 | Function does the same preprocessing as in \code{\link{trainLogisticRegressionWithPenalization}} and calls the glmnet predict-function.
 25 | }
 26 | \details{
 27 | (Not confirmed:) Since overfitting is not an issue with occupational data, our experience is: the smaller \code{lambda}, the better are the predictions. Check if this holds for your data using \code{deviance(model)}.
 28 | }
 29 | \examples{
 30 | # set up data
 31 | data(occupations)
 32 | allowed.codes <- c("71402", "71403", "63302", "83112", "83124", "83131", "83132", "83193", "83194", "-0004", "-0030")
 33 | allowed.codes.titles <- c("Office clerks and secretaries (without specialisation)-skilled tasks", "Office clerks and secretaries (without specialisation)-complex tasks", "Gastronomy occupations (without specialisation)-skilled tasks",
 34 |  "Occupations in child care and child-rearing-skilled tasks", "Occupations in social work and social pedagogics-highly complex tasks", "Pedagogic specialists in social care work and special needs education-unskilled/semiskilled tasks", "Pedagogic specialists in social care work and special needs education-skilled tasks", "Supervisors in education and social work, and of pedagogic specialists in social care work", "Managers in education and social work, and of pedagogic specialists in social care work",
 35 |  "Not precise enough for coding", "Student assistants")
 36 | proc.occupations <- removeFaultyAndUncodableAnswers_And_PrepareForAnalysis(occupations, colNames = c("orig_answer", "orig_code"), allowed.codes, allowed.codes.titles)
 37 | 
 38 | ## split sample
 39 | set.seed(3451345)
 40 | n.test <- 50
 41 | group <- sample(c(rep("test", n.test), rep("training", nrow(proc.occupations) - n.test)))
 42 | splitted.data <- split(proc.occupations, group)
 43 | 
 44 | # train model and make predictions
 45 | model <- trainLogisticRegressionWithPenalization(splitted.data$train, preprocessing = list(stopwords = tm::stopwords("de"), stemming = "de", countWords = FALSE), tuning = list(alpha = 0.05, maxit = 50^5, nlambda = 100, thresh = 1e-5))
 46 | predictLogisticRegressionWithPenalization(model, c("test", "HIWI", "Hilfswissenschaftler"))
 47 | res <- predictLogisticRegressionWithPenalization(model, splitted.data$test)
 48 | 
 49 | # look at most probable answer from each id
 50 | res[, .SD[which.max(pred.prob), list(ans, true.code = code, pred.code, acc = code == pred.code)], by = id]
 51 | res[, .SD[which.max(pred.prob), list(ans, true.code = code, pred.code, acc = code == pred.code)], by = id][, mean(acc)] # calculate accurac of predictions
 52 | 
 53 | # for further analysis we usually require further processing:
 54 | produceResults(expandPredictionResults(res, allowed.codes, method.name = "LogisticRegression"), k = 1, n = n.test, num.codes = length(allowed.codes))
 55 | 
 56 | #######################################################
 57 | ## RUN A GRID SEARCH (takes some time)
 58 | \donttest{
 59 | model.grid <- data.table(expand.grid(stopwords = c(TRUE, FALSE), stemming = c(FALSE, "de"), countWords = c(TRUE, FALSE), alpha = c(0, 0.05, 0.2), thresh = 1e-4, stringsAsFactors =FALSE))
 60 | 
 61 | # save results here
 62 | model.grid2 <- rbind(model.grid, model.grid, model.grid, model.grid)
 63 | model.grid2[, lambda_ind := rep(c(50, 70, 80, 100), each = nrow(model.grid))]
 64 | 
 65 | # Do grid search
 66 | for (i in 1:nrow(model.grid)) {
 67 |   res.model <- trainLogisticRegressionWithPenalization(splitted.data$train, preprocessing = list(stopwords = if (model.grid[i, stopwords]) tm::stopwords("de") else character(0),
 68 |                                                                                                  stemming = if (model.grid[i, stemming == "de"]) "de" else NULL,
 69 |                                                                                                  countWords = model.grid[i, countWords]),
 70 |                                                        tuning = list(alpha = model.grid[i, alpha],
 71 |                                                                      maxit = 10^6, nlambda = 100, thresh = model.grid[i, thresh]))
 72 | 
 73 |   for (j in 1:4) { # loop over all lambda_ind-values (c(50, 70, 80, 100))
 74 |     # if glmnet does not converge, we want to use lambda_ind = length(res.model$lambda)
 75 |     model.grid2[nrow(model.grid)*(j-1) + i, lambda_ind := min(lambda_ind, length(res.model$lambda))]
 76 |     lambdav <- res.model$lambda[model.grid2[nrow(model.grid)*(j-1) + i, lambda_ind]]
 77 |     res.proc <- expandPredictionResults(predictLogisticRegressionWithPenalization(res.model, splitted.data$test, lambda = lambdav), allowed.codes = allowed.codes, method.name = paste0("glmnet.elnet.Stopwords=", model.grid[i, stopwords], "Stemming=", model.grid[i, stemming], "Countwords=", model.grid[i, countWords], "Lambda=", lambdav))
 78 | 
 79 |     ac <- accuracy(calcAccurateAmongTopK(res.proc, k = 1), n = nrow(splitted.data$test))
 80 |     ll <- logLoss(res.proc)
 81 |     sh <- sharpness(res.proc)
 82 | 
 83 |     model.grid2[nrow(model.grid)*(j-1) + i, lambda := lambdav]
 84 |     model.grid2[nrow(model.grid)*(j-1) + i, acc := ac[, acc]]
 85 |     model.grid2[nrow(model.grid)*(j-1) + i, acc.se := ac[, se]]
 86 |     model.grid2[nrow(model.grid)*(j-1) + i, acc.N := ac[, N]]
 87 |     model.grid2[nrow(model.grid)*(j-1) + i, acc.prob0 := ac[, count.pred.prob0]]
 88 |     model.grid2[nrow(model.grid)*(j-1) + i, loss.full := ll[1, logscore]]
 89 |     model.grid2[nrow(model.grid)*(j-1) + i, loss.full.se := ll[1, se]]
 90 |     model.grid2[nrow(model.grid)*(j-1) + i, loss.full.N := ll[1, N]]
 91 |     model.grid2[nrow(model.grid)*(j-1) + i, loss.sub := ll[2, logscore]]
 92 |     model.grid2[nrow(model.grid)*(j-1) + i, loss.sub.se := ll[2, se]]
 93 |     model.grid2[nrow(model.grid)*(j-1) + i, loss.sub.N := ll[2, N]]
 94 |     model.grid2[nrow(model.grid)*(j-1) + i, sharp := sh[, sharpness]]
 95 |     model.grid2[nrow(model.grid)*(j-1) + i, sharp.se := sh[, se]]
 96 |     model.grid2[nrow(model.grid)*(j-1) + i, sharp.N := sh[, N]]
 97 |   }
 98 | }
 99 | 
100 | # how does alpha and lambda behave?
101 | model.grid2[order(alpha, -lambda)][stopwords == FALSE & stemming == FALSE & countWords == FALSE,]
102 | # Pick one combination of alpha and lambda and explore the influence of stemming, stopwords, and stemming
103 | model.grid2[alpha == 0.05 & lambda < 0.03][order(-lambda, stemming)]
104 | 
105 | }
106 | }
107 | \seealso{
108 | \code{\link{trainLogisticRegressionWithPenalization}}, \code{\link[glmnet]{glmnet}}
109 | }
110 | 


--------------------------------------------------------------------------------
/man/predictWithCodingIndex.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/predictWithCodingIndex.R
 3 | \name{predictWithCodingIndex}
 4 | \alias{predictWithCodingIndex}
 5 | \title{Code answers with a coding index}
 6 | \usage{
 7 | predictWithCodingIndex(
 8 |   newdata,
 9 |   coding_index,
10 |   include.substrings = FALSE,
11 |   max.count.categories = Inf
12 | )
13 | }
14 | \arguments{
15 | \item{newdata}{either a data.table created with \code{\link{removeFaultyAndUncodableAnswers_And_PrepareForAnalysis}} or a character vector.}
16 | 
17 | \item{coding_index}{a data.table as created with function \code{\link{prepare_German_coding_index_Gesamtberufsliste_der_BA}}}
18 | 
19 | \item{include.substrings}{(default: \code{FALSE}). If \code{FALSE}, a match is found if, after preprocessing, the entry from the coding index and the string-element are exactly identical. If TRUE (Attention: THIS IS SLOW!!), a match is found if, after preprocessing, the entry from the coding index is a substring of the string-element.}
20 | 
21 | \item{max.count.categories}{(default: \code{Inf}). Should we search the whole coding index (default) or should we exclude entries with large \code{count_categories}, an indicator of job title ambiguity? Only entries in the coding index with \code{count_categories \eqn{\le} max.count.categories} are searched.}
22 | }
23 | \value{
24 | a data.table with columns id, ans, and pred.code (format is not comparable to other formats in this package.)
25 | }
26 | \description{
27 | Look up the correct code in a coding index. We often find no code, 1 code or even more than one possible code this way.
28 | }
29 | \examples{
30 | # set up data
31 | data(occupations)
32 | allowed.codes <- c("71402", "71403", "63302", "83112", "83124", "83131", "83132", "83193", "83194", "-0004", "-0030")
33 | allowed.codes.titles <- c("Office clerks and secretaries (without specialisation)-skilled tasks", "Office clerks and secretaries (without specialisation)-complex tasks", "Gastronomy occupations (without specialisation)-skilled tasks",
34 |  "Occupations in child care and child-rearing-skilled tasks", "Occupations in social work and social pedagogics-highly complex tasks", "Pedagogic specialists in social care work and special needs education-unskilled/semiskilled tasks", "Pedagogic specialists in social care work and special needs education-skilled tasks", "Supervisors in education and social work, and of pedagogic specialists in social care work", "Managers in education and social work, and of pedagogic specialists in social care work",
35 |  "Not precise enough for coding", "Student assistants")
36 | proc.occupations <- removeFaultyAndUncodableAnswers_And_PrepareForAnalysis(occupations, colNames = c("orig_answer", "orig_code"), allowed.codes, allowed.codes.titles)
37 | 
38 | # recommended default
39 | res <- predictWithCodingIndex(proc.occupations,
40 |                               coding_index = coding_index_excerpt,
41 |                               include.substrings = FALSE,
42 |                               max.count.categories = Inf)
43 | 
44 | # playing around with the parameters to obtain other results
45 | res <- predictWithCodingIndex(proc.occupations,
46 |                               coding_index = coding_index_excerpt,
47 |                               include.substrings = TRUE,
48 |                               max.count.categories = 15)
49 | 
50 | #################################
51 | # Analysis: Standard functions from this package won't work here.
52 | # Absolute numbers: either nothing is predicted (nPredictedCodes = NA), or 1 or more cods are predicted
53 | res[ , .N, by = list(nPredictedCodes = 1 + nchar(pred.code) \%/\% 6 )]
54 | # Relative Numbers
55 | res[ , .N / res[, .N], by = list(nPredictedCodes = 1 + nchar(pred.code) \%/\% 6 )]
56 | # Agreement rate among answers where only a single code was predicted
57 | res[nchar(pred.code) == 5, mean(pred.code == code)]
58 | }
59 | \seealso{
60 | \code{\link{predictSimilarityBasedReasoning}}
61 | }
62 | 


--------------------------------------------------------------------------------
/man/prepare_German_coding_index_Gesamtberufsliste_der_BA.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in
 3 | %   R/prepare_German_coding_index_Gesamtberufsliste_der_BA.R
 4 | \name{prepare_German_coding_index_Gesamtberufsliste_der_BA}
 5 | \alias{prepare_German_coding_index_Gesamtberufsliste_der_BA}
 6 | \title{Prepares the Gesamtberufsliste der BA to be used with this package}
 7 | \usage{
 8 | prepare_German_coding_index_Gesamtberufsliste_der_BA(
 9 |   path_to_file,
10 |   count.categories = FALSE
11 | )
12 | }
13 | \arguments{
14 | \item{path_to_file}{path to downloaded file}
15 | 
16 | \item{count.categories}{(default: \code{FALSE}). The default sets the column \code{count_categories} to 0, avoiding lengthy computations. With \code{count.categories = TRUE} an indicator of job title ambiguity is calculated. See below.}
17 | }
18 | \value{
19 | a data.table with columns Berufsbenennungen, bezMale, and bezFemale, Code, and count_categories.
20 | }
21 | \description{
22 | To use this function go to \url{https://download-portal.arbeitsagentur.de/files/} and download the file \code{Gesamtberufsliste_der_BA.xlsx}. This function prepares a coding index from the Excel file, useful for other functions in this package.
23 | }
24 | \details{
25 | Part of the German Classification of Occupation (KldB 2010) is an alphabetic directory of job titles, available from \url{https://statistik.arbeitsagentur.de/Navigation/Statistik/Grundlagen/Klassifikation-der-Berufe/KldB2010/Systematik-Verzeichnisse/Systematik-Verzeichnisse-Nav.html}. Almost 28.000 job titles are listed in the alphabetic directory along with their associated codes. However, using the alphabetic directory for automatic coding is difficult because it consists of neutral job titles (e.g. Digital-Media-Designer/in) only. They are neutral because the '/' divides the male and female spelling. However, people will never use neutral titles that contain a '/' when they describe their own job. An alternative is the \code{Gesamtberufsliste_der_BA.xlsx}, which contains separate male and female job titles next to the neutral title. The Gesamtberufsliste is used for yearly updates of the alphabetic directory, entailing that both documents are mostly identical.
26 | 
27 | This functions loads the \code{Gesamtberufsliste_der_BA.xlsx} into memory. It corrects some errors in the Gesamtberufsliste, may remove a few entries, renames some column names, calculates an indicator of job title ambiguity, and, most importantly, it removes parentheses and text in between. The resulting data.table can be used with the function \code{\link{predictWithCodingIndex}} for coding. If coded training data from earlier studies is available, \code{\link{trainSimilarityBasedReasoning}} is another option for using this coding index.
28 | 
29 | With \code{count.categories = TRUE} an indicator of job title ambiguity is calculated. See the source code for examples what exactly is being calculated. This indicator is only needed if one wishes to change the default behaviour in \code{\link{predictWithCodingIndex}} by setting \code{max.count.categories}. This parameter allows to exclude ambiguous job titles (those with large values in count_categories) from the coding index.
30 | }
31 | \examples{
32 | \dontrun{
33 | # Illustrative results from this function (if you have not downloaded the file yet).
34 | View(coding_index_excerpt)
35 | 
36 | # point path_to_file to your local file
37 | path_to_file <- ".../Gesamtberufsliste_der_BA.xlsx"
38 | coding_index <- prepare_German_coding_index_Gesamtberufsliste_der_BA(path_to_file, count.categories = FALSE)
39 | }
40 | }
41 | \seealso{
42 | \code{\link{coding_index_excerpt}}, \code{\link{predictWithCodingIndex}}, \code{\link{trainSimilarityBasedReasoning}}
43 | }
44 | 


--------------------------------------------------------------------------------
/man/produceResults.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/produceResults.R
 3 | \name{produceResults}
 4 | \alias{produceResults}
 5 | \title{Produces summaries of predictive performance}
 6 | \usage{
 7 | produceResults(occupationalPredictions, k = 1, n, num.codes)
 8 | }
 9 | \arguments{
10 | \item{occupationalPredictions}{a data.table created with a \code{\link{expandPredictionResults}}-function from this package.}
11 | 
12 | \item{k}{how many top k categories to aggregate over?}
13 | 
14 | \item{n}{Number of unique persons in test data (may be larger than the number of persons in occupationalPredictionsAmongTopK if for some persons no codes were suggested)}
15 | 
16 | \item{num.codes}{Number of allowed categories in classification}
17 | }
18 | \value{
19 | a data.table
20 | }
21 | \description{
22 | Produces \code{\link{accuracy}}, \code{\link{plotReliabilityDiagram}}, \code{\link{sharpness}}, \code{\link{logLoss}}, \code{\link{plotTruePredictionsVsFalsePredictions}}, and \code{\link{plotAgreementRateVsProductionRate}}.
23 | }
24 | \examples{
25 | # set up data
26 | data(occupations)
27 | allowed.codes <- c("71402", "71403", "63302", "83112", "83124", "83131", "83132", "83193", "83194", "-0004", "-0030")
28 | allowed.codes.titles <- c("Office clerks and secretaries (without specialisation)-skilled tasks", "Office clerks and secretaries (without specialisation)-complex tasks", "Gastronomy occupations (without specialisation)-skilled tasks",
29 |  "Occupations in child care and child-rearing-skilled tasks", "Occupations in social work and social pedagogics-highly complex tasks", "Pedagogic specialists in social care work and special needs education-unskilled/semiskilled tasks", "Pedagogic specialists in social care work and special needs education-skilled tasks", "Supervisors in education and social work, and of pedagogic specialists in social care work", "Managers in education and social work, and of pedagogic specialists in social care work",
30 |  "Not precise enough for coding", "Student assistants")
31 | proc.occupations <- removeFaultyAndUncodableAnswers_And_PrepareForAnalysis(occupations, colNames = c("orig_answer", "orig_code"), allowed.codes, allowed.codes.titles)
32 | 
33 | ## split sample
34 | set.seed(3451345)
35 | n.test <- 50
36 | group <- sample(c(rep("test", n.test), rep("training", nrow(proc.occupations) - n.test)))
37 | splitted.data <- split(proc.occupations, group)
38 | 
39 | # train model and make predictions
40 | model <- trainLogisticRegressionWithPenalization(splitted.data$train, preprocessing = list(stopwords = tm::stopwords("de"), stemming = "de", countWords = FALSE), tuning = list(alpha = 0.05, maxit = 50^5, nlambda = 100, thresh = 1e-5))
41 | res <- predictLogisticRegressionWithPenalization(model, splitted.data$test)
42 | 
43 | # expand to contain more categories than the initial ones
44 | res.proc1 <- expandPredictionResults(res, allowed.codes = c("12345", allowed.codes), method.name = "glmnet1")
45 | 
46 | # aggregate over top k categories
47 | calcAccurateAmongTopK(res.proc1, k = 1)[,mean(acc)]
48 | calcAccurateAmongTopK(res.proc1, k = 5)[,mean(acc)]
49 | 
50 | # we can use different methods to create a combined dataset. This is how to run the subsequent analysis functions only once.
51 | res.proc2 <- expandPredictionResults(res, allowed.codes = c("12345", allowed.codes), method.name = "glmnet2")
52 | res.proc <- rbind(res.proc1, res.proc2); class(res.proc) <- c(class(res.proc), "occupationalPredictionsComplete")
53 | 
54 | produceResults(res.proc, k = 1, n = n.test, num.codes = length(allowed.codes) + 1)
55 | }
56 | \seealso{
57 | \code{\link{calcAccurateAmongTopK}}
58 | }
59 | 


--------------------------------------------------------------------------------
/man/removeFaultyAndUncodableAnswers_And_PrepareForAnalysis.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in
 3 | %   R/removeFaultyAndUncodableAnswers_And_PrepareForAnalysis.R
 4 | \name{removeFaultyAndUncodableAnswers_And_PrepareForAnalysis}
 5 | \alias{removeFaultyAndUncodableAnswers_And_PrepareForAnalysis}
 6 | \title{Data Preparation}
 7 | \usage{
 8 | removeFaultyAndUncodableAnswers_And_PrepareForAnalysis(
 9 |   data,
10 |   colNames = c("answer", "code"),
11 |   allowed.codes,
12 |   allowed.codes.titles = 1:length(allowed.codes)
13 | )
14 | }
15 | \arguments{
16 | \item{allowed.codes}{a vector of allowed codes from the classification.}
17 | 
18 | \item{allowed.codes.titles}{Labels for \code{allowed.codes}. Should have the same length as \code{allowed.codes}.}
19 | 
20 | \item{answers}{a character vector of answers}
21 | 
22 | \item{codes}{a vector of classification codes having the same length as \code{answers}. Will be transformed to character.}
23 | }
24 | \value{
25 | a data.table with attributes \code{classification} and \code{overview_tally}
26 | }
27 | \description{
28 | Prepare data (i.e. columns 'id', 'ans', and 'code' are appended to the dataset - only these columns will be used later) and remove answer that we cannot use (i.e. answers that have non-ASCII characters after preprocessing and answers that are at most one character long). During data preparation you should make sure that nothing important is lost here.
29 | }
30 | \details{
31 | The 2010 German classification is available at \url{https://www.klassifikationsserver.de/}.
32 | }
33 | \examples{
34 | occupations <- data.table(answers = c("LEITER VERTRIEB", "Kfz-Schlossermeister", "Aushilfe im Hotel(Bereich Housekeeping)"),
35 |                    codes = c("61194", "25213", "63221"))
36 | (allowed.codes <- c("11101", "61194", "25213", "63221", "..."))
37 | (allowed.codes.titles <- c("Berufe in der Landwirtschaft (ohne Spezialisierung) - Helfer-/Anlernt\xe4tigkeiten", "Berufe in der Kraftfahrzeugtechnik - komplexe Spezialistent\xe4tigkeiten", "F\xfchrungskräfte - Einkauf und Vertrieb", "Berufe im Hotelservice - Helfer-/Anlernt\xe4tigkeiten", "many more category labels from the classification"))
38 | removeFaultyAndUncodableAnswers_And_PrepareForAnalysis(occupations, colNames = c("answers", "codes"), allowed.codes, allowed.codes.titles)
39 | 
40 | data(occupations)
41 | allowed.codes <- c("71402", "71403", "63302", "83112", "83124", "83131", "83132", "83193", "83194", "-0004", "-0030")
42 | allowed.codes.titles <- c("Office clerks and secretaries (without specialisation)-skilled tasks", "Office clerks and secretaries (without specialisation)-complex tasks", "Gastronomy occupations (without specialisation)-skilled tasks",
43 |  "Occupations in child care and child-rearing-skilled tasks", "Occupations in social work and social pedagogics-highly complex tasks", "Pedagogic specialists in social care work and special needs education-unskilled/semiskilled tasks", "Pedagogic specialists in social care work and special needs education-skilled tasks", "Supervisors in education and social work, and of pedagogic specialists in social care work", "Managers in education and social work, and of pedagogic specialists in social care work",
44 |  "Not precise enough for coding", "Student assistants")
45 | removeFaultyAndUncodableAnswers_And_PrepareForAnalysis(occupations, colNames = c("orig_answer", "orig_code"), allowed.codes, allowed.codes.titles)
46 | 
47 | ## we could also paste both answers together
48 | occupations[, answer_combined := paste(orig_answer, orig_answer2)]
49 | removeFaultyAndUncodableAnswers_And_PrepareForAnalysis(occupations, colNames = c("answer_combined", "orig_code"), allowed.codes, allowed.codes.titles)
50 | }
51 | \seealso{
52 | \code{\link{createDescriptives}}
53 | }
54 | 


--------------------------------------------------------------------------------
/man/selectMaxProbMethod.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/selectMaxProbMethod.R
 3 | \name{selectMaxProbMethod}
 4 | \alias{selectMaxProbMethod}
 5 | \title{From multiple prediction methods, select the prediction method for each id which returns highest probability}
 6 | \usage{
 7 | selectMaxProbMethod(
 8 |   occupationalPredictions,
 9 |   combined.methods = c("xgboost", "SimilarityBasedSubstring",
10 |     "SimilarityBasedWordwise"),
11 |   k = 1,
12 |   new.method.name = "maxProbAmong1"
13 | )
14 | }
15 | \arguments{
16 | \item{occupationalPredictions}{a data.table created with the \code{\link{expandPredictionResults}}-function from this package. Actually, the utility of this function is only if we \code{rbind} several such data.tables together (see example).}
17 | 
18 | \item{combined.methods}{a character vector of methods to select from. We will only use the subset of rows from \code{occupationalPredictions} with \code{method.name 'in' combined.methods} (same names as assigned in \code{\link{expandPredictionResults}}).}
19 | 
20 | \item{k}{Calculate probability over \code{k} most probable categories.}
21 | 
22 | \item{new.method.name}{the name how the highest-probability-method shall be called.}
23 | }
24 | \value{
25 | a data.table
26 | }
27 | \description{
28 | Start with a data.table of class 'occupationalPredictionsComplete' (for each combination of pred.code and answer one prediction, we also should have this for multiple prediction methods), calculate for each id the probability of the top k categories, and select for each id the prediction method which returns highest probability. The so-found method is then used for this id and is called \code{new.method.name}.
29 | }
30 | \details{
31 | The problem solved here is this: \code{\link{trainXgboost}} is good for most answers and for interactions. But xgboost fails if a keyword was misspelled or a job title is in the alphabetic dictionary but not in the training data. In those cases we would like to use a prediction method from \code{\link{trainSimilarityBasedReasoning}} which will return higher probabilities.
32 | }
33 | \examples{
34 | # set up data
35 | data(occupations)
36 | allowed.codes <- c("71402", "71403", "63302", "83112", "83124", "83131", "83132", "83193", "83194", "-0004", "-0030")
37 | allowed.codes.titles <- c("Office clerks and secretaries (without specialisation)-skilled tasks", "Office clerks and secretaries (without specialisation)-complex tasks", "Gastronomy occupations (without specialisation)-skilled tasks",
38 |  "Occupations in child care and child-rearing-skilled tasks", "Occupations in social work and social pedagogics-highly complex tasks", "Pedagogic specialists in social care work and special needs education-unskilled/semiskilled tasks", "Pedagogic specialists in social care work and special needs education-skilled tasks", "Supervisors in education and social work, and of pedagogic specialists in social care work", "Managers in education and social work, and of pedagogic specialists in social care work",
39 |  "Not precise enough for coding", "Student assistants")
40 | proc.occupations <- removeFaultyAndUncodableAnswers_And_PrepareForAnalysis(occupations, colNames = c("orig_answer", "orig_code"), allowed.codes, allowed.codes.titles)
41 | 
42 | ## split sample
43 | set.seed(3451345)
44 | n.test <- 50
45 | group <- sample(c(rep("test", n.test), rep("training", nrow(proc.occupations) - n.test)))
46 | splitted.data <- split(proc.occupations, group)
47 | attr(splitted.data$training, "classification")$code <- attr(proc.occupations, "classification")$code
48 | 
49 | ####### train models
50 | # first model uses dist.type = wordwise and some other recommended settings (n.draws could be higher)
51 | simBasedModel <- trainSimilarityBasedReasoning(data = splitted.data$training,
52 |                               coding_index_w_codes = coding_index_excerpt,
53 |                               coding_index_without_codes = frequent_phrases,
54 |                               preprocessing = list(stopwords = tm::stopwords("de"), stemming = NULL, strPreprocessing = TRUE, removePunct = FALSE),
55 |                               dist.type = "wordwise",
56 |                               dist.control = list(method = "osa", weight = c(d = 1, i = 1, s = 1, t = 1)),
57 |                               threshold = c(max = 3, use = 1), simulation.control = list(n.draws = 50, check.normality = FALSE),
58 |                               tmp_folder = "similarityTables")
59 | 
60 | res1 <- expandPredictionResults(predictSimilarityBasedReasoning(simBasedModel, splitted.data$test), allowed.codes, method.name = "WordwiseSimilarityOsa1111")
61 | 
62 | # second model uses dist.type = substring and some other recommended settings (n.draws could be higher)
63 | simBasedModel <- trainSimilarityBasedReasoning(data = splitted.data$training,
64 |                               coding_index_w_codes = coding_index_excerpt,
65 |                               coding_index_without_codes = frequent_phrases,
66 |                               preprocessing = list(stopwords = NULL, stemming = NULL, strPreprocessing = TRUE, removePunct = FALSE),
67 |                               dist.type = "substring",
68 |                               dist.control = list(method = "substring", weight = numeric()),
69 |                               threshold = c(0, 0), simulation.control = list(n.draws = 50, check.normality = FALSE),
70 |                               tmp_folder = "similarityTables")
71 | 
72 | res2 <- expandPredictionResults(predictSimilarityBasedReasoning(simBasedModel, splitted.data$test, parallel = TRUE), allowed.codes, method.name = "substringSimilarity")
73 | 
74 | # third model uses dist.type = fulltext and some other recommended settings (n.draws could be higher)
75 | simBasedModel <- trainSimilarityBasedReasoning(data = proc.occupations,
76 |                               coding_index_w_codes = coding_index_excerpt,
77 |                               coding_index_without_codes = frequent_phrases,
78 |                               preprocessing = list(stopwords = NULL, stemming = NULL, strPreprocessing = TRUE, removePunct = FALSE),
79 |                               dist.type = "fulltext",
80 |                               dist.control = list(method = "osa", weight = c(d = 1, i = 1, s = 1, t = 1)),
81 |                               threshold = c(max = 3, use = 1), simulation.control = list(n.draws = 50, check.normality = FALSE),
82 |                               tmp_folder = "similarityTables")
83 | res3 <- expandPredictionResults(predictSimilarityBasedReasoning(simBasedModel, splitted.data$test), allowed.codes, method.name = "FulltextSimilarityOsa1111")
84 | 
85 | res.combined <- rbind(res1, res2, res3); class(res.combined) <- class(res1)
86 | 
87 | res.max <- selectMaxProbMethod(res.combined, combined.methods = c("WordwiseSimilarityOsa1111", "substringSimilarity"), k = 1, new.method.name = "maxProbAmong1")
88 | res.combined <- rbind(res.combined, res.max); class(res.combined) <- class(res1)
89 | produceResults(res.combined, k = 1, n = n.test, num.codes = length(allowed.codes))
90 | }
91 | 


--------------------------------------------------------------------------------
/man/sharpness.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/sharpness.R
 3 | \name{sharpness}
 4 | \alias{sharpness}
 5 | \title{Sharpness}
 6 | \usage{
 7 | sharpness(occupationalPredictions)
 8 | }
 9 | \arguments{
10 | \item{occupationalPredictions}{a data.table created with a \code{\link{expandPredictionResults}}-function from this package.}
11 | }
12 | \value{
13 | a data.table
14 | }
15 | \description{
16 | Calculate Sharpness \eqn{\log_2 loss = \frac{1}{N} \sum_n entropy_n} and standard error \eqn{\sqrt{\frac{1}{N(N-1)} \sum_n (entropy_n - sharpness)^2}} with \eqn{entropy_n = - \sum_k p_{nk} \log_2 p_{nk}}
17 | }
18 | \details{
19 | Sharpness is zero (optimal) if a single category is predicted with probability 1. It is maximal if all categories have equal probability \eqn{p = \frac{1}{K}}
20 | 
21 | Note: What should be done if a predicted probability is zero? \eqn{0 \times log(0)} is not defined but necessary to calculate sharpness. We set \eqn{0 \times log(0) = 0}. This also means we exclude observations from our analysis if all probabilities are predicted as zero. An alternative could be to set such zeros to \eqn{1/k)}, which would lead to very different sharpness.
22 | }
23 | \examples{
24 | # set up data
25 | data(occupations)
26 | allowed.codes <- c("71402", "71403", "63302", "83112", "83124", "83131", "83132", "83193", "83194", "-0004", "-0030")
27 | allowed.codes.titles <- c("Office clerks and secretaries (without specialisation)-skilled tasks", "Office clerks and secretaries (without specialisation)-complex tasks", "Gastronomy occupations (without specialisation)-skilled tasks",
28 |  "Occupations in child care and child-rearing-skilled tasks", "Occupations in social work and social pedagogics-highly complex tasks", "Pedagogic specialists in social care work and special needs education-unskilled/semiskilled tasks", "Pedagogic specialists in social care work and special needs education-skilled tasks", "Supervisors in education and social work, and of pedagogic specialists in social care work", "Managers in education and social work, and of pedagogic specialists in social care work",
29 |  "Not precise enough for coding", "Student assistants")
30 | proc.occupations <- removeFaultyAndUncodableAnswers_And_PrepareForAnalysis(occupations, colNames = c("orig_answer", "orig_code"), allowed.codes, allowed.codes.titles)
31 | 
32 | ## split sample
33 | set.seed(3451345)
34 | n.test <- 50
35 | group <- sample(c(rep("test", n.test), rep("training", nrow(proc.occupations) - n.test)))
36 | splitted.data <- split(proc.occupations, group)
37 | 
38 | # train model and make predictions
39 | model <- trainLogisticRegressionWithPenalization(splitted.data$train, preprocessing = list(stopwords = tm::stopwords("de"), stemming = "de", countWords = FALSE), tuning = list(alpha = 0.05, maxit = 50^5, nlambda = 100, thresh = 1e-5))
40 | res <- predictLogisticRegressionWithPenalization(model, splitted.data$test)
41 | 
42 | # expand to contain more categories than the initial ones
43 | res.proc1 <- expandPredictionResults(res, allowed.codes = c("12345", allowed.codes), method.name = "glmnet1")
44 | 
45 | # we can use different methods to create a combined dataset. This is how to run the subsequent analysis functions only once.
46 | res.proc2 <- expandPredictionResults(res, allowed.codes = c("12345", allowed.codes), method.name = "glmnet2")
47 | res.proc <- rbind(res.proc1, res.proc2); class(res.proc) <- c(class(res.proc), "occupationalPredictionsComplete")
48 | 
49 | sharpness(res.proc)
50 | }
51 | \seealso{
52 | \code{\link{plotReliabilityDiagram}}, \code{link{logLoss}}
53 | }
54 | 


--------------------------------------------------------------------------------
/man/stringPreprocessing.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/stringPreprocessing.R
 3 | \encoding{UTF-8}
 4 | \name{stringPreprocessing}
 5 | \alias{stringPreprocessing}
 6 | \title{Preprocess German occupational text}
 7 | \usage{
 8 | stringPreprocessing(verbatim, lang = "de")
 9 | }
10 | \arguments{
11 | \item{verbatim}{a character vector.}
12 | 
13 | \item{lang}{(default de) Everything else will throw an error.}
14 | }
15 | \value{
16 | the same character vector after processing
17 | }
18 | \description{
19 | Function replaces some common characters / character sequences (e.g., Ä, Ü, "DIPL.-ING.") with their uppercase equivalents and removes punctuation, empty spaces and the word "Diplom".
20 | }
21 | \details{
22 | \code{\link{charToRaw}} helps to find UTF-8 characters.
23 | }
24 | \examples{
25 | (x <- c("Verkauf von B\xfcchern, Schreibwaren", "Fach\xe4rzin f\xfcr Kinder- und Jugendmedizin im \xf6ffentlichen Gesundheitswesen", "Industriemechaniker", "Dipl.-Ing. - Agrarwirtschaft (Landwirtschaft)"))
26 | stringPreprocessing(x)
27 | }
28 | 


--------------------------------------------------------------------------------
/man/surveyCountsSubstringSimilarity.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \encoding{UTF-8}
 5 | \name{surveyCountsSubstringSimilarity}
 6 | \alias{surveyCountsSubstringSimilarity}
 7 | \title{Anonymized training data (substring similarity) to be used with Similarity-based Reasoning}
 8 | \format{
 9 | A data.table with 39242 rows and 3 variables:
10 | \describe{
11 |   \item{dictString}{Job titles (and similars). They were either taken from the \code{Gesamtberufsliste_der_BA} or from \code{\link{frequent_phrases}}}
12 |   \item{survCode}{5-digit codes from the survey data}
13 |   \item{N}{Frequency of how often a survey text identical or similar to \code{dictString} was coded as \code{survCode} (using substring-similarity)}
14 | }
15 | 
16 | Substring-similarity means that, to be counted, the survey response must contain the text in dictString, i.e., dictString must be identical or shorter than the original text provided by the respondent.
17 | }
18 | \source{
19 | Data from the following surveys were pooled:
20 | 
21 | Antoni, M., Drasch, K., Kleinert, C., Matthes, B., Ruland, M. and Trahms, A. (2010): Arbeiten und Lernen im Wandel * Teil 1: Überblick über die Studie, FDZ-Methodenreport 05/2010, Forschungsdatenzentrum der Bundesagentur für Arbeit im Institut für Arbeitsmarkt- und Berufsforschung, Nuremberg.
22 | 
23 | Rohrbach-Schmidt, D., Hall, A. (2013): BIBB/BAuA Employment Survey 2012, BIBB-FDZ Data and Methodological Reports Nr. 1/2013. Version 4.1, Federal Institute for Vocational Education and Training (Research Data Centre), Bonn.
24 | 
25 | Lange, C., Finger, J., Allen, J., Born, S., Hoebel, J., Kuhnert, R., Müters, S., Thelen, J., Schmich, P., Varga, M., von der Lippe, E., Wetzstein, M., Ziese, T. (2017): Implementation of the European Health Interview Survey (EHIS) into the German Health Update (GEDA), Archives of Public Health, 75, 1–14.
26 | 
27 | Hoffmann, R., Lange, M., Butschalowsky, H., Houben, R., Schmich, P., Allen, J., Kuhnert, R., Schaffrath Rosario, A., Gößwald, A. (2018): KiGGS Wave 2 Cross-Sectional Study—Participant Acquisition, Response Rates and Representativeness, Journal of Health Monitoring, 3, 78–91. (only wave 2)
28 | 
29 | Trappmann, M., Beste, J., Bethmann, A., Müller, G. (2013): The PASS Panel Survey after Six Waves, Journal for Labour Market Research, 46, 275–281. (only wave 10)
30 | 
31 | Job titles were taken from the following publication:
32 | 
33 | Bundesagentur für Arbeit (2019). Gesamtberufsliste der Bundesagentur für Arbeit. Stand: 03.01.2019. The \code{Gesamtberufsliste der BA} is available at \link{https://download-portal.arbeitsagentur.de/files/}.
34 | }
35 | \usage{
36 | surveyCountsSubstringSimilarity
37 | }
38 | \description{
39 | This aggregated (anonymized) training data is to be used within the \code{\link{trainSimilarityBasedReasoning2}}-function (\code{dist.type = "substring"}), see the documentation therein. It allows the coding of German language occuptions into the German Classification of Occupations (KldB 2010).
40 | }
41 | \examples{
42 | ## what follows is the source code used to create this data set
43 | ##
44 | 
45 | # load toy example data
46 | data(occupations) # toy example, the five data sets cited above were used instead
47 | # In addition to codes from ther 2010 German Classification of Occupations, our data make use of the following special codes:
48 | (special_codes <- c("-0004" = "genaue Kodierung nicht möglich", "-0006" = "Multiple jobs", "-0012" = "Blue-colar worker", "-0030" = "Student employee/assistant, work placement student, research assistant", "-0019" = "Federal volunteer service, voluntary social year (FSJ), civil service"))
49 | data(coding_index_excerpt) # toy example, the Gesamtberufsliste was used instead. After running ?prepare_German_coding_index_Gesamtberufsliste_der_BA, our version of the coding index had 27853 entries.
50 | data(frequent_phrases)
51 | 
52 | # prepare coding index for our purposes
53 | coding_index <- coding_index_excerpt[!(Berufsbenennungen \%in\% c("Bundeskanzler/in", "Bundespräsident/in", "Admiral", "General"))] # remove very rare occupations that might violate privacy regulations
54 | coding_index_w_codes <- rbind(coding_index[, list(title = bezMale, Code)], coding_index[, list(title = bezFemale, Code)])
55 | coding_index_w_codes <- coding_index_w_codes[,title := stringPreprocessing(title)]
56 | coding_index_w_codes <- coding_index_w_codes[!duplicated(title)]
57 | 
58 | # prepare the training data (special codes were harmonized in advance)
59 | training_data <- occupations[, .(answer = stringPreprocessing(orig_answer), code = orig_code)]
60 | 
61 | # trick to save time: do this only once for each unique string and merge later
62 | similarityTableSubstring <- createSimilarityTableSubstring(unique.string = unique(training_data$answer),
63 |                                                            coding_index_w_codes = coding_index_w_codes,
64 |                                                            coding_index_without_codes = frequent_phrases)
65 | similarityTableSubstring2 <- rbind(similarityTableSubstring$dist_table_w_code[, .(intString, dictString = dictString.title)],
66 |                                    similarityTableSubstring$dist_table_without_code[, .(intString, dictString)])
67 | surveyCountsSubstringSimilarity_toyExample <- merge(training_data[, .(answer, survCode = code)], similarityTableSubstring2, by.x = "answer", by.y = "intString", allow.cartesian = TRUE)[, .N, by = list(dictString, survCode)][order(dictString)]
68 | 
69 | }
70 | \seealso{
71 | See \code{\link{trainSimilarityBasedReasoning2}}, for which this data set was created, and \code{\link{surveyCountsWordwiseSimilarity}}, which has been created the same way but uses a different metric to calculate string similarities.
72 | }
73 | \keyword{datasets}
74 | 


--------------------------------------------------------------------------------
/man/surveyCountsWordwiseSimilarity.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \encoding{UTF-8}
 5 | \name{surveyCountsWordwiseSimilarity}
 6 | \alias{surveyCountsWordwiseSimilarity}
 7 | \title{Anonymized training data (wordwise similarity) to be used with Similarity-based Reasoning}
 8 | \format{
 9 | A data.table with 26710 rows and 3 variables:
10 | \describe{
11 |   \item{dictString}{Job titles (and similars). They were either taken from the \code{Gesamtberufsliste_der_BA} or from \code{\link{frequent_phrases}}}
12 |   \item{survCode}{5-digit codes from the survey data}
13 |   \item{N}{Frequency of how often a survey text identical or similar to \code{dictString} was coded as \code{survCode} (using wordwise-similarity)}
14 | }
15 | 
16 | Wordwise-similarity means that, to be counted, the verbal survey answer must be similar to dictString, more specifically, dictString must be identical with any one word in the survey response (a difference by at most one character is allowed to account for spelling errors).
17 | }
18 | \source{
19 | Data from the following surveys were pooled:
20 | 
21 | Antoni, M., Drasch, K., Kleinert, C., Matthes, B., Ruland, M. and Trahms, A. (2010): Arbeiten und Lernen im Wandel * Teil 1: Überblick über die Studie, FDZ-Methodenreport 05/2010, Forschungsdatenzentrum der Bundesagentur für Arbeit im Institut für Arbeitsmarkt- und Berufsforschung, Nuremberg.
22 | 
23 | Rohrbach-Schmidt, D., Hall, A. (2013): BIBB/BAuA Employment Survey 2012, BIBB-FDZ Data and Methodological Reports Nr. 1/2013. Version 4.1, Federal Institute for Vocational Education and Training (Research Data Centre), Bonn.
24 | 
25 | Lange, C., Finger, J., Allen, J., Born, S., Hoebel, J., Kuhnert, R., Müters, S., Thelen, J., Schmich, P., Varga, M., von der Lippe, E., Wetzstein, M., Ziese, T. (2017): Implementation of the European Health Interview Survey (EHIS) into the German Health Update (GEDA), Archives of Public Health, 75, 1–14.
26 | 
27 | Hoffmann, R., Lange, M., Butschalowsky, H., Houben, R., Schmich, P., Allen, J., Kuhnert, R., Schaffrath Rosario, A., Gößwald, A. (2018): KiGGS Wave 2 Cross-Sectional Study—Participant Acquisition, Response Rates and Representativeness, Journal of Health Monitoring, 3, 78–91. (only wave 2)
28 | 
29 | Trappmann, M., Beste, J., Bethmann, A., Müller, G. (2013): The PASS Panel Survey after Six Waves, Journal for Labour Market Research, 46, 275–281. (only wave 10)
30 | 
31 | Job titles were taken from the following publication:
32 | 
33 | Bundesagentur für Arbeit (2019). Gesamtberufsliste der Bundesagentur für Arbeit. Stand: 03.01.2019. The \code{Gesamtberufsliste der BA} is available at \link{https://download-portal.arbeitsagentur.de/files/}.
34 | }
35 | \usage{
36 | surveyCountsWordwiseSimilarity
37 | }
38 | \description{
39 | This aggregated (anonymized) training data is to be used within the \code{\link{trainSimilarityBasedReasoning2}}-function (\code{dist.type = "wordwise", dist.control = list(method = "osa", weight = c(d = 1, i = 1, s = 1, t = 1)), threshold = c(max = NA, use = 1))}, see the documentation therein. It allows the coding of German language occuptions into the German Classification of Occupations (KldB 2010).
40 | }
41 | \examples{
42 | ## what follows is the source code used to create this data set
43 | ##
44 | 
45 | # load toy example data
46 | data(occupations) # toy example, the five data sets cited above were used instead
47 | # In addition to codes from ther 2010 German Classification of Occupations, our data make use of the following special codes:
48 | (special_codes <- c("-0004" = "genaue Kodierung nicht möglich", "-0006" = "Multiple jobs", "-0012" = "Blue-colar worker", "-0030" = "Student employee/assistant, work placement student, research assistant", "-0019" = "Federal volunteer service, voluntary social year (FSJ), civil service"))
49 | data(coding_index_excerpt) # toy example, the Gesamtberufsliste was used instead. After running ?prepare_German_coding_index_Gesamtberufsliste_der_BA, our version of the coding index had 27853 entries.
50 | data(frequent_phrases)
51 | 
52 | # prepare coding index for our purposes
53 | coding_index <- coding_index_excerpt[!(Berufsbenennungen \%in\% c("Bundeskanzler/in", "Bundespräsident/in", "Admiral", "General"))] # remove very rare occupations that might violate privacy regulations
54 | coding_index_w_codes <- rbind(coding_index[, list(title = bezMale, Code)], coding_index[, list(title = bezFemale, Code)])
55 | coding_index_w_codes <- coding_index_w_codes[,title := stringPreprocessing(title)]
56 | coding_index_w_codes <- coding_index_w_codes[!duplicated(title)]
57 | 
58 | # prepare the training data (special codes were harmonized in advance)
59 | training_data <- occupations[, .(answer = stringPreprocessing(orig_answer), code = orig_code)]
60 | 
61 | # trick to save time: do this only once for each unique string and merge later
62 | similarityTableWordwise <- createSimilarityTableWordwiseStringdist(unique.string = unique(training_data$answer),
63 |                                                                    coding_index_w_codes = coding_index_w_codes,
64 |                                                                    coding_index_without_codes = occupationCoding::frequent_phrases,
65 |                                                                    preprocessing = list(stopwords = NULL, stemming = NULL, strPreprocessing = TRUE, removePunct = FALSE),
66 |                                                                    dist.control = list(method = "osa", weight = c(d = 1, i = 1, s = 1, t = 1)),
67 |                                                                    threshold = 1)
68 | similarityTableWordwise2 <- rbind(similarityTableWordwise$dist_table_w_code[, .(intString, dictString = dictString.title)],
69 |                                   similarityTableWordwise$dist_table_without_code[, .(intString, dictString)])
70 | surveyCountsWordwiseSimilarity_toyExample <- merge(training_data[, .(answer, survCode = code)], similarityTableWordwise2, by.x = "answer", by.y = "intString", allow.cartesian = TRUE)[, .N, by = list(dictString, survCode)][order(dictString)]
71 | }
72 | \seealso{
73 | See \code{\link{trainSimilarityBasedReasoning2}}, for which this data set was created, and \code{\link{surveyCountsSubstringSimilarity}}, which has been created the same way but uses a different metric to calculate string similarities.
74 | }
75 | \keyword{datasets}
76 | 


--------------------------------------------------------------------------------
/man/trainCreecysMemoryBasedReasoning.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/trainCreecysMemoryBasedReasoning.R
 3 | \name{trainCreecysMemoryBasedReasoning}
 4 | \alias{trainCreecysMemoryBasedReasoning}
 5 | \title{Train Creecys Memory-based reaoning model}
 6 | \usage{
 7 | trainCreecysMemoryBasedReasoning(
 8 |   data,
 9 |   preprocessing = list(stopwords = character(0), stemming = NULL, strPreprocessing =
10 |     TRUE, removePunct = TRUE)
11 | )
12 | }
13 | \arguments{
14 | \item{data}{a data.table created with \code{\link{removeFaultyAndUncodableAnswers_And_PrepareForAnalysis}}}
15 | 
16 | \item{preprocessing}{a list with elements
17 | \describe{
18 |   \item{stopwords}{a character vector, use \code{tm::stopwords("de")} for German stopwords.}
19 |   \item{stemming}{\code{NULL} for no stemming and \code{"de"} for stemming using the German porter stemmer.}
20 |   \item{strPreprocessing}{\code{TRUE} if \code{\link{stringPreprocessing}} shall be used.}
21 |   \item{removePunct}{\code{TRUE} if \code{\link[tm]{removePunctuation}} shall be used.}
22 | }}
23 | }
24 | \value{
25 | a processed feature matrix to be used in \code{\link{predictCreecysMemoryBasedReasoning}}
26 | }
27 | \description{
28 | The function does some preprocessing and calculates the importance of various features.
29 | }
30 | \examples{
31 | # set up data
32 | data(occupations)
33 | allowed.codes <- c("71402", "71403", "63302", "83112", "83124", "83131", "83132", "83193", "83194", "-0004", "-0030")
34 | allowed.codes.titles <- c("Office clerks and secretaries (without specialisation)-skilled tasks", "Office clerks and secretaries (without specialisation)-complex tasks", "Gastronomy occupations (without specialisation)-skilled tasks",
35 |  "Occupations in child care and child-rearing-skilled tasks", "Occupations in social work and social pedagogics-highly complex tasks", "Pedagogic specialists in social care work and special needs education-unskilled/semiskilled tasks", "Pedagogic specialists in social care work and special needs education-skilled tasks", "Supervisors in education and social work, and of pedagogic specialists in social care work", "Managers in education and social work, and of pedagogic specialists in social care work",
36 |  "Not precise enough for coding", "Student assistants")
37 | proc.occupations <- removeFaultyAndUncodableAnswers_And_PrepareForAnalysis(occupations, colNames = c("orig_answer", "orig_code"), allowed.codes, allowed.codes.titles)
38 | 
39 | # Recommended configuration (and commonly used in this package)
40 | memModel <- trainCreecysMemoryBasedReasoning(proc.occupations,
41 |                  preprocessing = list(stopwords = character(0), stemming = NULL, strPreprocessing = TRUE, removePunct = FALSE))
42 | }
43 | \seealso{
44 | \code{\link{predictCreecysMemoryBasedReasoning}}
45 | 
46 | Creecy, R. H., Masand, B. M., Smith, S. J., Waltz, D. L. (1992). Trading MIPS and Memory for Knowledge Engineering. Comm. ACM 35(8). pp. 48--65.
47 | }
48 | 


--------------------------------------------------------------------------------
/man/trainGweonsNearestNeighbor.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/trainGweonsNearestNeighbor.R
 3 | \name{trainGweonsNearestNeighbor}
 4 | \alias{trainGweonsNearestNeighbor}
 5 | \title{Trains Gweons Nearest Neighbor model}
 6 | \usage{
 7 | trainGweonsNearestNeighbor(
 8 |   data,
 9 |   preprocessing = list(stopwords = tm::stopwords("de"), stemming = "de",
10 |     strPreprocessing = FALSE, removePunct = TRUE)
11 | )
12 | }
13 | \arguments{
14 | \item{data}{a data.table created with \code{\link{removeFaultyAndUncodableAnswers_And_PrepareForAnalysis}}}
15 | 
16 | \item{preprocessing}{a list with elements
17 | \describe{
18 |   \item{stopwords}{a character vector, use \code{tm::stopwords("de")} for German stopwords.}
19 |   \item{stemming}{\code{NULL} for no stemming and \code{"de"} for stemming using the German porter stemmer.}
20 |   \item{strPreprocessing}{\code{TRUE} if \code{\link{stringPreprocessing}} shall be used.}
21 |   \item{removePunct}{\code{TRUE} if \code{\link[tm]{removePunctuation}} shall be used.}
22 | }}
23 | }
24 | \value{
25 | a document term matrix with some additional attributes
26 | }
27 | \description{
28 | Function does some preprocessing and creates a document term matrix to be used for the Nearest Neighbor model.
29 | }
30 | \examples{
31 | # set up data
32 | data(occupations)
33 | allowed.codes <- c("71402", "71403", "63302", "83112", "83124", "83131", "83132", "83193", "83194", "-0004", "-0030")
34 | allowed.codes.titles <- c("Office clerks and secretaries (without specialisation)-skilled tasks", "Office clerks and secretaries (without specialisation)-complex tasks", "Gastronomy occupations (without specialisation)-skilled tasks",
35 |  "Occupations in child care and child-rearing-skilled tasks", "Occupations in social work and social pedagogics-highly complex tasks", "Pedagogic specialists in social care work and special needs education-unskilled/semiskilled tasks", "Pedagogic specialists in social care work and special needs education-skilled tasks", "Supervisors in education and social work, and of pedagogic specialists in social care work", "Managers in education and social work, and of pedagogic specialists in social care work",
36 |  "Not precise enough for coding", "Student assistants")
37 | proc.occupations <- removeFaultyAndUncodableAnswers_And_PrepareForAnalysis(occupations, colNames = c("orig_answer", "orig_code"), allowed.codes, allowed.codes.titles)
38 | 
39 | # Recommended configuration
40 | dtmModel <- trainGweonsNearestNeighbor(proc.occupations,
41 |                  preprocessing = list(stopwords = tm::stopwords("de"), stemming = "de", strPreprocessing = TRUE, removePunct = FALSE))
42 | # Configuration used by Gweon et al. (2017)
43 | dtmModel <- trainGweonsNearestNeighbor(proc.occupations,
44 |                  preprocessing = list(stopwords = tm::stopwords("de"), stemming = "de", strPreprocessing = FALSE, removePunct = TRUE))
45 | # Configuration used for most other approaches in this package
46 | dtmModel <- trainGweonsNearestNeighbor(proc.occupations,
47 |                  preprocessing = list(stopwords = character(0), stemming = NULL, strPreprocessing = TRUE, removePunct = FALSE))
48 | 
49 | #######################################################
50 | ## RUN A GRID SEARCH (takes some time)
51 | \donttest{
52 | # create a grid of all combinations to be tried
53 | model.grid <- data.table(expand.grid(stopwords = c(TRUE, FALSE), stemming = c(FALSE, "de"), strPreprocessing = c(TRUE, FALSE), nearest.neighbors.multiplier = c(0.05, 0.1, 0.2)))
54 | 
55 | # Do grid search
56 | for (i in 1:nrow(model.grid)) {
57 |   res.model <- trainGweonsNearestNeighbor(splitted.data$training, preprocessing = list(stopwords = if (model.grid[i, stopwords]) tm::stopwords("de") else character(0),
58 |                                                                                        stemming = if (model.grid[i, stemming == "de"]) "de" else NULL,
59 |                                                                                        strPreprocessing = model.grid[i, strPreprocessing],
60 |                                                                                        removePunct = !model.grid[i, strPreprocessing]))
61 | 
62 |   res.proc <- predictGweonsNearestNeighbor(res.model, splitted.data$test,
63 |                                         tuning = list(nearest.neighbors.multiplier = model.grid[i, nearest.neighbors.multiplier]))
64 |   res.proc <- expandPredictionResults(res.proc, allowed.codes = allowed.codes, method.name = "NearestNeighbor_Gweon")
65 | 
66 |   ac <- accuracy(calcAccurateAmongTopK(res.proc, k = 1), n = nrow(splitted.data$test))
67 |   ll <- logLoss(res.proc)
68 |   sh <- sharpness(res.proc)
69 | 
70 |   model.grid[i, acc := ac[, acc]]
71 |   model.grid[i, acc.se := ac[, se]]
72 |   model.grid[i, acc.N := ac[, N]]
73 |   model.grid[i, acc.prob0 := ac[, count.pred.prob0]]
74 |   model.grid[i, loss.full := ll[1, logscore]]
75 |   model.grid[i, loss.full.se := ll[1, se]]
76 |   model.grid[i, loss.full.N := ll[1, N]]
77 |   model.grid[i, loss.sub := ll[2, logscore]]
78 |   model.grid[i, loss.sub.se := ll[2, se]]
79 |   model.grid[i, loss.sub.N := ll[2, N]]
80 |   model.grid[i, sharp := sh[, sharpness]]
81 |   model.grid[i, sharp.se := sh[, se]]
82 |   model.grid[i, sharp.N := sh[, N]]
83 | }
84 | 
85 | model.grid[order(stopwords, stemming, strPreprocessing, nearest.neighbors.multiplier)]
86 | 
87 | 
88 | }
89 | }
90 | \seealso{
91 | \code{\link{predictGweonsNearestNeighbor}}
92 | 
93 | Gweon, H.; Schonlau, M., Kaczmirek, L., Blohm, M., Steiner, S. (2017). Three Methods for Occupation Coding Based on Statistical Learning. Journal of Official Statistics 33(1), pp. 101--122
94 | }
95 | 


--------------------------------------------------------------------------------
/man/trainLogisticRegressionWithPenalization.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/trainLogisticRegressionWithPenalization.R
 3 | \name{trainLogisticRegressionWithPenalization}
 4 | \alias{trainLogisticRegressionWithPenalization}
 5 | \title{Train a logistic regression model with penalization}
 6 | \usage{
 7 | trainLogisticRegressionWithPenalization(
 8 |   data,
 9 |   preprocessing = list(stopwords = character(0), stemming = NULL, countWords = FALSE),
10 |   tuning = list(alpha = 0.05, maxit = 10^5, nlambda = 100, thresh = 1e-07)
11 | )
12 | }
13 | \arguments{
14 | \item{data}{a data.table created with \code{\link{removeFaultyAndUncodableAnswers_And_PrepareForAnalysis}}}
15 | 
16 | \item{preprocessing}{a list with elements
17 | \describe{
18 |   \item{stopwords}{a character vector, use \code{tm::stopwords("de")} for German stopwords.}
19 |   \item{stemming}{\code{NULL} for no stemming and \code{"de"} for stemming using the German porter stemmer.}
20 |   \item{countWords}{Set to TRUE if the predictor matrix should contain a column for answer length.}
21 | }}
22 | 
23 | \item{tuning}{a list with elements that will be passed to \code{\link[glmnet]{glmnet}}}
24 | }
25 | \value{
26 | a logistic regression model. Commands from \code{\link[glmnet]{glmnet}} should work.
27 | }
28 | \description{
29 | Function does some preprocessing and calls glmnet for a logistic regression model
30 | }
31 | \details{
32 | Setting \code{tuning$alpha = 0} (Ridge Penalty) seems to be most stable.
33 | 
34 | In our experience, \code{glmnet} often returns a warning like \code{3: from glmnet Fortran code (error code -72); Convergence for 72th lambda value not reached after maxit=100000 iterations; solutions for larger lambdas returned}. To solve this issue, we can increase \code{maxit} to try more iterations or we can decrease the threshold \code{thresh}.
35 | }
36 | \examples{
37 | # set up data
38 | data(occupations)
39 | allowed.codes <- c("71402", "71403", "63302", "83112", "83124", "83131", "83132", "83193", "83194", "-0004", "-0030")
40 | allowed.codes.titles <- c("Office clerks and secretaries (without specialisation)-skilled tasks", "Office clerks and secretaries (without specialisation)-complex tasks", "Gastronomy occupations (without specialisation)-skilled tasks",
41 |  "Occupations in child care and child-rearing-skilled tasks", "Occupations in social work and social pedagogics-highly complex tasks", "Pedagogic specialists in social care work and special needs education-unskilled/semiskilled tasks", "Pedagogic specialists in social care work and special needs education-skilled tasks", "Supervisors in education and social work, and of pedagogic specialists in social care work", "Managers in education and social work, and of pedagogic specialists in social care work",
42 |  "Not precise enough for coding", "Student assistants")
43 | proc.occupations <- removeFaultyAndUncodableAnswers_And_PrepareForAnalysis(occupations, colNames = c("orig_answer", "orig_code"), allowed.codes, allowed.codes.titles)
44 | 
45 | # Recommended configuration
46 | trainLogisticRegressionWithPenalization(proc.occupations,
47 |                  preprocessing = list(stopwords = character(0), stemming = "de", countWords = FALSE),
48 |                  tuning = list(alpha = 0.05, maxit = 10^6, nlambda = 100, thresh = 1e-7))
49 | 
50 | # Other possibility
51 | trainLogisticRegressionWithPenalization(proc.occupations,
52 |                  preprocessing = list(stopwords = tm::stopwords("de"), stemming = NULL, countWords = TRUE),
53 |                  tuning = list(alpha = 0.05, maxit = 10^6, nlambda = 100, thresh = 1e-7))
54 | }
55 | \seealso{
56 | \code{\link{predictLogisticRegressionWithPenalization}}, \code{\link[glmnet]{glmnet}}
57 | }
58 | 


--------------------------------------------------------------------------------