├── DESCRIPTION ├── LICENSE ├── NAMESPACE ├── R ├── accuracy.R ├── asDocumentTermMatrix.R ├── calcAccurateAmongTopK.R ├── cosineSimilarity.R ├── createDescriptives.R ├── createSimilarityTableStringdist.R ├── createSimilarityTableSubstring.R ├── createSimilarityTableWordwiseStringdist.R ├── data.R ├── expandPredictionResults.R ├── logLoss.R ├── plotAgreementRateVsProductionRate.R ├── plotReliabilityDiagram.R ├── plotTruePredictionsVsFalsePredictions.R ├── predictCreecysMemoryBasedReasoning.R ├── predictGweonsNearestNeighbor.R ├── predictLogisticRegressionWithPenalization.R ├── predictSimilarityBasedReasoning.R ├── predictWithCodingIndex.R ├── predictXgboost.R ├── prepare_German_coding_index_Gesamtberufsliste_der_BA.R ├── produceResults.R ├── removeFaultyAndUncodableAnswers_And_PrepareForAnalysis.R ├── selectMaxProbMethod.R ├── sharpness.R ├── stringPreprocessing.R ├── trainCreecysMemoryBasedReasoning.R ├── trainGweonsNearestNeighbor.R ├── trainLogisticRegressionWithPenalization.R ├── trainSimilarityBasedReasoning.R ├── trainSimilarityBasedReasoning2.R └── trainXgboost.R ├── README.md ├── data ├── coding_index_excerpt.RData ├── frequent_phrases.RData ├── kldb2010PlusFive.RData ├── occupations.RData ├── surveyCountsSubstringSimilarity.RData └── surveyCountsWordwiseSimilarity.RData └── man ├── accuracy.Rd ├── asDocumentTermMatrix.Rd ├── calcAccurateAmongTopK.Rd ├── coding_index_excerpt.Rd ├── cosineSimilarity.Rd ├── createDescriptives.Rd ├── createSimilarityTableStringdist.Rd ├── createSimilarityTableSubstring.Rd ├── createSimilarityTableWordwiseStringdist.Rd ├── expandPredictionResults.Rd ├── frequent_phrases.Rd ├── kldb2010PlusFive.Rd ├── logLoss.Rd ├── occupations.Rd ├── plotAgreementRateVsProductionRate.Rd ├── plotReliabilityDiagram.Rd ├── plotTruePredictionsVsFalsePredictions.Rd ├── predictCreecysMemoryBasedReasoning.Rd ├── predictGweonsNearestNeighbor.Rd ├── predictLogisticRegressionWithPenalization.Rd ├── predictSimilarityBasedReasoning.Rd ├── predictWithCodingIndex.Rd ├── predictXgboost.Rd ├── prepare_German_coding_index_Gesamtberufsliste_der_BA.Rd ├── produceResults.Rd ├── removeFaultyAndUncodableAnswers_And_PrepareForAnalysis.Rd ├── selectMaxProbMethod.Rd ├── sharpness.Rd ├── stringPreprocessing.Rd ├── surveyCountsSubstringSimilarity.Rd ├── surveyCountsWordwiseSimilarity.Rd ├── trainCreecysMemoryBasedReasoning.Rd ├── trainGweonsNearestNeighbor.Rd ├── trainLogisticRegressionWithPenalization.Rd ├── trainSimilarityBasedReasoning.Rd ├── trainSimilarityBasedReasoning2.Rd └── trainXgboost.Rd /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: occupationCoding 2 | Type: Package 3 | Title: Supervised Learning for Occupation Coding 4 | Version: 0.1.0 5 | Author: Malte Schierholz 6 | Maintainer: Malte Schierholz 7 | Description: Occupation Coding refers to coding short verbal texts (survey answers) 8 | into an occupational classification. This package implements several supervised learning 9 | techniques to simplify its automation. More details can be found in an upcoming publication. 10 | See predict-functions for some examples of what this package can do. 11 | Depends: R (>= 3.5.0) 12 | License: GPL-3 13 | RoxygenNote: 7.1.1 14 | Encoding: UTF-8 15 | LazyData: true 16 | IMPORTS: 17 | text2vec (== 0.4.0), 18 | tm (== 0.6-2), 19 | stringr, 20 | ggplot2, 21 | data.table 22 | SUGGESTS: 23 | glmnet, 24 | xgboost, 25 | SnowballC, 26 | mvtnorm, 27 | stringdist, 28 | Matrix, 29 | parallel, 30 | readxl 31 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | export(accuracy) 4 | export(asDocumentTermMatrix) 5 | export(calcAccurateAmongTopK) 6 | export(cosineSimilarity) 7 | export(createDescriptives) 8 | export(createSimilarityTableStringdist) 9 | export(createSimilarityTableSubstring) 10 | export(createSimilarityTableWordwiseStringdist) 11 | export(expandPredictionResults) 12 | export(logLoss) 13 | export(plotAgreementRateVsProductionRate) 14 | export(plotReliabilityDiagram) 15 | export(plotTruePredictionsVsFalsePredictions) 16 | export(predictCreecysMemoryBasedReasoning) 17 | export(predictGweonsNearestNeighbor) 18 | export(predictLogisticRegressionWithPenalization) 19 | export(predictSimilarityBasedReasoning) 20 | export(predictWithCodingIndex) 21 | export(predictXgboost) 22 | export(prepare_German_coding_index_Gesamtberufsliste_der_BA) 23 | export(produceResults) 24 | export(removeFaultyAndUncodableAnswers_And_PrepareForAnalysis) 25 | export(selectMaxProbMethod) 26 | export(sharpness) 27 | export(stringPreprocessing) 28 | export(trainCreecysMemoryBasedReasoning) 29 | export(trainGweonsNearestNeighbor) 30 | export(trainLogisticRegressionWithPenalization) 31 | export(trainSimilarityBasedReasoning) 32 | export(trainSimilarityBasedReasoning2) 33 | export(trainXgboost) 34 | import(data.table) 35 | import(ggplot2) 36 | import(text2vec) 37 | -------------------------------------------------------------------------------- /R/accuracy.R: -------------------------------------------------------------------------------- 1 | #' Accuracy 2 | #' 3 | #' Calculate accuracy \eqn{p = \frac{1}{n} \sum acc} and standard errors \eqn{\sqrt{\frac{1}{n} * p * (1-p)}}. 4 | #' 5 | #' Note that this function also works if \code{occupationalPredictionsAmongTopK} contains less than \code{n} individuals. 6 | #' 7 | #' @param occupationalPredictionsAmongTopK a data table created with \code{\link{calcAccurateAmongTopK}}. 8 | #' @param n Number of unique persons in test data (may be larger than the number of persons in occupationalPredictionsAmongTopK if for some persons no codes were suggested) 9 | #' 10 | #' @return a data.table with columns N (number of unique IDs in occupationalPredictionsAmongTopK), acc (\code{sum(acc) / n}), se (standard error of acc), mean.in.subset.N (\code{mean(acc)}), and count.pred.prob0 (\code{sum(pred.prob == 0)}) 11 | #' @import data.table 12 | #' @export 13 | #' 14 | #' @examples 15 | #' # set up data 16 | #' data(occupations) 17 | #' allowed.codes <- c("71402", "71403", "63302", "83112", "83124", "83131", "83132", "83193", "83194", "-0004", "-0030") 18 | #' allowed.codes.titles <- c("Office clerks and secretaries (without specialisation)-skilled tasks", "Office clerks and secretaries (without specialisation)-complex tasks", "Gastronomy occupations (without specialisation)-skilled tasks", 19 | #' "Occupations in child care and child-rearing-skilled tasks", "Occupations in social work and social pedagogics-highly complex tasks", "Pedagogic specialists in social care work and special needs education-unskilled/semiskilled tasks", "Pedagogic specialists in social care work and special needs education-skilled tasks", "Supervisors in education and social work, and of pedagogic specialists in social care work", "Managers in education and social work, and of pedagogic specialists in social care work", 20 | #' "Not precise enough for coding", "Student assistants") 21 | #' proc.occupations <- removeFaultyAndUncodableAnswers_And_PrepareForAnalysis(occupations, colNames = c("orig_answer", "orig_code"), allowed.codes, allowed.codes.titles) 22 | #' 23 | #' ## split sample 24 | #' set.seed(3451345) 25 | #' n.test <- 50 26 | #' group <- sample(c(rep("test", n.test), rep("training", nrow(proc.occupations) - n.test))) 27 | #' splitted.data <- split(proc.occupations, group) 28 | #' 29 | #' # train model and make predictions 30 | #' model <- trainLogisticRegressionWithPenalization(splitted.data$train, preprocessing = list(stopwords = tm::stopwords("de"), stemming = "de", countWords = FALSE), tuning = list(alpha = 0.05, maxit = 50^5, nlambda = 100, thresh = 1e-5)) 31 | #' res <- predictLogisticRegressionWithPenalization(model, splitted.data$test) 32 | #' 33 | #' # expand to contain more categories than the initial ones 34 | #' res.proc1 <- expandPredictionResults(res, allowed.codes = c("12345", allowed.codes), method.name = "glmnet1") 35 | #' 36 | #' # we can use different methods to create a combined dataset. This is how to run the subsequent analysis functions only once. 37 | #' res.proc2 <- expandPredictionResults(res, allowed.codes = c("12345", allowed.codes), method.name = "glmnet2") 38 | #' res.proc <- rbind(res.proc1, res.proc2); class(res.proc) <- c(class(res.proc), "occupationalPredictionsComplete") 39 | #' 40 | #' calcAccurateAmongTopK(res.proc, k = 5)[,mean(acc), by = method.name] 41 | #' accuracy(calcAccurateAmongTopK(res.proc, k = 5), n = 50) 42 | #' accuracy(calcAccurateAmongTopK(res.proc, k = 1), n = 50) 43 | accuracy <- function(occupationalPredictionsAmongTopK, n) { 44 | 45 | occupationalPredictionsAmongTopK[, list(.N, acc = sum(acc) / n, se = sqrt(sum(acc) * (n - sum(acc)) / n^3), mean.in.subset.N = mean(acc), count.pred.prob0 = sum(pred.prob == 0)), by = method.name] # se = sqrt(1/n * p * (1-p)) = sqrt(n.true * n.false / n^3) 46 | 47 | } 48 | -------------------------------------------------------------------------------- /R/asDocumentTermMatrix.R: -------------------------------------------------------------------------------- 1 | #' Document-Term Matrix 2 | #' 3 | #' Constructs a document-term matrix. 4 | #' 5 | #' @param input a character vector. 6 | #' @param vect.vocab a vocabulary created with \code{\link[text2vec:vectorizers]{vocab_vectorizer}}. If \code{NULL}, the vocabulary is created from the input. See example for a typical use case. 7 | #' @param stopwords character vector of stopwords to exclude when creating the vocabulary. \code{tm::stopwords("de")} provides German stopwords. 8 | #' @param stemming \code{NULL} for no stemming and \code{"de"} for stemming using the German porter stemmer. 9 | #' @param type character, one of c("dgCMatrix", "dgTMatrix", "lda_c") taken from \code{\link[text2vec]{create_dtm}}. \code{dgCMatrix} are useful for glmnet; \code{dgTMatrix} matrix refers to sparse matrices in triplet form, i.e. positions of all non-zero values are stored (easier to work with, but non-unique). 10 | #' 11 | #' @return A list with two elements 12 | #' \describe{ 13 | #' \item{dtm}{a sparse document-term-matrix, depending on the \code{type}-parameter} 14 | #' \item{vect.vocab}{a vocabulary that can be inserted as \code{vect.vocab} to build a document term matrix on new data with the same vocabulary.} 15 | #' } 16 | #' 17 | #' @seealso 18 | #' \url{http://text2vec.org/vectorization.html} for details on the implementation used here, 19 | #' another implementation \code{\link[tm]{TermDocumentMatrix}} is slower 20 | #' 21 | #' @export 22 | #' 23 | #' @examples 24 | #' x <- c("Verkauf von Schreibwaren", "Verkauf", "Schreibwaren") 25 | #' asDocumentTermMatrix(x) 26 | #' asDocumentTermMatrix(x, type = "dgTMatrix") 27 | #' asDocumentTermMatrix(x, stopwords = tm::stopwords("de")) 28 | #' 29 | #' (x <- c("Verkauf von B\xfcchern, Schreibwaren", "Fach\xe4rzin f\xfcr Kinder- und Jugendmedizin im \xf6ffentlichen Gesundheitswesen", "Industriemechaniker", "Dipl.-Ing. - Agrarwirtschaft (Landwirtschaft)")) 30 | #' x <- stringPreprocessing(x) 31 | #' dtm <- asDocumentTermMatrix(x, stemming = "de") 32 | #' print(dtm$dtm) 33 | #' dimnames(dtm$dtm)[[2]] 34 | #' 35 | #' # use the newly created vocab_vectorizer 36 | #'(x <- stringPreprocessing(c("WILL NOT SHOW UP", "Verkauf von B\xfcchern, Schreibwaren", "Fach\xe4rzin f\xfcr Kinder- und Jugendmedizin"))) 37 | #' asDocumentTermMatrix(x, vect.vocab = dtm$vect.vocab, stopwords = character(0), stemming = "de")$dtm 38 | asDocumentTermMatrix <- function(input, vect.vocab = NULL, 39 | stopwords = character(0), 40 | stemming = NULL, 41 | type = c("dgCMatrix", "dgTMatrix", "lda_c")) { 42 | 43 | # tokenize with or without stemming? 44 | if (!is.null(stemming)) { 45 | if (!requireNamespace("SnowballC", quietly = TRUE)) { 46 | stop("Package \"SnowballC\" needed if you want to do stemming. Please install it.", 47 | call. = FALSE) 48 | } 49 | tok_fun <-function(x) { 50 | tokens <- text2vec::word_tokenizer(x) 51 | lapply(tokens, SnowballC::wordStem, language = stemming) 52 | } 53 | } else { 54 | tok_fun <- text2vec::word_tokenizer 55 | } 56 | 57 | # prep_fun = toupper 58 | it_train = text2vec::itoken(tolower(input), 59 | # preprocessor = prep_fun, 60 | tokenizer = tok_fun, 61 | progressbar = FALSE) 62 | if (is.null(vect.vocab)) { 63 | vocab = text2vec::create_vocabulary(it_train, stopwords = tolower(stopwords)) 64 | vect.vocab = text2vec::vocab_vectorizer(vocab) 65 | } 66 | input.dtm = text2vec::create_dtm(it_train, vect.vocab, type = type) 67 | return(list(dtm = input.dtm, vect.vocab = vect.vocab)) 68 | } 69 | -------------------------------------------------------------------------------- /R/calcAccurateAmongTopK.R: -------------------------------------------------------------------------------- 1 | #' Calculate aggregate properties for top k predicted categories 2 | #' 3 | #' Start with a data.table of class 'occupationalPredictions' (for each combination of pred.code and answer one prediction) and calulate if one of the top k entries is accurate. 4 | #' 5 | #' \code{num.suggested} and \code{general.among.top5} is currently not used. Relates to situations if the prediction algorithm does not provide all codes. 6 | #' 7 | #' @param occupationalPredictions a data.table created with a \code{\link{expandPredictionResults}}-function from this package. 8 | #' @param k how many top k categories to aggregate over? 9 | #' 10 | #' @seealso ... 11 | #' 12 | #' @return a data.table 13 | #' @import data.table 14 | #' @export 15 | #' 16 | #' @examples 17 | #' # set up data 18 | #' data(occupations) 19 | #' allowed.codes <- c("71402", "71403", "63302", "83112", "83124", "83131", "83132", "83193", "83194", "-0004", "-0030") 20 | #' allowed.codes.titles <- c("Office clerks and secretaries (without specialisation)-skilled tasks", "Office clerks and secretaries (without specialisation)-complex tasks", "Gastronomy occupations (without specialisation)-skilled tasks", 21 | #' "Occupations in child care and child-rearing-skilled tasks", "Occupations in social work and social pedagogics-highly complex tasks", "Pedagogic specialists in social care work and special needs education-unskilled/semiskilled tasks", "Pedagogic specialists in social care work and special needs education-skilled tasks", "Supervisors in education and social work, and of pedagogic specialists in social care work", "Managers in education and social work, and of pedagogic specialists in social care work", 22 | #' "Not precise enough for coding", "Student assistants") 23 | #' proc.occupations <- removeFaultyAndUncodableAnswers_And_PrepareForAnalysis(occupations, colNames = c("orig_answer", "orig_code"), allowed.codes, allowed.codes.titles) 24 | #' 25 | #' ## split sample 26 | #' set.seed(3451345) 27 | #' n.test <- 50 28 | #' group <- sample(c(rep("test", n.test), rep("training", nrow(proc.occupations) - n.test))) 29 | #' splitted.data <- split(proc.occupations, group) 30 | #' 31 | #' # train model and make predictions 32 | #' model <- trainLogisticRegressionWithPenalization(splitted.data$train, preprocessing = list(stopwords = tm::stopwords("de"), stemming = "de", countWords = FALSE), tuning = list(alpha = 0.05, maxit = 50^5, nlambda = 100, thresh = 1e-5)) 33 | #' res <- predictLogisticRegressionWithPenalization(model, splitted.data$test) 34 | #' 35 | #' # expand to contain more categories than the initial ones 36 | #' res.proc1 <- expandPredictionResults(res, allowed.codes = c("12345", allowed.codes), method.name = "glmnet1") 37 | #' 38 | #' # aggregate over top k categories 39 | #' calcAccurateAmongTopK(res.proc1, k = 1)[,mean(acc)] 40 | #' calcAccurateAmongTopK(res.proc1, k = 5)[,mean(acc)] 41 | #' 42 | #' # we can use different methods to create a combined dataset. This is how to run the subsequent analysis functions only once. 43 | #' res.proc2 <- expandPredictionResults(res, allowed.codes = c("12345", allowed.codes), method.name = "glmnet2") 44 | #' res.proc <- rbind(res.proc1, res.proc2); class(res.proc) <- c(class(res.proc), "occupationalPredictionsComplete") 45 | #' 46 | #' calcAccurateAmongTopK(res.proc, k = 5)[,mean(acc), by = method.name] 47 | #' # res[, calcAccurateAmongTopK(.SD, k = 5), by = method][,mean(acc), by = method] 48 | calcAccurateAmongTopK <- function(occupationalPredictions, k = 1) { 49 | 50 | if (!("occupationalPredictionsComplete" %in% class(occupationalPredictions))){ 51 | stop("'occupationalPredictionsComplete' needs to have class 'occupationalPredictionsComplete' (=constructed with a expandPredictionResults method).") 52 | } 53 | 54 | # determine if a pred.code is accurate (acc) and get the (at most) k most probable pred.codes per id 55 | # only codes if they were probable in the first place (among.suggested.code == TRUE) 56 | occupationalPredictions2 <- occupationalPredictions[among.suggested.code == TRUE, head(.SD[order(sample(1:.N, .N))][order(pred.prob, decreasing = TRUE), list(ans, pred.code, pred.prob, code, acc = code == pred.code, among.suggested.code)], k), by = list(id, method.name)] 57 | 58 | if (k == 1) { 59 | randomChoices <- merge(occupationalPredictions, occupationalPredictions2, by = c("pred.prob", "id", "method.name"))[, .N, by = list(method.name, id)][N > 2] 60 | 61 | if (nrow(randomChoices) > 0) { 62 | warning("Some ids have more than one most probable code (see above). Random choice made (or lowest pred.code selected?)") 63 | print(randomChoices) 64 | } 65 | } 66 | 67 | # calculate key statistics (pred.code: set of predicted codes among top k, pred.prob: sum of probabilities, acc: number of correct predictions, num.suggested: number (1-k), general.among.top5: indicator for any non-included code), 68 | return(occupationalPredictions2[, list(pred.code = paste(pred.code, collapse = ", "), pred.prob = sum(pred.prob), acc = sum(acc), num.suggested.codes = sum(!among.suggested.code), num.suggested = .N, general.among.top5 = "-9999" %in% pred.code), by = list(id, ans, code, method.name)]) 69 | } 70 | -------------------------------------------------------------------------------- /R/cosineSimilarity.R: -------------------------------------------------------------------------------- 1 | #' Cosine Similarity 2 | #' 3 | #' Calculate cosine similarity between every row in \code{matrix1} and every row in \code{matrix2}. 4 | #' 5 | #' Cosine similarity is a measure of similarity between two vectors \eqn{x} and \eqn{y} that measures the cosine of the angle between them. Since we consider positive vectors, its maximal value is 1 if both vectors are identical and its minimal value is 0 if \eqn{x \times y = 0}. 6 | #' 7 | #' The definition is: \eqn{similarity = (x \times y) / (||x|| \times ||y||) = (\sum_i x_i \times y_i) / (\sqrt{(\sum_i x_i^2)} \times \sqrt{(\sum_i y_i^2)})} 8 | #' 9 | #' @param matrix1 a matrix of type \code{dgCMatrix}. 10 | #' @param matrix2 a matrix of type \code{dgCMatrix}. 11 | #' 12 | #' @seealso \code{\link[Matrix]{Matrix}} 13 | #' 14 | #' @return A \code{dgCMatrix} where element \code{A[index1, index2]} is the cosine similarity between \code{matrix1[index1,]} and \code{matrix2[index2,]}. 15 | #' @export 16 | #' 17 | #' @examples 18 | #' x <- c("Verkauf von Schreibwaren", "Verkauf", "Schreibwaren", "Industriemechaniker", "NOTINDOCUMENTTERMMATRIX") 19 | #' (y <- c("Verkauf von B\xfcchern, Schreibwaren", "Fach\xe4rzin f\xfcr Kinder- und Jugendmedizin im \xf6ffentlichen Gesundheitswesen", "Industriemechaniker", "Dipl.-Ing. - Agrarwirtschaft (Landwirtschaft)")) 20 | #' 21 | #' tok_fun = text2vec::word_tokenizer 22 | #' it_train = text2vec::itoken(tolower(y), tokenizer = tok_fun, progressbar = FALSE) 23 | #' vocab = text2vec::create_vocabulary(it_train) 24 | #' vect.vocab = text2vec::vocab_vectorizer(vocab) 25 | #' 26 | #' matrix1 <- asDocumentTermMatrix(x, vect.vocab = vect.vocab)$dtm 27 | #' matrix2 <- asDocumentTermMatrix(y, vect.vocab = vect.vocab)$dtm 28 | #' 29 | #' cosineSimilarity(matrix1, matrix1) 30 | #' cosineSimilarity(matrix1, matrix2) 31 | cosineSimilarity <- function(matrix1, matrix2) { 32 | 33 | if (class(matrix1) != "dgCMatrix") stop("matrix1 needs to have class dgCMatrix") 34 | if (class(matrix2) != "dgCMatrix") stop("matrix2 needs to have class dgCMatrix") 35 | 36 | if (matrix1@Dim[2] != matrix2@Dim[2]) stop("matrix1 and matrix2 need to have identical numbers of columns.") 37 | 38 | similarity <- Matrix::tcrossprod(matrix1, matrix2) / outer(sqrt(Matrix::rowSums(matrix1)), sqrt(Matrix::rowSums(matrix2))) 39 | similarity@x[is.na(similarity@x)] <- 0 # if a row in either matrix1 or matrix2 is all zero, the denominator is 0 and similarity is NA -> replace with zero. 40 | 41 | return(similarity) 42 | } 43 | -------------------------------------------------------------------------------- /R/createDescriptives.R: -------------------------------------------------------------------------------- 1 | #' Describe occupational data 2 | #' 3 | #' Outputs the following descriptives: frequencies, mean, median, max of number of words. Number of codes used and frequencies for special codes 4 | #' 5 | #' @param data eiter a data.table created with \code{\link{removeFaultyAndUncodableAnswers_And_PrepareForAnalysis}} or a character vector 6 | #' @param noninformative a charactor vector: if an answer is in \code{noninformative}, it is excluded 7 | #' 8 | #' @return Side effects only: a plot and text 9 | #' @import data.table 10 | #' @export 11 | #' 12 | #' @examples 13 | #' data <- data.table(answers = c("LEITER VERTRIEB", "Kfz-Schlossermeister", "Aushilfe im Hotel(Bereich Housekeeping)", "Studentische Hilfskraft"), 14 | #' codes = c("61194", "25213", "63221", "-0001")) 15 | #' (allowed.codes <- c("11101", "61194", "25213", "63221", "-0001")) 16 | #' (allowed.codes.titles <- c("Berufe in der Landwirtschaft (ohne Spezialisierung) - Helfer-/Anlernt\xe4tigkeiten", "Berufe in der Kraftfahrzeugtechnik - komplexe Spezialistent\xe4tigkeiten", "F\xfchrungskräfte - Einkauf und Vertrieb", "Berufe im Hotelservice - Helfer-/Anlernt\xe4tigkeiten", "a negative label (used for categories like 'student assistant' that are not in the official classification)")) 17 | #' data <- removeFaultyAndUncodableAnswers_And_PrepareForAnalysis(data, colNames = c("answers", "codes"), allowed.codes, allowed.codes.titles) 18 | #' createDescriptives(data) 19 | #' 20 | #' (answer <- c("LEITER VERTRIEB", "Kfz-Schlossermeister", "Aushilfe im Hotel(Bereich Housekeeping)")) 21 | #' createDescriptives(answer) 22 | #' 23 | #' data(occupations) 24 | #' allowed.codes <- c("71402", "71403", "63302", "83112", "83124", "83131", "83132", "83193", "83194", "-0004", "-0030") 25 | #' allowed.codes.titles <- c("Office clerks and secretaries (without specialisation)-skilled tasks", "Office clerks and secretaries (without specialisation)-complex tasks", 26 | #' "Gastronomy occupations (without specialisation)-skilled tasks", "Occupations in child care and child-rearing-skilled tasks", "Occupations in social work and social pedagogics-highly complex tasks", "Pedagogic specialists in social care work and special needs education-unskilled/semiskilled tasks", "Pedagogic specialists in social care work and special needs education-skilled tasks", "Supervisors in education and social work, and of pedagogic specialists in social care work", "Managers in education and social work, and of pedagogic specialists in social care work", 27 | #' "Not precise enough for coding", "Student assistants") 28 | #' occupations <- removeFaultyAndUncodableAnswers_And_PrepareForAnalysis(occupations, colNames = c("orig_answer", "orig_code"), allowed.codes, allowed.codes.titles) 29 | #' createDescriptives(occupations) 30 | createDescriptives <- function (data, noninformative = c("")) { 31 | if ("occupationData" %in% class(data)) { 32 | ans <- data[, gsub(" {2,}", " ", ans)] # remove multiple (>2) empty space 33 | } 34 | if (is.character(data)) { 35 | ans <- gsub(" {2,}", " ", data) 36 | } 37 | 38 | ans <- ans[!(ans %in% noninformative)] 39 | cat("Number of informative answers:", length(ans), "\n") 40 | 41 | # answer length 42 | ans_freq <- sapply(strsplit(ans, " "), length) 43 | plot(table(ans_freq)) 44 | cat("Frequency of Number of Words:\n") 45 | print(table(ans_freq)) 46 | cat("\n") 47 | cat("Mean Number of Words: ", mean(ans_freq), "\n") 48 | cat("Median Number of Words: ", median(ans_freq), "\n") 49 | cat("Maximal Number of Words: ", max(ans_freq), "\n") 50 | 51 | if (is.data.table(data)) { 52 | cat("Number of categories (with code > 0) from classification used: ", data[code > 0 & code %in% attr(data, "classification")[, code], .N, by = code][, .N], "\n") 53 | cat("Frequency of categories (wih code < 0) from classification:\n") 54 | print(data[code < 0 & code %in% attr(data, "classification")[, code], table(code)]) 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /R/createSimilarityTableStringdist.R: -------------------------------------------------------------------------------- 1 | #' Similarity Table with Coding index 2 | #' 3 | #' Calculate string similarity between \code{unique.string} and \code{(coding_index_w_codes, coding_index_without_codes)}. 4 | #' 5 | #' Special function for similarity-based reasoning: creates distance data with osa-method c(d = 1, i = 1, s = 1, t = 1) 6 | #' dist == 0: strings in dict and data are identical 7 | # dist == 1: exactly one deletion, insertion, substitution, or transposition 8 | # dist == 2: exactly two deletion, insertion, substitution, or transposition, the majority is a mismatch with dist == 2 9 | #' 10 | #' @param unique.string a character vector (usually unique(answer)) 11 | #' @param coding_index_w_codes a data.table with columns "title" and "Code". 12 | #' @param coding_index_without_codes a character vector of additional titles 13 | #' @param dist.control a list that will be passed to \code{\link[stringdist:stringdist]{stringdistmatrix}}. Currently only two elements are implemented: 14 | #' \describe{ 15 | #' \item{method}{Method for distance calculation.} 16 | #' \item{weight}{For method='osa' or 'dl'.} 17 | #' } 18 | #' @param threshold All entries with distance above this threshold will be removed from the result 19 | #' 20 | #' @seealso \code{\link{trainSimilarityBasedReasoning}}, \code{\link{createSimilarityTableWordwiseStringdist}}, \code{\link{createSimilarityTableSubstring}} 21 | #' 22 | #' @return a list with elements 23 | #' \describe{ 24 | #' \item{dist_table_w_code}{a data.table with colummns \code{intString}, \code{dictString.title}, \code{dictString.Code}, \code{dist}} 25 | #' \item{dist_table_without_code}{\code{NULL} or a data.table with colummns \code{intString}, \code{dictString}, \code{dist}} 26 | #' \item{vect_vocab}{see \code{link{asDocumentTermMatrix}}} 27 | #' } 28 | #' 29 | #' @export 30 | #' @examples 31 | #' ## Prepare coding index 32 | #' # write female titles beneath the male title 33 | #' coding_index <- rbind(coding_index_excerpt[, list(title = bezMale, Code)], 34 | #' coding_index_excerpt[, list(title = bezFemale, Code)]) 35 | #' # standardize titles from the coding index 36 | #' coding_index <- coding_index[,title := stringPreprocessing(title)] 37 | #' # drop duplicate lines, might be suboptimal because we keep each title and its associated code only a single time. This means we delete duplicates and the associated, possibly relevant codes. 38 | #' coding_index <- coding_index[!duplicated(title)] 39 | #' 40 | #' (x <- c("Abgeordneter", "Abgeordneter", "Abgeordnete", "abgeordnet", "FSJ", "FSJ2", "Industriemechaniker", "Dipl.-Ing. - Agrarwirtschaft (Landwirtschaft)")) 41 | #' createSimilarityTableStringdist(unique.string = stringPreprocessing(x), 42 | #' coding_index_w_codes = coding_index, 43 | #' coding_index_without_codes = frequent_phrases, 44 | #' dist.control = list(method = "osa", weight = c(d = 1, i = 1, s = 1, t = 1)), 45 | #' threshold = 3) 46 | createSimilarityTableStringdist <- function(unique.string, 47 | coding_index_w_codes, 48 | coding_index_without_codes, 49 | dist.control = list(method = "osa", weight = c(d = 1, i = 1, s = 1, t = 1)), 50 | threshold = 3) { 51 | 52 | distmat <- stringdist::stringdistmatrix(coding_index_w_codes[, title], unique.string, method = dist.control$method, weight = dist.control$weight) 53 | dist_table_w_code <- data.table(intString = unique.string[which(distmat <= threshold, arr.ind = TRUE)[, 2]], dictString = coding_index_w_codes[, list(title, Code)][which(distmat <= threshold, arr.ind = TRUE)[, 1]], dist = distmat[which(distmat <= threshold, arr.ind = TRUE)]) 54 | 55 | dist_table_without_code <- NULL 56 | # do the same for coding_index_without_codes 57 | if (length(coding_index_without_codes) > 2) { 58 | distmat <- stringdist::stringdistmatrix(coding_index_without_codes, unique.string, method = dist.control$method, weight = dist.control$weight) 59 | dist_table_without_code <- data.table(intString = unique.string[which(distmat <= threshold, arr.ind = TRUE)[, 2]], dictString = coding_index_without_codes[which(distmat <= threshold, arr.ind = TRUE)[, 1]], dist = distmat[which(distmat <= threshold, arr.ind = TRUE)]) 60 | } 61 | 62 | return(list(dist_table_w_code = dist_table_w_code, dist_table_without_code = dist_table_without_code, vect_vocab = NULL)) 63 | 64 | } 65 | -------------------------------------------------------------------------------- /R/createSimilarityTableSubstring.R: -------------------------------------------------------------------------------- 1 | #' Similarity Table with Coding index 2 | #' 3 | #' Calculate SUBSTRING similarity between \code{unique.string} and \code{(coding_index_w_codes, coding_index_without_codes)}. unique.string and coding_index are similar if coding_index is a substring of unique.string. 4 | #' 5 | #' Special function for similarity-based reasoning: creates distance data with substring-method 6 | #' 7 | #' @param unique.string a character vector (usually unique(answer)) 8 | #' @param coding_index_w_codes a data.table with columns "title" and "Code". 9 | #' @param coding_index_without_codes a character vector of additional titles 10 | #' 11 | #' @seealso \code{\link{trainSimilarityBasedReasoning}}, \code{\link{createSimilarityTableWordwiseStringdist}}, \code{\link{createSimilarityTableStringdist}}, \code{\link{createSimilarityTableStringdist}} 12 | #' 13 | #' @return a list with elements 14 | #' \describe{ 15 | #' \item{dist_table_w_code}{a data.table with colummns \code{intString}, \code{dictString.title}, \code{dictString.Code}, \code{dist}} 16 | #' \item{dist_table_without_code}{\code{NULL} or a data.table with colummns \code{intString}, \code{dictString}, \code{dist}} 17 | #' \item{vect_vocab}{see \code{link{asDocumentTermMatrix}}} 18 | #' } 19 | #' 20 | #' @export 21 | #' 22 | #' @examples 23 | #' ## Prepare coding index 24 | #' # write female titles beneath the male title 25 | #' coding_index <- rbind(coding_index_excerpt[, list(title = bezMale, Code)], 26 | #' coding_index_excerpt[, list(title = bezFemale, Code)]) 27 | #' # standardize titles from the coding index 28 | #' coding_index <- coding_index[,title := stringPreprocessing(title)] 29 | #' # drop duplicate lines, might be suboptimal because we keep each title and its associated code only a single time. This means we delete duplicates and the associated, possibly relevant codes. 30 | #' coding_index <- coding_index[!duplicated(title)] 31 | #' 32 | #' (x <- c("Abgeordneter", "Abgeordneter", "Abgeordnete", "abgeordnet", "FSJ", "FSJ2", "Industriemechaniker", "Dipl.-Ing. - Agrarwirtschaft (Landwirtschaft)")) 33 | #' createSimilarityTableSubstring(unique.string = stringPreprocessing(x), 34 | #' coding_index_w_codes = coding_index, 35 | #' coding_index_without_codes = frequent_phrases) 36 | createSimilarityTableSubstring <- function(unique.string, 37 | coding_index_w_codes, 38 | coding_index_without_codes) { 39 | 40 | 41 | # create distance data where dictionary string is substring from survey string 42 | ##################################################### 43 | # find all indices of survey strings where coding_index[i,] is a substring 44 | distmat.ind <- lapply(1:nrow(coding_index_w_codes), FUN = function(i) grep(coding_index_w_codes[i, title], unique.string)) 45 | # bring them in data.table format 46 | length.ind <- sapply(distmat.ind, length) 47 | distmat.ind <- data.table(dict = rep(1:nrow(coding_index_w_codes), times = length.ind), u.string = unlist(distmat.ind)) 48 | # get strings instead of indices 49 | dist_table_w_code <- data.table(intString = unique.string[distmat.ind[, u.string]], dictString = coding_index_w_codes[, list(title, Code)][distmat.ind[, dict]], dist = 0L) 50 | 51 | dist_table_without_code <- NULL 52 | # do the same for coding_index_without_codes 53 | if (length(coding_index_without_codes) > 2) { 54 | distmat.ind <- lapply(1:length(coding_index_without_codes), FUN = function(i) grep(coding_index_without_codes[i], unique.string)) 55 | # bring them in data.table format 56 | length.ind <- sapply(distmat.ind, length) 57 | distmat.ind <- data.table(dict = rep(1:length(coding_index_without_codes), times = length.ind), u.string = unlist(distmat.ind)) 58 | # get strings instead of indices 59 | dist_table_without_code <- data.table(intString = unique.string[distmat.ind[, u.string]], dictString = coding_index_without_codes[distmat.ind[, dict]], dist = 0L) 60 | } 61 | 62 | return(list(dist_table_w_code = dist_table_w_code, dist_table_without_code = dist_table_without_code, vect_vocab = NULL)) 63 | } 64 | -------------------------------------------------------------------------------- /R/createSimilarityTableWordwiseStringdist.R: -------------------------------------------------------------------------------- 1 | #' Wordwise Similarity Table with Coding index 2 | #' 3 | #' Calculate string similarity on a word basis between \code{unique.string} and \code{(coding_index_w_codes, coding_index_without_codes)}. 4 | #' 5 | #' Special function for similarity-based reasoning: creates WORDWISE!!!!! distance data with osa-method c(d = 1, i = 1, s = 1, t = 1) 6 | #' --> allows to correct 1 letter in a single word and matches this word with the dictionary. 7 | #' This means: unique.string is split wordwise and that word is used for matchning which has lowest osa-distance (all in case of a tie) 8 | #' example: 9 | #' "KUESTER and HAUSMEISTER" has distance 0 to both dictString.title HAUSMEISTER and KUESTER. Because the word HAUSMEISTER has minimal distance, another dictString.title HAUMEISTER, which has dist = 1 is not included. 10 | #' 11 | #' @param unique.string a character vector (usually unique(answer)) 12 | #' @param coding_index_w_codes a data.table with columns "title" and "Code". 13 | #' @param coding_index_without_codes a character vector of additional titles 14 | #' @param preprocessing a list with elements 15 | #' \describe{ 16 | #' \item{stopwords}{a character vector, use \code{tm::stopwords("de")} for German stopwords.} 17 | #' \item{stemming}{\code{NULL} for no stemming and \code{"de"} for stemming using the German porter stemmer.} 18 | #' \item{strPreprocessing}{\code{TRUE} if \code{\link{stringPreprocessing}} shall be used.} 19 | #' \item{removePunct}{\code{TRUE} if \code{\link[tm]{removePunctuation}} shall be used.} 20 | #' } 21 | #' @param dist.control a list that will be passed to \code{\link[stringdist:stringdist]{stringdistmatrix}}. Currently only two elements are implemented: 22 | #' \describe{ 23 | #' \item{method}{Method for distance calculation.} 24 | #' \item{weight}{For method='osa' or 'dl'.} 25 | #' } 26 | #' @param threshold All entries with distance above this threshold will be removed from the result 27 | #' 28 | #' @seealso \code{\link{trainSimilarityBasedReasoning}}, \code{\link{createSimilarityTableStringdist}}, \code{\link{createSimilarityTableSubstring}} 29 | #' 30 | #' @return a list with elements 31 | #' \describe{ 32 | #' \item{dist_table_w_code}{a data.table with colummns \code{intString}, \code{dictString.title}, \code{dictString.Code}, \code{dist}} 33 | #' \item{dist_table_without_code}{\code{NULL} or a data.table with colummns \code{intString}, \code{dictString}, \code{dist}} 34 | #' \item{vect_vocab}{see \code{link{asDocumentTermMatrix}}} 35 | #' } 36 | #' 37 | #' @export 38 | #' @examples 39 | #' ## Prepare coding index 40 | #' # write female titles beneath the male title 41 | #' coding_index <- rbind(coding_index_excerpt[, list(title = bezMale, Code)], 42 | #' coding_index_excerpt[, list(title = bezFemale, Code)]) 43 | #' # standardize titles from the coding index 44 | #' coding_index <- coding_index[,title := stringPreprocessing(title)] 45 | #' # drop duplicate lines, might be suboptimal because we keep each title and its associated code only a single time. This means we delete duplicates and the associated, possibly relevant codes. 46 | #' coding_index <- coding_index[!duplicated(title)] 47 | #' 48 | #' (x <- c("Abgeordneter", "Abgeordneter", "Abgeordnete", "abgeordnet", "abgeordnet zu xxx", "FSJ", "FSJ2", "Industriemechaniker", "Dipl.-Ing. - Agrarwirtschaft (Landwirtschaft)")) 49 | #' createSimilarityTableWordwiseStringdist(unique.string = stringPreprocessing(x), 50 | #' coding_index_w_codes = coding_index, 51 | #' coding_index_without_codes = frequent_phrases, 52 | #' preprocessing = list(stopwords = tm::stopwords("de"), stemming = NULL, strPreprocessing = TRUE, removePunct = FALSE), 53 | #' dist.control = list(method = "osa", weight = c(d = 1, i = 1, s = 1, t = 1)), 54 | #' threshold = 1) 55 | createSimilarityTableWordwiseStringdist <- function(unique.string, 56 | coding_index_w_codes, 57 | coding_index_without_codes, 58 | preprocessing, 59 | dist.control = list(method = "osa", weight = c(d = 1, i = 1, s = 1, t = 1)), 60 | threshold = 1) { 61 | 62 | # prepare text for efficient computation -> transform to sparse matrix 63 | matrix <- asDocumentTermMatrix(unique.string, vect.vocab = NULL, 64 | stopwords = preprocessing$stopwords, 65 | stemming = preprocessing$stemming, 66 | type = "dgTMatrix") 67 | 68 | distmat <- stringdist::stringdistmatrix(coding_index_w_codes[, title], toupper(matrix$dtm@Dimnames[[2]]), method = dist.control$method, weight = dist.control$weight) # calculate distances between each word in matrix1 and the coding_index_w_codes 69 | distmat.table <- data.table(word.id = which(distmat <= threshold, arr.ind = TRUE)[, 2], dictString = coding_index_w_codes[, list(title, Code)][which(distmat <= threshold, arr.ind = TRUE)[, 1]], dist = distmat[which(distmat <= threshold, arr.ind = TRUE)]) 70 | distmat.table <- merge(distmat.table, data.table(intString = unique.string[matrix$dtm@i + 1], word.id = matrix$dtm@j + 1), by = "word.id", allow.cartesian=TRUE) # we have now columns for each unique.string, word, dictString, dist 71 | # now keep all words that have at least one dictString with minimal distance 72 | distmat.table[, dist2 := min(dist), by = list(intString, word.id)] 73 | dist_table_w_code <- distmat.table[, .SD[dist == dist2, list(dictString.title, dictString.Code, dist)], by = intString] 74 | 75 | dist_table_without_code <- NULL 76 | # do the same for coding_index_without_codes 77 | if (length(coding_index_without_codes) > 2) { 78 | distmat <- stringdist::stringdistmatrix(coding_index_without_codes, toupper(matrix$dtm@Dimnames[[2]]), method = dist.control$method, weight = dist.control$weight) # calculate distances between each word in matrix1 and the coding_index 79 | distmat.table <- data.table(word.id = which(distmat <= threshold, arr.ind = TRUE)[, 2], dictString = coding_index_without_codes[which(distmat <= threshold, arr.ind = TRUE)[, 1]], dist = distmat[which(distmat <= threshold, arr.ind = TRUE)]) 80 | distmat.table <- merge(distmat.table, data.table(intString = unique.string[matrix$dtm@i + 1], word.id = matrix$dtm@j + 1), by = "word.id", allow.cartesian=TRUE) # we have now columns for each unique.string, word, dictString, dist 81 | # now keep all words that have at least one dictString with minimal distance 82 | distmat.table[, dist2 := min(dist), by = list(intString, word.id)] 83 | dist_table_without_code <- distmat.table[, .SD[dist == dist2, list(dictString, dist)], by = intString] 84 | } 85 | 86 | return(list(dist_table_w_code = dist_table_w_code, dist_table_without_code = dist_table_without_code, vect_vocab = matrix$vect.vocab)) 87 | 88 | } 89 | -------------------------------------------------------------------------------- /R/expandPredictionResults.R: -------------------------------------------------------------------------------- 1 | #' Expands predicted datasets to contain all allowed codes 2 | #' 3 | #' Start with a data.table of class 'occupationalPredictions' (for each combination of pred.code and answer one prediction) and expand it to contain all allowed codes. 4 | #' 5 | #' The problem solved here is this: Most algorithms do not provide codes for all categories from the classification, because this would require that the categories are in the training data. This function expands the dataset and predicts some very small probabilities (or 0) for classification codes that the training algorithm found impossible to predict. 6 | #' 7 | #' @param data a data.table created with a \code{predict}-function from this package. 8 | #' @param allowed.codes a character vector of all allowed codes. 9 | #' @param method.name the name how the method shall be called. 10 | #' 11 | #' @seealso \code{\link{produceResults}} 12 | #' 13 | #' @return a data.table 14 | #' @import data.table 15 | #' @export 16 | #' 17 | #' @examples 18 | #' # set up data 19 | #' data(occupations) 20 | #' allowed.codes <- c("71402", "71403", "63302", "83112", "83124", "83131", "83132", "83193", "83194", "-0004", "-0030") 21 | #' allowed.codes.titles <- c("Office clerks and secretaries (without specialisation)-skilled tasks", "Office clerks and secretaries (without specialisation)-complex tasks", "Gastronomy occupations (without specialisation)-skilled tasks", 22 | #' "Occupations in child care and child-rearing-skilled tasks", "Occupations in social work and social pedagogics-highly complex tasks", "Pedagogic specialists in social care work and special needs education-unskilled/semiskilled tasks", "Pedagogic specialists in social care work and special needs education-skilled tasks", "Supervisors in education and social work, and of pedagogic specialists in social care work", "Managers in education and social work, and of pedagogic specialists in social care work", 23 | #' "Not precise enough for coding", "Student assistants") 24 | #' proc.occupations <- removeFaultyAndUncodableAnswers_And_PrepareForAnalysis(occupations, colNames = c("orig_answer", "orig_code"), allowed.codes, allowed.codes.titles) 25 | #' 26 | #' ## split sample 27 | #' set.seed(3451345) 28 | #' n.test <- 50 29 | #' group <- sample(c(rep("test", n.test), rep("training", nrow(proc.occupations) - n.test))) 30 | #' splitted.data <- split(proc.occupations, group) 31 | #' 32 | #' # train model and make predictions 33 | #' model <- trainLogisticRegressionWithPenalization(splitted.data$train, preprocessing = list(stopwords = tm::stopwords("de"), stemming = "de", countWords = FALSE), tuning = list(alpha = 0.05, maxit = 50^5, nlambda = 100, thresh = 1e-5)) 34 | #' res <- predictLogisticRegressionWithPenalization(model, splitted.data$test) 35 | #' 36 | #' expandPredictionResults(res, allowed.codes, method.name = "Logistic Regression") 37 | expandPredictionResults <- function(occupationalPredictions, allowed.codes, method.name) { 38 | 39 | if (!("occupationalPredictions" %in% class(occupationalPredictions))){ 40 | stop("'occupationalPredictions' needs to have class 'occupationalPredictions' (=constructed with a predict method).") 41 | } 42 | 43 | # 1. construct with CJ an expanded data.table that contains one row for every combination of id and allowed.code. 2. Merge with original data to expand that dataset 44 | res.complete <- merge(occupationalPredictions[, list(id, pred.code, ans, pred.prob, code, among.suggested.code = TRUE)], CJ(id = unique(occupationalPredictions[, id]), pred.code = allowed.codes), by = c("id", "pred.code"), all.y = TRUE) 45 | 46 | # special case: in some prediction datasets a code "-9999" was predicted that is a placeholder for all other codes. Use this probability 47 | if (occupationalPredictions[, any(pred.code == "-9999")]) { 48 | res.complete <- merge(res.complete, occupationalPredictions[pred.code == "-9999", list(pred.prob.insert = pred.prob, id, true.code.insert = code)], by = "id", all.x = TRUE) 49 | } else { 50 | res.complete <- merge(res.complete, occupationalPredictions[pred.code == min(pred.code), list(pred.prob.insert = 0, id, true.code.insert = code)], by = "id", all.x = TRUE) 51 | } 52 | 53 | # fill new rows with appropriate values 54 | res.complete[is.na(pred.prob), pred.prob := pred.prob.insert] 55 | res.complete[is.na(code), code := true.code.insert] 56 | res.complete[, true.code.insert := NULL] 57 | res.complete[, pred.prob.insert := NULL] 58 | 59 | # an indicator that tells us if the original algorithm came up with this code (impossible.code = FALSE) 60 | res.complete[is.na(among.suggested.code), among.suggested.code := FALSE] 61 | 62 | res.complete[, method.name := method.name] 63 | 64 | class(res.complete) <- c(class(res.complete), "occupationalPredictionsComplete") 65 | return(res.complete) 66 | } 67 | -------------------------------------------------------------------------------- /R/logLoss.R: -------------------------------------------------------------------------------- 1 | #' Log loss 2 | #' 3 | #' Calculate log loss \eqn{\log_2 loss = \frac{1}{N} \sum_n \log_2 loss_n} and standard error \eqn{\sqrt{\frac{1}{N(N-1)} \sum_n (\log_2 loss_n - \log_2 loss)^2}} with \eqn{loss_n = \sum_k -y_{nk} \log_2 p_{nk}} 4 | #' 5 | #' log loss is the average probability of true categories that actually realized. 6 | #' 7 | #' @param occupationalPredictions a data.table created with a \code{\link{expandPredictionResults}}-function from this package. 8 | #' 9 | #' @seealso \code{\link{sharpness}} 10 | #' 11 | #' @return a data.table 12 | #' @import data.table 13 | #' @export 14 | #' 15 | #' @examples 16 | #' # set up data 17 | #' data(occupations) 18 | #' allowed.codes <- c("71402", "71403", "63302", "83112", "83124", "83131", "83132", "83193", "83194", "-0004", "-0030") 19 | #' allowed.codes.titles <- c("Office clerks and secretaries (without specialisation)-skilled tasks", "Office clerks and secretaries (without specialisation)-complex tasks", "Gastronomy occupations (without specialisation)-skilled tasks", 20 | #' "Occupations in child care and child-rearing-skilled tasks", "Occupations in social work and social pedagogics-highly complex tasks", "Pedagogic specialists in social care work and special needs education-unskilled/semiskilled tasks", "Pedagogic specialists in social care work and special needs education-skilled tasks", "Supervisors in education and social work, and of pedagogic specialists in social care work", "Managers in education and social work, and of pedagogic specialists in social care work", 21 | #' "Not precise enough for coding", "Student assistants") 22 | #' proc.occupations <- removeFaultyAndUncodableAnswers_And_PrepareForAnalysis(occupations, colNames = c("orig_answer", "orig_code"), allowed.codes, allowed.codes.titles) 23 | #' 24 | #' ## split sample 25 | #' set.seed(3451345) 26 | #' n.test <- 50 27 | #' group <- sample(c(rep("test", n.test), rep("training", nrow(proc.occupations) - n.test))) 28 | #' splitted.data <- split(proc.occupations, group) 29 | #' 30 | #' # train model and make predictions 31 | #' model <- trainLogisticRegressionWithPenalization(splitted.data$train, preprocessing = list(stopwords = tm::stopwords("de"), stemming = "de", countWords = FALSE), tuning = list(alpha = 0.05, maxit = 50^5, nlambda = 100, thresh = 1e-5)) 32 | #' res <- predictLogisticRegressionWithPenalization(model, splitted.data$test) 33 | #' 34 | #' # expand to contain more categories than the initial ones 35 | #' res.proc1 <- expandPredictionResults(res, allowed.codes = c("12345", allowed.codes), method.name = "glmnet1") 36 | #' 37 | #' # we can use different methods to create a combined dataset. This is how to run the subsequent analysis functions only once. 38 | #' res.proc2 <- expandPredictionResults(res, allowed.codes = c("12345", allowed.codes), method.name = "glmnet2") 39 | #' res.proc <- rbind(res.proc1, res.proc2); class(res.proc) <- c(class(res.proc), "occupationalPredictionsComplete") 40 | #' 41 | #' logLoss(res.proc) 42 | logLoss <- function(occupationalPredictions) { 43 | 44 | if (!("occupationalPredictionsComplete" %in% class(occupationalPredictions))){ 45 | stop("'occupationalPredictionsComplete' needs to have class 'occupationalPredictionsComplete' (=constructed with a expandPredictionResults method).") 46 | } 47 | 48 | # calculate mean log loss +- standard errors 49 | res <- occupationalPredictions[code == pred.code, list(.N, logscore = -mean(log2(pred.prob)), se = sqrt(var(log2(pred.prob))/.N)), by = method.name] 50 | 51 | if (any(res[, is.infinite(logscore)])) { 52 | res[, type := "all observations (true log score)"] 53 | 54 | res2 <- occupationalPredictions[code == pred.code & pred.prob > 0, list(.N, logscore = -mean(log2(pred.prob)), se = sqrt(var(log2(pred.prob))/.N), type = "observations with pred.prob = 0 of true category excluded"), by = method.name] 55 | return(rbind(res, res2)) 56 | } else { 57 | return(res) 58 | } 59 | 60 | # calculate mean log loss 61 | # check that no other category is predicted with maximal probability that is lower than pred.prob (result should all be -9999) 62 | # res[, .SD[acc == TRUE | pred.code == "-9999"][which.min(pred.prob)], by = list(id, sim.name)][, .N, by = pred.code] 63 | # an example how the so found probabilities look like 64 | # summary(res[, .SD[acc == TRUE | pred.code == "-9999"][which.max(pred.prob)], by = list(id, sim.name)][sim.name == "substring", list(pred.prob, log(pred.prob), log2(pred.prob))]) 65 | # if so, we may calculate the mean log loss (+- standard errors) 66 | # res[, .SD[acc == TRUE | pred.code == "-9999"][which.max(pred.prob)], by = list(id, sim.name)][, list(.N, logscore = -mean(log2(pred.prob)), se = sqrt(var(log2(pred.prob))/.N)), by = sim.name] 67 | 68 | } 69 | -------------------------------------------------------------------------------- /R/plotAgreementRateVsProductionRate.R: -------------------------------------------------------------------------------- 1 | #' Plot agreement rate vs. production rate 2 | #' 3 | #' Plot the proportion of answers coded correctly for any given production rate (proportion of \code{n} that are coded automatically.) 4 | #' 5 | #' @param occupationalPredictionsAmongTopK a data table created with \code{\link{calcAccurateAmongTopK}}. 6 | #' @param n Number of unique persons in test data (may be larger than the number of persons in occupationalPredictionsAmongTopK if for some persons no codes were suggested) 7 | #' @param yintercept Where to draw a horizontal line? 8 | #' @param filename If a \code{filename} is specified the diagram will be saved at with this name. 9 | #' 10 | #' @seealso \code{\link{plotTruePredictionsVsFalsePredictions}}, \code{\link{calcAccurateAmongTopK}} 11 | #' 12 | #' @return a ggplot 13 | #' @import data.table 14 | #' @import ggplot2 15 | #' @export 16 | #' 17 | #' @examples 18 | #' # set up data 19 | #' data(occupations) 20 | #' allowed.codes <- c("71402", "71403", "63302", "83112", "83124", "83131", "83132", "83193", "83194", "-0004", "-0030") 21 | #' allowed.codes.titles <- c("Office clerks and secretaries (without specialisation)-skilled tasks", "Office clerks and secretaries (without specialisation)-complex tasks", "Gastronomy occupations (without specialisation)-skilled tasks", 22 | #' "Occupations in child care and child-rearing-skilled tasks", "Occupations in social work and social pedagogics-highly complex tasks", "Pedagogic specialists in social care work and special needs education-unskilled/semiskilled tasks", "Pedagogic specialists in social care work and special needs education-skilled tasks", "Supervisors in education and social work, and of pedagogic specialists in social care work", "Managers in education and social work, and of pedagogic specialists in social care work", 23 | #' "Not precise enough for coding", "Student assistants") 24 | #' proc.occupations <- removeFaultyAndUncodableAnswers_And_PrepareForAnalysis(occupations, colNames = c("orig_answer", "orig_code"), allowed.codes, allowed.codes.titles) 25 | #' 26 | #' ## split sample 27 | #' set.seed(3451345) 28 | #' n.test <- 50 29 | #' group <- sample(c(rep("test", n.test), rep("training", nrow(proc.occupations) - n.test))) 30 | #' splitted.data <- split(proc.occupations, group) 31 | #' 32 | #' # train model and make predictions 33 | #' model <- trainLogisticRegressionWithPenalization(splitted.data$train, preprocessing = list(stopwords = tm::stopwords("de"), stemming = "de", countWords = FALSE), tuning = list(alpha = 0.05, maxit = 50^5, nlambda = 100, thresh = 1e-5)) 34 | #' res <- predictLogisticRegressionWithPenalization(model, splitted.data$test) 35 | #' 36 | #' # expand to contain more categories than the initial ones 37 | #' res.proc1 <- expandPredictionResults(res, allowed.codes = c("12345", allowed.codes), method.name = "glmnet1") 38 | #' 39 | #' # we can use different methods to create a combined dataset. This is how to run the subsequent analysis functions only once. 40 | #' res.proc2 <- expandPredictionResults(res, allowed.codes = c("12345", allowed.codes), method.name = "glmnet2") 41 | #' res.proc <- rbind(res.proc1, res.proc2); class(res.proc) <- c(class(res.proc), "occupationalPredictionsComplete") 42 | #' 43 | #' calcAccurateAmongTopK(res.proc, k = 5)[,mean(acc), by = method.name] 44 | #' plotAgreementRateVsProductionRate(calcAccurateAmongTopK(res.proc, k = 5), n = n.test, yintercept = 0.85) 45 | #' plotAgreementRateVsProductionRate(calcAccurateAmongTopK(res.proc, k = 1), n = n.test, yintercept = 0.85, filename = "test.pdf") 46 | plotAgreementRateVsProductionRate <- function(occupationalPredictionsAmongTopK, n, yintercept, filename = NULL) { 47 | 48 | print(paste0("If these numbers are different than the size of the test data ", n, ", make sure to understand what this means. (x-axis will end before 1)")) 49 | print(occupationalPredictionsAmongTopK[, .N, by = method.name]) 50 | 51 | # calculate coordinates to plot 52 | agreementProductionGraph <- occupationalPredictionsAmongTopK[, .SD[order(pred.prob, acc, decreasing = TRUE), list(y = cumsum(acc)/ 1:.N, x = 1:.N/n)], by = method.name] 53 | 54 | p <- ggplot(agreementProductionGraph, aes(x = x, y = y, colour = method.name, group = paste(method.name))) + 55 | geom_line() + 56 | geom_hline(yintercept = yintercept) + 57 | # geom_line(data = agreementProductionGraph[method == "dw averaged"], aes(x = x, y = y, colour = method, size = method)) 58 | scale_colour_manual(values = c("chocolate", "blue4", "red", c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7"), c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7"), c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7"), c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")), # 59 | name = "Prediction Method", 60 | guide = guide_legend(direction = "horizontal", 61 | nrow = 2, ncol=4, 62 | title.position = "top", 63 | title.hjust = 0.5, 64 | label.position="right", 65 | label.hjust = 0, 66 | label.vjust = 0.5, 67 | reverse = TRUE 68 | # label.theme = element_text(angle = 90) 69 | )) + 70 | labs(x = "Production Rate", y = "Agreement Rate") + # , title = "Agreement Rates for most probable category at various production rates") + 71 | theme(legend.position="bottom") 72 | 73 | if (!is.null(filename)) ggsave(filename, plot = p, width = 7, height = 7) 74 | 75 | return(p) 76 | } 77 | 78 | -------------------------------------------------------------------------------- /R/plotReliabilityDiagram.R: -------------------------------------------------------------------------------- 1 | #' Reliability Diagram 2 | #' 3 | #' Plots the observed relative frequency of correctness against the forecasted probability. 4 | #' 5 | #' @param occupationalPredictionsAmongTopK a data table created with \code{\link{calcAccurateAmongTopK}}. 6 | #' @param k how many top k categories to aggregate over? 7 | #' @param num.codes Number of allowed categories in classification 8 | #' @param filename If a \code{filename} is specified the diagram will be saved at with this name. 9 | #' 10 | #' @seealso \code{\link{sharpness}} 11 | #' 12 | #' @return a ggplot 13 | #' @import data.table 14 | #' @import ggplot2 15 | #' @export 16 | #' 17 | #' @examples 18 | #' # set up data 19 | #' data(occupations) 20 | #' allowed.codes <- c("71402", "71403", "63302", "83112", "83124", "83131", "83132", "83193", "83194", "-0004", "-0030") 21 | #' allowed.codes.titles <- c("Office clerks and secretaries (without specialisation)-skilled tasks", "Office clerks and secretaries (without specialisation)-complex tasks", "Gastronomy occupations (without specialisation)-skilled tasks", 22 | #' "Occupations in child care and child-rearing-skilled tasks", "Occupations in social work and social pedagogics-highly complex tasks", "Pedagogic specialists in social care work and special needs education-unskilled/semiskilled tasks", "Pedagogic specialists in social care work and special needs education-skilled tasks", "Supervisors in education and social work, and of pedagogic specialists in social care work", "Managers in education and social work, and of pedagogic specialists in social care work", 23 | #' "Not precise enough for coding", "Student assistants") 24 | #' proc.occupations <- removeFaultyAndUncodableAnswers_And_PrepareForAnalysis(occupations, colNames = c("orig_answer", "orig_code"), allowed.codes, allowed.codes.titles) 25 | #' 26 | #' ## split sample 27 | #' set.seed(3451345) 28 | #' n.test <- 50 29 | #' group <- sample(c(rep("test", n.test), rep("training", nrow(proc.occupations) - n.test))) 30 | #' splitted.data <- split(proc.occupations, group) 31 | #' 32 | #' # train model and make predictions 33 | #' model <- trainLogisticRegressionWithPenalization(splitted.data$train, preprocessing = list(stopwords = tm::stopwords("de"), stemming = "de", countWords = FALSE), tuning = list(alpha = 0.05, maxit = 50^5, nlambda = 100, thresh = 1e-5)) 34 | #' res <- predictLogisticRegressionWithPenalization(model, splitted.data$test) 35 | #' 36 | #' # expand to contain more categories than the initial ones 37 | #' res.proc1 <- expandPredictionResults(res, allowed.codes = c("12345", allowed.codes), method.name = "glmnet1") 38 | #' 39 | #' # we can use different methods to create a combined dataset. This is how to run the subsequent analysis functions only once. 40 | #' res.proc2 <- expandPredictionResults(res, allowed.codes = c("12345", allowed.codes), method.name = "glmnet2") 41 | #' res.proc <- rbind(res.proc1, res.proc2); class(res.proc) <- c(class(res.proc), "occupationalPredictionsComplete") 42 | #' 43 | #' plotReliabilityDiagram(res.proc, k = 5, num.codes = length(allowed.codes) + 1) # + 1 because we introduced the code "12345" later 44 | #' plotReliabilityDiagram(res.proc, k = 1, num.codes = length(allowed.codes) + 1, filename = "test.pdf") 45 | plotReliabilityDiagram <- function(occupationalPredictions, k, num.codes, filename = NULL) { 46 | 47 | # same as calcAccurateAmongTopK but we also keep among.suggested.code == FALSE 48 | occupationalPredictionsAmongTopK <- occupationalPredictions[, head(.SD[order(pred.prob, decreasing = TRUE), list(pred.prob, acc = code == pred.code)], k), by = list(id, method.name)][, list(pred.prob = sum(pred.prob), acc = sum(acc)), by = list(id, method.name)] 49 | 50 | print("Make sure the following is the size of your test data (otherwise an error happened).") 51 | print(occupationalPredictionsAmongTopK[, .N, by = method.name]) 52 | 53 | # calculate coordinates to plot 54 | reliabilityDiagram <- occupationalPredictionsAmongTopK[, list(.N, forecast.probability = mean(pred.prob), observed.frequency = mean(acc)), by = list(cut(pred.prob, breaks = seq(0, 1, 0.1)), method.name)] 55 | 56 | p <- ggplot(reliabilityDiagram, aes(x = forecast.probability, y = observed.frequency, colour = method.name, group = paste(method.name))) + 57 | geom_point(aes(size = N), shape = 15) + scale_size(range = c(0.01, 4)) + guides(size = "none") + 58 | geom_line(linetype = "dotdash", size = 0.5) + # http://ggplot2.tidyverse.org/articles/ggplot2-specs.html 59 | geom_abline(intercept = 0, slope = 1, size = 1) + 60 | geom_hline(yintercept = 1/num.codes) + 61 | scale_x_continuous(breaks = seq(0, 1, 0.1), limits = c(0, 1)) + 62 | scale_y_continuous(breaks = seq(0, 1, 0.1), limits = c(0, 1)) + 63 | scale_colour_manual(values = c("chocolate", "blue4", "red", c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7"), c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7"), c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7"), c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")), # 64 | name = "Prediction Method", 65 | guide = guide_legend(direction = "horizontal", 66 | nrow = 2, ncol=4, 67 | title.position = "top", 68 | title.hjust = 0.5, 69 | label.position="right", 70 | label.hjust = 0, 71 | label.vjust = 0.5, 72 | reverse = TRUE 73 | # label.theme = element_text(angle = 90) 74 | )) + 75 | labs(x = "Mean Forecast Probability within Decile", y = "Observed Relative Frequency of Agreement") + 76 | theme(legend.position="bottom") 77 | 78 | if (!is.null(filename)) ggsave(filename, plot = p, width = 7, height = 7) 79 | 80 | return(p) 81 | } 82 | -------------------------------------------------------------------------------- /R/plotTruePredictionsVsFalsePredictions.R: -------------------------------------------------------------------------------- 1 | #' Plot true predictions versus false predictions 2 | #' 3 | #' Show how many predictions would be correct as a function of how many would be incorrect. 4 | #' 5 | #' @param occupationalPredictionsAmongTopK a data table created with \code{\link{calcAccurateAmongTopK}}. 6 | #' @param filename If a \code{filename} is specified the diagram will be saved at with this name. 7 | #' 8 | #' @seealso \code{\link{plotAgreementRateVsProductionRate}}, \code{\link{calcAccurateAmongTopK}} 9 | #' 10 | #' @return a ggplot 11 | #' @import data.table 12 | #' @import ggplot2 13 | #' @export 14 | #' 15 | #' @examples 16 | #' # set up data 17 | #' data(occupations) 18 | #' allowed.codes <- c("71402", "71403", "63302", "83112", "83124", "83131", "83132", "83193", "83194", "-0004", "-0030") 19 | #' allowed.codes.titles <- c("Office clerks and secretaries (without specialisation)-skilled tasks", "Office clerks and secretaries (without specialisation)-complex tasks", "Gastronomy occupations (without specialisation)-skilled tasks", 20 | #' "Occupations in child care and child-rearing-skilled tasks", "Occupations in social work and social pedagogics-highly complex tasks", "Pedagogic specialists in social care work and special needs education-unskilled/semiskilled tasks", "Pedagogic specialists in social care work and special needs education-skilled tasks", "Supervisors in education and social work, and of pedagogic specialists in social care work", "Managers in education and social work, and of pedagogic specialists in social care work", 21 | #' "Not precise enough for coding", "Student assistants") 22 | #' proc.occupations <- removeFaultyAndUncodableAnswers_And_PrepareForAnalysis(occupations, colNames = c("orig_answer", "orig_code"), allowed.codes, allowed.codes.titles) 23 | #' 24 | #' ## split sample 25 | #' set.seed(3451345) 26 | #' n.test <- 50 27 | #' group <- sample(c(rep("test", n.test), rep("training", nrow(proc.occupations) - n.test))) 28 | #' splitted.data <- split(proc.occupations, group) 29 | #' 30 | #' # train model and make predictions 31 | #' model <- trainLogisticRegressionWithPenalization(splitted.data$train, preprocessing = list(stopwords = tm::stopwords("de"), stemming = "de", countWords = FALSE), tuning = list(alpha = 0.05, maxit = 50^5, nlambda = 100, thresh = 1e-5)) 32 | #' res <- predictLogisticRegressionWithPenalization(model, splitted.data$test) 33 | #' 34 | #' # expand to contain more categories than the initial ones 35 | #' res.proc1 <- expandPredictionResults(res, allowed.codes = c("12345", allowed.codes), method.name = "glmnet1") 36 | #' 37 | #' # we can use different methods to create a combined dataset. This is how to run the subsequent analysis functions only once. 38 | #' res.proc2 <- expandPredictionResults(res, allowed.codes = c("12345", allowed.codes), method.name = "glmnet2") 39 | #' res.proc <- rbind(res.proc1, res.proc2); class(res.proc) <- c(class(res.proc), "occupationalPredictionsComplete") 40 | #' 41 | #' calcAccurateAmongTopK(res.proc, k = 5)[,mean(acc), by = method.name] 42 | #' plotTruePredictionsVsFalsePredictions(calcAccurateAmongTopK(res.proc, k = 5)) 43 | #' plotTruePredictionsVsFalsePredictions(calcAccurateAmongTopK(res.proc, k = 1), filename = "test.pdf") 44 | plotTruePredictionsVsFalsePredictions <- function(occupationalPredictionsAmongTopK, filename = NULL) { 45 | 46 | print("Make sure this is the size of your test data. (or count the missing one as additional false predictions on the upper right)") 47 | print(occupationalPredictionsAmongTopK[, .N, by = method.name]) 48 | 49 | # calculate coordinates to plot 50 | n.true.predictions.vs.n.false.predictions <- occupationalPredictionsAmongTopK[, .SD[order(pred.prob, decreasing = TRUE), list(y = cumsum(acc), x = cumsum(1 - acc))], by = method.name] 51 | 52 | p <- ggplot(n.true.predictions.vs.n.false.predictions, aes(x = x, y = y, colour = method.name, group = paste(method.name))) + 53 | geom_line(size = 0.1, linetype = "solid") + scale_size(range = c(0, 1)) + 54 | # geom_line(data = agreementProductionGraph[method == "dw averaged"], aes(x = x, y = y, colour = method, size = method)) 55 | scale_colour_manual(values = c("chocolate", "blue4", "red", c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7"), c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7"), c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7"), c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")), # 56 | name = "Prediction Method", 57 | guide = guide_legend(direction = "horizontal", 58 | nrow = 2, ncol=4, 59 | title.position = "top", 60 | title.hjust = 0.5, 61 | label.position="right", 62 | label.hjust = 0, 63 | label.vjust = 0.5, 64 | reverse = TRUE 65 | # label.theme = element_text(angle = 90) 66 | )) + 67 | labs(x = "No. of False Predictions", y = "No. of True Predictions") + 68 | theme(legend.position="bottom") 69 | 70 | if (!is.null(filename)) ggsave(filename, plot = p, width = 7, height = 7) 71 | 72 | return(p) 73 | } 74 | 75 | -------------------------------------------------------------------------------- /R/predictWithCodingIndex.R: -------------------------------------------------------------------------------- 1 | #' Code answers with a coding index 2 | #' 3 | #' Look up the correct code in a coding index. We often find no code, 1 code or even more than one possible code this way. 4 | #' 5 | #' @param newdata either a data.table created with \code{\link{removeFaultyAndUncodableAnswers_And_PrepareForAnalysis}} or a character vector. 6 | #' @param coding_index a data.table as created with function \code{\link{prepare_German_coding_index_Gesamtberufsliste_der_BA}} 7 | #' @param include.substrings (default: \code{FALSE}). If \code{FALSE}, a match is found if, after preprocessing, the entry from the coding index and the string-element are exactly identical. If TRUE (Attention: THIS IS SLOW!!), a match is found if, after preprocessing, the entry from the coding index is a substring of the string-element. 8 | #' @param max.count.categories (default: \code{Inf}). Should we search the whole coding index (default) or should we exclude entries with large \code{count_categories}, an indicator of job title ambiguity? Only entries in the coding index with \code{count_categories \eqn{\le} max.count.categories} are searched. 9 | #' 10 | #' @seealso 11 | #' \code{\link{predictSimilarityBasedReasoning}} 12 | #' 13 | #' @return a data.table with columns id, ans, and pred.code (format is not comparable to other formats in this package.) 14 | #' @export 15 | #' 16 | #' @examples 17 | #' # set up data 18 | #' data(occupations) 19 | #' allowed.codes <- c("71402", "71403", "63302", "83112", "83124", "83131", "83132", "83193", "83194", "-0004", "-0030") 20 | #' allowed.codes.titles <- c("Office clerks and secretaries (without specialisation)-skilled tasks", "Office clerks and secretaries (without specialisation)-complex tasks", "Gastronomy occupations (without specialisation)-skilled tasks", 21 | #' "Occupations in child care and child-rearing-skilled tasks", "Occupations in social work and social pedagogics-highly complex tasks", "Pedagogic specialists in social care work and special needs education-unskilled/semiskilled tasks", "Pedagogic specialists in social care work and special needs education-skilled tasks", "Supervisors in education and social work, and of pedagogic specialists in social care work", "Managers in education and social work, and of pedagogic specialists in social care work", 22 | #' "Not precise enough for coding", "Student assistants") 23 | #' proc.occupations <- removeFaultyAndUncodableAnswers_And_PrepareForAnalysis(occupations, colNames = c("orig_answer", "orig_code"), allowed.codes, allowed.codes.titles) 24 | #' 25 | #' # recommended default 26 | #' res <- predictWithCodingIndex(proc.occupations, 27 | #' coding_index = coding_index_excerpt, 28 | #' include.substrings = FALSE, 29 | #' max.count.categories = Inf) 30 | #' 31 | #' # playing around with the parameters to obtain other results 32 | #' res <- predictWithCodingIndex(proc.occupations, 33 | #' coding_index = coding_index_excerpt, 34 | #' include.substrings = TRUE, 35 | #' max.count.categories = 15) 36 | #' 37 | #' ################################# 38 | #' # Analysis: Standard functions from this package won't work here. 39 | #' # Absolute numbers: either nothing is predicted (nPredictedCodes = NA), or 1 or more cods are predicted 40 | #' res[ , .N, by = list(nPredictedCodes = 1 + nchar(pred.code) %/% 6 )] 41 | #' # Relative Numbers 42 | #' res[ , .N / res[, .N], by = list(nPredictedCodes = 1 + nchar(pred.code) %/% 6 )] 43 | #' # Agreement rate among answers where only a single code was predicted 44 | #' res[nchar(pred.code) == 5, mean(pred.code == code)] 45 | predictWithCodingIndex <- function(newdata, coding_index, 46 | include.substrings = FALSE, max.count.categories = Inf) { 47 | 48 | # get character vector depending on type of input 49 | if ("occupationData" %in% class(newdata)) { 50 | ans <- newdata[, ans] 51 | if (exists("id", newdata)) { 52 | id <- newdata[, id] 53 | } else { 54 | id <- 1:length(ans) 55 | } 56 | } 57 | if (is.character(newdata)) { 58 | ans <- newdata 59 | id <- 1:length(ans) 60 | } 61 | 62 | ########################################### 63 | ###### prepare coding index ###### 64 | 65 | # write female titles beneath the male titles 66 | coding_index <- rbind(coding_index[, list(title = bezMale, Code, count_categories)], 67 | coding_index[, list(title = bezFemale, Code, count_categories)]) 68 | # drop duplicate lines 69 | coding_index <- coding_index[,.N, by = list(title, Code, count_categories) 70 | ][,list(title, Code, count_categories)] 71 | 72 | # standardize titles from the coding index 73 | coding_index <- coding_index[,title := stringPreprocessing(title)] 74 | 75 | 76 | ########################################### 77 | ####### do coding with the dictionary ##### 78 | string.proc <- stringPreprocessing(ans) 79 | if (!include.substrings) { 80 | # substrings are not considered as matches 81 | res <- sapply(string.proc, function(str) { 82 | kldb_code <- coding_index[title == str & count_categories <= max.count.categories, 83 | paste(unique(Code), collapse = ",")] 84 | if (kldb_code == "" | length(kldb_code) == 0) kldb_code <- NA 85 | return(kldb_code) 86 | }) 87 | } else { 88 | # substrings are considered as matches 89 | 90 | # loop through coding_index$titles 91 | coding_index_matches <- sapply(coding_index$title, grepl, string.proc) 92 | # loop through string.proc 93 | res <- apply(coding_index_matches, 1, function(vec) coding_index[vec & count_categories <= max.count.categories, paste(unique(Code), collapse = ",")]) 94 | res[res == "" | length(res) == 0] <- NA 95 | } 96 | 97 | res <- data.table(id, ans, pred.code = res) 98 | 99 | # add additional columns from new data to the result 100 | if ("occupationData" %in% class(newdata)) { 101 | for (i in seq_along(names(newdata))) { 102 | if (names(newdata)[i] != "ans") { 103 | set(res, i = NULL, j = names(newdata)[i], value = newdata[, i, with = FALSE]) 104 | } 105 | } 106 | 107 | # class(res) <- c(class(res), "occupationalPredictions") Only if we had probabilities for more than one category. One could adapt the above function to deliver this. 108 | } 109 | 110 | return(res) 111 | } 112 | -------------------------------------------------------------------------------- /R/produceResults.R: -------------------------------------------------------------------------------- 1 | #' Produces summaries of predictive performance 2 | #' 3 | #' Produces \code{\link{accuracy}}, \code{\link{plotReliabilityDiagram}}, \code{\link{sharpness}}, \code{\link{logLoss}}, \code{\link{plotTruePredictionsVsFalsePredictions}}, and \code{\link{plotAgreementRateVsProductionRate}}. 4 | #' 5 | #' @param occupationalPredictions a data.table created with a \code{\link{expandPredictionResults}}-function from this package. 6 | #' @param k how many top k categories to aggregate over? 7 | #' @param n Number of unique persons in test data (may be larger than the number of persons in occupationalPredictionsAmongTopK if for some persons no codes were suggested) 8 | #' @param num.codes Number of allowed categories in classification 9 | #' 10 | #' @seealso \code{\link{calcAccurateAmongTopK}} 11 | #' 12 | #' @return a data.table 13 | #' @import data.table 14 | #' @export 15 | #' 16 | #' @examples 17 | #' # set up data 18 | #' data(occupations) 19 | #' allowed.codes <- c("71402", "71403", "63302", "83112", "83124", "83131", "83132", "83193", "83194", "-0004", "-0030") 20 | #' allowed.codes.titles <- c("Office clerks and secretaries (without specialisation)-skilled tasks", "Office clerks and secretaries (without specialisation)-complex tasks", "Gastronomy occupations (without specialisation)-skilled tasks", 21 | #' "Occupations in child care and child-rearing-skilled tasks", "Occupations in social work and social pedagogics-highly complex tasks", "Pedagogic specialists in social care work and special needs education-unskilled/semiskilled tasks", "Pedagogic specialists in social care work and special needs education-skilled tasks", "Supervisors in education and social work, and of pedagogic specialists in social care work", "Managers in education and social work, and of pedagogic specialists in social care work", 22 | #' "Not precise enough for coding", "Student assistants") 23 | #' proc.occupations <- removeFaultyAndUncodableAnswers_And_PrepareForAnalysis(occupations, colNames = c("orig_answer", "orig_code"), allowed.codes, allowed.codes.titles) 24 | #' 25 | #' ## split sample 26 | #' set.seed(3451345) 27 | #' n.test <- 50 28 | #' group <- sample(c(rep("test", n.test), rep("training", nrow(proc.occupations) - n.test))) 29 | #' splitted.data <- split(proc.occupations, group) 30 | #' 31 | #' # train model and make predictions 32 | #' model <- trainLogisticRegressionWithPenalization(splitted.data$train, preprocessing = list(stopwords = tm::stopwords("de"), stemming = "de", countWords = FALSE), tuning = list(alpha = 0.05, maxit = 50^5, nlambda = 100, thresh = 1e-5)) 33 | #' res <- predictLogisticRegressionWithPenalization(model, splitted.data$test) 34 | #' 35 | #' # expand to contain more categories than the initial ones 36 | #' res.proc1 <- expandPredictionResults(res, allowed.codes = c("12345", allowed.codes), method.name = "glmnet1") 37 | #' 38 | #' # aggregate over top k categories 39 | #' calcAccurateAmongTopK(res.proc1, k = 1)[,mean(acc)] 40 | #' calcAccurateAmongTopK(res.proc1, k = 5)[,mean(acc)] 41 | #' 42 | #' # we can use different methods to create a combined dataset. This is how to run the subsequent analysis functions only once. 43 | #' res.proc2 <- expandPredictionResults(res, allowed.codes = c("12345", allowed.codes), method.name = "glmnet2") 44 | #' res.proc <- rbind(res.proc1, res.proc2); class(res.proc) <- c(class(res.proc), "occupationalPredictionsComplete") 45 | #' 46 | #' produceResults(res.proc, k = 1, n = n.test, num.codes = length(allowed.codes) + 1) 47 | produceResults <- function(occupationalPredictions, k = 1, n, num.codes) { 48 | 49 | if (!("occupationalPredictionsComplete" %in% class(occupationalPredictions))){ 50 | stop("'occupationalPredictionsComplete' needs to have class 'occupationalPredictionsComplete' (=constructed with a expandPredictionResults method).") 51 | } 52 | 53 | no.individuals <- occupationalPredictions[, length(unique(id))] 54 | print(paste("Number of individuals:", no.individuals)) 55 | print(paste("Expected Number of rows in dataset:", no.individuals*num.codes, "=", no.individuals, "Individuals *", num.codes, "=", "Codes")) 56 | print(paste("Observed Number of rows in dataset:")) 57 | 58 | print(occupationalPredictions[, .N, by = method.name]) 59 | 60 | accurateAmong <- calcAccurateAmongTopK(occupationalPredictions, k = k) 61 | 62 | print("## Agreement Rate (at 100% production rate):") 63 | print(accuracy(accurateAmong, n = n)) 64 | 65 | print("## Plot Reliability Diagram") 66 | print(plotReliabilityDiagram(occupationalPredictions, k = k, num.codes = num.codes)) 67 | 68 | print("## Sharpness:") 69 | print(sharpness(occupationalPredictions)) 70 | 71 | print("## Log2 loss:") 72 | print(logLoss(occupationalPredictions)) 73 | 74 | print("## Plot True Predictions vs False Predictions") 75 | print(plotTruePredictionsVsFalsePredictions(accurateAmong)) 76 | 77 | print("## Plot Agreement Rate vs Production Rate") 78 | print(plotAgreementRateVsProductionRate(accurateAmong, n = n, yintercept = 0.85)) 79 | 80 | # calculate AUC (I don't like ROC because the number of positive conditions (used in the tpr/recall) is not meaningful as a denominator here) 81 | # library(verification) 82 | # calcAccurateAmongTopK(occupationalPredictions, k = k)[, verification::roc.area(acc, pred.prob), by = sim.name] # used in the paper 83 | 84 | # ########################### 85 | # # idea: absolute number in a category should be similar to the predicted number (aggregates per category). But the results are a bit strange.. so include in the paper? 86 | # 87 | # # calculate residuals 88 | # res.complete[, acc := pred.code == true.code] 89 | # res.complete[, residual := acc - pred.prob] # y_{n_fk} - \hat{p}_{n_fk} 90 | # 91 | # # this should be exactly 1 for all ids 92 | # summary(res.complete[sim.name == "osa1111 = 0" & pred.code != "-9999", sum(pred.prob), by = id]) 93 | # res.complete[sim.name == "osa1111 = 0" & pred.code != "-9999", sum(residual), by = list(pred.code, sim.name)][order(V1), sum(V1)] 94 | # # this should be 1064 -> our numbers are up to 1.8 off. 95 | # res.complete[sim.name == "osa1111 = 0" & pred.code != "-9999", sum(pred.prob), by = id][,sum(V1)] 96 | # 97 | # pdf("Z:/Eigene Dateien/in work/Algorithms for Occupation Coding/paper/graphs/category_residuals.pdf") 98 | # res.complete[sim.name == "osa1111 = 0" & pred.code != "-9999", sum(residual), by = list(pred.code, sim.name)][order(V1), plot(V1, ylab = "Residual")] 99 | # dev.off() 100 | # 101 | # pdf("Z:/Eigene Dateien/in work/Algorithms for Occupation Coding/paper/graphs/category_residuals2.pdf") 102 | # plot(res.complete[sim.name == "osa1111 = 0" & pred.code != "-9999", list(residual = sum(residual)), by = list(pred.code, sim.name)][order(residual), ][residual < -5 | residual > 5, list(pred.code = as.factor(pred.code), by = residual)], las = 3, ylab = "Residual") 103 | # abline(h = 0) 104 | # dev.off() 105 | # res[sim.name == "osa1111 = 0", list(sum(pred.prob)), by = pred.code][order(V1)] 106 | # 107 | # res.complete[sim.name == "osa1111 = 0" & pred.code != "-9999", list(residual = sum(residual)), by = list(pred.code, sim.name)][residual > -0.5 & residual < 0.5][, .N, by = residual] 108 | # 109 | # res.complete[sim.name == "osa1111 = 0" & pred.code != "-9999", list(residual = sum(residual)), by = list(pred.code, sim.name)][residual > -0.5 & residual < 0.5, list(.N, mean(residual))] 110 | # turtle[code == 71402, .N] 111 | # res.complete[sim.name == "osa1111 = 0" & pred.code == "71402", list(sum(pred.prob), sum(residual))] 112 | # 113 | } 114 | -------------------------------------------------------------------------------- /R/removeFaultyAndUncodableAnswers_And_PrepareForAnalysis.R: -------------------------------------------------------------------------------- 1 | #' Data Preparation 2 | #' 3 | #' Prepare data (i.e. columns 'id', 'ans', and 'code' are appended to the dataset - only these columns will be used later) and remove answer that we cannot use (i.e. answers that have non-ASCII characters after preprocessing and answers that are at most one character long). During data preparation you should make sure that nothing important is lost here. 4 | #' 5 | #' The 2010 German classification is available at \url{https://www.klassifikationsserver.de/}. 6 | #' 7 | #' @param answers a character vector of answers 8 | #' @param codes a vector of classification codes having the same length as \code{answers}. Will be transformed to character. 9 | #' @param allowed.codes a vector of allowed codes from the classification. 10 | #' @param allowed.codes.titles Labels for \code{allowed.codes}. Should have the same length as \code{allowed.codes}. 11 | #' 12 | #' @seealso \code{\link{createDescriptives}} 13 | #' 14 | #' @return a data.table with attributes \code{classification} and \code{overview_tally} 15 | #' @import data.table 16 | #' @export 17 | #' 18 | #' @examples 19 | #' occupations <- data.table(answers = c("LEITER VERTRIEB", "Kfz-Schlossermeister", "Aushilfe im Hotel(Bereich Housekeeping)"), 20 | #' codes = c("61194", "25213", "63221")) 21 | #' (allowed.codes <- c("11101", "61194", "25213", "63221", "...")) 22 | #' (allowed.codes.titles <- c("Berufe in der Landwirtschaft (ohne Spezialisierung) - Helfer-/Anlernt\xe4tigkeiten", "Berufe in der Kraftfahrzeugtechnik - komplexe Spezialistent\xe4tigkeiten", "F\xfchrungskräfte - Einkauf und Vertrieb", "Berufe im Hotelservice - Helfer-/Anlernt\xe4tigkeiten", "many more category labels from the classification")) 23 | #' removeFaultyAndUncodableAnswers_And_PrepareForAnalysis(occupations, colNames = c("answers", "codes"), allowed.codes, allowed.codes.titles) 24 | #' 25 | #' data(occupations) 26 | #' allowed.codes <- c("71402", "71403", "63302", "83112", "83124", "83131", "83132", "83193", "83194", "-0004", "-0030") 27 | #' allowed.codes.titles <- c("Office clerks and secretaries (without specialisation)-skilled tasks", "Office clerks and secretaries (without specialisation)-complex tasks", "Gastronomy occupations (without specialisation)-skilled tasks", 28 | #' "Occupations in child care and child-rearing-skilled tasks", "Occupations in social work and social pedagogics-highly complex tasks", "Pedagogic specialists in social care work and special needs education-unskilled/semiskilled tasks", "Pedagogic specialists in social care work and special needs education-skilled tasks", "Supervisors in education and social work, and of pedagogic specialists in social care work", "Managers in education and social work, and of pedagogic specialists in social care work", 29 | #' "Not precise enough for coding", "Student assistants") 30 | #' removeFaultyAndUncodableAnswers_And_PrepareForAnalysis(occupations, colNames = c("orig_answer", "orig_code"), allowed.codes, allowed.codes.titles) 31 | #' 32 | #' ## we could also paste both answers together 33 | #' occupations[, answer_combined := paste(orig_answer, orig_answer2)] 34 | #' removeFaultyAndUncodableAnswers_And_PrepareForAnalysis(occupations, colNames = c("answer_combined", "orig_code"), allowed.codes, allowed.codes.titles) 35 | removeFaultyAndUncodableAnswers_And_PrepareForAnalysis <- function(data, colNames = c("answer", "code"), allowed.codes, allowed.codes.titles = 1:length(allowed.codes)) { 36 | 37 | # the classification contains all allowed codes 38 | classification <- data.table(code = allowed.codes, title = allowed.codes.titles) 39 | 40 | # insert 2 new columns into data, bases on colNames 41 | res <- copy(data) 42 | res[, "ans" := as.character(get(colNames[1]))] 43 | res[, "code" := as.character(get(colNames[2]))] 44 | 45 | # provide some additional info about removed answers 46 | overview_tally_preparation <- list(N_verbatims_start = nrow(res)) 47 | 48 | ## remove non-ASCII answers 49 | overview_tally_preparation$N_ASCII_removed <- res[,sum(!tau::is.ascii(stringPreprocessing(ans)))] 50 | res <- res[tau::is.ascii(stringPreprocessing(ans)),] 51 | 52 | ## remove answers with at most one character 53 | overview_tally_preparation$N_short_verbatims <- res[nchar(stringPreprocessing(ans)) <= 1, .N] 54 | res <- res[nchar(stringPreprocessing(ans)) > 1, ] 55 | 56 | ## remove answers and codes if the code is not allowed 57 | overview_tally_preparation$N_Codes_removed <- res[!(code %in% allowed.codes), .N] 58 | res <- res[(code %in% allowed.codes)] 59 | 60 | # set ids 61 | res[, id := 1:.N] 62 | setkey(res, id) 63 | 64 | attr(res, "classification") <- classification 65 | attr(res, "overview_tally") <- overview_tally_preparation 66 | class(res) <- c(class(res), "occupationData") 67 | 68 | cat("Number of cases at start:", overview_tally_preparation$N_verbatims_start, "\n") 69 | cat("Number of cases with non-ASCII characters (removed):", overview_tally_preparation$N_ASCII_removed, "\n") 70 | cat("Number of cases with codes that are not allowed (removed):", overview_tally_preparation$N_Codes_removed, " (check with 'data[!(code %in% allowed.codes), table(code)]')\n") 71 | cat("Number of cases with at most one character (removed):", overview_tally_preparation$N_short_verbatims, "\n") 72 | cat("Number of remaining cases:", res[, .N], "\n") 73 | 74 | return(res) 75 | } 76 | -------------------------------------------------------------------------------- /R/sharpness.R: -------------------------------------------------------------------------------- 1 | #' Sharpness 2 | #' 3 | #' Calculate Sharpness \eqn{\log_2 loss = \frac{1}{N} \sum_n entropy_n} and standard error \eqn{\sqrt{\frac{1}{N(N-1)} \sum_n (entropy_n - sharpness)^2}} with \eqn{entropy_n = - \sum_k p_{nk} \log_2 p_{nk}} 4 | #' 5 | #' Sharpness is zero (optimal) if a single category is predicted with probability 1. It is maximal if all categories have equal probability \eqn{p = \frac{1}{K}} 6 | #' 7 | #' Note: What should be done if a predicted probability is zero? \eqn{0 \times log(0)} is not defined but necessary to calculate sharpness. We set \eqn{0 \times log(0) = 0}. This also means we exclude observations from our analysis if all probabilities are predicted as zero. An alternative could be to set such zeros to \eqn{1/k)}, which would lead to very different sharpness. 8 | #' 9 | #' @param occupationalPredictions a data.table created with a \code{\link{expandPredictionResults}}-function from this package. 10 | #' 11 | #' @seealso \code{\link{plotReliabilityDiagram}}, \code{link{logLoss}} 12 | #' 13 | #' @return a data.table 14 | #' @import data.table 15 | #' @export 16 | #' 17 | #' @examples 18 | #' # set up data 19 | #' data(occupations) 20 | #' allowed.codes <- c("71402", "71403", "63302", "83112", "83124", "83131", "83132", "83193", "83194", "-0004", "-0030") 21 | #' allowed.codes.titles <- c("Office clerks and secretaries (without specialisation)-skilled tasks", "Office clerks and secretaries (without specialisation)-complex tasks", "Gastronomy occupations (without specialisation)-skilled tasks", 22 | #' "Occupations in child care and child-rearing-skilled tasks", "Occupations in social work and social pedagogics-highly complex tasks", "Pedagogic specialists in social care work and special needs education-unskilled/semiskilled tasks", "Pedagogic specialists in social care work and special needs education-skilled tasks", "Supervisors in education and social work, and of pedagogic specialists in social care work", "Managers in education and social work, and of pedagogic specialists in social care work", 23 | #' "Not precise enough for coding", "Student assistants") 24 | #' proc.occupations <- removeFaultyAndUncodableAnswers_And_PrepareForAnalysis(occupations, colNames = c("orig_answer", "orig_code"), allowed.codes, allowed.codes.titles) 25 | #' 26 | #' ## split sample 27 | #' set.seed(3451345) 28 | #' n.test <- 50 29 | #' group <- sample(c(rep("test", n.test), rep("training", nrow(proc.occupations) - n.test))) 30 | #' splitted.data <- split(proc.occupations, group) 31 | #' 32 | #' # train model and make predictions 33 | #' model <- trainLogisticRegressionWithPenalization(splitted.data$train, preprocessing = list(stopwords = tm::stopwords("de"), stemming = "de", countWords = FALSE), tuning = list(alpha = 0.05, maxit = 50^5, nlambda = 100, thresh = 1e-5)) 34 | #' res <- predictLogisticRegressionWithPenalization(model, splitted.data$test) 35 | #' 36 | #' # expand to contain more categories than the initial ones 37 | #' res.proc1 <- expandPredictionResults(res, allowed.codes = c("12345", allowed.codes), method.name = "glmnet1") 38 | #' 39 | #' # we can use different methods to create a combined dataset. This is how to run the subsequent analysis functions only once. 40 | #' res.proc2 <- expandPredictionResults(res, allowed.codes = c("12345", allowed.codes), method.name = "glmnet2") 41 | #' res.proc <- rbind(res.proc1, res.proc2); class(res.proc) <- c(class(res.proc), "occupationalPredictionsComplete") 42 | #' 43 | #' sharpness(res.proc) 44 | sharpness <- function(occupationalPredictions) { 45 | 46 | if (!("occupationalPredictionsComplete" %in% class(occupationalPredictions))){ 47 | stop("'occupationalPredictionsComplete' needs to have class 'occupationalPredictionsComplete' (=constructed with a expandPredictionResults method).") 48 | } 49 | 50 | # we set 0 * log_2 0 = 0 (implemented as pred.prob > 0) 51 | occupationalPredictions[pred.prob > 0, list(entropy = sum(-pred.prob * log2(pred.prob))), by = list(id, method.name)][, list(.N, sharpness = mean(entropy), se = sqrt(var(entropy) / .N)), by = method.name] 52 | # res.complete[, list(entropy = sum(-pred.prob * log2(pred.prob))), by = list(id, sim.name)][, list(sharpness = mean(entropy), se = sqrt(var(entropy) / .N)), by = sim.name] 53 | 54 | } 55 | -------------------------------------------------------------------------------- /R/stringPreprocessing.R: -------------------------------------------------------------------------------- 1 | #' Preprocess German occupational text 2 | #' 3 | #' Function replaces some common characters / character sequences (e.g., Ä, Ü, "DIPL.-ING.") with their uppercase equivalents and removes punctuation, empty spaces and the word "Diplom". 4 | #' 5 | #' \code{\link{charToRaw}} helps to find UTF-8 characters. 6 | #' 7 | #' @encoding UTF-8 8 | #' @param verbatim a character vector. 9 | #' @param lang (default de) Everything else will throw an error. 10 | #' 11 | #' @return the same character vector after processing 12 | #' @export 13 | #' 14 | #' @examples 15 | #' (x <- c("Verkauf von B\xfcchern, Schreibwaren", "Fach\xe4rzin f\xfcr Kinder- und Jugendmedizin im \xf6ffentlichen Gesundheitswesen", "Industriemechaniker", "Dipl.-Ing. - Agrarwirtschaft (Landwirtschaft)")) 16 | #' stringPreprocessing(x) 17 | stringPreprocessing <- function(verbatim, lang = "de") { 18 | if(!is.character(verbatim)) 19 | stop("Character Input required for function stringPreprocessing") 20 | if (lang != "de") 21 | stop("This function was programmed for German characters and German occupational titles") 22 | verbatim <- toupper(verbatim) 23 | verbatim <- gsub(intToUtf8(0xc4), "AE", verbatim, fixed=TRUE) 24 | verbatim <- gsub(intToUtf8(0xd6), "OE", verbatim, fixed=TRUE) 25 | verbatim <- gsub(intToUtf8(0xdc), "UE", verbatim, fixed=TRUE) 26 | verbatim <- gsub(intToUtf8(0xdf), "SS", verbatim, fixed=TRUE) 27 | verbatim <- gsub(intToUtf8(0xc2), "A", verbatim, fixed=TRUE) 28 | verbatim <- gsub(intToUtf8(0xc9), "E", verbatim, fixed=TRUE) 29 | verbatim <- gsub(intToUtf8(0xca), "E", verbatim, fixed=TRUE) 30 | verbatim <- gsub(intToUtf8(0xcd), "I", verbatim, fixed=TRUE) 31 | verbatim <- gsub(intToUtf8(0xce), "I", verbatim, fixed=TRUE) 32 | verbatim <- gsub(intToUtf8(0xd4), "O", verbatim, fixed=TRUE) 33 | verbatim <- gsub("/", " ", verbatim , fixed=TRUE) 34 | verbatim <- gsub("+", " ", verbatim , fixed=TRUE) 35 | verbatim <- gsub("-", " ", verbatim , fixed=TRUE) 36 | verbatim <- gsub(")", " ", verbatim , fixed=TRUE) 37 | verbatim <- gsub("(", " ", verbatim , fixed=TRUE) 38 | verbatim <- gsub("\u20ac", "EURO", verbatim, fixed=TRUE) # no solution with intToUtf8 39 | verbatim <- gsub("\u0080", "EURO", verbatim, fixed = TRUE) # there must be a solution with intToUtf8 40 | # verbatim <- gsub(" [A-Za-z]{1,3}\\.", "", verbatim) # Abkürzungen mit max. 3 Buchstaben vor einem Punkt werden gelöscht 41 | verbatim <- gsub("DIPL.-ING.", "DIPLOMINGENIEUR", verbatim , fixed=TRUE) # this abbreviation is often used in the coding index 42 | verbatim <- gsub("ING.", "INGENIEUR", verbatim , fixed=TRUE) # also sometimes used 43 | verbatim <- gsub("DIPL.", "DIPLOM", verbatim , fixed=TRUE) # this abbreviation is often used in the coding index 44 | verbatim <- gsub("DIPLOM", "DIPLOM ", verbatim , fixed=TRUE) # better to think of DIPLOM as a separate word 45 | verbatim <- tm::removePunctuation(verbatim) 46 | verbatim <- stringr::str_trim(verbatim) 47 | verbatim <- gsub(" {2,}", " ", verbatim) # in case we have at least two double spaces, replace it with a single one 48 | # verbatim <- gsub(" ", " ", verbatim , fixed=TRUE) # in case we have multiple double spaces 49 | # verbatim <- gsub(" ", " ", verbatim , fixed=TRUE) # in case we have multiple double spaces 50 | # verbatim <- gsub(" ", " ", verbatim , fixed=TRUE) # in case we have multiple double spaces 51 | # verbatim <- gsub( " *\\(.*?\\) *", "", verbatim) # would it make sense to remove text in parenthesis completely? 52 | verbatim <- gsub("DIPLOM ","",verbatim) 53 | verbatim <- gsub("DIPL ","",verbatim) 54 | return(verbatim) 55 | } 56 | -------------------------------------------------------------------------------- /R/trainCreecysMemoryBasedReasoning.R: -------------------------------------------------------------------------------- 1 | #' Train Creecys Memory-based reaoning model 2 | #' 3 | #' The function does some preprocessing and calculates the importance of various features. 4 | #' 5 | #' @param data a data.table created with \code{\link{removeFaultyAndUncodableAnswers_And_PrepareForAnalysis}} 6 | #' @param preprocessing a list with elements 7 | #' \describe{ 8 | #' \item{stopwords}{a character vector, use \code{tm::stopwords("de")} for German stopwords.} 9 | #' \item{stemming}{\code{NULL} for no stemming and \code{"de"} for stemming using the German porter stemmer.} 10 | #' \item{strPreprocessing}{\code{TRUE} if \code{\link{stringPreprocessing}} shall be used.} 11 | #' \item{removePunct}{\code{TRUE} if \code{\link[tm]{removePunctuation}} shall be used.} 12 | #' } 13 | #' 14 | #' @seealso 15 | #' \code{\link{predictCreecysMemoryBasedReasoning}} 16 | #' 17 | #' Creecy, R. H., Masand, B. M., Smith, S. J., Waltz, D. L. (1992). Trading MIPS and Memory for Knowledge Engineering. Comm. ACM 35(8). pp. 48--65. 18 | #' 19 | #' @return a processed feature matrix to be used in \code{\link{predictCreecysMemoryBasedReasoning}} 20 | #' @import data.table 21 | #' @export 22 | #' 23 | #' @examples 24 | #' # set up data 25 | #' data(occupations) 26 | #' allowed.codes <- c("71402", "71403", "63302", "83112", "83124", "83131", "83132", "83193", "83194", "-0004", "-0030") 27 | #' allowed.codes.titles <- c("Office clerks and secretaries (without specialisation)-skilled tasks", "Office clerks and secretaries (without specialisation)-complex tasks", "Gastronomy occupations (without specialisation)-skilled tasks", 28 | #' "Occupations in child care and child-rearing-skilled tasks", "Occupations in social work and social pedagogics-highly complex tasks", "Pedagogic specialists in social care work and special needs education-unskilled/semiskilled tasks", "Pedagogic specialists in social care work and special needs education-skilled tasks", "Supervisors in education and social work, and of pedagogic specialists in social care work", "Managers in education and social work, and of pedagogic specialists in social care work", 29 | #' "Not precise enough for coding", "Student assistants") 30 | #' proc.occupations <- removeFaultyAndUncodableAnswers_And_PrepareForAnalysis(occupations, colNames = c("orig_answer", "orig_code"), allowed.codes, allowed.codes.titles) 31 | #' 32 | #' # Recommended configuration (and commonly used in this package) 33 | #' memModel <- trainCreecysMemoryBasedReasoning(proc.occupations, 34 | #' preprocessing = list(stopwords = character(0), stemming = NULL, strPreprocessing = TRUE, removePunct = FALSE)) 35 | trainCreecysMemoryBasedReasoning <- function(data, 36 | preprocessing = list(stopwords = character(0), stemming = NULL, strPreprocessing = TRUE, removePunct = TRUE)) { 37 | 38 | codes <- data[,code] 39 | 40 | # preprocessing 41 | if (preprocessing$removePunct) { 42 | ans <- data[, tm::removePunctuation(ans)] 43 | } else { 44 | ans <- data[, ans] 45 | } 46 | 47 | if (preprocessing$strPreprocessing) { 48 | ans <- stringPreprocessing(ans) 49 | } 50 | 51 | # prepare text for efficient computation -> transform to sparse matrix 52 | matrix <- asDocumentTermMatrix(ans, vect.vocab = NULL, 53 | stopwords = preprocessing$stopwords, 54 | stemming = preprocessing$stemming, 55 | type = "dgTMatrix") 56 | 57 | # create features (every word and every combination of 2 words is a feature, neglecting the word order: "single word fields along with all pairwise conjunctions") 58 | DT.wordwise <- data.table(id.training = matrix$dtm@i + 1, words = matrix$dtm@Dimnames[[2]][matrix$dtm@j + 1])[, .SD[order(words)], by = id.training] # [, .SD[order(words)], by = id.training] creates alphabetical ordering, possibly not needed 59 | create_features <- function(words) { 60 | combined_words <- outer(words, words, FUN = paste, sep = ".") 61 | c(words, combined_words[upper.tri(combined_words)]) # exploiting alphabetic ordering 62 | } 63 | DT.wordwise <- DT.wordwise[,.SD[,list(feature = create_features(words))], keyby = id.training] 64 | # DT.wordwise[id.training == 8] 65 | # save for each feature the assigned code 66 | training.prepared <- data.table(id.training = 1:length(ans), code = codes, key = "id.training")[DT.wordwise] 67 | 68 | # calculate feature importance 69 | training.prepared[, feature_freq := .N, by = feature] 70 | # "per category feature importance", P(C_i | f_k) 71 | training.prepared[, p.code.given.feature := .N / feature_freq, by = list(feature, code)] 72 | # "cross category" feature importance Weight f_k = sum P(C_i | f_k) ^2 73 | training.prepared[, cross.category.feature.weight := .SD[!duplicated(code), sum(p.code.given.feature^2)], by = feature] 74 | 75 | return(list(training.prepared = training.prepared, vect.vocab = matrix$vect.vocab, preprocessing = preprocessing)) 76 | } 77 | -------------------------------------------------------------------------------- /R/trainGweonsNearestNeighbor.R: -------------------------------------------------------------------------------- 1 | #' Trains Gweons Nearest Neighbor model 2 | #' 3 | #' Function does some preprocessing and creates a document term matrix to be used for the Nearest Neighbor model. 4 | #' 5 | #' @param data a data.table created with \code{\link{removeFaultyAndUncodableAnswers_And_PrepareForAnalysis}} 6 | #' @param preprocessing a list with elements 7 | #' \describe{ 8 | #' \item{stopwords}{a character vector, use \code{tm::stopwords("de")} for German stopwords.} 9 | #' \item{stemming}{\code{NULL} for no stemming and \code{"de"} for stemming using the German porter stemmer.} 10 | #' \item{strPreprocessing}{\code{TRUE} if \code{\link{stringPreprocessing}} shall be used.} 11 | #' \item{removePunct}{\code{TRUE} if \code{\link[tm]{removePunctuation}} shall be used.} 12 | #' } 13 | #' 14 | #' @seealso 15 | #' \code{\link{predictGweonsNearestNeighbor}} 16 | #' 17 | #' Gweon, H.; Schonlau, M., Kaczmirek, L., Blohm, M., Steiner, S. (2017). Three Methods for Occupation Coding Based on Statistical Learning. Journal of Official Statistics 33(1), pp. 101--122 18 | #' 19 | #' @return a document term matrix with some additional attributes 20 | #' @import data.table 21 | #' @import text2vec 22 | #' @export 23 | #' 24 | #' @examples 25 | #' # set up data 26 | #' data(occupations) 27 | #' allowed.codes <- c("71402", "71403", "63302", "83112", "83124", "83131", "83132", "83193", "83194", "-0004", "-0030") 28 | #' allowed.codes.titles <- c("Office clerks and secretaries (without specialisation)-skilled tasks", "Office clerks and secretaries (without specialisation)-complex tasks", "Gastronomy occupations (without specialisation)-skilled tasks", 29 | #' "Occupations in child care and child-rearing-skilled tasks", "Occupations in social work and social pedagogics-highly complex tasks", "Pedagogic specialists in social care work and special needs education-unskilled/semiskilled tasks", "Pedagogic specialists in social care work and special needs education-skilled tasks", "Supervisors in education and social work, and of pedagogic specialists in social care work", "Managers in education and social work, and of pedagogic specialists in social care work", 30 | #' "Not precise enough for coding", "Student assistants") 31 | #' proc.occupations <- removeFaultyAndUncodableAnswers_And_PrepareForAnalysis(occupations, colNames = c("orig_answer", "orig_code"), allowed.codes, allowed.codes.titles) 32 | #' 33 | #' # Recommended configuration 34 | #' dtmModel <- trainGweonsNearestNeighbor(proc.occupations, 35 | #' preprocessing = list(stopwords = tm::stopwords("de"), stemming = "de", strPreprocessing = TRUE, removePunct = FALSE)) 36 | #' # Configuration used by Gweon et al. (2017) 37 | #' dtmModel <- trainGweonsNearestNeighbor(proc.occupations, 38 | #' preprocessing = list(stopwords = tm::stopwords("de"), stemming = "de", strPreprocessing = FALSE, removePunct = TRUE)) 39 | #' # Configuration used for most other approaches in this package 40 | #' dtmModel <- trainGweonsNearestNeighbor(proc.occupations, 41 | #' preprocessing = list(stopwords = character(0), stemming = NULL, strPreprocessing = TRUE, removePunct = FALSE)) 42 | #' 43 | #' ####################################################### 44 | #' ## RUN A GRID SEARCH (takes some time) 45 | #' \donttest{ 46 | #' # create a grid of all combinations to be tried 47 | #' model.grid <- data.table(expand.grid(stopwords = c(TRUE, FALSE), stemming = c(FALSE, "de"), strPreprocessing = c(TRUE, FALSE), nearest.neighbors.multiplier = c(0.05, 0.1, 0.2))) 48 | #' 49 | #' # Do grid search 50 | #' for (i in 1:nrow(model.grid)) { 51 | #' res.model <- trainGweonsNearestNeighbor(splitted.data$training, preprocessing = list(stopwords = if (model.grid[i, stopwords]) tm::stopwords("de") else character(0), 52 | #' stemming = if (model.grid[i, stemming == "de"]) "de" else NULL, 53 | #' strPreprocessing = model.grid[i, strPreprocessing], 54 | #' removePunct = !model.grid[i, strPreprocessing])) 55 | #' 56 | #' res.proc <- predictGweonsNearestNeighbor(res.model, splitted.data$test, 57 | #' tuning = list(nearest.neighbors.multiplier = model.grid[i, nearest.neighbors.multiplier])) 58 | #' res.proc <- expandPredictionResults(res.proc, allowed.codes = allowed.codes, method.name = "NearestNeighbor_Gweon") 59 | #' 60 | #' ac <- accuracy(calcAccurateAmongTopK(res.proc, k = 1), n = nrow(splitted.data$test)) 61 | #' ll <- logLoss(res.proc) 62 | #' sh <- sharpness(res.proc) 63 | #' 64 | #' model.grid[i, acc := ac[, acc]] 65 | #' model.grid[i, acc.se := ac[, se]] 66 | #' model.grid[i, acc.N := ac[, N]] 67 | #' model.grid[i, acc.prob0 := ac[, count.pred.prob0]] 68 | #' model.grid[i, loss.full := ll[1, logscore]] 69 | #' model.grid[i, loss.full.se := ll[1, se]] 70 | #' model.grid[i, loss.full.N := ll[1, N]] 71 | #' model.grid[i, loss.sub := ll[2, logscore]] 72 | #' model.grid[i, loss.sub.se := ll[2, se]] 73 | #' model.grid[i, loss.sub.N := ll[2, N]] 74 | #' model.grid[i, sharp := sh[, sharpness]] 75 | #' model.grid[i, sharp.se := sh[, se]] 76 | #' model.grid[i, sharp.N := sh[, N]] 77 | #' } 78 | #' 79 | #' model.grid[order(stopwords, stemming, strPreprocessing, nearest.neighbors.multiplier)] 80 | #' 81 | #' 82 | #' } 83 | trainGweonsNearestNeighbor <- function(data, 84 | preprocessing = list(stopwords = tm::stopwords("de"), stemming = "de", strPreprocessing = FALSE, removePunct = TRUE)) { 85 | 86 | # preprocessing 87 | if (preprocessing$removePunct) { 88 | ans <- data[, tm::removePunctuation(ans)] 89 | } else { 90 | ans <- data[, ans] 91 | } 92 | 93 | if (preprocessing$strPreprocessing) { 94 | ans <- stringPreprocessing(ans) 95 | } 96 | 97 | # prepare text for efficient computation -> transform to sparse matrix 98 | matrix <- asDocumentTermMatrix(ans, vect.vocab = NULL, 99 | stopwords = preprocessing$stopwords, 100 | stemming = preprocessing$stemming, 101 | type = "dgCMatrix") 102 | 103 | return(list(matrix = matrix$dtm, vect.vocab = matrix$vect.vocab, preprocessing = preprocessing, code = data[,code], 104 | num.allowed.codes = length(attr(data, "classification")$code))) 105 | } 106 | -------------------------------------------------------------------------------- /R/trainLogisticRegressionWithPenalization.R: -------------------------------------------------------------------------------- 1 | #' Train a logistic regression model with penalization 2 | #' 3 | #' Function does some preprocessing and calls glmnet for a logistic regression model 4 | #' 5 | #' Setting \code{tuning$alpha = 0} (Ridge Penalty) seems to be most stable. 6 | #' 7 | #' In our experience, \code{glmnet} often returns a warning like \code{3: from glmnet Fortran code (error code -72); Convergence for 72th lambda value not reached after maxit=100000 iterations; solutions for larger lambdas returned}. To solve this issue, we can increase \code{maxit} to try more iterations or we can decrease the threshold \code{thresh}. 8 | #' 9 | #' @param data a data.table created with \code{\link{removeFaultyAndUncodableAnswers_And_PrepareForAnalysis}} 10 | #' @param preprocessing a list with elements 11 | #' \describe{ 12 | #' \item{stopwords}{a character vector, use \code{tm::stopwords("de")} for German stopwords.} 13 | #' \item{stemming}{\code{NULL} for no stemming and \code{"de"} for stemming using the German porter stemmer.} 14 | #' \item{countWords}{Set to TRUE if the predictor matrix should contain a column for answer length.} 15 | #' } 16 | #' @param tuning a list with elements that will be passed to \code{\link[glmnet]{glmnet}} 17 | #' 18 | #' @seealso \code{\link{predictLogisticRegressionWithPenalization}}, \code{\link[glmnet]{glmnet}} 19 | #' 20 | #' @return a logistic regression model. Commands from \code{\link[glmnet]{glmnet}} should work. 21 | #' @import data.table 22 | #' @import text2vec 23 | #' @export 24 | #' 25 | #' @examples 26 | #' # set up data 27 | #' data(occupations) 28 | #' allowed.codes <- c("71402", "71403", "63302", "83112", "83124", "83131", "83132", "83193", "83194", "-0004", "-0030") 29 | #' allowed.codes.titles <- c("Office clerks and secretaries (without specialisation)-skilled tasks", "Office clerks and secretaries (without specialisation)-complex tasks", "Gastronomy occupations (without specialisation)-skilled tasks", 30 | #' "Occupations in child care and child-rearing-skilled tasks", "Occupations in social work and social pedagogics-highly complex tasks", "Pedagogic specialists in social care work and special needs education-unskilled/semiskilled tasks", "Pedagogic specialists in social care work and special needs education-skilled tasks", "Supervisors in education and social work, and of pedagogic specialists in social care work", "Managers in education and social work, and of pedagogic specialists in social care work", 31 | #' "Not precise enough for coding", "Student assistants") 32 | #' proc.occupations <- removeFaultyAndUncodableAnswers_And_PrepareForAnalysis(occupations, colNames = c("orig_answer", "orig_code"), allowed.codes, allowed.codes.titles) 33 | #' 34 | #' # Recommended configuration 35 | #' trainLogisticRegressionWithPenalization(proc.occupations, 36 | #' preprocessing = list(stopwords = character(0), stemming = "de", countWords = FALSE), 37 | #' tuning = list(alpha = 0.05, maxit = 10^6, nlambda = 100, thresh = 1e-7)) 38 | #' 39 | #' # Other possibility 40 | #' trainLogisticRegressionWithPenalization(proc.occupations, 41 | #' preprocessing = list(stopwords = tm::stopwords("de"), stemming = NULL, countWords = TRUE), 42 | #' tuning = list(alpha = 0.05, maxit = 10^6, nlambda = 100, thresh = 1e-7)) 43 | trainLogisticRegressionWithPenalization <- function(data, 44 | preprocessing = list(stopwords = character(0), stemming = NULL, countWords = FALSE), 45 | tuning = list(alpha = 0.05, maxit = 10^5, nlambda = 100, thresh = 1e-07)) { 46 | 47 | if (!requireNamespace("glmnet", quietly = TRUE)) { 48 | stop("Package \"glmnet\" needed if you want to use this function. Please install it.", 49 | call. = FALSE) 50 | } 51 | 52 | # remove seldom codes 53 | seldom.codes <- data[, .N, by = code][N <= 1, code] 54 | if (length(seldom.codes) > 0) { 55 | warning(paste("This algorithm requires that every outcome code appears at least twice in the training data. The following codes were removed:", paste(seldom.codes, collapse = ", "))) 56 | 57 | data <- data[!(code %in% seldom.codes)] 58 | } 59 | 60 | # and prepare codes 61 | outcome <- as.factor(data[,code]) 62 | 63 | # preprocessing 64 | ans <- data[, stringPreprocessing(ans)] 65 | 66 | # prepare text for efficient computation -> transform to sparse matrix 67 | matrix <- asDocumentTermMatrix(ans, vect.vocab = NULL, 68 | stopwords = preprocessing$stopwords, 69 | stemming = preprocessing$stemming, 70 | type = "dgCMatrix") 71 | 72 | # include feature for number of words 73 | if (preprocessing$countWords) { 74 | ans_freq <- sapply(strsplit(ans, " "), length) 75 | matrix$dtm <- cbind(matrix$dtm, ans_freq) 76 | } 77 | 78 | # estimate model 79 | fit <- glmnet::glmnet(matrix$dtm, outcome, family="multinomial", alpha = tuning$alpha, maxit = tuning$maxit, nlambda = tuning$nlambda, thresh = tuning$thresh) 80 | 81 | # and save preprocessing 82 | fit$vect.vocab <- matrix$vect.vocab 83 | fit$preprocessing <- preprocessing 84 | fit$seldom.codes <- seldom.codes 85 | return(fit) 86 | } 87 | -------------------------------------------------------------------------------- /data/coding_index_excerpt.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malsch/occupationCoding/940a6f6117f264e093246ad1868ec9bde45ba40c/data/coding_index_excerpt.RData -------------------------------------------------------------------------------- /data/frequent_phrases.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malsch/occupationCoding/940a6f6117f264e093246ad1868ec9bde45ba40c/data/frequent_phrases.RData -------------------------------------------------------------------------------- /data/kldb2010PlusFive.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malsch/occupationCoding/940a6f6117f264e093246ad1868ec9bde45ba40c/data/kldb2010PlusFive.RData -------------------------------------------------------------------------------- /data/occupations.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malsch/occupationCoding/940a6f6117f264e093246ad1868ec9bde45ba40c/data/occupations.RData -------------------------------------------------------------------------------- /data/surveyCountsSubstringSimilarity.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malsch/occupationCoding/940a6f6117f264e093246ad1868ec9bde45ba40c/data/surveyCountsSubstringSimilarity.RData -------------------------------------------------------------------------------- /data/surveyCountsWordwiseSimilarity.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malsch/occupationCoding/940a6f6117f264e093246ad1868ec9bde45ba40c/data/surveyCountsWordwiseSimilarity.RData -------------------------------------------------------------------------------- /man/accuracy.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/accuracy.R 3 | \name{accuracy} 4 | \alias{accuracy} 5 | \title{Accuracy} 6 | \usage{ 7 | accuracy(occupationalPredictionsAmongTopK, n) 8 | } 9 | \arguments{ 10 | \item{occupationalPredictionsAmongTopK}{a data table created with \code{\link{calcAccurateAmongTopK}}.} 11 | 12 | \item{n}{Number of unique persons in test data (may be larger than the number of persons in occupationalPredictionsAmongTopK if for some persons no codes were suggested)} 13 | } 14 | \value{ 15 | a data.table with columns N (number of unique IDs in occupationalPredictionsAmongTopK), acc (\code{sum(acc) / n}), se (standard error of acc), mean.in.subset.N (\code{mean(acc)}), and count.pred.prob0 (\code{sum(pred.prob == 0)}) 16 | } 17 | \description{ 18 | Calculate accuracy \eqn{p = \frac{1}{n} \sum acc} and standard errors \eqn{\sqrt{\frac{1}{n} * p * (1-p)}}. 19 | } 20 | \details{ 21 | Note that this function also works if \code{occupationalPredictionsAmongTopK} contains less than \code{n} individuals. 22 | } 23 | \examples{ 24 | # set up data 25 | data(occupations) 26 | allowed.codes <- c("71402", "71403", "63302", "83112", "83124", "83131", "83132", "83193", "83194", "-0004", "-0030") 27 | allowed.codes.titles <- c("Office clerks and secretaries (without specialisation)-skilled tasks", "Office clerks and secretaries (without specialisation)-complex tasks", "Gastronomy occupations (without specialisation)-skilled tasks", 28 | "Occupations in child care and child-rearing-skilled tasks", "Occupations in social work and social pedagogics-highly complex tasks", "Pedagogic specialists in social care work and special needs education-unskilled/semiskilled tasks", "Pedagogic specialists in social care work and special needs education-skilled tasks", "Supervisors in education and social work, and of pedagogic specialists in social care work", "Managers in education and social work, and of pedagogic specialists in social care work", 29 | "Not precise enough for coding", "Student assistants") 30 | proc.occupations <- removeFaultyAndUncodableAnswers_And_PrepareForAnalysis(occupations, colNames = c("orig_answer", "orig_code"), allowed.codes, allowed.codes.titles) 31 | 32 | ## split sample 33 | set.seed(3451345) 34 | n.test <- 50 35 | group <- sample(c(rep("test", n.test), rep("training", nrow(proc.occupations) - n.test))) 36 | splitted.data <- split(proc.occupations, group) 37 | 38 | # train model and make predictions 39 | model <- trainLogisticRegressionWithPenalization(splitted.data$train, preprocessing = list(stopwords = tm::stopwords("de"), stemming = "de", countWords = FALSE), tuning = list(alpha = 0.05, maxit = 50^5, nlambda = 100, thresh = 1e-5)) 40 | res <- predictLogisticRegressionWithPenalization(model, splitted.data$test) 41 | 42 | # expand to contain more categories than the initial ones 43 | res.proc1 <- expandPredictionResults(res, allowed.codes = c("12345", allowed.codes), method.name = "glmnet1") 44 | 45 | # we can use different methods to create a combined dataset. This is how to run the subsequent analysis functions only once. 46 | res.proc2 <- expandPredictionResults(res, allowed.codes = c("12345", allowed.codes), method.name = "glmnet2") 47 | res.proc <- rbind(res.proc1, res.proc2); class(res.proc) <- c(class(res.proc), "occupationalPredictionsComplete") 48 | 49 | calcAccurateAmongTopK(res.proc, k = 5)[,mean(acc), by = method.name] 50 | accuracy(calcAccurateAmongTopK(res.proc, k = 5), n = 50) 51 | accuracy(calcAccurateAmongTopK(res.proc, k = 1), n = 50) 52 | } 53 | -------------------------------------------------------------------------------- /man/asDocumentTermMatrix.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/asDocumentTermMatrix.R 3 | \name{asDocumentTermMatrix} 4 | \alias{asDocumentTermMatrix} 5 | \title{Document-Term Matrix} 6 | \usage{ 7 | asDocumentTermMatrix( 8 | input, 9 | vect.vocab = NULL, 10 | stopwords = character(0), 11 | stemming = NULL, 12 | type = c("dgCMatrix", "dgTMatrix", "lda_c") 13 | ) 14 | } 15 | \arguments{ 16 | \item{input}{a character vector.} 17 | 18 | \item{vect.vocab}{a vocabulary created with \code{\link[text2vec:vectorizers]{vocab_vectorizer}}. If \code{NULL}, the vocabulary is created from the input. See example for a typical use case.} 19 | 20 | \item{stopwords}{character vector of stopwords to exclude when creating the vocabulary. \code{tm::stopwords("de")} provides German stopwords.} 21 | 22 | \item{stemming}{\code{NULL} for no stemming and \code{"de"} for stemming using the German porter stemmer.} 23 | 24 | \item{type}{character, one of c("dgCMatrix", "dgTMatrix", "lda_c") taken from \code{\link[text2vec]{create_dtm}}. \code{dgCMatrix} are useful for glmnet; \code{dgTMatrix} matrix refers to sparse matrices in triplet form, i.e. positions of all non-zero values are stored (easier to work with, but non-unique).} 25 | } 26 | \value{ 27 | A list with two elements 28 | \describe{ 29 | \item{dtm}{a sparse document-term-matrix, depending on the \code{type}-parameter} 30 | \item{vect.vocab}{a vocabulary that can be inserted as \code{vect.vocab} to build a document term matrix on new data with the same vocabulary.} 31 | } 32 | } 33 | \description{ 34 | Constructs a document-term matrix. 35 | } 36 | \examples{ 37 | x <- c("Verkauf von Schreibwaren", "Verkauf", "Schreibwaren") 38 | asDocumentTermMatrix(x) 39 | asDocumentTermMatrix(x, type = "dgTMatrix") 40 | asDocumentTermMatrix(x, stopwords = tm::stopwords("de")) 41 | 42 | (x <- c("Verkauf von B\xfcchern, Schreibwaren", "Fach\xe4rzin f\xfcr Kinder- und Jugendmedizin im \xf6ffentlichen Gesundheitswesen", "Industriemechaniker", "Dipl.-Ing. - Agrarwirtschaft (Landwirtschaft)")) 43 | x <- stringPreprocessing(x) 44 | dtm <- asDocumentTermMatrix(x, stemming = "de") 45 | print(dtm$dtm) 46 | dimnames(dtm$dtm)[[2]] 47 | 48 | # use the newly created vocab_vectorizer 49 | (x <- stringPreprocessing(c("WILL NOT SHOW UP", "Verkauf von B\xfcchern, Schreibwaren", "Fach\xe4rzin f\xfcr Kinder- und Jugendmedizin"))) 50 | asDocumentTermMatrix(x, vect.vocab = dtm$vect.vocab, stopwords = character(0), stemming = "de")$dtm 51 | } 52 | \seealso{ 53 | \url{http://text2vec.org/vectorization.html} for details on the implementation used here, 54 | another implementation \code{\link[tm]{TermDocumentMatrix}} is slower 55 | } 56 | -------------------------------------------------------------------------------- /man/calcAccurateAmongTopK.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/calcAccurateAmongTopK.R 3 | \name{calcAccurateAmongTopK} 4 | \alias{calcAccurateAmongTopK} 5 | \title{Calculate aggregate properties for top k predicted categories} 6 | \usage{ 7 | calcAccurateAmongTopK(occupationalPredictions, k = 1) 8 | } 9 | \arguments{ 10 | \item{occupationalPredictions}{a data.table created with a \code{\link{expandPredictionResults}}-function from this package.} 11 | 12 | \item{k}{how many top k categories to aggregate over?} 13 | } 14 | \value{ 15 | a data.table 16 | } 17 | \description{ 18 | Start with a data.table of class 'occupationalPredictions' (for each combination of pred.code and answer one prediction) and calulate if one of the top k entries is accurate. 19 | } 20 | \details{ 21 | \code{num.suggested} and \code{general.among.top5} is currently not used. Relates to situations if the prediction algorithm does not provide all codes. 22 | } 23 | \examples{ 24 | # set up data 25 | data(occupations) 26 | allowed.codes <- c("71402", "71403", "63302", "83112", "83124", "83131", "83132", "83193", "83194", "-0004", "-0030") 27 | allowed.codes.titles <- c("Office clerks and secretaries (without specialisation)-skilled tasks", "Office clerks and secretaries (without specialisation)-complex tasks", "Gastronomy occupations (without specialisation)-skilled tasks", 28 | "Occupations in child care and child-rearing-skilled tasks", "Occupations in social work and social pedagogics-highly complex tasks", "Pedagogic specialists in social care work and special needs education-unskilled/semiskilled tasks", "Pedagogic specialists in social care work and special needs education-skilled tasks", "Supervisors in education and social work, and of pedagogic specialists in social care work", "Managers in education and social work, and of pedagogic specialists in social care work", 29 | "Not precise enough for coding", "Student assistants") 30 | proc.occupations <- removeFaultyAndUncodableAnswers_And_PrepareForAnalysis(occupations, colNames = c("orig_answer", "orig_code"), allowed.codes, allowed.codes.titles) 31 | 32 | ## split sample 33 | set.seed(3451345) 34 | n.test <- 50 35 | group <- sample(c(rep("test", n.test), rep("training", nrow(proc.occupations) - n.test))) 36 | splitted.data <- split(proc.occupations, group) 37 | 38 | # train model and make predictions 39 | model <- trainLogisticRegressionWithPenalization(splitted.data$train, preprocessing = list(stopwords = tm::stopwords("de"), stemming = "de", countWords = FALSE), tuning = list(alpha = 0.05, maxit = 50^5, nlambda = 100, thresh = 1e-5)) 40 | res <- predictLogisticRegressionWithPenalization(model, splitted.data$test) 41 | 42 | # expand to contain more categories than the initial ones 43 | res.proc1 <- expandPredictionResults(res, allowed.codes = c("12345", allowed.codes), method.name = "glmnet1") 44 | 45 | # aggregate over top k categories 46 | calcAccurateAmongTopK(res.proc1, k = 1)[,mean(acc)] 47 | calcAccurateAmongTopK(res.proc1, k = 5)[,mean(acc)] 48 | 49 | # we can use different methods to create a combined dataset. This is how to run the subsequent analysis functions only once. 50 | res.proc2 <- expandPredictionResults(res, allowed.codes = c("12345", allowed.codes), method.name = "glmnet2") 51 | res.proc <- rbind(res.proc1, res.proc2); class(res.proc) <- c(class(res.proc), "occupationalPredictionsComplete") 52 | 53 | calcAccurateAmongTopK(res.proc, k = 5)[,mean(acc), by = method.name] 54 | # res[, calcAccurateAmongTopK(.SD, k = 5), by = method][,mean(acc), by = method] 55 | } 56 | \seealso{ 57 | ... 58 | } 59 | -------------------------------------------------------------------------------- /man/coding_index_excerpt.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \encoding{UTF-8} 5 | \name{coding_index_excerpt} 6 | \alias{coding_index_excerpt} 7 | \title{An excerpt from the Gesamtberufsliste der BA} 8 | \format{ 9 | A data frame with 90 rows and 5 variables: 10 | \describe{ 11 | \item{Berufsbenennungen}{neutral short title} 12 | \item{bezMale}{male long title} 13 | \item{bezFemale}{female long title} 14 | \item{Code}{Code from the German Classification of Occupations (KldB 2010)} 15 | \item{count_categories}{An indicator of job title ambiguity. Only used within function \code{\link{predictWithCodingIndex}}} 16 | } 17 | } 18 | \source{ 19 | Bundesagentur für Arbeit (2019). Gesamtberufsliste der Bundesagentur für Arbeit. Stand: 03.01.2019. The \code{Gesamtberufsliste der BA} is available at \url{https://download-portal.arbeitsagentur.de/files/}. 20 | 21 | The function \code{\link{prepare_German_coding_index_Gesamtberufsliste_der_BA}} was used to process the downloaded \code{.xlsx}-file and prepare this coding index. The resulting coding index has 27853 rows. We selected 90 rows that are related to the \code{\link{occupations}} dataset. 22 | } 23 | \usage{ 24 | coding_index_excerpt 25 | } 26 | \description{ 27 | 90 selected job titles from the \code{Gesamtberufsliste der BA}. This dataset is not complete and only used to demonstrate the functionality of the package. We recommend that users build their own coding index. \code{\link{prepare_German_coding_index_Gesamtberufsliste_der_BA}} can be used to build a German coding index. 28 | } 29 | \keyword{datasets} 30 | -------------------------------------------------------------------------------- /man/cosineSimilarity.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/cosineSimilarity.R 3 | \name{cosineSimilarity} 4 | \alias{cosineSimilarity} 5 | \title{Cosine Similarity} 6 | \usage{ 7 | cosineSimilarity(matrix1, matrix2) 8 | } 9 | \arguments{ 10 | \item{matrix1}{a matrix of type \code{dgCMatrix}.} 11 | 12 | \item{matrix2}{a matrix of type \code{dgCMatrix}.} 13 | } 14 | \value{ 15 | A \code{dgCMatrix} where element \code{A[index1, index2]} is the cosine similarity between \code{matrix1[index1,]} and \code{matrix2[index2,]}. 16 | } 17 | \description{ 18 | Calculate cosine similarity between every row in \code{matrix1} and every row in \code{matrix2}. 19 | } 20 | \details{ 21 | Cosine similarity is a measure of similarity between two vectors \eqn{x} and \eqn{y} that measures the cosine of the angle between them. Since we consider positive vectors, its maximal value is 1 if both vectors are identical and its minimal value is 0 if \eqn{x \times y = 0}. 22 | 23 | The definition is: \eqn{similarity = (x \times y) / (||x|| \times ||y||) = (\sum_i x_i \times y_i) / (\sqrt{(\sum_i x_i^2)} \times \sqrt{(\sum_i y_i^2)})} 24 | } 25 | \examples{ 26 | x <- c("Verkauf von Schreibwaren", "Verkauf", "Schreibwaren", "Industriemechaniker", "NOTINDOCUMENTTERMMATRIX") 27 | (y <- c("Verkauf von B\xfcchern, Schreibwaren", "Fach\xe4rzin f\xfcr Kinder- und Jugendmedizin im \xf6ffentlichen Gesundheitswesen", "Industriemechaniker", "Dipl.-Ing. - Agrarwirtschaft (Landwirtschaft)")) 28 | 29 | tok_fun = text2vec::word_tokenizer 30 | it_train = text2vec::itoken(tolower(y), tokenizer = tok_fun, progressbar = FALSE) 31 | vocab = text2vec::create_vocabulary(it_train) 32 | vect.vocab = text2vec::vocab_vectorizer(vocab) 33 | 34 | matrix1 <- asDocumentTermMatrix(x, vect.vocab = vect.vocab)$dtm 35 | matrix2 <- asDocumentTermMatrix(y, vect.vocab = vect.vocab)$dtm 36 | 37 | cosineSimilarity(matrix1, matrix1) 38 | cosineSimilarity(matrix1, matrix2) 39 | } 40 | \seealso{ 41 | \code{\link[Matrix]{Matrix}} 42 | } 43 | -------------------------------------------------------------------------------- /man/createDescriptives.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/createDescriptives.R 3 | \name{createDescriptives} 4 | \alias{createDescriptives} 5 | \title{Describe occupational data} 6 | \usage{ 7 | createDescriptives(data, noninformative = c("")) 8 | } 9 | \arguments{ 10 | \item{data}{eiter a data.table created with \code{\link{removeFaultyAndUncodableAnswers_And_PrepareForAnalysis}} or a character vector} 11 | 12 | \item{noninformative}{a charactor vector: if an answer is in \code{noninformative}, it is excluded} 13 | } 14 | \value{ 15 | Side effects only: a plot and text 16 | } 17 | \description{ 18 | Outputs the following descriptives: frequencies, mean, median, max of number of words. Number of codes used and frequencies for special codes 19 | } 20 | \examples{ 21 | data <- data.table(answers = c("LEITER VERTRIEB", "Kfz-Schlossermeister", "Aushilfe im Hotel(Bereich Housekeeping)", "Studentische Hilfskraft"), 22 | codes = c("61194", "25213", "63221", "-0001")) 23 | (allowed.codes <- c("11101", "61194", "25213", "63221", "-0001")) 24 | (allowed.codes.titles <- c("Berufe in der Landwirtschaft (ohne Spezialisierung) - Helfer-/Anlernt\xe4tigkeiten", "Berufe in der Kraftfahrzeugtechnik - komplexe Spezialistent\xe4tigkeiten", "F\xfchrungskräfte - Einkauf und Vertrieb", "Berufe im Hotelservice - Helfer-/Anlernt\xe4tigkeiten", "a negative label (used for categories like 'student assistant' that are not in the official classification)")) 25 | data <- removeFaultyAndUncodableAnswers_And_PrepareForAnalysis(data, colNames = c("answers", "codes"), allowed.codes, allowed.codes.titles) 26 | createDescriptives(data) 27 | 28 | (answer <- c("LEITER VERTRIEB", "Kfz-Schlossermeister", "Aushilfe im Hotel(Bereich Housekeeping)")) 29 | createDescriptives(answer) 30 | 31 | data(occupations) 32 | allowed.codes <- c("71402", "71403", "63302", "83112", "83124", "83131", "83132", "83193", "83194", "-0004", "-0030") 33 | allowed.codes.titles <- c("Office clerks and secretaries (without specialisation)-skilled tasks", "Office clerks and secretaries (without specialisation)-complex tasks", 34 | "Gastronomy occupations (without specialisation)-skilled tasks", "Occupations in child care and child-rearing-skilled tasks", "Occupations in social work and social pedagogics-highly complex tasks", "Pedagogic specialists in social care work and special needs education-unskilled/semiskilled tasks", "Pedagogic specialists in social care work and special needs education-skilled tasks", "Supervisors in education and social work, and of pedagogic specialists in social care work", "Managers in education and social work, and of pedagogic specialists in social care work", 35 | "Not precise enough for coding", "Student assistants") 36 | occupations <- removeFaultyAndUncodableAnswers_And_PrepareForAnalysis(occupations, colNames = c("orig_answer", "orig_code"), allowed.codes, allowed.codes.titles) 37 | createDescriptives(occupations) 38 | } 39 | -------------------------------------------------------------------------------- /man/createSimilarityTableStringdist.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/createSimilarityTableStringdist.R 3 | \name{createSimilarityTableStringdist} 4 | \alias{createSimilarityTableStringdist} 5 | \title{Similarity Table with Coding index} 6 | \usage{ 7 | createSimilarityTableStringdist( 8 | unique.string, 9 | coding_index_w_codes, 10 | coding_index_without_codes, 11 | dist.control = list(method = "osa", weight = c(d = 1, i = 1, s = 1, t = 1)), 12 | threshold = 3 13 | ) 14 | } 15 | \arguments{ 16 | \item{unique.string}{a character vector (usually unique(answer))} 17 | 18 | \item{coding_index_w_codes}{a data.table with columns "title" and "Code".} 19 | 20 | \item{coding_index_without_codes}{a character vector of additional titles} 21 | 22 | \item{dist.control}{a list that will be passed to \code{\link[stringdist:stringdist]{stringdistmatrix}}. Currently only two elements are implemented: 23 | \describe{ 24 | \item{method}{Method for distance calculation.} 25 | \item{weight}{For method='osa' or 'dl'.} 26 | }} 27 | 28 | \item{threshold}{All entries with distance above this threshold will be removed from the result} 29 | } 30 | \value{ 31 | a list with elements 32 | \describe{ 33 | \item{dist_table_w_code}{a data.table with colummns \code{intString}, \code{dictString.title}, \code{dictString.Code}, \code{dist}} 34 | \item{dist_table_without_code}{\code{NULL} or a data.table with colummns \code{intString}, \code{dictString}, \code{dist}} 35 | \item{vect_vocab}{see \code{link{asDocumentTermMatrix}}} 36 | } 37 | } 38 | \description{ 39 | Calculate string similarity between \code{unique.string} and \code{(coding_index_w_codes, coding_index_without_codes)}. 40 | } 41 | \details{ 42 | Special function for similarity-based reasoning: creates distance data with osa-method c(d = 1, i = 1, s = 1, t = 1) 43 | dist == 0: strings in dict and data are identical 44 | } 45 | \examples{ 46 | ## Prepare coding index 47 | # write female titles beneath the male title 48 | coding_index <- rbind(coding_index_excerpt[, list(title = bezMale, Code)], 49 | coding_index_excerpt[, list(title = bezFemale, Code)]) 50 | # standardize titles from the coding index 51 | coding_index <- coding_index[,title := stringPreprocessing(title)] 52 | # drop duplicate lines, might be suboptimal because we keep each title and its associated code only a single time. This means we delete duplicates and the associated, possibly relevant codes. 53 | coding_index <- coding_index[!duplicated(title)] 54 | 55 | (x <- c("Abgeordneter", "Abgeordneter", "Abgeordnete", "abgeordnet", "FSJ", "FSJ2", "Industriemechaniker", "Dipl.-Ing. - Agrarwirtschaft (Landwirtschaft)")) 56 | createSimilarityTableStringdist(unique.string = stringPreprocessing(x), 57 | coding_index_w_codes = coding_index, 58 | coding_index_without_codes = frequent_phrases, 59 | dist.control = list(method = "osa", weight = c(d = 1, i = 1, s = 1, t = 1)), 60 | threshold = 3) 61 | } 62 | \seealso{ 63 | \code{\link{trainSimilarityBasedReasoning}}, \code{\link{createSimilarityTableWordwiseStringdist}}, \code{\link{createSimilarityTableSubstring}} 64 | } 65 | -------------------------------------------------------------------------------- /man/createSimilarityTableSubstring.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/createSimilarityTableSubstring.R 3 | \name{createSimilarityTableSubstring} 4 | \alias{createSimilarityTableSubstring} 5 | \title{Similarity Table with Coding index} 6 | \usage{ 7 | createSimilarityTableSubstring( 8 | unique.string, 9 | coding_index_w_codes, 10 | coding_index_without_codes 11 | ) 12 | } 13 | \arguments{ 14 | \item{unique.string}{a character vector (usually unique(answer))} 15 | 16 | \item{coding_index_w_codes}{a data.table with columns "title" and "Code".} 17 | 18 | \item{coding_index_without_codes}{a character vector of additional titles} 19 | } 20 | \value{ 21 | a list with elements 22 | \describe{ 23 | \item{dist_table_w_code}{a data.table with colummns \code{intString}, \code{dictString.title}, \code{dictString.Code}, \code{dist}} 24 | \item{dist_table_without_code}{\code{NULL} or a data.table with colummns \code{intString}, \code{dictString}, \code{dist}} 25 | \item{vect_vocab}{see \code{link{asDocumentTermMatrix}}} 26 | } 27 | } 28 | \description{ 29 | Calculate SUBSTRING similarity between \code{unique.string} and \code{(coding_index_w_codes, coding_index_without_codes)}. unique.string and coding_index are similar if coding_index is a substring of unique.string. 30 | } 31 | \details{ 32 | Special function for similarity-based reasoning: creates distance data with substring-method 33 | } 34 | \examples{ 35 | ## Prepare coding index 36 | # write female titles beneath the male title 37 | coding_index <- rbind(coding_index_excerpt[, list(title = bezMale, Code)], 38 | coding_index_excerpt[, list(title = bezFemale, Code)]) 39 | # standardize titles from the coding index 40 | coding_index <- coding_index[,title := stringPreprocessing(title)] 41 | # drop duplicate lines, might be suboptimal because we keep each title and its associated code only a single time. This means we delete duplicates and the associated, possibly relevant codes. 42 | coding_index <- coding_index[!duplicated(title)] 43 | 44 | (x <- c("Abgeordneter", "Abgeordneter", "Abgeordnete", "abgeordnet", "FSJ", "FSJ2", "Industriemechaniker", "Dipl.-Ing. - Agrarwirtschaft (Landwirtschaft)")) 45 | createSimilarityTableSubstring(unique.string = stringPreprocessing(x), 46 | coding_index_w_codes = coding_index, 47 | coding_index_without_codes = frequent_phrases) 48 | } 49 | \seealso{ 50 | \code{\link{trainSimilarityBasedReasoning}}, \code{\link{createSimilarityTableWordwiseStringdist}}, \code{\link{createSimilarityTableStringdist}}, \code{\link{createSimilarityTableStringdist}} 51 | } 52 | -------------------------------------------------------------------------------- /man/createSimilarityTableWordwiseStringdist.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/createSimilarityTableWordwiseStringdist.R 3 | \name{createSimilarityTableWordwiseStringdist} 4 | \alias{createSimilarityTableWordwiseStringdist} 5 | \title{Wordwise Similarity Table with Coding index} 6 | \usage{ 7 | createSimilarityTableWordwiseStringdist( 8 | unique.string, 9 | coding_index_w_codes, 10 | coding_index_without_codes, 11 | preprocessing, 12 | dist.control = list(method = "osa", weight = c(d = 1, i = 1, s = 1, t = 1)), 13 | threshold = 1 14 | ) 15 | } 16 | \arguments{ 17 | \item{unique.string}{a character vector (usually unique(answer))} 18 | 19 | \item{coding_index_w_codes}{a data.table with columns "title" and "Code".} 20 | 21 | \item{coding_index_without_codes}{a character vector of additional titles} 22 | 23 | \item{preprocessing}{a list with elements 24 | \describe{ 25 | \item{stopwords}{a character vector, use \code{tm::stopwords("de")} for German stopwords.} 26 | \item{stemming}{\code{NULL} for no stemming and \code{"de"} for stemming using the German porter stemmer.} 27 | \item{strPreprocessing}{\code{TRUE} if \code{\link{stringPreprocessing}} shall be used.} 28 | \item{removePunct}{\code{TRUE} if \code{\link[tm]{removePunctuation}} shall be used.} 29 | }} 30 | 31 | \item{dist.control}{a list that will be passed to \code{\link[stringdist:stringdist]{stringdistmatrix}}. Currently only two elements are implemented: 32 | \describe{ 33 | \item{method}{Method for distance calculation.} 34 | \item{weight}{For method='osa' or 'dl'.} 35 | }} 36 | 37 | \item{threshold}{All entries with distance above this threshold will be removed from the result} 38 | } 39 | \value{ 40 | a list with elements 41 | \describe{ 42 | \item{dist_table_w_code}{a data.table with colummns \code{intString}, \code{dictString.title}, \code{dictString.Code}, \code{dist}} 43 | \item{dist_table_without_code}{\code{NULL} or a data.table with colummns \code{intString}, \code{dictString}, \code{dist}} 44 | \item{vect_vocab}{see \code{link{asDocumentTermMatrix}}} 45 | } 46 | } 47 | \description{ 48 | Calculate string similarity on a word basis between \code{unique.string} and \code{(coding_index_w_codes, coding_index_without_codes)}. 49 | } 50 | \details{ 51 | Special function for similarity-based reasoning: creates WORDWISE!!!!! distance data with osa-method c(d = 1, i = 1, s = 1, t = 1) 52 | --> allows to correct 1 letter in a single word and matches this word with the dictionary. 53 | This means: unique.string is split wordwise and that word is used for matchning which has lowest osa-distance (all in case of a tie) 54 | example: 55 | "KUESTER and HAUSMEISTER" has distance 0 to both dictString.title HAUSMEISTER and KUESTER. Because the word HAUSMEISTER has minimal distance, another dictString.title HAUMEISTER, which has dist = 1 is not included. 56 | } 57 | \examples{ 58 | ## Prepare coding index 59 | # write female titles beneath the male title 60 | coding_index <- rbind(coding_index_excerpt[, list(title = bezMale, Code)], 61 | coding_index_excerpt[, list(title = bezFemale, Code)]) 62 | # standardize titles from the coding index 63 | coding_index <- coding_index[,title := stringPreprocessing(title)] 64 | # drop duplicate lines, might be suboptimal because we keep each title and its associated code only a single time. This means we delete duplicates and the associated, possibly relevant codes. 65 | coding_index <- coding_index[!duplicated(title)] 66 | 67 | (x <- c("Abgeordneter", "Abgeordneter", "Abgeordnete", "abgeordnet", "abgeordnet zu xxx", "FSJ", "FSJ2", "Industriemechaniker", "Dipl.-Ing. - Agrarwirtschaft (Landwirtschaft)")) 68 | createSimilarityTableWordwiseStringdist(unique.string = stringPreprocessing(x), 69 | coding_index_w_codes = coding_index, 70 | coding_index_without_codes = frequent_phrases, 71 | preprocessing = list(stopwords = tm::stopwords("de"), stemming = NULL, strPreprocessing = TRUE, removePunct = FALSE), 72 | dist.control = list(method = "osa", weight = c(d = 1, i = 1, s = 1, t = 1)), 73 | threshold = 1) 74 | } 75 | \seealso{ 76 | \code{\link{trainSimilarityBasedReasoning}}, \code{\link{createSimilarityTableStringdist}}, \code{\link{createSimilarityTableSubstring}} 77 | } 78 | -------------------------------------------------------------------------------- /man/expandPredictionResults.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/expandPredictionResults.R 3 | \name{expandPredictionResults} 4 | \alias{expandPredictionResults} 5 | \title{Expands predicted datasets to contain all allowed codes} 6 | \usage{ 7 | expandPredictionResults(occupationalPredictions, allowed.codes, method.name) 8 | } 9 | \arguments{ 10 | \item{allowed.codes}{a character vector of all allowed codes.} 11 | 12 | \item{method.name}{the name how the method shall be called.} 13 | 14 | \item{data}{a data.table created with a \code{predict}-function from this package.} 15 | } 16 | \value{ 17 | a data.table 18 | } 19 | \description{ 20 | Start with a data.table of class 'occupationalPredictions' (for each combination of pred.code and answer one prediction) and expand it to contain all allowed codes. 21 | } 22 | \details{ 23 | The problem solved here is this: Most algorithms do not provide codes for all categories from the classification, because this would require that the categories are in the training data. This function expands the dataset and predicts some very small probabilities (or 0) for classification codes that the training algorithm found impossible to predict. 24 | } 25 | \examples{ 26 | # set up data 27 | data(occupations) 28 | allowed.codes <- c("71402", "71403", "63302", "83112", "83124", "83131", "83132", "83193", "83194", "-0004", "-0030") 29 | allowed.codes.titles <- c("Office clerks and secretaries (without specialisation)-skilled tasks", "Office clerks and secretaries (without specialisation)-complex tasks", "Gastronomy occupations (without specialisation)-skilled tasks", 30 | "Occupations in child care and child-rearing-skilled tasks", "Occupations in social work and social pedagogics-highly complex tasks", "Pedagogic specialists in social care work and special needs education-unskilled/semiskilled tasks", "Pedagogic specialists in social care work and special needs education-skilled tasks", "Supervisors in education and social work, and of pedagogic specialists in social care work", "Managers in education and social work, and of pedagogic specialists in social care work", 31 | "Not precise enough for coding", "Student assistants") 32 | proc.occupations <- removeFaultyAndUncodableAnswers_And_PrepareForAnalysis(occupations, colNames = c("orig_answer", "orig_code"), allowed.codes, allowed.codes.titles) 33 | 34 | ## split sample 35 | set.seed(3451345) 36 | n.test <- 50 37 | group <- sample(c(rep("test", n.test), rep("training", nrow(proc.occupations) - n.test))) 38 | splitted.data <- split(proc.occupations, group) 39 | 40 | # train model and make predictions 41 | model <- trainLogisticRegressionWithPenalization(splitted.data$train, preprocessing = list(stopwords = tm::stopwords("de"), stemming = "de", countWords = FALSE), tuning = list(alpha = 0.05, maxit = 50^5, nlambda = 100, thresh = 1e-5)) 42 | res <- predictLogisticRegressionWithPenalization(model, splitted.data$test) 43 | 44 | expandPredictionResults(res, allowed.codes, method.name = "Logistic Regression") 45 | } 46 | \seealso{ 47 | \code{\link{produceResults}} 48 | } 49 | -------------------------------------------------------------------------------- /man/frequent_phrases.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \encoding{UTF-8} 5 | \name{frequent_phrases} 6 | \alias{frequent_phrases} 7 | \title{Some job titles and job descriptions} 8 | \format{ 9 | A character vector with 701 elements 10 | } 11 | \usage{ 12 | frequent_phrases 13 | } 14 | \description{ 15 | There exist some job titles/descriptions that are not part of the \code{Gesamtberufsliste_der_BA} (see \code{\link{prepare_German_coding_index_Gesamtberufsliste_der_BA}}), possibly because they are too general or because they are misspelled. They are still common answers that should not be ignored. We use these answers within the function \code{\link{trainSimilarityBasedReasoning}}. 16 | } 17 | \details{ 18 | As a general rule, we included some of the most frequent verbal answers (after \code{\link{stringPreprocessing}}) if they have more than three characters and have no empty spaces. Creating predictions for these answers would be impossible if only the job titles from the Gesamtberufsliste were used only. 19 | } 20 | \keyword{datasets} 21 | -------------------------------------------------------------------------------- /man/kldb2010PlusFive.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \encoding{UTF-8} 5 | \name{kldb2010PlusFive} 6 | \alias{kldb2010PlusFive} 7 | \title{2010 German Classification of Occupations (KldB 2010)} 8 | \format{ 9 | A data frame with 2 variables: 10 | \describe{ 11 | \item{code}{5-digit number (Berufsgattung)} 12 | \item{title}{Category label} 13 | } 14 | } 15 | \source{ 16 | Federal Employment Agency (2011). Klassifikation der Berufe 2010, Bundesagentur für Arbeit, Nuremberg. 17 | } 18 | \usage{ 19 | kldb2010PlusFive 20 | } 21 | \description{ 22 | This file contains the labels for all 1286 five-digit categories (Berufsgattungen) from the 2010 German Classification of Occupations. 23 | } 24 | \details{ 25 | Five additional categories were added for coding purposes: (-0030 = Studentische Hilfskraft, -0004 = Berufsbeschreibung zu unpräzise/nicht kodierbar, -0006 = Mehrere Jobs, -0012 = Arbeiter/in (nicht näher spezifiziert), -0019 = Freiwilligendienst, FSJ, Zivildienst) 26 | } 27 | \keyword{datasets} 28 | -------------------------------------------------------------------------------- /man/logLoss.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/logLoss.R 3 | \name{logLoss} 4 | \alias{logLoss} 5 | \title{Log loss} 6 | \usage{ 7 | logLoss(occupationalPredictions) 8 | } 9 | \arguments{ 10 | \item{occupationalPredictions}{a data.table created with a \code{\link{expandPredictionResults}}-function from this package.} 11 | } 12 | \value{ 13 | a data.table 14 | } 15 | \description{ 16 | Calculate log loss \eqn{\log_2 loss = \frac{1}{N} \sum_n \log_2 loss_n} and standard error \eqn{\sqrt{\frac{1}{N(N-1)} \sum_n (\log_2 loss_n - \log_2 loss)^2}} with \eqn{loss_n = \sum_k -y_{nk} \log_2 p_{nk}} 17 | } 18 | \details{ 19 | log loss is the average probability of true categories that actually realized. 20 | } 21 | \examples{ 22 | # set up data 23 | data(occupations) 24 | allowed.codes <- c("71402", "71403", "63302", "83112", "83124", "83131", "83132", "83193", "83194", "-0004", "-0030") 25 | allowed.codes.titles <- c("Office clerks and secretaries (without specialisation)-skilled tasks", "Office clerks and secretaries (without specialisation)-complex tasks", "Gastronomy occupations (without specialisation)-skilled tasks", 26 | "Occupations in child care and child-rearing-skilled tasks", "Occupations in social work and social pedagogics-highly complex tasks", "Pedagogic specialists in social care work and special needs education-unskilled/semiskilled tasks", "Pedagogic specialists in social care work and special needs education-skilled tasks", "Supervisors in education and social work, and of pedagogic specialists in social care work", "Managers in education and social work, and of pedagogic specialists in social care work", 27 | "Not precise enough for coding", "Student assistants") 28 | proc.occupations <- removeFaultyAndUncodableAnswers_And_PrepareForAnalysis(occupations, colNames = c("orig_answer", "orig_code"), allowed.codes, allowed.codes.titles) 29 | 30 | ## split sample 31 | set.seed(3451345) 32 | n.test <- 50 33 | group <- sample(c(rep("test", n.test), rep("training", nrow(proc.occupations) - n.test))) 34 | splitted.data <- split(proc.occupations, group) 35 | 36 | # train model and make predictions 37 | model <- trainLogisticRegressionWithPenalization(splitted.data$train, preprocessing = list(stopwords = tm::stopwords("de"), stemming = "de", countWords = FALSE), tuning = list(alpha = 0.05, maxit = 50^5, nlambda = 100, thresh = 1e-5)) 38 | res <- predictLogisticRegressionWithPenalization(model, splitted.data$test) 39 | 40 | # expand to contain more categories than the initial ones 41 | res.proc1 <- expandPredictionResults(res, allowed.codes = c("12345", allowed.codes), method.name = "glmnet1") 42 | 43 | # we can use different methods to create a combined dataset. This is how to run the subsequent analysis functions only once. 44 | res.proc2 <- expandPredictionResults(res, allowed.codes = c("12345", allowed.codes), method.name = "glmnet2") 45 | res.proc <- rbind(res.proc1, res.proc2); class(res.proc) <- c(class(res.proc), "occupationalPredictionsComplete") 46 | 47 | logLoss(res.proc) 48 | } 49 | \seealso{ 50 | \code{\link{sharpness}} 51 | } 52 | -------------------------------------------------------------------------------- /man/occupations.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \encoding{UTF-8} 5 | \name{occupations} 6 | \alias{occupations} 7 | \title{A selection of 250 coded occupational answers} 8 | \format{ 9 | A data frame with 250 rows and 3 variables: 10 | \describe{ 11 | \item{orig_answer}{verbal answer given to the first question about occupation} 12 | \item{orig_answer2}{verbal answer given to the second question about occupation (not used (much) in this package but it illustrates that this and several other variables are used by human coders)} 13 | \item{orig_code}{5-digit codes from the German classification of Occupation. Negative codes are not part of the official classification but were used for coding. -0030 refers to student assistants, -0004 means that the job description was not precise enough for coding.} 14 | } 15 | } 16 | \source{ 17 | The following codes were hand-selected and double-checked to ensure anonymity: 71402 (50 cases), 71403 (20 cases), 63302 (20 cases), 83112 (25 cases), 83124 (25 cases), 83131 (20 cases), 83132 (20 cases), 83193 (15 cases), 83194 (5 cases), -0004 (25 cases), -0030 (25 cases) 18 | 19 | Antoni, M., Drasch, K., Kleinert, C., Matthes, B., Ruland, M. and Trahms, A. (2010). Arbeiten und Lernen im Wandel * Teil 1: Überblick über die Studie, FDZ-Methodenreport 05/2010, Forschungsdatenzentrum der Bundesagentur fur Arbeit im Institut für Arbeitsmarkt- und Berufsforschung, Nuremberg. 20 | 21 | Drasch, K., Matthes, B., Munz, M., Paulus, W. and Valentin, M.-A. (2012). Arbeiten und Lernen im Wandel * Teil V: Die Codierung der offenen Angaben zur beruflichen Tätigkeit, Ausbildung und Branche, FDZ-Methodenreport 04/2012, Forschungsdatenzentrum der Bundesagentur für Arbeit im Institut für Arbeitsmarkt- und Berufsforschung, Nuremberg. 22 | } 23 | \usage{ 24 | occupations 25 | } 26 | \description{ 27 | A dataset containing 250 selected verbal answers and their associated codes from the 2010 German classification of Occupation. This dataset is not meant to be representative, but only used to demonstrate the functionality of the package. We anticipate that accurate occupation coding will be more difficult with real data compared to this toy example. 28 | } 29 | \keyword{datasets} 30 | -------------------------------------------------------------------------------- /man/plotAgreementRateVsProductionRate.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/plotAgreementRateVsProductionRate.R 3 | \name{plotAgreementRateVsProductionRate} 4 | \alias{plotAgreementRateVsProductionRate} 5 | \title{Plot agreement rate vs. production rate} 6 | \usage{ 7 | plotAgreementRateVsProductionRate( 8 | occupationalPredictionsAmongTopK, 9 | n, 10 | yintercept, 11 | filename = NULL 12 | ) 13 | } 14 | \arguments{ 15 | \item{occupationalPredictionsAmongTopK}{a data table created with \code{\link{calcAccurateAmongTopK}}.} 16 | 17 | \item{n}{Number of unique persons in test data (may be larger than the number of persons in occupationalPredictionsAmongTopK if for some persons no codes were suggested)} 18 | 19 | \item{yintercept}{Where to draw a horizontal line?} 20 | 21 | \item{filename}{If a \code{filename} is specified the diagram will be saved at with this name.} 22 | } 23 | \value{ 24 | a ggplot 25 | } 26 | \description{ 27 | Plot the proportion of answers coded correctly for any given production rate (proportion of \code{n} that are coded automatically.) 28 | } 29 | \examples{ 30 | # set up data 31 | data(occupations) 32 | allowed.codes <- c("71402", "71403", "63302", "83112", "83124", "83131", "83132", "83193", "83194", "-0004", "-0030") 33 | allowed.codes.titles <- c("Office clerks and secretaries (without specialisation)-skilled tasks", "Office clerks and secretaries (without specialisation)-complex tasks", "Gastronomy occupations (without specialisation)-skilled tasks", 34 | "Occupations in child care and child-rearing-skilled tasks", "Occupations in social work and social pedagogics-highly complex tasks", "Pedagogic specialists in social care work and special needs education-unskilled/semiskilled tasks", "Pedagogic specialists in social care work and special needs education-skilled tasks", "Supervisors in education and social work, and of pedagogic specialists in social care work", "Managers in education and social work, and of pedagogic specialists in social care work", 35 | "Not precise enough for coding", "Student assistants") 36 | proc.occupations <- removeFaultyAndUncodableAnswers_And_PrepareForAnalysis(occupations, colNames = c("orig_answer", "orig_code"), allowed.codes, allowed.codes.titles) 37 | 38 | ## split sample 39 | set.seed(3451345) 40 | n.test <- 50 41 | group <- sample(c(rep("test", n.test), rep("training", nrow(proc.occupations) - n.test))) 42 | splitted.data <- split(proc.occupations, group) 43 | 44 | # train model and make predictions 45 | model <- trainLogisticRegressionWithPenalization(splitted.data$train, preprocessing = list(stopwords = tm::stopwords("de"), stemming = "de", countWords = FALSE), tuning = list(alpha = 0.05, maxit = 50^5, nlambda = 100, thresh = 1e-5)) 46 | res <- predictLogisticRegressionWithPenalization(model, splitted.data$test) 47 | 48 | # expand to contain more categories than the initial ones 49 | res.proc1 <- expandPredictionResults(res, allowed.codes = c("12345", allowed.codes), method.name = "glmnet1") 50 | 51 | # we can use different methods to create a combined dataset. This is how to run the subsequent analysis functions only once. 52 | res.proc2 <- expandPredictionResults(res, allowed.codes = c("12345", allowed.codes), method.name = "glmnet2") 53 | res.proc <- rbind(res.proc1, res.proc2); class(res.proc) <- c(class(res.proc), "occupationalPredictionsComplete") 54 | 55 | calcAccurateAmongTopK(res.proc, k = 5)[,mean(acc), by = method.name] 56 | plotAgreementRateVsProductionRate(calcAccurateAmongTopK(res.proc, k = 5), n = n.test, yintercept = 0.85) 57 | plotAgreementRateVsProductionRate(calcAccurateAmongTopK(res.proc, k = 1), n = n.test, yintercept = 0.85, filename = "test.pdf") 58 | } 59 | \seealso{ 60 | \code{\link{plotTruePredictionsVsFalsePredictions}}, \code{\link{calcAccurateAmongTopK}} 61 | } 62 | -------------------------------------------------------------------------------- /man/plotReliabilityDiagram.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/plotReliabilityDiagram.R 3 | \name{plotReliabilityDiagram} 4 | \alias{plotReliabilityDiagram} 5 | \title{Reliability Diagram} 6 | \usage{ 7 | plotReliabilityDiagram(occupationalPredictions, k, num.codes, filename = NULL) 8 | } 9 | \arguments{ 10 | \item{k}{how many top k categories to aggregate over?} 11 | 12 | \item{num.codes}{Number of allowed categories in classification} 13 | 14 | \item{filename}{If a \code{filename} is specified the diagram will be saved at with this name.} 15 | 16 | \item{occupationalPredictionsAmongTopK}{a data table created with \code{\link{calcAccurateAmongTopK}}.} 17 | } 18 | \value{ 19 | a ggplot 20 | } 21 | \description{ 22 | Plots the observed relative frequency of correctness against the forecasted probability. 23 | } 24 | \examples{ 25 | # set up data 26 | data(occupations) 27 | allowed.codes <- c("71402", "71403", "63302", "83112", "83124", "83131", "83132", "83193", "83194", "-0004", "-0030") 28 | allowed.codes.titles <- c("Office clerks and secretaries (without specialisation)-skilled tasks", "Office clerks and secretaries (without specialisation)-complex tasks", "Gastronomy occupations (without specialisation)-skilled tasks", 29 | "Occupations in child care and child-rearing-skilled tasks", "Occupations in social work and social pedagogics-highly complex tasks", "Pedagogic specialists in social care work and special needs education-unskilled/semiskilled tasks", "Pedagogic specialists in social care work and special needs education-skilled tasks", "Supervisors in education and social work, and of pedagogic specialists in social care work", "Managers in education and social work, and of pedagogic specialists in social care work", 30 | "Not precise enough for coding", "Student assistants") 31 | proc.occupations <- removeFaultyAndUncodableAnswers_And_PrepareForAnalysis(occupations, colNames = c("orig_answer", "orig_code"), allowed.codes, allowed.codes.titles) 32 | 33 | ## split sample 34 | set.seed(3451345) 35 | n.test <- 50 36 | group <- sample(c(rep("test", n.test), rep("training", nrow(proc.occupations) - n.test))) 37 | splitted.data <- split(proc.occupations, group) 38 | 39 | # train model and make predictions 40 | model <- trainLogisticRegressionWithPenalization(splitted.data$train, preprocessing = list(stopwords = tm::stopwords("de"), stemming = "de", countWords = FALSE), tuning = list(alpha = 0.05, maxit = 50^5, nlambda = 100, thresh = 1e-5)) 41 | res <- predictLogisticRegressionWithPenalization(model, splitted.data$test) 42 | 43 | # expand to contain more categories than the initial ones 44 | res.proc1 <- expandPredictionResults(res, allowed.codes = c("12345", allowed.codes), method.name = "glmnet1") 45 | 46 | # we can use different methods to create a combined dataset. This is how to run the subsequent analysis functions only once. 47 | res.proc2 <- expandPredictionResults(res, allowed.codes = c("12345", allowed.codes), method.name = "glmnet2") 48 | res.proc <- rbind(res.proc1, res.proc2); class(res.proc) <- c(class(res.proc), "occupationalPredictionsComplete") 49 | 50 | plotReliabilityDiagram(res.proc, k = 5, num.codes = length(allowed.codes) + 1) # + 1 because we introduced the code "12345" later 51 | plotReliabilityDiagram(res.proc, k = 1, num.codes = length(allowed.codes) + 1, filename = "test.pdf") 52 | } 53 | \seealso{ 54 | \code{\link{sharpness}} 55 | } 56 | -------------------------------------------------------------------------------- /man/plotTruePredictionsVsFalsePredictions.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/plotTruePredictionsVsFalsePredictions.R 3 | \name{plotTruePredictionsVsFalsePredictions} 4 | \alias{plotTruePredictionsVsFalsePredictions} 5 | \title{Plot true predictions versus false predictions} 6 | \usage{ 7 | plotTruePredictionsVsFalsePredictions( 8 | occupationalPredictionsAmongTopK, 9 | filename = NULL 10 | ) 11 | } 12 | \arguments{ 13 | \item{occupationalPredictionsAmongTopK}{a data table created with \code{\link{calcAccurateAmongTopK}}.} 14 | 15 | \item{filename}{If a \code{filename} is specified the diagram will be saved at with this name.} 16 | } 17 | \value{ 18 | a ggplot 19 | } 20 | \description{ 21 | Show how many predictions would be correct as a function of how many would be incorrect. 22 | } 23 | \examples{ 24 | # set up data 25 | data(occupations) 26 | allowed.codes <- c("71402", "71403", "63302", "83112", "83124", "83131", "83132", "83193", "83194", "-0004", "-0030") 27 | allowed.codes.titles <- c("Office clerks and secretaries (without specialisation)-skilled tasks", "Office clerks and secretaries (without specialisation)-complex tasks", "Gastronomy occupations (without specialisation)-skilled tasks", 28 | "Occupations in child care and child-rearing-skilled tasks", "Occupations in social work and social pedagogics-highly complex tasks", "Pedagogic specialists in social care work and special needs education-unskilled/semiskilled tasks", "Pedagogic specialists in social care work and special needs education-skilled tasks", "Supervisors in education and social work, and of pedagogic specialists in social care work", "Managers in education and social work, and of pedagogic specialists in social care work", 29 | "Not precise enough for coding", "Student assistants") 30 | proc.occupations <- removeFaultyAndUncodableAnswers_And_PrepareForAnalysis(occupations, colNames = c("orig_answer", "orig_code"), allowed.codes, allowed.codes.titles) 31 | 32 | ## split sample 33 | set.seed(3451345) 34 | n.test <- 50 35 | group <- sample(c(rep("test", n.test), rep("training", nrow(proc.occupations) - n.test))) 36 | splitted.data <- split(proc.occupations, group) 37 | 38 | # train model and make predictions 39 | model <- trainLogisticRegressionWithPenalization(splitted.data$train, preprocessing = list(stopwords = tm::stopwords("de"), stemming = "de", countWords = FALSE), tuning = list(alpha = 0.05, maxit = 50^5, nlambda = 100, thresh = 1e-5)) 40 | res <- predictLogisticRegressionWithPenalization(model, splitted.data$test) 41 | 42 | # expand to contain more categories than the initial ones 43 | res.proc1 <- expandPredictionResults(res, allowed.codes = c("12345", allowed.codes), method.name = "glmnet1") 44 | 45 | # we can use different methods to create a combined dataset. This is how to run the subsequent analysis functions only once. 46 | res.proc2 <- expandPredictionResults(res, allowed.codes = c("12345", allowed.codes), method.name = "glmnet2") 47 | res.proc <- rbind(res.proc1, res.proc2); class(res.proc) <- c(class(res.proc), "occupationalPredictionsComplete") 48 | 49 | calcAccurateAmongTopK(res.proc, k = 5)[,mean(acc), by = method.name] 50 | plotTruePredictionsVsFalsePredictions(calcAccurateAmongTopK(res.proc, k = 5)) 51 | plotTruePredictionsVsFalsePredictions(calcAccurateAmongTopK(res.proc, k = 1), filename = "test.pdf") 52 | } 53 | \seealso{ 54 | \code{\link{plotAgreementRateVsProductionRate}}, \code{\link{calcAccurateAmongTopK}} 55 | } 56 | -------------------------------------------------------------------------------- /man/predictCreecysMemoryBasedReasoning.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/predictCreecysMemoryBasedReasoning.R 3 | \name{predictCreecysMemoryBasedReasoning} 4 | \alias{predictCreecysMemoryBasedReasoning} 5 | \title{Predict codes with Creecys Memory-based reaoning model} 6 | \usage{ 7 | predictCreecysMemoryBasedReasoning( 8 | model, 9 | newdata, 10 | tuning = list(k.neighbors = 12, metric = c("SUM", "ERROR", "MAX")) 11 | ) 12 | } 13 | \arguments{ 14 | \item{model}{the output created from \code{\link{trainCreecysMemoryBasedReasoning}}} 15 | 16 | \item{newdata}{eiter a data.table created with \code{\link{removeFaultyAndUncodableAnswers_And_PrepareForAnalysis}} or a character vector} 17 | 18 | \item{tuning}{a list with elements 19 | \describe{ 20 | \item{k.neighbors}{Number of nearest neighbors to use.} 21 | \item{metric}{\code{metric} determines how to calculate 'nearness'. Setting \code{metric == MAX} is not recommended. See Creecy et al. for their reasoning and testing of different metrics.} 22 | }} 23 | } 24 | \value{ 25 | a data.table that provides a confidence score for the most likely category. Unlike other prediction functions in this package, no probabilities for all categories are provided, which makes post-processing a bit more difficult. See examples. 26 | } 27 | \description{ 28 | Function does the same preprocessing as in \code{\link{trainCreecysMemoryBasedReasoning}} and predicts codes with a modified \code{k}-nearest-neighbor approach. 29 | } 30 | \examples{ 31 | # set up data 32 | data(occupations) 33 | allowed.codes <- c("71402", "71403", "63302", "83112", "83124", "83131", "83132", "83193", "83194", "-0004", "-0030") 34 | allowed.codes.titles <- c("Office clerks and secretaries (without specialisation)-skilled tasks", "Office clerks and secretaries (without specialisation)-complex tasks", "Gastronomy occupations (without specialisation)-skilled tasks", 35 | "Occupations in child care and child-rearing-skilled tasks", "Occupations in social work and social pedagogics-highly complex tasks", "Pedagogic specialists in social care work and special needs education-unskilled/semiskilled tasks", "Pedagogic specialists in social care work and special needs education-skilled tasks", "Supervisors in education and social work, and of pedagogic specialists in social care work", "Managers in education and social work, and of pedagogic specialists in social care work", 36 | "Not precise enough for coding", "Student assistants") 37 | proc.occupations <- removeFaultyAndUncodableAnswers_And_PrepareForAnalysis(occupations, colNames = c("orig_answer", "orig_code"), allowed.codes, allowed.codes.titles) 38 | 39 | ## split sample 40 | set.seed(3451345) 41 | n.test <- 50 42 | group <- sample(c(rep("test", n.test), rep("training", nrow(proc.occupations) - n.test))) 43 | splitted.data <- split(proc.occupations, group) 44 | 45 | # train model and make predictions 46 | memModel <- trainCreecysMemoryBasedReasoning(splitted.data$training, 47 | preprocessing = list(stopwords = character(0), stemming = NULL, strPreprocessing = TRUE, removePunct = FALSE)) 48 | testi <- predictCreecysMemoryBasedReasoning(memModel, c("test", "HIWI", "Hilfswissenschaftler"), tuning = list(k.neighbors = 12, metric = c("SUM"))) 49 | testi; testi 50 | 51 | resMem <- predictCreecysMemoryBasedReasoning(memModel, splitted.data$test, tuning = list(k.neighbors = 12, metric = c("SUM"))) 52 | 53 | # Analyize predictions 54 | accuracy(resMem[, list(pred.code, pred.prob = confidence.score, acc = pred.code == code, num.suggested.codes = 1, num.suggested = 1, general.among.top5 = pred.code == "-9999", method.name = "Creecy.Sum.12")], n = n.test) 55 | plotTruePredictionsVsFalsePredictions(resMem[, list(pred.code, pred.prob = confidence.score, acc = pred.code == code, num.suggested.codes = 1, num.suggested = 1, general.among.top5 = pred.code == "-9999", method.name = "Creecy.Sum.12")]) 56 | plotAgreementRateVsProductionRate(resMem[, list(pred.code, pred.prob = confidence.score, acc = pred.code == code, num.suggested.codes = 1, num.suggested = 1, general.among.top5 = pred.code == "-9999", method.name = "Creecy.Sum.12")], n = n.test, yintercept = 0.85) 57 | 58 | #' ####################################################### 59 | ## RUN A GRID SEARCH (takes some time) 60 | \donttest{ 61 | # create a grid of all tuning combinations to try 62 | model.grid <- data.table(expand.grid(stopwords = c(TRUE, FALSE), stemming = c(FALSE, "de"), metric = c("SUM", "ERROR"), k.neighbors = c(2, 10, 17))) 63 | 64 | # Do grid search 65 | for (i in 1:nrow(model.grid)) { 66 | 67 | res.model <- trainCreecysMemoryBasedReasoning(splitted.data$training, preprocessing = list(stopwords = if (model.grid[i, stopwords]) tm::stopwords("de") else character(0), 68 | stemming = if (model.grid[i, stemming == "de"]) "de" else NULL, 69 | strPreprocessing = TRUE, 70 | removePunct = FALSE)) 71 | 72 | res.proc <- predictCreecysMemoryBasedReasoning(res.model, splitted.data$test, 73 | tuning = list(k.neighbors = model.grid[i, k.neighbors], 74 | metric = model.grid[i, metric])) 75 | 76 | ac <- accuracy(res.proc[, list(pred.code, pred.prob = confidence.score, acc = pred.code == code, num.suggested.codes = 1, num.suggested = 1, general.among.top5 = pred.code == "-9999", method.name = "Creecy.Sum.12")], n = nrow(splitted.data$test)) 77 | 78 | model.grid[i, acc := ac[, acc]] 79 | model.grid[i, acc.se := ac[, se]] 80 | model.grid[i, acc.N := ac[, N]] 81 | model.grid[i, acc.prob0 := ac[, count.pred.prob0]] 82 | } 83 | 84 | model.grid[order(metric, k.neighbors, stemming)] 85 | } 86 | } 87 | \seealso{ 88 | \code{\link{trainCreecysMemoryBasedReasoning}} 89 | 90 | Creecy, R. H., Masand, B. M., Smith, S. J., Waltz, D. L. (1992). Trading MIPS and Memory for Knowledge Engineering. Comm. ACM 35(8). pp. 48--65. 91 | } 92 | -------------------------------------------------------------------------------- /man/predictGweonsNearestNeighbor.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/predictGweonsNearestNeighbor.R 3 | \name{predictGweonsNearestNeighbor} 4 | \alias{predictGweonsNearestNeighbor} 5 | \title{Predict codes with Gweons Nearest Neighbor Method} 6 | \usage{ 7 | predictGweonsNearestNeighbor( 8 | model, 9 | newdata, 10 | tuning = list(nearest.neighbors.multiplier = 0.1) 11 | ) 12 | } 13 | \arguments{ 14 | \item{model}{the output created from \code{\link{trainGweonsNearestNeighbor}}} 15 | 16 | \item{newdata}{eiter a data.table created with \code{\link{removeFaultyAndUncodableAnswers_And_PrepareForAnalysis}} or a character vector} 17 | 18 | \item{tuning}{a list with element 19 | \describe{ 20 | \item{nearest.neighbors.multiplier}{defaults to 0.1. Gweon et al. (2017) show that 0.1 is a better choice than 0 but the exact value is a bit arbitrary.} 21 | }} 22 | } 23 | \value{ 24 | a data.table of class \code{occupationalPredictions} that contains predicted probabilities \code{pred.prob} for every combination of \code{ans} and \code{pred.code}. pred.code may not cover the full set of possible codes. If all predicted codes have probability 0, these predictions are removed and we instead insert \code{pred.code := "-9999"} with \code{pred.prob = 1/num.allowed.codes}. 25 | } 26 | \description{ 27 | Function does the same preprocessing as in \code{\link{trainGweonsNearestNeighbor}} and predicts codes with a modified 1-nearest-neighbor approach. 28 | } 29 | \examples{ 30 | # set up data 31 | data(occupations) 32 | allowed.codes <- c("71402", "71403", "63302", "83112", "83124", "83131", "83132", "83193", "83194", "-0004", "-0030") 33 | allowed.codes.titles <- c("Office clerks and secretaries (without specialisation)-skilled tasks", "Office clerks and secretaries (without specialisation)-complex tasks", "Gastronomy occupations (without specialisation)-skilled tasks", 34 | "Occupations in child care and child-rearing-skilled tasks", "Occupations in social work and social pedagogics-highly complex tasks", "Pedagogic specialists in social care work and special needs education-unskilled/semiskilled tasks", "Pedagogic specialists in social care work and special needs education-skilled tasks", "Supervisors in education and social work, and of pedagogic specialists in social care work", "Managers in education and social work, and of pedagogic specialists in social care work", 35 | "Not precise enough for coding", "Student assistants") 36 | proc.occupations <- removeFaultyAndUncodableAnswers_And_PrepareForAnalysis(occupations, colNames = c("orig_answer", "orig_code"), allowed.codes, allowed.codes.titles) 37 | 38 | ## split sample 39 | set.seed(3451345) 40 | n.test <- 50 41 | group <- sample(c(rep("test", n.test), rep("training", nrow(proc.occupations) - n.test))) 42 | splitted.data <- split(proc.occupations, group) 43 | 44 | # train model and make predictions 45 | model <- trainGweonsNearestNeighbor(splitted.data$train, 46 | preprocessing = list(stopwords = tm::stopwords("de"), stemming = "de", strPreprocessing = TRUE, removePunct = FALSE)) 47 | predictGweonsNearestNeighbor(model, c("test", "HIWI", "Hilfswissenschaftler")) 48 | res <- predictGweonsNearestNeighbor(model, splitted.data$test) 49 | 50 | # look at most probable answer from each id 51 | res[, .SD[which.max(pred.prob), list(ans, true.code = code, pred.code, acc = code == pred.code)], by = id] 52 | res[, .SD[which.max(pred.prob), list(ans, true.code = code, pred.code, acc = code == pred.code)], by = id][, mean(acc)] # calculate accurac of predictions 53 | 54 | # for further analysis we usually require further processing: 55 | produceResults(expandPredictionResults(res, allowed.codes, method.name = "GweonsNearestNeighbor"), k = 1, n = n.test, num.codes = length(allowed.codes)) 56 | } 57 | \seealso{ 58 | \code{\link{trainGweonsNearestNeighbor}} 59 | 60 | Gweon, H.; Schonlau, M., Kaczmirek, L., Blohm, M., Steiner, S. (2017). Three Methods for Occupation Coding Based on Statistical Learning. Journal of Official Statistics 33(1), pp. 101--122 61 | 62 | This function is based on \url{https://github.com/hgweon/occupation-coding/blob/master/Modified_NN.r}. Considerable speed improvements were implemented. 63 | } 64 | -------------------------------------------------------------------------------- /man/predictLogisticRegressionWithPenalization.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/predictLogisticRegressionWithPenalization.R 3 | \name{predictLogisticRegressionWithPenalization} 4 | \alias{predictLogisticRegressionWithPenalization} 5 | \title{Predict codes using a logistic regression model} 6 | \usage{ 7 | predictLogisticRegressionWithPenalization( 8 | model, 9 | newdata, 10 | lambda = min(model$lambda) 11 | ) 12 | } 13 | \arguments{ 14 | \item{model}{the output created from \code{\link{trainLogisticRegressionWithPenalization}}} 15 | 16 | \item{newdata}{eiter a data.table created with \code{\link{removeFaultyAndUncodableAnswers_And_PrepareForAnalysis}} or a character vector} 17 | 18 | \item{lambda}{see \code{\link[glmnet]{glmnet}}} 19 | } 20 | \value{ 21 | a data.table of class \code{occupationalPredictions} that contains predicted probabilities \code{pred.prob} for every combination of \code{ans} and \code{pred.code}. pred.code may not cover the full set of possible codes. 22 | } 23 | \description{ 24 | Function does the same preprocessing as in \code{\link{trainLogisticRegressionWithPenalization}} and calls the glmnet predict-function. 25 | } 26 | \details{ 27 | (Not confirmed:) Since overfitting is not an issue with occupational data, our experience is: the smaller \code{lambda}, the better are the predictions. Check if this holds for your data using \code{deviance(model)}. 28 | } 29 | \examples{ 30 | # set up data 31 | data(occupations) 32 | allowed.codes <- c("71402", "71403", "63302", "83112", "83124", "83131", "83132", "83193", "83194", "-0004", "-0030") 33 | allowed.codes.titles <- c("Office clerks and secretaries (without specialisation)-skilled tasks", "Office clerks and secretaries (without specialisation)-complex tasks", "Gastronomy occupations (without specialisation)-skilled tasks", 34 | "Occupations in child care and child-rearing-skilled tasks", "Occupations in social work and social pedagogics-highly complex tasks", "Pedagogic specialists in social care work and special needs education-unskilled/semiskilled tasks", "Pedagogic specialists in social care work and special needs education-skilled tasks", "Supervisors in education and social work, and of pedagogic specialists in social care work", "Managers in education and social work, and of pedagogic specialists in social care work", 35 | "Not precise enough for coding", "Student assistants") 36 | proc.occupations <- removeFaultyAndUncodableAnswers_And_PrepareForAnalysis(occupations, colNames = c("orig_answer", "orig_code"), allowed.codes, allowed.codes.titles) 37 | 38 | ## split sample 39 | set.seed(3451345) 40 | n.test <- 50 41 | group <- sample(c(rep("test", n.test), rep("training", nrow(proc.occupations) - n.test))) 42 | splitted.data <- split(proc.occupations, group) 43 | 44 | # train model and make predictions 45 | model <- trainLogisticRegressionWithPenalization(splitted.data$train, preprocessing = list(stopwords = tm::stopwords("de"), stemming = "de", countWords = FALSE), tuning = list(alpha = 0.05, maxit = 50^5, nlambda = 100, thresh = 1e-5)) 46 | predictLogisticRegressionWithPenalization(model, c("test", "HIWI", "Hilfswissenschaftler")) 47 | res <- predictLogisticRegressionWithPenalization(model, splitted.data$test) 48 | 49 | # look at most probable answer from each id 50 | res[, .SD[which.max(pred.prob), list(ans, true.code = code, pred.code, acc = code == pred.code)], by = id] 51 | res[, .SD[which.max(pred.prob), list(ans, true.code = code, pred.code, acc = code == pred.code)], by = id][, mean(acc)] # calculate accurac of predictions 52 | 53 | # for further analysis we usually require further processing: 54 | produceResults(expandPredictionResults(res, allowed.codes, method.name = "LogisticRegression"), k = 1, n = n.test, num.codes = length(allowed.codes)) 55 | 56 | ####################################################### 57 | ## RUN A GRID SEARCH (takes some time) 58 | \donttest{ 59 | model.grid <- data.table(expand.grid(stopwords = c(TRUE, FALSE), stemming = c(FALSE, "de"), countWords = c(TRUE, FALSE), alpha = c(0, 0.05, 0.2), thresh = 1e-4, stringsAsFactors =FALSE)) 60 | 61 | # save results here 62 | model.grid2 <- rbind(model.grid, model.grid, model.grid, model.grid) 63 | model.grid2[, lambda_ind := rep(c(50, 70, 80, 100), each = nrow(model.grid))] 64 | 65 | # Do grid search 66 | for (i in 1:nrow(model.grid)) { 67 | res.model <- trainLogisticRegressionWithPenalization(splitted.data$train, preprocessing = list(stopwords = if (model.grid[i, stopwords]) tm::stopwords("de") else character(0), 68 | stemming = if (model.grid[i, stemming == "de"]) "de" else NULL, 69 | countWords = model.grid[i, countWords]), 70 | tuning = list(alpha = model.grid[i, alpha], 71 | maxit = 10^6, nlambda = 100, thresh = model.grid[i, thresh])) 72 | 73 | for (j in 1:4) { # loop over all lambda_ind-values (c(50, 70, 80, 100)) 74 | # if glmnet does not converge, we want to use lambda_ind = length(res.model$lambda) 75 | model.grid2[nrow(model.grid)*(j-1) + i, lambda_ind := min(lambda_ind, length(res.model$lambda))] 76 | lambdav <- res.model$lambda[model.grid2[nrow(model.grid)*(j-1) + i, lambda_ind]] 77 | res.proc <- expandPredictionResults(predictLogisticRegressionWithPenalization(res.model, splitted.data$test, lambda = lambdav), allowed.codes = allowed.codes, method.name = paste0("glmnet.elnet.Stopwords=", model.grid[i, stopwords], "Stemming=", model.grid[i, stemming], "Countwords=", model.grid[i, countWords], "Lambda=", lambdav)) 78 | 79 | ac <- accuracy(calcAccurateAmongTopK(res.proc, k = 1), n = nrow(splitted.data$test)) 80 | ll <- logLoss(res.proc) 81 | sh <- sharpness(res.proc) 82 | 83 | model.grid2[nrow(model.grid)*(j-1) + i, lambda := lambdav] 84 | model.grid2[nrow(model.grid)*(j-1) + i, acc := ac[, acc]] 85 | model.grid2[nrow(model.grid)*(j-1) + i, acc.se := ac[, se]] 86 | model.grid2[nrow(model.grid)*(j-1) + i, acc.N := ac[, N]] 87 | model.grid2[nrow(model.grid)*(j-1) + i, acc.prob0 := ac[, count.pred.prob0]] 88 | model.grid2[nrow(model.grid)*(j-1) + i, loss.full := ll[1, logscore]] 89 | model.grid2[nrow(model.grid)*(j-1) + i, loss.full.se := ll[1, se]] 90 | model.grid2[nrow(model.grid)*(j-1) + i, loss.full.N := ll[1, N]] 91 | model.grid2[nrow(model.grid)*(j-1) + i, loss.sub := ll[2, logscore]] 92 | model.grid2[nrow(model.grid)*(j-1) + i, loss.sub.se := ll[2, se]] 93 | model.grid2[nrow(model.grid)*(j-1) + i, loss.sub.N := ll[2, N]] 94 | model.grid2[nrow(model.grid)*(j-1) + i, sharp := sh[, sharpness]] 95 | model.grid2[nrow(model.grid)*(j-1) + i, sharp.se := sh[, se]] 96 | model.grid2[nrow(model.grid)*(j-1) + i, sharp.N := sh[, N]] 97 | } 98 | } 99 | 100 | # how does alpha and lambda behave? 101 | model.grid2[order(alpha, -lambda)][stopwords == FALSE & stemming == FALSE & countWords == FALSE,] 102 | # Pick one combination of alpha and lambda and explore the influence of stemming, stopwords, and stemming 103 | model.grid2[alpha == 0.05 & lambda < 0.03][order(-lambda, stemming)] 104 | 105 | } 106 | } 107 | \seealso{ 108 | \code{\link{trainLogisticRegressionWithPenalization}}, \code{\link[glmnet]{glmnet}} 109 | } 110 | -------------------------------------------------------------------------------- /man/predictWithCodingIndex.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/predictWithCodingIndex.R 3 | \name{predictWithCodingIndex} 4 | \alias{predictWithCodingIndex} 5 | \title{Code answers with a coding index} 6 | \usage{ 7 | predictWithCodingIndex( 8 | newdata, 9 | coding_index, 10 | include.substrings = FALSE, 11 | max.count.categories = Inf 12 | ) 13 | } 14 | \arguments{ 15 | \item{newdata}{either a data.table created with \code{\link{removeFaultyAndUncodableAnswers_And_PrepareForAnalysis}} or a character vector.} 16 | 17 | \item{coding_index}{a data.table as created with function \code{\link{prepare_German_coding_index_Gesamtberufsliste_der_BA}}} 18 | 19 | \item{include.substrings}{(default: \code{FALSE}). If \code{FALSE}, a match is found if, after preprocessing, the entry from the coding index and the string-element are exactly identical. If TRUE (Attention: THIS IS SLOW!!), a match is found if, after preprocessing, the entry from the coding index is a substring of the string-element.} 20 | 21 | \item{max.count.categories}{(default: \code{Inf}). Should we search the whole coding index (default) or should we exclude entries with large \code{count_categories}, an indicator of job title ambiguity? Only entries in the coding index with \code{count_categories \eqn{\le} max.count.categories} are searched.} 22 | } 23 | \value{ 24 | a data.table with columns id, ans, and pred.code (format is not comparable to other formats in this package.) 25 | } 26 | \description{ 27 | Look up the correct code in a coding index. We often find no code, 1 code or even more than one possible code this way. 28 | } 29 | \examples{ 30 | # set up data 31 | data(occupations) 32 | allowed.codes <- c("71402", "71403", "63302", "83112", "83124", "83131", "83132", "83193", "83194", "-0004", "-0030") 33 | allowed.codes.titles <- c("Office clerks and secretaries (without specialisation)-skilled tasks", "Office clerks and secretaries (without specialisation)-complex tasks", "Gastronomy occupations (without specialisation)-skilled tasks", 34 | "Occupations in child care and child-rearing-skilled tasks", "Occupations in social work and social pedagogics-highly complex tasks", "Pedagogic specialists in social care work and special needs education-unskilled/semiskilled tasks", "Pedagogic specialists in social care work and special needs education-skilled tasks", "Supervisors in education and social work, and of pedagogic specialists in social care work", "Managers in education and social work, and of pedagogic specialists in social care work", 35 | "Not precise enough for coding", "Student assistants") 36 | proc.occupations <- removeFaultyAndUncodableAnswers_And_PrepareForAnalysis(occupations, colNames = c("orig_answer", "orig_code"), allowed.codes, allowed.codes.titles) 37 | 38 | # recommended default 39 | res <- predictWithCodingIndex(proc.occupations, 40 | coding_index = coding_index_excerpt, 41 | include.substrings = FALSE, 42 | max.count.categories = Inf) 43 | 44 | # playing around with the parameters to obtain other results 45 | res <- predictWithCodingIndex(proc.occupations, 46 | coding_index = coding_index_excerpt, 47 | include.substrings = TRUE, 48 | max.count.categories = 15) 49 | 50 | ################################# 51 | # Analysis: Standard functions from this package won't work here. 52 | # Absolute numbers: either nothing is predicted (nPredictedCodes = NA), or 1 or more cods are predicted 53 | res[ , .N, by = list(nPredictedCodes = 1 + nchar(pred.code) \%/\% 6 )] 54 | # Relative Numbers 55 | res[ , .N / res[, .N], by = list(nPredictedCodes = 1 + nchar(pred.code) \%/\% 6 )] 56 | # Agreement rate among answers where only a single code was predicted 57 | res[nchar(pred.code) == 5, mean(pred.code == code)] 58 | } 59 | \seealso{ 60 | \code{\link{predictSimilarityBasedReasoning}} 61 | } 62 | -------------------------------------------------------------------------------- /man/prepare_German_coding_index_Gesamtberufsliste_der_BA.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in 3 | % R/prepare_German_coding_index_Gesamtberufsliste_der_BA.R 4 | \name{prepare_German_coding_index_Gesamtberufsliste_der_BA} 5 | \alias{prepare_German_coding_index_Gesamtberufsliste_der_BA} 6 | \title{Prepares the Gesamtberufsliste der BA to be used with this package} 7 | \usage{ 8 | prepare_German_coding_index_Gesamtberufsliste_der_BA( 9 | path_to_file, 10 | count.categories = FALSE 11 | ) 12 | } 13 | \arguments{ 14 | \item{path_to_file}{path to downloaded file} 15 | 16 | \item{count.categories}{(default: \code{FALSE}). The default sets the column \code{count_categories} to 0, avoiding lengthy computations. With \code{count.categories = TRUE} an indicator of job title ambiguity is calculated. See below.} 17 | } 18 | \value{ 19 | a data.table with columns Berufsbenennungen, bezMale, and bezFemale, Code, and count_categories. 20 | } 21 | \description{ 22 | To use this function go to \url{https://download-portal.arbeitsagentur.de/files/} and download the file \code{Gesamtberufsliste_der_BA.xlsx}. This function prepares a coding index from the Excel file, useful for other functions in this package. 23 | } 24 | \details{ 25 | Part of the German Classification of Occupation (KldB 2010) is an alphabetic directory of job titles, available from \url{https://statistik.arbeitsagentur.de/Navigation/Statistik/Grundlagen/Klassifikation-der-Berufe/KldB2010/Systematik-Verzeichnisse/Systematik-Verzeichnisse-Nav.html}. Almost 28.000 job titles are listed in the alphabetic directory along with their associated codes. However, using the alphabetic directory for automatic coding is difficult because it consists of neutral job titles (e.g. Digital-Media-Designer/in) only. They are neutral because the '/' divides the male and female spelling. However, people will never use neutral titles that contain a '/' when they describe their own job. An alternative is the \code{Gesamtberufsliste_der_BA.xlsx}, which contains separate male and female job titles next to the neutral title. The Gesamtberufsliste is used for yearly updates of the alphabetic directory, entailing that both documents are mostly identical. 26 | 27 | This functions loads the \code{Gesamtberufsliste_der_BA.xlsx} into memory. It corrects some errors in the Gesamtberufsliste, may remove a few entries, renames some column names, calculates an indicator of job title ambiguity, and, most importantly, it removes parentheses and text in between. The resulting data.table can be used with the function \code{\link{predictWithCodingIndex}} for coding. If coded training data from earlier studies is available, \code{\link{trainSimilarityBasedReasoning}} is another option for using this coding index. 28 | 29 | With \code{count.categories = TRUE} an indicator of job title ambiguity is calculated. See the source code for examples what exactly is being calculated. This indicator is only needed if one wishes to change the default behaviour in \code{\link{predictWithCodingIndex}} by setting \code{max.count.categories}. This parameter allows to exclude ambiguous job titles (those with large values in count_categories) from the coding index. 30 | } 31 | \examples{ 32 | \dontrun{ 33 | # Illustrative results from this function (if you have not downloaded the file yet). 34 | View(coding_index_excerpt) 35 | 36 | # point path_to_file to your local file 37 | path_to_file <- ".../Gesamtberufsliste_der_BA.xlsx" 38 | coding_index <- prepare_German_coding_index_Gesamtberufsliste_der_BA(path_to_file, count.categories = FALSE) 39 | } 40 | } 41 | \seealso{ 42 | \code{\link{coding_index_excerpt}}, \code{\link{predictWithCodingIndex}}, \code{\link{trainSimilarityBasedReasoning}} 43 | } 44 | -------------------------------------------------------------------------------- /man/produceResults.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/produceResults.R 3 | \name{produceResults} 4 | \alias{produceResults} 5 | \title{Produces summaries of predictive performance} 6 | \usage{ 7 | produceResults(occupationalPredictions, k = 1, n, num.codes) 8 | } 9 | \arguments{ 10 | \item{occupationalPredictions}{a data.table created with a \code{\link{expandPredictionResults}}-function from this package.} 11 | 12 | \item{k}{how many top k categories to aggregate over?} 13 | 14 | \item{n}{Number of unique persons in test data (may be larger than the number of persons in occupationalPredictionsAmongTopK if for some persons no codes were suggested)} 15 | 16 | \item{num.codes}{Number of allowed categories in classification} 17 | } 18 | \value{ 19 | a data.table 20 | } 21 | \description{ 22 | Produces \code{\link{accuracy}}, \code{\link{plotReliabilityDiagram}}, \code{\link{sharpness}}, \code{\link{logLoss}}, \code{\link{plotTruePredictionsVsFalsePredictions}}, and \code{\link{plotAgreementRateVsProductionRate}}. 23 | } 24 | \examples{ 25 | # set up data 26 | data(occupations) 27 | allowed.codes <- c("71402", "71403", "63302", "83112", "83124", "83131", "83132", "83193", "83194", "-0004", "-0030") 28 | allowed.codes.titles <- c("Office clerks and secretaries (without specialisation)-skilled tasks", "Office clerks and secretaries (without specialisation)-complex tasks", "Gastronomy occupations (without specialisation)-skilled tasks", 29 | "Occupations in child care and child-rearing-skilled tasks", "Occupations in social work and social pedagogics-highly complex tasks", "Pedagogic specialists in social care work and special needs education-unskilled/semiskilled tasks", "Pedagogic specialists in social care work and special needs education-skilled tasks", "Supervisors in education and social work, and of pedagogic specialists in social care work", "Managers in education and social work, and of pedagogic specialists in social care work", 30 | "Not precise enough for coding", "Student assistants") 31 | proc.occupations <- removeFaultyAndUncodableAnswers_And_PrepareForAnalysis(occupations, colNames = c("orig_answer", "orig_code"), allowed.codes, allowed.codes.titles) 32 | 33 | ## split sample 34 | set.seed(3451345) 35 | n.test <- 50 36 | group <- sample(c(rep("test", n.test), rep("training", nrow(proc.occupations) - n.test))) 37 | splitted.data <- split(proc.occupations, group) 38 | 39 | # train model and make predictions 40 | model <- trainLogisticRegressionWithPenalization(splitted.data$train, preprocessing = list(stopwords = tm::stopwords("de"), stemming = "de", countWords = FALSE), tuning = list(alpha = 0.05, maxit = 50^5, nlambda = 100, thresh = 1e-5)) 41 | res <- predictLogisticRegressionWithPenalization(model, splitted.data$test) 42 | 43 | # expand to contain more categories than the initial ones 44 | res.proc1 <- expandPredictionResults(res, allowed.codes = c("12345", allowed.codes), method.name = "glmnet1") 45 | 46 | # aggregate over top k categories 47 | calcAccurateAmongTopK(res.proc1, k = 1)[,mean(acc)] 48 | calcAccurateAmongTopK(res.proc1, k = 5)[,mean(acc)] 49 | 50 | # we can use different methods to create a combined dataset. This is how to run the subsequent analysis functions only once. 51 | res.proc2 <- expandPredictionResults(res, allowed.codes = c("12345", allowed.codes), method.name = "glmnet2") 52 | res.proc <- rbind(res.proc1, res.proc2); class(res.proc) <- c(class(res.proc), "occupationalPredictionsComplete") 53 | 54 | produceResults(res.proc, k = 1, n = n.test, num.codes = length(allowed.codes) + 1) 55 | } 56 | \seealso{ 57 | \code{\link{calcAccurateAmongTopK}} 58 | } 59 | -------------------------------------------------------------------------------- /man/removeFaultyAndUncodableAnswers_And_PrepareForAnalysis.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in 3 | % R/removeFaultyAndUncodableAnswers_And_PrepareForAnalysis.R 4 | \name{removeFaultyAndUncodableAnswers_And_PrepareForAnalysis} 5 | \alias{removeFaultyAndUncodableAnswers_And_PrepareForAnalysis} 6 | \title{Data Preparation} 7 | \usage{ 8 | removeFaultyAndUncodableAnswers_And_PrepareForAnalysis( 9 | data, 10 | colNames = c("answer", "code"), 11 | allowed.codes, 12 | allowed.codes.titles = 1:length(allowed.codes) 13 | ) 14 | } 15 | \arguments{ 16 | \item{allowed.codes}{a vector of allowed codes from the classification.} 17 | 18 | \item{allowed.codes.titles}{Labels for \code{allowed.codes}. Should have the same length as \code{allowed.codes}.} 19 | 20 | \item{answers}{a character vector of answers} 21 | 22 | \item{codes}{a vector of classification codes having the same length as \code{answers}. Will be transformed to character.} 23 | } 24 | \value{ 25 | a data.table with attributes \code{classification} and \code{overview_tally} 26 | } 27 | \description{ 28 | Prepare data (i.e. columns 'id', 'ans', and 'code' are appended to the dataset - only these columns will be used later) and remove answer that we cannot use (i.e. answers that have non-ASCII characters after preprocessing and answers that are at most one character long). During data preparation you should make sure that nothing important is lost here. 29 | } 30 | \details{ 31 | The 2010 German classification is available at \url{https://www.klassifikationsserver.de/}. 32 | } 33 | \examples{ 34 | occupations <- data.table(answers = c("LEITER VERTRIEB", "Kfz-Schlossermeister", "Aushilfe im Hotel(Bereich Housekeeping)"), 35 | codes = c("61194", "25213", "63221")) 36 | (allowed.codes <- c("11101", "61194", "25213", "63221", "...")) 37 | (allowed.codes.titles <- c("Berufe in der Landwirtschaft (ohne Spezialisierung) - Helfer-/Anlernt\xe4tigkeiten", "Berufe in der Kraftfahrzeugtechnik - komplexe Spezialistent\xe4tigkeiten", "F\xfchrungskräfte - Einkauf und Vertrieb", "Berufe im Hotelservice - Helfer-/Anlernt\xe4tigkeiten", "many more category labels from the classification")) 38 | removeFaultyAndUncodableAnswers_And_PrepareForAnalysis(occupations, colNames = c("answers", "codes"), allowed.codes, allowed.codes.titles) 39 | 40 | data(occupations) 41 | allowed.codes <- c("71402", "71403", "63302", "83112", "83124", "83131", "83132", "83193", "83194", "-0004", "-0030") 42 | allowed.codes.titles <- c("Office clerks and secretaries (without specialisation)-skilled tasks", "Office clerks and secretaries (without specialisation)-complex tasks", "Gastronomy occupations (without specialisation)-skilled tasks", 43 | "Occupations in child care and child-rearing-skilled tasks", "Occupations in social work and social pedagogics-highly complex tasks", "Pedagogic specialists in social care work and special needs education-unskilled/semiskilled tasks", "Pedagogic specialists in social care work and special needs education-skilled tasks", "Supervisors in education and social work, and of pedagogic specialists in social care work", "Managers in education and social work, and of pedagogic specialists in social care work", 44 | "Not precise enough for coding", "Student assistants") 45 | removeFaultyAndUncodableAnswers_And_PrepareForAnalysis(occupations, colNames = c("orig_answer", "orig_code"), allowed.codes, allowed.codes.titles) 46 | 47 | ## we could also paste both answers together 48 | occupations[, answer_combined := paste(orig_answer, orig_answer2)] 49 | removeFaultyAndUncodableAnswers_And_PrepareForAnalysis(occupations, colNames = c("answer_combined", "orig_code"), allowed.codes, allowed.codes.titles) 50 | } 51 | \seealso{ 52 | \code{\link{createDescriptives}} 53 | } 54 | -------------------------------------------------------------------------------- /man/selectMaxProbMethod.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/selectMaxProbMethod.R 3 | \name{selectMaxProbMethod} 4 | \alias{selectMaxProbMethod} 5 | \title{From multiple prediction methods, select the prediction method for each id which returns highest probability} 6 | \usage{ 7 | selectMaxProbMethod( 8 | occupationalPredictions, 9 | combined.methods = c("xgboost", "SimilarityBasedSubstring", 10 | "SimilarityBasedWordwise"), 11 | k = 1, 12 | new.method.name = "maxProbAmong1" 13 | ) 14 | } 15 | \arguments{ 16 | \item{occupationalPredictions}{a data.table created with the \code{\link{expandPredictionResults}}-function from this package. Actually, the utility of this function is only if we \code{rbind} several such data.tables together (see example).} 17 | 18 | \item{combined.methods}{a character vector of methods to select from. We will only use the subset of rows from \code{occupationalPredictions} with \code{method.name 'in' combined.methods} (same names as assigned in \code{\link{expandPredictionResults}}).} 19 | 20 | \item{k}{Calculate probability over \code{k} most probable categories.} 21 | 22 | \item{new.method.name}{the name how the highest-probability-method shall be called.} 23 | } 24 | \value{ 25 | a data.table 26 | } 27 | \description{ 28 | Start with a data.table of class 'occupationalPredictionsComplete' (for each combination of pred.code and answer one prediction, we also should have this for multiple prediction methods), calculate for each id the probability of the top k categories, and select for each id the prediction method which returns highest probability. The so-found method is then used for this id and is called \code{new.method.name}. 29 | } 30 | \details{ 31 | The problem solved here is this: \code{\link{trainXgboost}} is good for most answers and for interactions. But xgboost fails if a keyword was misspelled or a job title is in the alphabetic dictionary but not in the training data. In those cases we would like to use a prediction method from \code{\link{trainSimilarityBasedReasoning}} which will return higher probabilities. 32 | } 33 | \examples{ 34 | # set up data 35 | data(occupations) 36 | allowed.codes <- c("71402", "71403", "63302", "83112", "83124", "83131", "83132", "83193", "83194", "-0004", "-0030") 37 | allowed.codes.titles <- c("Office clerks and secretaries (without specialisation)-skilled tasks", "Office clerks and secretaries (without specialisation)-complex tasks", "Gastronomy occupations (without specialisation)-skilled tasks", 38 | "Occupations in child care and child-rearing-skilled tasks", "Occupations in social work and social pedagogics-highly complex tasks", "Pedagogic specialists in social care work and special needs education-unskilled/semiskilled tasks", "Pedagogic specialists in social care work and special needs education-skilled tasks", "Supervisors in education and social work, and of pedagogic specialists in social care work", "Managers in education and social work, and of pedagogic specialists in social care work", 39 | "Not precise enough for coding", "Student assistants") 40 | proc.occupations <- removeFaultyAndUncodableAnswers_And_PrepareForAnalysis(occupations, colNames = c("orig_answer", "orig_code"), allowed.codes, allowed.codes.titles) 41 | 42 | ## split sample 43 | set.seed(3451345) 44 | n.test <- 50 45 | group <- sample(c(rep("test", n.test), rep("training", nrow(proc.occupations) - n.test))) 46 | splitted.data <- split(proc.occupations, group) 47 | attr(splitted.data$training, "classification")$code <- attr(proc.occupations, "classification")$code 48 | 49 | ####### train models 50 | # first model uses dist.type = wordwise and some other recommended settings (n.draws could be higher) 51 | simBasedModel <- trainSimilarityBasedReasoning(data = splitted.data$training, 52 | coding_index_w_codes = coding_index_excerpt, 53 | coding_index_without_codes = frequent_phrases, 54 | preprocessing = list(stopwords = tm::stopwords("de"), stemming = NULL, strPreprocessing = TRUE, removePunct = FALSE), 55 | dist.type = "wordwise", 56 | dist.control = list(method = "osa", weight = c(d = 1, i = 1, s = 1, t = 1)), 57 | threshold = c(max = 3, use = 1), simulation.control = list(n.draws = 50, check.normality = FALSE), 58 | tmp_folder = "similarityTables") 59 | 60 | res1 <- expandPredictionResults(predictSimilarityBasedReasoning(simBasedModel, splitted.data$test), allowed.codes, method.name = "WordwiseSimilarityOsa1111") 61 | 62 | # second model uses dist.type = substring and some other recommended settings (n.draws could be higher) 63 | simBasedModel <- trainSimilarityBasedReasoning(data = splitted.data$training, 64 | coding_index_w_codes = coding_index_excerpt, 65 | coding_index_without_codes = frequent_phrases, 66 | preprocessing = list(stopwords = NULL, stemming = NULL, strPreprocessing = TRUE, removePunct = FALSE), 67 | dist.type = "substring", 68 | dist.control = list(method = "substring", weight = numeric()), 69 | threshold = c(0, 0), simulation.control = list(n.draws = 50, check.normality = FALSE), 70 | tmp_folder = "similarityTables") 71 | 72 | res2 <- expandPredictionResults(predictSimilarityBasedReasoning(simBasedModel, splitted.data$test, parallel = TRUE), allowed.codes, method.name = "substringSimilarity") 73 | 74 | # third model uses dist.type = fulltext and some other recommended settings (n.draws could be higher) 75 | simBasedModel <- trainSimilarityBasedReasoning(data = proc.occupations, 76 | coding_index_w_codes = coding_index_excerpt, 77 | coding_index_without_codes = frequent_phrases, 78 | preprocessing = list(stopwords = NULL, stemming = NULL, strPreprocessing = TRUE, removePunct = FALSE), 79 | dist.type = "fulltext", 80 | dist.control = list(method = "osa", weight = c(d = 1, i = 1, s = 1, t = 1)), 81 | threshold = c(max = 3, use = 1), simulation.control = list(n.draws = 50, check.normality = FALSE), 82 | tmp_folder = "similarityTables") 83 | res3 <- expandPredictionResults(predictSimilarityBasedReasoning(simBasedModel, splitted.data$test), allowed.codes, method.name = "FulltextSimilarityOsa1111") 84 | 85 | res.combined <- rbind(res1, res2, res3); class(res.combined) <- class(res1) 86 | 87 | res.max <- selectMaxProbMethod(res.combined, combined.methods = c("WordwiseSimilarityOsa1111", "substringSimilarity"), k = 1, new.method.name = "maxProbAmong1") 88 | res.combined <- rbind(res.combined, res.max); class(res.combined) <- class(res1) 89 | produceResults(res.combined, k = 1, n = n.test, num.codes = length(allowed.codes)) 90 | } 91 | -------------------------------------------------------------------------------- /man/sharpness.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/sharpness.R 3 | \name{sharpness} 4 | \alias{sharpness} 5 | \title{Sharpness} 6 | \usage{ 7 | sharpness(occupationalPredictions) 8 | } 9 | \arguments{ 10 | \item{occupationalPredictions}{a data.table created with a \code{\link{expandPredictionResults}}-function from this package.} 11 | } 12 | \value{ 13 | a data.table 14 | } 15 | \description{ 16 | Calculate Sharpness \eqn{\log_2 loss = \frac{1}{N} \sum_n entropy_n} and standard error \eqn{\sqrt{\frac{1}{N(N-1)} \sum_n (entropy_n - sharpness)^2}} with \eqn{entropy_n = - \sum_k p_{nk} \log_2 p_{nk}} 17 | } 18 | \details{ 19 | Sharpness is zero (optimal) if a single category is predicted with probability 1. It is maximal if all categories have equal probability \eqn{p = \frac{1}{K}} 20 | 21 | Note: What should be done if a predicted probability is zero? \eqn{0 \times log(0)} is not defined but necessary to calculate sharpness. We set \eqn{0 \times log(0) = 0}. This also means we exclude observations from our analysis if all probabilities are predicted as zero. An alternative could be to set such zeros to \eqn{1/k)}, which would lead to very different sharpness. 22 | } 23 | \examples{ 24 | # set up data 25 | data(occupations) 26 | allowed.codes <- c("71402", "71403", "63302", "83112", "83124", "83131", "83132", "83193", "83194", "-0004", "-0030") 27 | allowed.codes.titles <- c("Office clerks and secretaries (without specialisation)-skilled tasks", "Office clerks and secretaries (without specialisation)-complex tasks", "Gastronomy occupations (without specialisation)-skilled tasks", 28 | "Occupations in child care and child-rearing-skilled tasks", "Occupations in social work and social pedagogics-highly complex tasks", "Pedagogic specialists in social care work and special needs education-unskilled/semiskilled tasks", "Pedagogic specialists in social care work and special needs education-skilled tasks", "Supervisors in education and social work, and of pedagogic specialists in social care work", "Managers in education and social work, and of pedagogic specialists in social care work", 29 | "Not precise enough for coding", "Student assistants") 30 | proc.occupations <- removeFaultyAndUncodableAnswers_And_PrepareForAnalysis(occupations, colNames = c("orig_answer", "orig_code"), allowed.codes, allowed.codes.titles) 31 | 32 | ## split sample 33 | set.seed(3451345) 34 | n.test <- 50 35 | group <- sample(c(rep("test", n.test), rep("training", nrow(proc.occupations) - n.test))) 36 | splitted.data <- split(proc.occupations, group) 37 | 38 | # train model and make predictions 39 | model <- trainLogisticRegressionWithPenalization(splitted.data$train, preprocessing = list(stopwords = tm::stopwords("de"), stemming = "de", countWords = FALSE), tuning = list(alpha = 0.05, maxit = 50^5, nlambda = 100, thresh = 1e-5)) 40 | res <- predictLogisticRegressionWithPenalization(model, splitted.data$test) 41 | 42 | # expand to contain more categories than the initial ones 43 | res.proc1 <- expandPredictionResults(res, allowed.codes = c("12345", allowed.codes), method.name = "glmnet1") 44 | 45 | # we can use different methods to create a combined dataset. This is how to run the subsequent analysis functions only once. 46 | res.proc2 <- expandPredictionResults(res, allowed.codes = c("12345", allowed.codes), method.name = "glmnet2") 47 | res.proc <- rbind(res.proc1, res.proc2); class(res.proc) <- c(class(res.proc), "occupationalPredictionsComplete") 48 | 49 | sharpness(res.proc) 50 | } 51 | \seealso{ 52 | \code{\link{plotReliabilityDiagram}}, \code{link{logLoss}} 53 | } 54 | -------------------------------------------------------------------------------- /man/stringPreprocessing.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/stringPreprocessing.R 3 | \encoding{UTF-8} 4 | \name{stringPreprocessing} 5 | \alias{stringPreprocessing} 6 | \title{Preprocess German occupational text} 7 | \usage{ 8 | stringPreprocessing(verbatim, lang = "de") 9 | } 10 | \arguments{ 11 | \item{verbatim}{a character vector.} 12 | 13 | \item{lang}{(default de) Everything else will throw an error.} 14 | } 15 | \value{ 16 | the same character vector after processing 17 | } 18 | \description{ 19 | Function replaces some common characters / character sequences (e.g., Ä, Ü, "DIPL.-ING.") with their uppercase equivalents and removes punctuation, empty spaces and the word "Diplom". 20 | } 21 | \details{ 22 | \code{\link{charToRaw}} helps to find UTF-8 characters. 23 | } 24 | \examples{ 25 | (x <- c("Verkauf von B\xfcchern, Schreibwaren", "Fach\xe4rzin f\xfcr Kinder- und Jugendmedizin im \xf6ffentlichen Gesundheitswesen", "Industriemechaniker", "Dipl.-Ing. - Agrarwirtschaft (Landwirtschaft)")) 26 | stringPreprocessing(x) 27 | } 28 | -------------------------------------------------------------------------------- /man/surveyCountsSubstringSimilarity.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \encoding{UTF-8} 5 | \name{surveyCountsSubstringSimilarity} 6 | \alias{surveyCountsSubstringSimilarity} 7 | \title{Anonymized training data (substring similarity) to be used with Similarity-based Reasoning} 8 | \format{ 9 | A data.table with 39242 rows and 3 variables: 10 | \describe{ 11 | \item{dictString}{Job titles (and similars). They were either taken from the \code{Gesamtberufsliste_der_BA} or from \code{\link{frequent_phrases}}} 12 | \item{survCode}{5-digit codes from the survey data} 13 | \item{N}{Frequency of how often a survey text identical or similar to \code{dictString} was coded as \code{survCode} (using substring-similarity)} 14 | } 15 | 16 | Substring-similarity means that, to be counted, the survey response must contain the text in dictString, i.e., dictString must be identical or shorter than the original text provided by the respondent. 17 | } 18 | \source{ 19 | Data from the following surveys were pooled: 20 | 21 | Antoni, M., Drasch, K., Kleinert, C., Matthes, B., Ruland, M. and Trahms, A. (2010): Arbeiten und Lernen im Wandel * Teil 1: Überblick über die Studie, FDZ-Methodenreport 05/2010, Forschungsdatenzentrum der Bundesagentur für Arbeit im Institut für Arbeitsmarkt- und Berufsforschung, Nuremberg. 22 | 23 | Rohrbach-Schmidt, D., Hall, A. (2013): BIBB/BAuA Employment Survey 2012, BIBB-FDZ Data and Methodological Reports Nr. 1/2013. Version 4.1, Federal Institute for Vocational Education and Training (Research Data Centre), Bonn. 24 | 25 | Lange, C., Finger, J., Allen, J., Born, S., Hoebel, J., Kuhnert, R., Müters, S., Thelen, J., Schmich, P., Varga, M., von der Lippe, E., Wetzstein, M., Ziese, T. (2017): Implementation of the European Health Interview Survey (EHIS) into the German Health Update (GEDA), Archives of Public Health, 75, 1–14. 26 | 27 | Hoffmann, R., Lange, M., Butschalowsky, H., Houben, R., Schmich, P., Allen, J., Kuhnert, R., Schaffrath Rosario, A., Gößwald, A. (2018): KiGGS Wave 2 Cross-Sectional Study—Participant Acquisition, Response Rates and Representativeness, Journal of Health Monitoring, 3, 78–91. (only wave 2) 28 | 29 | Trappmann, M., Beste, J., Bethmann, A., Müller, G. (2013): The PASS Panel Survey after Six Waves, Journal for Labour Market Research, 46, 275–281. (only wave 10) 30 | 31 | Job titles were taken from the following publication: 32 | 33 | Bundesagentur für Arbeit (2019). Gesamtberufsliste der Bundesagentur für Arbeit. Stand: 03.01.2019. The \code{Gesamtberufsliste der BA} is available at \link{https://download-portal.arbeitsagentur.de/files/}. 34 | } 35 | \usage{ 36 | surveyCountsSubstringSimilarity 37 | } 38 | \description{ 39 | This aggregated (anonymized) training data is to be used within the \code{\link{trainSimilarityBasedReasoning2}}-function (\code{dist.type = "substring"}), see the documentation therein. It allows the coding of German language occuptions into the German Classification of Occupations (KldB 2010). 40 | } 41 | \examples{ 42 | ## what follows is the source code used to create this data set 43 | ## 44 | 45 | # load toy example data 46 | data(occupations) # toy example, the five data sets cited above were used instead 47 | # In addition to codes from ther 2010 German Classification of Occupations, our data make use of the following special codes: 48 | (special_codes <- c("-0004" = "genaue Kodierung nicht möglich", "-0006" = "Multiple jobs", "-0012" = "Blue-colar worker", "-0030" = "Student employee/assistant, work placement student, research assistant", "-0019" = "Federal volunteer service, voluntary social year (FSJ), civil service")) 49 | data(coding_index_excerpt) # toy example, the Gesamtberufsliste was used instead. After running ?prepare_German_coding_index_Gesamtberufsliste_der_BA, our version of the coding index had 27853 entries. 50 | data(frequent_phrases) 51 | 52 | # prepare coding index for our purposes 53 | coding_index <- coding_index_excerpt[!(Berufsbenennungen \%in\% c("Bundeskanzler/in", "Bundespräsident/in", "Admiral", "General"))] # remove very rare occupations that might violate privacy regulations 54 | coding_index_w_codes <- rbind(coding_index[, list(title = bezMale, Code)], coding_index[, list(title = bezFemale, Code)]) 55 | coding_index_w_codes <- coding_index_w_codes[,title := stringPreprocessing(title)] 56 | coding_index_w_codes <- coding_index_w_codes[!duplicated(title)] 57 | 58 | # prepare the training data (special codes were harmonized in advance) 59 | training_data <- occupations[, .(answer = stringPreprocessing(orig_answer), code = orig_code)] 60 | 61 | # trick to save time: do this only once for each unique string and merge later 62 | similarityTableSubstring <- createSimilarityTableSubstring(unique.string = unique(training_data$answer), 63 | coding_index_w_codes = coding_index_w_codes, 64 | coding_index_without_codes = frequent_phrases) 65 | similarityTableSubstring2 <- rbind(similarityTableSubstring$dist_table_w_code[, .(intString, dictString = dictString.title)], 66 | similarityTableSubstring$dist_table_without_code[, .(intString, dictString)]) 67 | surveyCountsSubstringSimilarity_toyExample <- merge(training_data[, .(answer, survCode = code)], similarityTableSubstring2, by.x = "answer", by.y = "intString", allow.cartesian = TRUE)[, .N, by = list(dictString, survCode)][order(dictString)] 68 | 69 | } 70 | \seealso{ 71 | See \code{\link{trainSimilarityBasedReasoning2}}, for which this data set was created, and \code{\link{surveyCountsWordwiseSimilarity}}, which has been created the same way but uses a different metric to calculate string similarities. 72 | } 73 | \keyword{datasets} 74 | -------------------------------------------------------------------------------- /man/surveyCountsWordwiseSimilarity.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \encoding{UTF-8} 5 | \name{surveyCountsWordwiseSimilarity} 6 | \alias{surveyCountsWordwiseSimilarity} 7 | \title{Anonymized training data (wordwise similarity) to be used with Similarity-based Reasoning} 8 | \format{ 9 | A data.table with 26710 rows and 3 variables: 10 | \describe{ 11 | \item{dictString}{Job titles (and similars). They were either taken from the \code{Gesamtberufsliste_der_BA} or from \code{\link{frequent_phrases}}} 12 | \item{survCode}{5-digit codes from the survey data} 13 | \item{N}{Frequency of how often a survey text identical or similar to \code{dictString} was coded as \code{survCode} (using wordwise-similarity)} 14 | } 15 | 16 | Wordwise-similarity means that, to be counted, the verbal survey answer must be similar to dictString, more specifically, dictString must be identical with any one word in the survey response (a difference by at most one character is allowed to account for spelling errors). 17 | } 18 | \source{ 19 | Data from the following surveys were pooled: 20 | 21 | Antoni, M., Drasch, K., Kleinert, C., Matthes, B., Ruland, M. and Trahms, A. (2010): Arbeiten und Lernen im Wandel * Teil 1: Überblick über die Studie, FDZ-Methodenreport 05/2010, Forschungsdatenzentrum der Bundesagentur für Arbeit im Institut für Arbeitsmarkt- und Berufsforschung, Nuremberg. 22 | 23 | Rohrbach-Schmidt, D., Hall, A. (2013): BIBB/BAuA Employment Survey 2012, BIBB-FDZ Data and Methodological Reports Nr. 1/2013. Version 4.1, Federal Institute for Vocational Education and Training (Research Data Centre), Bonn. 24 | 25 | Lange, C., Finger, J., Allen, J., Born, S., Hoebel, J., Kuhnert, R., Müters, S., Thelen, J., Schmich, P., Varga, M., von der Lippe, E., Wetzstein, M., Ziese, T. (2017): Implementation of the European Health Interview Survey (EHIS) into the German Health Update (GEDA), Archives of Public Health, 75, 1–14. 26 | 27 | Hoffmann, R., Lange, M., Butschalowsky, H., Houben, R., Schmich, P., Allen, J., Kuhnert, R., Schaffrath Rosario, A., Gößwald, A. (2018): KiGGS Wave 2 Cross-Sectional Study—Participant Acquisition, Response Rates and Representativeness, Journal of Health Monitoring, 3, 78–91. (only wave 2) 28 | 29 | Trappmann, M., Beste, J., Bethmann, A., Müller, G. (2013): The PASS Panel Survey after Six Waves, Journal for Labour Market Research, 46, 275–281. (only wave 10) 30 | 31 | Job titles were taken from the following publication: 32 | 33 | Bundesagentur für Arbeit (2019). Gesamtberufsliste der Bundesagentur für Arbeit. Stand: 03.01.2019. The \code{Gesamtberufsliste der BA} is available at \link{https://download-portal.arbeitsagentur.de/files/}. 34 | } 35 | \usage{ 36 | surveyCountsWordwiseSimilarity 37 | } 38 | \description{ 39 | This aggregated (anonymized) training data is to be used within the \code{\link{trainSimilarityBasedReasoning2}}-function (\code{dist.type = "wordwise", dist.control = list(method = "osa", weight = c(d = 1, i = 1, s = 1, t = 1)), threshold = c(max = NA, use = 1))}, see the documentation therein. It allows the coding of German language occuptions into the German Classification of Occupations (KldB 2010). 40 | } 41 | \examples{ 42 | ## what follows is the source code used to create this data set 43 | ## 44 | 45 | # load toy example data 46 | data(occupations) # toy example, the five data sets cited above were used instead 47 | # In addition to codes from ther 2010 German Classification of Occupations, our data make use of the following special codes: 48 | (special_codes <- c("-0004" = "genaue Kodierung nicht möglich", "-0006" = "Multiple jobs", "-0012" = "Blue-colar worker", "-0030" = "Student employee/assistant, work placement student, research assistant", "-0019" = "Federal volunteer service, voluntary social year (FSJ), civil service")) 49 | data(coding_index_excerpt) # toy example, the Gesamtberufsliste was used instead. After running ?prepare_German_coding_index_Gesamtberufsliste_der_BA, our version of the coding index had 27853 entries. 50 | data(frequent_phrases) 51 | 52 | # prepare coding index for our purposes 53 | coding_index <- coding_index_excerpt[!(Berufsbenennungen \%in\% c("Bundeskanzler/in", "Bundespräsident/in", "Admiral", "General"))] # remove very rare occupations that might violate privacy regulations 54 | coding_index_w_codes <- rbind(coding_index[, list(title = bezMale, Code)], coding_index[, list(title = bezFemale, Code)]) 55 | coding_index_w_codes <- coding_index_w_codes[,title := stringPreprocessing(title)] 56 | coding_index_w_codes <- coding_index_w_codes[!duplicated(title)] 57 | 58 | # prepare the training data (special codes were harmonized in advance) 59 | training_data <- occupations[, .(answer = stringPreprocessing(orig_answer), code = orig_code)] 60 | 61 | # trick to save time: do this only once for each unique string and merge later 62 | similarityTableWordwise <- createSimilarityTableWordwiseStringdist(unique.string = unique(training_data$answer), 63 | coding_index_w_codes = coding_index_w_codes, 64 | coding_index_without_codes = occupationCoding::frequent_phrases, 65 | preprocessing = list(stopwords = NULL, stemming = NULL, strPreprocessing = TRUE, removePunct = FALSE), 66 | dist.control = list(method = "osa", weight = c(d = 1, i = 1, s = 1, t = 1)), 67 | threshold = 1) 68 | similarityTableWordwise2 <- rbind(similarityTableWordwise$dist_table_w_code[, .(intString, dictString = dictString.title)], 69 | similarityTableWordwise$dist_table_without_code[, .(intString, dictString)]) 70 | surveyCountsWordwiseSimilarity_toyExample <- merge(training_data[, .(answer, survCode = code)], similarityTableWordwise2, by.x = "answer", by.y = "intString", allow.cartesian = TRUE)[, .N, by = list(dictString, survCode)][order(dictString)] 71 | } 72 | \seealso{ 73 | See \code{\link{trainSimilarityBasedReasoning2}}, for which this data set was created, and \code{\link{surveyCountsSubstringSimilarity}}, which has been created the same way but uses a different metric to calculate string similarities. 74 | } 75 | \keyword{datasets} 76 | -------------------------------------------------------------------------------- /man/trainCreecysMemoryBasedReasoning.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/trainCreecysMemoryBasedReasoning.R 3 | \name{trainCreecysMemoryBasedReasoning} 4 | \alias{trainCreecysMemoryBasedReasoning} 5 | \title{Train Creecys Memory-based reaoning model} 6 | \usage{ 7 | trainCreecysMemoryBasedReasoning( 8 | data, 9 | preprocessing = list(stopwords = character(0), stemming = NULL, strPreprocessing = 10 | TRUE, removePunct = TRUE) 11 | ) 12 | } 13 | \arguments{ 14 | \item{data}{a data.table created with \code{\link{removeFaultyAndUncodableAnswers_And_PrepareForAnalysis}}} 15 | 16 | \item{preprocessing}{a list with elements 17 | \describe{ 18 | \item{stopwords}{a character vector, use \code{tm::stopwords("de")} for German stopwords.} 19 | \item{stemming}{\code{NULL} for no stemming and \code{"de"} for stemming using the German porter stemmer.} 20 | \item{strPreprocessing}{\code{TRUE} if \code{\link{stringPreprocessing}} shall be used.} 21 | \item{removePunct}{\code{TRUE} if \code{\link[tm]{removePunctuation}} shall be used.} 22 | }} 23 | } 24 | \value{ 25 | a processed feature matrix to be used in \code{\link{predictCreecysMemoryBasedReasoning}} 26 | } 27 | \description{ 28 | The function does some preprocessing and calculates the importance of various features. 29 | } 30 | \examples{ 31 | # set up data 32 | data(occupations) 33 | allowed.codes <- c("71402", "71403", "63302", "83112", "83124", "83131", "83132", "83193", "83194", "-0004", "-0030") 34 | allowed.codes.titles <- c("Office clerks and secretaries (without specialisation)-skilled tasks", "Office clerks and secretaries (without specialisation)-complex tasks", "Gastronomy occupations (without specialisation)-skilled tasks", 35 | "Occupations in child care and child-rearing-skilled tasks", "Occupations in social work and social pedagogics-highly complex tasks", "Pedagogic specialists in social care work and special needs education-unskilled/semiskilled tasks", "Pedagogic specialists in social care work and special needs education-skilled tasks", "Supervisors in education and social work, and of pedagogic specialists in social care work", "Managers in education and social work, and of pedagogic specialists in social care work", 36 | "Not precise enough for coding", "Student assistants") 37 | proc.occupations <- removeFaultyAndUncodableAnswers_And_PrepareForAnalysis(occupations, colNames = c("orig_answer", "orig_code"), allowed.codes, allowed.codes.titles) 38 | 39 | # Recommended configuration (and commonly used in this package) 40 | memModel <- trainCreecysMemoryBasedReasoning(proc.occupations, 41 | preprocessing = list(stopwords = character(0), stemming = NULL, strPreprocessing = TRUE, removePunct = FALSE)) 42 | } 43 | \seealso{ 44 | \code{\link{predictCreecysMemoryBasedReasoning}} 45 | 46 | Creecy, R. H., Masand, B. M., Smith, S. J., Waltz, D. L. (1992). Trading MIPS and Memory for Knowledge Engineering. Comm. ACM 35(8). pp. 48--65. 47 | } 48 | -------------------------------------------------------------------------------- /man/trainGweonsNearestNeighbor.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/trainGweonsNearestNeighbor.R 3 | \name{trainGweonsNearestNeighbor} 4 | \alias{trainGweonsNearestNeighbor} 5 | \title{Trains Gweons Nearest Neighbor model} 6 | \usage{ 7 | trainGweonsNearestNeighbor( 8 | data, 9 | preprocessing = list(stopwords = tm::stopwords("de"), stemming = "de", 10 | strPreprocessing = FALSE, removePunct = TRUE) 11 | ) 12 | } 13 | \arguments{ 14 | \item{data}{a data.table created with \code{\link{removeFaultyAndUncodableAnswers_And_PrepareForAnalysis}}} 15 | 16 | \item{preprocessing}{a list with elements 17 | \describe{ 18 | \item{stopwords}{a character vector, use \code{tm::stopwords("de")} for German stopwords.} 19 | \item{stemming}{\code{NULL} for no stemming and \code{"de"} for stemming using the German porter stemmer.} 20 | \item{strPreprocessing}{\code{TRUE} if \code{\link{stringPreprocessing}} shall be used.} 21 | \item{removePunct}{\code{TRUE} if \code{\link[tm]{removePunctuation}} shall be used.} 22 | }} 23 | } 24 | \value{ 25 | a document term matrix with some additional attributes 26 | } 27 | \description{ 28 | Function does some preprocessing and creates a document term matrix to be used for the Nearest Neighbor model. 29 | } 30 | \examples{ 31 | # set up data 32 | data(occupations) 33 | allowed.codes <- c("71402", "71403", "63302", "83112", "83124", "83131", "83132", "83193", "83194", "-0004", "-0030") 34 | allowed.codes.titles <- c("Office clerks and secretaries (without specialisation)-skilled tasks", "Office clerks and secretaries (without specialisation)-complex tasks", "Gastronomy occupations (without specialisation)-skilled tasks", 35 | "Occupations in child care and child-rearing-skilled tasks", "Occupations in social work and social pedagogics-highly complex tasks", "Pedagogic specialists in social care work and special needs education-unskilled/semiskilled tasks", "Pedagogic specialists in social care work and special needs education-skilled tasks", "Supervisors in education and social work, and of pedagogic specialists in social care work", "Managers in education and social work, and of pedagogic specialists in social care work", 36 | "Not precise enough for coding", "Student assistants") 37 | proc.occupations <- removeFaultyAndUncodableAnswers_And_PrepareForAnalysis(occupations, colNames = c("orig_answer", "orig_code"), allowed.codes, allowed.codes.titles) 38 | 39 | # Recommended configuration 40 | dtmModel <- trainGweonsNearestNeighbor(proc.occupations, 41 | preprocessing = list(stopwords = tm::stopwords("de"), stemming = "de", strPreprocessing = TRUE, removePunct = FALSE)) 42 | # Configuration used by Gweon et al. (2017) 43 | dtmModel <- trainGweonsNearestNeighbor(proc.occupations, 44 | preprocessing = list(stopwords = tm::stopwords("de"), stemming = "de", strPreprocessing = FALSE, removePunct = TRUE)) 45 | # Configuration used for most other approaches in this package 46 | dtmModel <- trainGweonsNearestNeighbor(proc.occupations, 47 | preprocessing = list(stopwords = character(0), stemming = NULL, strPreprocessing = TRUE, removePunct = FALSE)) 48 | 49 | ####################################################### 50 | ## RUN A GRID SEARCH (takes some time) 51 | \donttest{ 52 | # create a grid of all combinations to be tried 53 | model.grid <- data.table(expand.grid(stopwords = c(TRUE, FALSE), stemming = c(FALSE, "de"), strPreprocessing = c(TRUE, FALSE), nearest.neighbors.multiplier = c(0.05, 0.1, 0.2))) 54 | 55 | # Do grid search 56 | for (i in 1:nrow(model.grid)) { 57 | res.model <- trainGweonsNearestNeighbor(splitted.data$training, preprocessing = list(stopwords = if (model.grid[i, stopwords]) tm::stopwords("de") else character(0), 58 | stemming = if (model.grid[i, stemming == "de"]) "de" else NULL, 59 | strPreprocessing = model.grid[i, strPreprocessing], 60 | removePunct = !model.grid[i, strPreprocessing])) 61 | 62 | res.proc <- predictGweonsNearestNeighbor(res.model, splitted.data$test, 63 | tuning = list(nearest.neighbors.multiplier = model.grid[i, nearest.neighbors.multiplier])) 64 | res.proc <- expandPredictionResults(res.proc, allowed.codes = allowed.codes, method.name = "NearestNeighbor_Gweon") 65 | 66 | ac <- accuracy(calcAccurateAmongTopK(res.proc, k = 1), n = nrow(splitted.data$test)) 67 | ll <- logLoss(res.proc) 68 | sh <- sharpness(res.proc) 69 | 70 | model.grid[i, acc := ac[, acc]] 71 | model.grid[i, acc.se := ac[, se]] 72 | model.grid[i, acc.N := ac[, N]] 73 | model.grid[i, acc.prob0 := ac[, count.pred.prob0]] 74 | model.grid[i, loss.full := ll[1, logscore]] 75 | model.grid[i, loss.full.se := ll[1, se]] 76 | model.grid[i, loss.full.N := ll[1, N]] 77 | model.grid[i, loss.sub := ll[2, logscore]] 78 | model.grid[i, loss.sub.se := ll[2, se]] 79 | model.grid[i, loss.sub.N := ll[2, N]] 80 | model.grid[i, sharp := sh[, sharpness]] 81 | model.grid[i, sharp.se := sh[, se]] 82 | model.grid[i, sharp.N := sh[, N]] 83 | } 84 | 85 | model.grid[order(stopwords, stemming, strPreprocessing, nearest.neighbors.multiplier)] 86 | 87 | 88 | } 89 | } 90 | \seealso{ 91 | \code{\link{predictGweonsNearestNeighbor}} 92 | 93 | Gweon, H.; Schonlau, M., Kaczmirek, L., Blohm, M., Steiner, S. (2017). Three Methods for Occupation Coding Based on Statistical Learning. Journal of Official Statistics 33(1), pp. 101--122 94 | } 95 | -------------------------------------------------------------------------------- /man/trainLogisticRegressionWithPenalization.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/trainLogisticRegressionWithPenalization.R 3 | \name{trainLogisticRegressionWithPenalization} 4 | \alias{trainLogisticRegressionWithPenalization} 5 | \title{Train a logistic regression model with penalization} 6 | \usage{ 7 | trainLogisticRegressionWithPenalization( 8 | data, 9 | preprocessing = list(stopwords = character(0), stemming = NULL, countWords = FALSE), 10 | tuning = list(alpha = 0.05, maxit = 10^5, nlambda = 100, thresh = 1e-07) 11 | ) 12 | } 13 | \arguments{ 14 | \item{data}{a data.table created with \code{\link{removeFaultyAndUncodableAnswers_And_PrepareForAnalysis}}} 15 | 16 | \item{preprocessing}{a list with elements 17 | \describe{ 18 | \item{stopwords}{a character vector, use \code{tm::stopwords("de")} for German stopwords.} 19 | \item{stemming}{\code{NULL} for no stemming and \code{"de"} for stemming using the German porter stemmer.} 20 | \item{countWords}{Set to TRUE if the predictor matrix should contain a column for answer length.} 21 | }} 22 | 23 | \item{tuning}{a list with elements that will be passed to \code{\link[glmnet]{glmnet}}} 24 | } 25 | \value{ 26 | a logistic regression model. Commands from \code{\link[glmnet]{glmnet}} should work. 27 | } 28 | \description{ 29 | Function does some preprocessing and calls glmnet for a logistic regression model 30 | } 31 | \details{ 32 | Setting \code{tuning$alpha = 0} (Ridge Penalty) seems to be most stable. 33 | 34 | In our experience, \code{glmnet} often returns a warning like \code{3: from glmnet Fortran code (error code -72); Convergence for 72th lambda value not reached after maxit=100000 iterations; solutions for larger lambdas returned}. To solve this issue, we can increase \code{maxit} to try more iterations or we can decrease the threshold \code{thresh}. 35 | } 36 | \examples{ 37 | # set up data 38 | data(occupations) 39 | allowed.codes <- c("71402", "71403", "63302", "83112", "83124", "83131", "83132", "83193", "83194", "-0004", "-0030") 40 | allowed.codes.titles <- c("Office clerks and secretaries (without specialisation)-skilled tasks", "Office clerks and secretaries (without specialisation)-complex tasks", "Gastronomy occupations (without specialisation)-skilled tasks", 41 | "Occupations in child care and child-rearing-skilled tasks", "Occupations in social work and social pedagogics-highly complex tasks", "Pedagogic specialists in social care work and special needs education-unskilled/semiskilled tasks", "Pedagogic specialists in social care work and special needs education-skilled tasks", "Supervisors in education and social work, and of pedagogic specialists in social care work", "Managers in education and social work, and of pedagogic specialists in social care work", 42 | "Not precise enough for coding", "Student assistants") 43 | proc.occupations <- removeFaultyAndUncodableAnswers_And_PrepareForAnalysis(occupations, colNames = c("orig_answer", "orig_code"), allowed.codes, allowed.codes.titles) 44 | 45 | # Recommended configuration 46 | trainLogisticRegressionWithPenalization(proc.occupations, 47 | preprocessing = list(stopwords = character(0), stemming = "de", countWords = FALSE), 48 | tuning = list(alpha = 0.05, maxit = 10^6, nlambda = 100, thresh = 1e-7)) 49 | 50 | # Other possibility 51 | trainLogisticRegressionWithPenalization(proc.occupations, 52 | preprocessing = list(stopwords = tm::stopwords("de"), stemming = NULL, countWords = TRUE), 53 | tuning = list(alpha = 0.05, maxit = 10^6, nlambda = 100, thresh = 1e-7)) 54 | } 55 | \seealso{ 56 | \code{\link{predictLogisticRegressionWithPenalization}}, \code{\link[glmnet]{glmnet}} 57 | } 58 | --------------------------------------------------------------------------------