├── .Rbuildignore
├── .gitignore
├── CRAN-SUBMISSION
├── DESCRIPTION
├── LICENSE
├── LICENSE.md
├── NAMESPACE
├── NEWS.md
├── R
    ├── complete_chat.R
    ├── complete_prompt.R
    ├── data.R
    ├── format_chat.R
    ├── format_prompt.R
    └── openai_api_key.R
├── README.Rmd
├── README.md
├── cran-comments.md
├── data-raw
    ├── DALL·E 2024-02-21 05.59.41 - A line drawing of a teleprompter..png
    ├── logo.R
    ├── logo.png
    └── masterpiece_tweets.RData
├── data
    ├── occupations.rda
    ├── occupations_examples.rda
    ├── scotus_tweets.rda
    └── scotus_tweets_examples.rda
├── man
    ├── complete_chat.Rd
    ├── complete_prompt.Rd
    ├── figures
    │   ├── README-unnamed-chunk-15-1.png
    │   ├── README-unnamed-chunk-16-1.png
    │   └── logo.png
    ├── format_chat.Rd
    ├── format_prompt.Rd
    ├── occupations.Rd
    ├── occupations_examples.Rd
    ├── openai_api_key.Rd
    ├── scotus_tweets.Rd
    └── scotus_tweets_examples.Rd
└── promptr.Rproj


/.Rbuildignore:
--------------------------------------------------------------------------------
1 | ^promptr\.Rproj$
2 | ^\.Rproj\.user$
3 | ^LICENSE\.md$
4 | ^README\.Rmd$
5 | ^data-raw$
6 | ^cran-comments\.md$
7 | ^CRAN-SUBMISSION$
8 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .Rproj.user
2 | .Rhistory
3 | .Rdata
4 | .httr-oauth
5 | .DS_Store
6 | .quarto
7 | 


--------------------------------------------------------------------------------
/CRAN-SUBMISSION:
--------------------------------------------------------------------------------
1 | Version: 1.0.0
2 | Date: 2024-08-15 12:27:04 UTC
3 | SHA: 6fcf4960634342156a3c6fdb3069fa07b90a8963
4 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: promptr
 2 | Title: Format and Complete Few-Shot LLM Prompts
 3 | Version: 1.0.0
 4 | Authors@R: 
 5 |     person("Joe", "Ornstein", , "jornstein@uga.edu", role = c("aut", "cre", "cph"),
 6 |            comment = c(ORCID = "0000-0002-5704-2098"))
 7 | Description: Format and submit few-shot prompts to OpenAI's Large Language Models (LLMs). Designed to be particularly useful for text classification problems in the social sciences. Methods are described in Ornstein, Blasingame, and Truscott (2024) <https://joeornstein.github.io/publications/ornstein-blasingame-truscott.pdf>.
 8 | License: MIT + file LICENSE
 9 | Encoding: UTF-8
10 | Roxygen: list(markdown = TRUE)
11 | RoxygenNote: 7.3.2
12 | Imports: 
13 |     curl,
14 |     dplyr,
15 |     glue,
16 |     httr2,
17 |     jsonlite,
18 |     stringr
19 | Depends: 
20 |     R (>= 2.10)
21 | LazyData: true
22 | URL: https://github.com/joeornstein/promptr
23 | BugReports: https://github.com/joeornstein/promptr/issues
24 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | YEAR: 2024
2 | COPYRIGHT HOLDER: promptr authors
3 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | 
 3 | Copyright (c) 2024 promptr authors
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | export(complete_chat)
 4 | export(complete_prompt)
 5 | export(format_chat)
 6 | export(format_prompt)
 7 | export(openai_api_key)
 8 | importFrom(utils,read.table)
 9 | importFrom(utils,write.table)
10 | 


--------------------------------------------------------------------------------
/NEWS.md:
--------------------------------------------------------------------------------
1 | # promptr 1.0.0
2 | 
3 | * Initial CRAN submission.
4 | 


--------------------------------------------------------------------------------
/R/complete_chat.R:
--------------------------------------------------------------------------------
  1 | #' Complete an LLM Chat
  2 | #'
  3 | #' @description
  4 | #' Submits a prompt to OpenAI's "Chat" API endpoint and formats the response into a string or tidy dataframe.
  5 | #'
  6 | #'
  7 | #' @param prompt The prompt
  8 | #' @param model  Which OpenAI model to use. Defaults to 'gpt-3.5-turbo'
  9 | #' @param openai_api_key Your API key. By default, looks for a system environment variable called "OPENAI_API_KEY" (recommended option). Otherwise, it will prompt you to enter the API key as an argument.
 10 | #' @param max_tokens How many tokens (roughly 4 characters of text) should the model return? Defaults to a single token (next word prediction).
 11 | #' @param temperature A numeric between 0 and 2 When set to zero, the model will always return the most probable next token. For values greater than zero, the model selects the next word probabilistically.
 12 | #' @param seed An integer. If specified, the OpenAI API will "make a best effort to sample deterministically".
 13 | #' @param parallel TRUE to submit API requests in parallel. Setting to FALSE can reduce rate limit errors at the expense of longer runtime.
 14 | #'
 15 | #' @return If max_tokens = 1, returns a dataframe with the 5 most likely next-word responses and their probabilities. If max_tokens > 1, returns a single string of text generated by the model.
 16 | #' @export
 17 | #'
 18 | #' @examples \dontrun{
 19 | #' format_chat('Are frogs sentient? Yes or No.') |> complete_chat()
 20 | #' format_chat('Write a haiku about frogs.') |> complete_chat(max_tokens = 100)
 21 | #' }
 22 | complete_chat <- function(prompt,
 23 |                           model = 'gpt-3.5-turbo',
 24 |                           openai_api_key = Sys.getenv('OPENAI_API_KEY'),
 25 |                           max_tokens = 1,
 26 |                           temperature = 0,
 27 |                           seed = NULL,
 28 |                           parallel = FALSE) {
 29 | 
 30 |   if(openai_api_key == ''){
 31 |     stop("No API key detected in system environment. You can enter it manually using the 'openai_api_key' argument.")
 32 |   }
 33 | 
 34 |   # function to return a formatted API request
 35 |   format_request <- function(prompt,
 36 |                              base_url = "https://api.openai.com/v1/chat/completions"){
 37 | 
 38 |     logprobs <- max_tokens == 1
 39 |     top_logprobs <- NULL
 40 |     if(logprobs) top_logprobs <- 20
 41 | 
 42 |     httr2::request(base_url) |>
 43 |       # headers
 44 |       httr2::req_headers('Authorization' = paste("Bearer", openai_api_key)) |>
 45 |       httr2::req_headers("Content-Type" = "application/json") |>
 46 |       # body
 47 |       httr2::req_body_json(list(model = model,
 48 |                                 messages = prompt,
 49 |                                 temperature = temperature,
 50 |                                 max_tokens = max_tokens,
 51 |                                 logprobs = logprobs,
 52 |                                 top_logprobs = top_logprobs,
 53 |                                 seed = seed)) #|>
 54 |       #httr2::req_retry(max_tries = 10)
 55 |   }
 56 | 
 57 |   # format a list of requests
 58 |   if(is.character(prompt[[1]][[1]])) prompt <- list(prompt) # if prompt is singular, this condition will be true
 59 |   reqs <- lapply(prompt, format_request)# Map(f = format_request, prompt = prompt)
 60 | 
 61 |   # submit prompts sequentially or in parallel
 62 |   if(parallel){
 63 |     # 20 concurrent requests per host seems to be the optimum
 64 |     resps <- httr2::req_perform_parallel(reqs, pool = curl::new_pool(host_con = 20))
 65 |   } else{
 66 |     resps <- httr2::req_perform_sequential(reqs)
 67 |   }
 68 | 
 69 |   # parse the responses
 70 |   parsed <- resps |>
 71 |     lapply(httr2::resp_body_string) |>
 72 |     lapply(jsonlite::fromJSON, flatten=TRUE)
 73 | 
 74 |   # if max_tokens > 1, return the text
 75 |   to_return <- unlist(lapply(parsed, function(x) x$choices$message.content))
 76 | 
 77 |   # if max_tokens == 1, return a tidy dataframe of probabilities for each prompt
 78 |   if(max_tokens == 1){
 79 | 
 80 |     df <- lapply(parsed, function(x) x$choices$logprobs.content[[1]]$top_logprobs[[1]])
 81 | 
 82 |     # to_return <- df |>
 83 |     #   lapply(function(x) dplyr::mutate(x, probability = exp(logprob))) |>
 84 |     #   lapply(function(x) dplyr::select(x, token, probability))
 85 |     to_return <- df |>
 86 |       lapply(function(x) cbind(x, probability = exp(x[['logprob']]))) |>
 87 |       lapply(function(x) x[,c('token', 'probability')])
 88 | 
 89 |     # don't return it as a list if there's only one prompt in the input
 90 |     if(length(prompt) == 1){
 91 |       to_return <- to_return[[1]]
 92 |     }
 93 | 
 94 |   }
 95 | 
 96 |   return(to_return)
 97 | 
 98 |   # # httr code adapted from https://github.com/irudnyts/openai
 99 |   #
100 |   # ## Build path parameters ----------------------
101 |   #
102 |   # task <- "chat/completions"
103 |   #
104 |   # base_url <- glue::glue("https://api.openai.com/v1/{task}")
105 |   #
106 |   # headers <- c(
107 |   #   "Authorization" = paste("Bearer", openai_api_key),
108 |   #   "Content-Type" = "application/json"
109 |   # )
110 |   #
111 |   # ## Build request body ----------------------------
112 |   #
113 |   # body <- list()
114 |   # body[['model']] <- model
115 |   # body[['messages']] <- prompt
116 |   # body[['max_tokens']] <- max_tokens
117 |   # body[['temperature']] <- temperature
118 |   # if(max_tokens == 1){
119 |   #   body[['logprobs']] <- TRUE
120 |   #   body[['top_logprobs']] <- 5
121 |   # }
122 |   # body[['seed']] <- seed
123 |   #
124 |   # ## Make a request and parse it ----------------
125 |   # response <- httr::POST(
126 |   #   url = base_url,
127 |   #   httr::add_headers(.headers = headers),
128 |   #   body = body,
129 |   #   encode = "json"
130 |   # )
131 |   #
132 |   # parsed <- response |>
133 |   #   httr::content(as = "text", encoding = "UTF-8") |>
134 |   #   jsonlite::fromJSON(flatten = TRUE)
135 |   #
136 |   # ## Check whether request failed and return parsed --------------
137 |   #
138 |   # if (httr::http_error(response)) {
139 |   #   paste0(
140 |   #     "OpenAI API request failed [",
141 |   #     httr::status_code(response),
142 |   #     "]:\n\n",
143 |   #     parsed$error$message
144 |   #   ) |>
145 |   #     stop(call. = FALSE)
146 |   # }
147 |   #
148 |   # # if max_tokens > 1, return the text
149 |   # to_return <- parsed$choices$message.content
150 |   #
151 |   # # if max_tokens == 1, return a tidy dataframe of probabilities for each prompt
152 |   # if(max_tokens == 1){
153 |   #
154 |   #   df <- parsed$choices$logprobs.content[[1]]$top_logprobs[[1]]
155 |   #
156 |   #   df$probability <- exp(df$logprob)
157 |   #
158 |   #   to_return <- data.frame(token = df$token,
159 |   #                           probability = df$probability)
160 |   #
161 |   # }
162 |   #
163 |   # return(to_return)
164 | 
165 | }
166 | 


--------------------------------------------------------------------------------
/R/complete_prompt.R:
--------------------------------------------------------------------------------
  1 | #' Complete an LLM Prompt
  2 | #'
  3 | #' @description
  4 | #' Submits a text prompt to OpenAI's "Completion" API endpoint and formats the response into a string or tidy dataframe. (Note that, as of 2024, this endpoint is considered "Legacy" by OpenAI and is likely to be deprecated.)
  5 | #'
  6 | #'
  7 | #' @param prompt The prompt
  8 | #' @param model  Which OpenAI model to use. Defaults to 'gpt-3.5-turbo-instruct'
  9 | #' @param openai_api_key Your API key. By default, looks for a system environment variable called "OPENAI_API_KEY" (recommended option). Otherwise, it will prompt you to enter the API key as an argument.
 10 | #' @param max_tokens How many tokens (roughly 4 characters of text) should the model return? Defaults to a single token (next word prediction).
 11 | #' @param temperature A numeric between 0 and 2 When set to zero, the model will always return the most probable next token. For values greater than zero, the model selects the next word probabilistically.
 12 | #' @param seed An integer. If specified, the OpenAI API will "make a best effort to sample deterministically".
 13 | #' @param parallel TRUE to submit API requests in parallel. Setting to FALSE can reduce rate limit errors at the expense of longer runtime.
 14 | #'
 15 | #' @return If max_tokens = 1, returns a dataframe with the 5 most likely next words and their probabilities. If max_tokens > 1, returns a single string of text generated by the model.
 16 | #' @export
 17 | #'
 18 | #' @examples \dontrun{
 19 | #' complete_prompt('I feel like a')
 20 | #' complete_prompt('Here is my haiku about frogs:',
 21 | #'                 max_tokens = 100)
 22 | #' }
 23 | complete_prompt <- function(prompt,
 24 |                             model = 'gpt-3.5-turbo-instruct',
 25 |                             openai_api_key = Sys.getenv('OPENAI_API_KEY'),
 26 |                             max_tokens = 1,
 27 |                             temperature = 0,
 28 |                             seed = NULL,
 29 |                             parallel = FALSE) {
 30 | 
 31 |   if(openai_api_key == ''){
 32 |     stop("No API key detected in system environment. You can enter it manually using the 'openai_api_key' argument.")
 33 |   }
 34 | 
 35 |   # function to return a formatted API request -----------------
 36 |   format_request <- function(prompt,
 37 |                              base_url = "https://api.openai.com/v1/completions"){
 38 | 
 39 |     logprobs <- NULL
 40 |     if(max_tokens == 1) logprobs <- 5
 41 | 
 42 |     httr2::request(base_url) |>
 43 |       # headers
 44 |       httr2::req_headers('Authorization' = paste("Bearer", openai_api_key)) |>
 45 |       httr2::req_headers("Content-Type" = "application/json") |>
 46 |       # body
 47 |       httr2::req_body_json(list(model = model,
 48 |                                 prompt = prompt,
 49 |                                 temperature = temperature,
 50 |                                 max_tokens = max_tokens,
 51 |                                 logprobs = logprobs,
 52 |                                 seed = seed))
 53 |   }
 54 | 
 55 |   # split the prompt into chunks; the API will accept 2048 at most
 56 |   chunks <- split(prompt, ceiling(seq_along(prompt) / 2048))
 57 | 
 58 |   # format requests
 59 |   reqs <- lapply(chunks, format_request)
 60 | 
 61 |   # submit prompts sequentially or in parallel
 62 |   if(parallel){
 63 |     # 20 concurrent requests per host seems to be the optimum
 64 |     resps <- httr2::req_perform_parallel(reqs, pool = curl::new_pool(host_con = 20))
 65 |   } else{
 66 |     resps <- httr2::req_perform_sequential(reqs)
 67 |   }
 68 | 
 69 |   # parse the responses
 70 |   parsed <- resps |>
 71 |     lapply(httr2::resp_body_string) |>
 72 |     lapply(jsonlite::fromJSON, flatten=TRUE)
 73 | 
 74 | 
 75 |   # if max_tokens > 1, return the text
 76 |   to_return <- unlist(lapply(parsed, function(x) x$choices$text))
 77 | 
 78 |   # if max_tokens == 1, return a tidy dataframe of probabilities for each prompt
 79 |   if(max_tokens == 1){
 80 |     # get list of logprobs
 81 |     top_logprobs <- parsed |>
 82 |       lapply(function(x) x$choices$logprobs.top_logprobs) |>
 83 |       unlist(recursive = FALSE)
 84 | 
 85 |     # convert to list of dataframes
 86 |     tokens <- lapply(top_logprobs, names)
 87 |     logprobs <- lapply(top_logprobs, as.numeric)
 88 | 
 89 |     to_return <- Map(function(token,logprob){
 90 |       data.frame(token = trimws(token),
 91 |                  probability = exp(logprob))
 92 |     }, tokens, logprobs)
 93 | 
 94 |     # don't return it as a list if there's only one prompt in the input
 95 |     if(length(prompt) == 1){
 96 |       to_return <- to_return[[1]]
 97 |     }
 98 |   }
 99 | 
100 |   return(to_return)
101 | 
102 | }
103 | 


--------------------------------------------------------------------------------
/R/data.R:
--------------------------------------------------------------------------------
 1 | #' Tweets About The Supreme Court of the United States
 2 | #'
 3 | #' This dataset contains 945 tweets referencing the US Supreme Court.
 4 | #' Roughly half were collected on June 4, 2018 following the *Masterpiece Cakeshop*
 5 | #' ruling, and the other half were collected on July 9, 2020 following the
 6 | #' Court's concurrently released opinions in *Trump v. Mazars* and *Trump v. Vance*.
 7 | #' Each tweet includes three independent human-coded sentiment scores (-1 to +1).
 8 | #'
 9 | #' CONTENT WARNING: These texts come from social media, and many contain explicit
10 | #' or offensive language.
11 | #'
12 | #' @docType data
13 | #'
14 | #' @usage data(scotus_tweets)
15 | #'
16 | #' @format
17 | #' A data frame with 945 rows and 5 columns:
18 | #' \describe{
19 | #'  \item{tweet_id}{A unique ID}
20 | #'  \item{text}{The text of the tweet}
21 | #'  \item{case}{An identifier denoting which Supreme Court ruling the tweet was collected after.}
22 | #'  \item{expert1, expert2, expert3}{Hand-coded sentiment score (-1 = negative, 0 = neutral, 1 = positive)}
23 | #' }
24 | #'
25 | #' @keywords datasets
26 | #'
27 | #' @references Ornstein et al. (2024). "How To Train Your Stochastic Parrot"
28 | "scotus_tweets"
29 | 
30 | 
31 | #' Labelled Example Tweets About The Supreme Court of the United States
32 | #'
33 | #' This dataset contains 12 example tweets referencing the Supreme Court
34 | #' along with a sentiment label. These can be used as few-shot prompt
35 | #' examples for classifying tweets in the `scotus_tweets` dataset.
36 | #'
37 | #' @docType data
38 | #'
39 | #' @usage data(scotus_tweets_examples)
40 | #'
41 | #' @format
42 | #' A data frame with 12 rows and 4 columns:
43 | #' \describe{
44 | #'  \item{tweet_id}{A unique ID for each tweet}
45 | #'  \item{text}{The text of the tweet}
46 | #'  \item{case}{The case referenced in the tweet (Masterpiece Cakeshop or Trump v. Mazars)}
47 | #'  \item{label}{The "true" label (Positive, Negative, or Neutral)}
48 | #' }
49 | #'
50 | #' @keywords datasets
51 | #'
52 | #' @references Ornstein et al. (2023). "How To Train Your Stochastic Parrot"
53 | "scotus_tweets_examples"
54 | 
55 | #' Occupations
56 | #'
57 | #' This dataset contains 3,948 ballot designations from municipal elections in California.
58 | #' A random subset are hand-labeled as either "Working Class" or "Not Working Class" occupations.
59 | #'
60 | #' @docType data
61 | #'
62 | #' @usage data(occupations)
63 | #'
64 | #' @format
65 | #' A data frame with 3948 rows and 2 columns:
66 | #' \describe{
67 | #'  \item{baldesig}{Ballot designation as it appears in the CEDA dataset}
68 | #'  \item{hand_coded}{A hand-coded occupation classification (for a random subset)}
69 | #' }
70 | #'
71 | #' @keywords datasets
72 | #'
73 | #' @references California Elections Data Archive (CEDA). https://hdl.handle.net/10211.3/210187
74 | "occupations"
75 | 
76 | 
77 | #' Labelled Occupations
78 | #'
79 | #' This dataset contains 9 example occupations
80 | #' along with a classification. These can be used as few-shot
81 | #' examples for classifying occupations in the `occupations` dataset.
82 | #'
83 | #' @docType data
84 | #'
85 | #' @usage data(occupations_examples)
86 | #'
87 | #' @format
88 | #' A data frame with 9 rows and 2 columns:
89 | #' \describe{
90 | #'  \item{text}{The text of the ballot designation}
91 | #'  \item{label}{The hand-coded label (Working Class, Not Working Class, NA)}
92 | #' }
93 | #'
94 | #' @keywords datasets
95 | #'
96 | #' @references California Elections Data Archive (CEDA). https://hdl.handle.net/10211.3/210187
97 | "occupations_examples"
98 | 


--------------------------------------------------------------------------------
/R/format_chat.R:
--------------------------------------------------------------------------------
 1 | #' Format a Chat Prompt
 2 | #'
 3 | #' @description
 4 | #' Format a chat prompt to submit to OpenAI's ChatGPT or GPT-4 (particularly useful for classification tasks).
 5 | #'
 6 | #' @param text The text to be classified.
 7 | #' @param instructions Instructions to be included at the beginning of the prompt (format them like you would format instructions to a human research assistant).
 8 | #' @param examples A dataframe of "few-shot" examples. Must include one column called 'text' with the example text(s) and another column called "label" with the correct label(s).
 9 | #' @param system_message An optional "system message" with high-level instructions (e.g. "You are a helpful research assistant.")
10 | #'
11 | #' @return Returns a series of messages formatted as a list object, which can be used as an input for promptr::complete_chat() or openai::create_chat_completion().
12 | #' @export
13 | #'
14 | #' @examples
15 | #' data(scotus_tweets_examples)
16 | #'
17 | #' format_chat(text = "I am disappointed with this ruling.",
18 | #'                    instructions = "Decide if the statement is Positive or Negative.",
19 | #'                    examples = scotus_tweets_examples)
20 | format_chat <- function(text, instructions = NA,
21 |                         examples = data.frame(),
22 |                         system_message = NA){
23 | 
24 |   # initialize empty list
25 |   result <- list()
26 | 
27 |   # start with system message (if applicable)
28 |   if(!is.na(system_message)){
29 |     result[[length(result) + 1]] <- list('role' = 'system',
30 |                                          'content' = system_message)
31 |   }
32 | 
33 |   # add instructions
34 |   if(!is.na(instructions)){
35 |     result[[length(result) + 1]] <- list('role' = 'user',
36 |                                          'content' = instructions)
37 |   }
38 | 
39 |   # loop through examples, formatting as user/assistant responses
40 |   if(nrow(examples) > 0){
41 |     for(i in 1:nrow(examples)){
42 |       user_entry <- list('role' = 'user', 'content' = examples$text[i])
43 |       assistant_entry <- list('role' = 'assistant', 'content' = examples$label[i])
44 | 
45 |       result[[length(result) + 1]] <- user_entry
46 |       result[[length(result) + 1]] <- assistant_entry
47 |     }
48 |   }
49 | 
50 |   # add the text to be classified
51 |   result[[length(result) + 1]] <- list('role' = 'user',
52 |                                        'content' = text)
53 | 
54 |   return(result)
55 | 
56 | }
57 | 


--------------------------------------------------------------------------------
/R/format_prompt.R:
--------------------------------------------------------------------------------
 1 | #' Format an LLM prompt
 2 | #'
 3 | #' @description
 4 | #' Format a text prompt for a Large Language Model. Particularly useful for few-shot text classification tasks. Note that if you are planning to use one of OpenAI's chat models, like ChatGPT or GPT-4, you will want to use the `format_chat()` function instead.
 5 | #'
 6 | #' @param text The text to be classified. Can be a character vector or a single string.
 7 | #' @param instructions Instructions to be included in the prompt (format them like you would format instructions to a human research assistant).
 8 | #' @param examples A dataframe of "few-shot" examples. Must include one column called 'text' with the example text(s) and another column called "label" with the correct label(s).
 9 | #' @param template The template for how examples and completions should be formatted, in `glue` syntax. If you are including few-shot examples in the prompt, this must contain the \{text\} and \{label\} placeholders.
10 | #' @param prompt_template The template for the entire prompt. Defaults to instructions, followed by few-shot examples, followed by the input to be classified.
11 | #' @param separator A character that separates examples. Defaults to two carriage returns.
12 | #'
13 | #' @return Returns a formatted prompt that can be used as input for `complete_prompt()` or `openai::create_completion()`.
14 | #' @export
15 | #'
16 | #' @examples
17 | #' data(scotus_tweets_examples)
18 | #'
19 | #' format_prompt(text = "I am disappointed with this ruling.",
20 | #'               instructions = "Decide if the sentiment of this statement is Positive or Negative.",
21 | #'               examples = scotus_tweets_examples,
22 | #'               template = "Statement: {text}\nSentiment: {label}")
23 | #'
24 | #' format_prompt(text = 'I am sad about the Supreme Court',
25 | #'               examples = scotus_tweets_examples,
26 | #'               template = '"{text}" is a {label} statement',
27 | #'               separator = '\n')
28 | format_prompt <- function(text,
29 |                           instructions = '',
30 |                           examples = data.frame(),
31 |                           template = 'Text: {text}\nClassification: {label}',
32 |                           prompt_template = '{instructions}{examples}{input}',
33 |                           separator = '\n\n'){
34 | 
35 |   # convert examples dataframe to string
36 |   if(nrow(examples) == 0){
37 |     examples <- ''
38 |   } else{
39 |     examples <- examples |>
40 |       dplyr::mutate(prompt_segment = glue::glue(template))
41 | 
42 |     examples <- examples$prompt_segment |>
43 |       paste(collapse = separator) |>
44 |       paste0(separator)
45 |   }
46 | 
47 |   # add separator to instructions
48 |   if(nchar(instructions) > 0){
49 |     instructions <- paste0(instructions, separator)
50 |   }
51 | 
52 |   # format input using template (removing the {label} tag and anything after it)
53 |   input <- template |>
54 |     stringr::str_replace('\\{label\\}.*', '') |>
55 |     stringr::str_trim() |>
56 |     glue::glue()
57 | 
58 |   # glue together the complete prompt template
59 |   glue::glue(prompt_template)
60 | 
61 | }
62 | 


--------------------------------------------------------------------------------
/R/openai_api_key.R:
--------------------------------------------------------------------------------
 1 | #' Install an OPENAI API KEY in Your \code{.Renviron} File for Repeated Use
 2 | #' @description This function will add your OpenAI API key to your \code{.Renviron} file so it can be called securely without being stored
 3 | #' in your code. After you have installed your key, it can be called any time by typing \code{Sys.getenv("OPENAI_API_KEY")} and will be
 4 | #' automatically called in package functions. If you do not have an \code{.Renviron} file, the function will create one for you.
 5 | #' If you already have an \code{.Renviron} file, the function will append the key to your existing file, while making a backup of your
 6 | #' original file for disaster recovery purposes.
 7 | #' @param key The API key provided to you from the OpenAI formated in quotes.
 8 | #' @param install if TRUE, will install the key in your \code{.Renviron} file for use in future sessions.  Defaults to FALSE.
 9 | #' @param overwrite If this is set to TRUE, it will overwrite an existing OPENAI_API_KEY that you already have in your \code{.Renviron} file.
10 | #' @importFrom utils write.table read.table
11 | #'
12 | #' @return No return value, called for side effects
13 | #'
14 | #' @examples
15 | #'
16 | #' \dontrun{
17 | #' openai_api_key("111111abc", install = TRUE)
18 | #' # First time, reload your environment so you can use the key without restarting R.
19 | #' readRenviron("~/.Renviron")
20 | #' # You can check it with:
21 | #' Sys.getenv("OPENAI_API_KEY")
22 | #' }
23 | #'
24 | #' \dontrun{
25 | #' # If you need to overwrite an existing key:
26 | #' openai_api_key("111111abc", overwrite = TRUE, install = TRUE)
27 | #' # First time, reload your environment so you can use the key without restarting R.
28 | #' readRenviron("~/.Renviron")
29 | #' # You can check it with:
30 | #' Sys.getenv("OPENAI_API_KEY")
31 | #' }
32 | #' @export
33 | 
34 | openai_api_key <- function(key, overwrite = FALSE, install = FALSE){
35 | 
36 |   if (install) {
37 |     home <- Sys.getenv("HOME")
38 |     renv <- file.path(home, ".Renviron")
39 |     if(file.exists(renv)){
40 |       # Backup original .Renviron before doing anything else here.
41 |       file.copy(renv, file.path(home, ".Renviron_backup"))
42 |     }
43 |     if(!file.exists(renv)){
44 |       file.create(renv)
45 |     }
46 |     else{
47 |       if(isTRUE(overwrite)){
48 |         message("Your original .Renviron will be backed up and stored in your R HOME directory if needed.")
49 |         oldenv=read.table(renv, stringsAsFactors = FALSE)
50 |         newenv <- oldenv[-grep("OPENAI_API_KEY", oldenv),]
51 |         write.table(newenv, renv, quote = FALSE, sep = "\n",
52 |                     col.names = FALSE, row.names = FALSE)
53 |       }
54 |       else{
55 |         tv <- readLines(renv)
56 |         if(any(grepl("OPENAI_API_KEY",tv))){
57 |           stop("An OPENAI_API_KEY already exists. You can overwrite it with the argument overwrite=TRUE", call.=FALSE)
58 |         }
59 |       }
60 |     }
61 | 
62 |     keyconcat <- paste0("OPENAI_API_KEY='", key, "'")
63 |     # Append API key to .Renviron file
64 |     write(keyconcat, renv, sep = "\n", append = TRUE)
65 |     message('Your API key has been stored in your .Renviron and can be accessed by Sys.getenv("OPENAI_API_KEY"). \nTo use now, restart R or run `readRenviron("~/.Renviron")`')
66 |     return(key)
67 |   } else {
68 |     message("To install your API key for use in future sessions, run this function with `install = TRUE`.")
69 |     Sys.setenv(OPENAI_API_KEY = key)
70 |   }
71 | 
72 | }
73 | 


--------------------------------------------------------------------------------
/README.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | output: github_document
  3 | ---
  4 | 
  5 | <!-- README.md is generated from README.Rmd. Please edit that file -->
  6 | 
  7 | ```{r, include = FALSE}
  8 | knitr::opts_chunk$set(
  9 |   collapse = TRUE,
 10 |   warning = FALSE,
 11 |   message = FALSE,
 12 |   comment = "#>",
 13 |   fig.path = "man/figures/README-",
 14 |   out.width = "100%"
 15 | )
 16 | ```
 17 | 
 18 | # promptr <img src="man/figures/logo.png" align="right" height="89" />
 19 | 
 20 | <!-- badges: start -->
 21 | <!-- badges: end -->
 22 | 
 23 | We developed the `promptr` package so that researchers could easily format and submit LLM prompts using the R programming language. It provides a handful of convenient functions to query the OpenAI API and return the output as a tidy R dataframe. The package is intended to be particularly useful for social scientists using LLMs for text classification and scaling tasks.
 24 | 
 25 | ## Installation
 26 | 
 27 | You can install the release version of 
 28 | 
 29 | ```{r, eval = FALSE}
 30 | install.packages('promptr')
 31 | ```
 32 | 
 33 | You can install the development version of `promptr` from [GitHub](https://github.com/) with:
 34 | 
 35 | ``` r
 36 | # install.packages("devtools")
 37 | devtools::install_github("joeornstein/promptr")
 38 | ```
 39 | 
 40 | You will also need a developer account with OpenAI and an API key. For best performance, you may also want to provide credit card information (this significantly boosts your API rate limit, even if you’re not spending money).
 41 | 
 42 | Once your account is created, copy-paste your API key into the following line of R code.
 43 | 
 44 | ```
 45 | library(promptr)
 46 | 
 47 | openai_api_key('YOUR API KEY GOES HERE', install = TRUE)
 48 | ```
 49 | 
 50 | Now you're all set up!
 51 | 
 52 | ## Completing Prompts
 53 | 
 54 | The workhorse function of the `promptr` package is `complete_prompt()`. This function submits a prompt to the OpenAI API and returns a dataframe with the five most likely next word predictions and their associated probabilities.
 55 | 
 56 | ```{r}
 57 | library(promptr)
 58 | 
 59 | complete_prompt('I feel like a')
 60 | ```
 61 | 
 62 | If you prefer the model to autoregressively generate text instead of outputting the next-word probabilities, you can set the `max_tokens` input greater than 1. The function will return a character object with the most likely completion.
 63 | 
 64 | ```{r}
 65 | complete_prompt('I feel like a', max_tokens = 18)
 66 | ```
 67 | 
 68 | Note that by default, the `temperature` input is set to 0, which means the model will always return the most likely completion for your prompt. Increasing temperature allows the model to randomly select words from its estimated probability distribution (see the API reference for more on these parameters).
 69 | 
 70 | You can also change which model variant the function calls using the `model` input. By default, it is set to "gpt-3.5-turbo-instruct", the RLHF variant of GPT-3.5. For the base GPT-3 variants, try "davinci-002" (175 billion parameters) or "babbage-002" (1.3 billion parameters).
 71 | 
 72 | ## Formatting Prompts
 73 | 
 74 | Manually typing prompts with multiple few-shot examples can be tedious and error-prone, particularly if you want to include context-specific instructions or few-shot examples. We include the `format_prompt()` function to aid in that process.
 75 | 
 76 | The function is designed with classification problems in mind. If you input the text you would like to classify along with a set of instructions, the default prompt template looks like this:
 77 | 
 78 | ```{r}
 79 | prompt <- format_prompt(text = 'I feel positively morose today.', 
 80 |                         instructions = 'Decide whether this statment is happy or sad.')
 81 | prompt
 82 | ```
 83 | 
 84 | You can customize the template using `glue` syntax, with placeholders for {text} and {label}.
 85 | 
 86 | ```{r}
 87 | format_prompt(text = 'I feel positively morose today.',
 88 |               instructions = 'Decide whether this statment is happy or sad.',
 89 |               template = 'Statement: {text}\nSentiment: {label}')
 90 | ```
 91 | 
 92 | This function is particularly useful when including few-shot examples in the prompt. If you input these examples as a tidy dataframe, the `format_prompt()` function will paste them into the prompt according to the template. The `examples` dataframe must have at least two columns, one called "text" and the other called "label".
 93 | 
 94 | ```{r}
 95 | examples <- data.frame(
 96 |   text = c('What a pleasant day!', 
 97 |            'Oh bother.',
 98 |            'Merry Christmas!',
 99 |            ':-('),
100 |   label = c('happy', 'sad', 'happy', 'sad')
101 | )
102 | 
103 | examples
104 | 
105 | prompt <- format_prompt(text = 'I feel positively morose today.',
106 |                         instructions = 'Decide whether this statment is happy or sad.',
107 |                         examples = examples,
108 |                         template = 'Statement: {text}\nSentiment: {label}')
109 | 
110 | prompt
111 | ```
112 | 
113 | Once you're satisfied with the format of the prompt, you can submit it with `complete_prompt()`:
114 | 
115 | ```{r}
116 | complete_prompt(prompt)
117 | ```
118 | 
119 | The full pipeline---first formatting the text into a prompt, then submitting the prompt for completion---looks like this:
120 | 
121 | ```{r}
122 | 'What a joyous day for our adversaries.' |> 
123 |   format_prompt(instructions = 'Classify this text as happy or sad.',
124 |                 examples = examples) |> 
125 |   complete_prompt()
126 | ```
127 | 
128 | The biggest advantage of using text prompts like these is **efficiency**. One can request up to 2,048 next-word probability distributions in a single API call, whereas ChatGPT prompts (see next section) can only be submitted one at a time. Both the `format_prompt()` function and the `complete_prompt()` function are vectorized so that users can submit multiple texts to be classified simultaneously.
129 | 
130 | ```{r}
131 | texts <- c('What a wonderful world??? As if!', 'Things are looking up.', 'Me gusta mi vida.')
132 | 
133 | texts |> 
134 |   format_prompt(instructions = 'Classify these texts as happy or sad.',
135 |                 examples = examples) |> 
136 |   complete_prompt()
137 | ```
138 | 
139 | ## Example: Supreme Court Tweets
140 | 
141 | To illustrate the entire workflow, let's classify the sentiment of social media posts from the Supreme Court Tweets dataset included in the package.
142 | 
143 | ```{r}
144 | data(scotus_tweets) # the full dataset
145 | data(scotus_tweets_examples) # a dataframe with few-shot examples
146 | ```
147 | 
148 | Let's focus on tweets posted following the *Masterpiece Cakeshop v Colorado* (2018) decision, formatting the prompts with a set of instructions and few-shot examples tailored to that context.
149 | 
150 | ```{r}
151 | library(tidyverse)
152 | 
153 | masterpiece_tweets <- scotus_tweets |> 
154 |   filter(case == 'masterpiece')
155 | 
156 | instructions <- 'Read these tweets posted the day after the US Supreme Court ruled in favor of a baker who refused to bake a wedding cake for a same-sex couple (Masterpiece Cakeshop, 2018). For each tweet, decide whether its sentiment is Positive, Neutral, or Negative.'
157 | 
158 | masterpiece_examples <- scotus_tweets_examples |> 
159 |   filter(case == 'masterpiece')
160 | 
161 | masterpiece_tweets$prompt <- format_prompt(text = masterpiece_tweets$text,
162 |                                            instructions = instructions,
163 |                                            examples = masterpiece_examples)
164 | 
165 | masterpiece_tweets$prompt[3]
166 | ```
167 | 
168 | Then we can submit this list of prompts using `complete_prompt()`:
169 | 
170 | ```{r, echo=TRUE, eval=FALSE}
171 | masterpiece_tweets$out <- complete_prompt(masterpiece_tweets$prompt)
172 | ```
173 | 
174 | ```{r, echo = FALSE, eval = TRUE}
175 | load('data-raw/masterpiece_tweets.RData')
176 | ```
177 | 
178 | The estimated probability distribution for each completion is now a list of dataframes in the `out` column. We can compute a simple sentiment score by taking the estimated probability each tweet is Positive minus the estimated probability the tweet is Negative:
179 | 
180 | ```{r}
181 | masterpiece_tweets$score <- masterpiece_tweets$out |> 
182 |   lapply(mutate, token = str_to_lower(token)) |> 
183 |   lapply(summarize, 
184 |          positive = sum(probability[token=='positive']), 
185 |          negative = sum(probability[token=='negative'])) |>
186 |   lapply(summarize,score=positive-negative) |> 
187 |   unlist()
188 | ```
189 | 
190 | Finally, let's compare those scores from GPT-3.5 with the authors' hand-coded sentiment scores (-1 for Negative, 0 for Neutral, and +1 for Positive).
191 | 
192 | ```{r}
193 | ggplot(data = masterpiece_tweets,
194 |        mapping = aes(
195 |          x = (expert1 + expert2 + expert3) / 3,
196 |          y = score
197 |          )) +
198 |   geom_jitter(width = 0.1) +
199 |   labs(x = 'Hand-Coded Sentiment',
200 |        y = 'GPT-3.5 Sentiment Score') +
201 |   theme_bw()
202 | ```
203 | 
204 | 
205 | ## Chat Completions
206 | 
207 | The most recent OpenAI language models---including ChatGPT and GPT-4---have been fine-tuned to function as "chat" models, and interacting with them through the API requires a slightly different format for the inputs. Instead of a single text prompt, few-shot prompts are expressed in the form of a "dialogue" between the user and the model, which we can represent in `R` as a "list of lists".
208 | 
209 | ```{r}
210 | prompt <- list(
211 |   list(role = 'user',
212 |        content = 'Hello can you help me with a homework problem?'),
213 |   list(role = 'assistant',
214 |        content = 'Sure thing! What is the problem?'),
215 |   list(role = 'user',
216 |        content = 'I need to explain why Frederick the Great was so fond of potatoes?')
217 | )
218 | ```
219 | 
220 | Users can submit a chat prompt to the API using the `complete_chat()` function. The default model is "gpt-3.5-turbo" (the most cost-effective chat model offered through the API as of February 2024).
221 | 
222 | ```{r}
223 | complete_chat(prompt, max_tokens = 300)
224 | ```
225 | 
226 | The `format_chat()` function allows users to create a chat prompt using the same syntax as `format_prompt()`.
227 | 
228 | ```{r}
229 | tweet <- masterpiece_tweets$text[4]
230 | cat(tweet)
231 | 
232 | prompt <- format_chat(tweet, 
233 |                       instructions = 'Read these tweets posted the day after the US Supreme Court ruled in favor of a baker who refused to bake a wedding cake for a same-sex couple (Masterpiece Cakeshop, 2018). For each tweet, decide whether its sentiment is Positive, Neutral, or Negative.',
234 |                       examples = masterpiece_examples)
235 | 
236 | prompt
237 | ```
238 | 
239 | One advantage of these chat models is that they typically do not require as many few-shot examples to perform well, but their big practical disadvantage is that we can only submit one chat to the API at a time.
240 | 
241 | ```{r}
242 | response <- complete_chat(prompt)
243 | response
244 | ```
245 | 
246 | ```{r, echo=FALSE, eval=FALSE}
247 | 'Rate the sentiment of this tweet on a scale from 0 to 100, where 0 means "Extremely Negative" and 100 means "Extremely Positive".'
248 | ```
249 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | <!-- README.md is generated from README.Rmd. Please edit that file -->
  3 | 
  4 | # promptr <img src="man/figures/logo.png" align="right" height="89" />
  5 | 
  6 | <!-- badges: start -->
  7 | <!-- badges: end -->
  8 | 
  9 | We developed the `promptr` package so that researchers could easily
 10 | format and submit LLM prompts using the R programming language. It
 11 | provides a handful of convenient functions to query the OpenAI API and
 12 | return the output as a tidy R dataframe. The package is intended to be
 13 | particularly useful for social scientists using LLMs for text
 14 | classification and scaling tasks.
 15 | 
 16 | ## Installation
 17 | 
 18 | You can install the release version of `promptr` from CRAN with:
 19 | 
 20 | ``` r
 21 | install.packages('promptr')
 22 | ```
 23 | 
 24 | Or you can install the latest development version from
 25 | [GitHub](https://github.com/) with:
 26 | 
 27 | ``` r
 28 | # install.packages("devtools")
 29 | devtools::install_github("joeornstein/promptr")
 30 | ```
 31 | 
 32 | You will also need a developer account with OpenAI and an API key. For
 33 | best performance, you may also want to provide credit card information
 34 | (this significantly boosts your API rate limit, even if you’re not
 35 | spending money).
 36 | 
 37 | Once your account is created, copy-paste your API key into the following
 38 | line of R code.
 39 | 
 40 |     library(promptr)
 41 | 
 42 |     openai_api_key('YOUR API KEY GOES HERE', install = TRUE)
 43 | 
 44 | Now you’re all set up!
 45 | 
 46 | ## Completing Prompts
 47 | 
 48 | The workhorse function of the `promptr` package is `complete_prompt()`.
 49 | This function submits a prompt to the OpenAI API and returns a dataframe
 50 | with the five most likely next word predictions and their associated
 51 | probabilities.
 52 | 
 53 | ``` r
 54 | library(promptr)
 55 | 
 56 | complete_prompt('I feel like a')
 57 | #>    token probability
 58 | #> 1    lot  0.20985606
 59 | #> 2 little  0.02118042
 60 | #> 3    kid  0.01374532
 61 | #> 4    new  0.01208388
 62 | #> 5    big  0.01204145
 63 | ```
 64 | 
 65 | If you prefer the model to autoregressively generate text instead of
 66 | outputting the next-word probabilities, you can set the `max_tokens`
 67 | input greater than 1. The function will return a character object with
 68 | the most likely completion.
 69 | 
 70 | ``` r
 71 | complete_prompt('I feel like a', max_tokens = 18)
 72 | #> [1] " lot of people are gonna be like, \"Oh, I'm gonna be a doctor.\"\n\n"
 73 | ```
 74 | 
 75 | Note that by default, the `temperature` input is set to 0, which means
 76 | the model will always return the most likely completion for your prompt.
 77 | Increasing temperature allows the model to randomly select words from
 78 | its estimated probability distribution (see the API reference for more
 79 | on these parameters).
 80 | 
 81 | You can also change which model variant the function calls using the
 82 | `model` input. By default, it is set to “gpt-3.5-turbo-instruct”, the
 83 | RLHF variant of GPT-3.5. For the base GPT-3 variants, try “davinci-002”
 84 | (175 billion parameters) or “babbage-002” (1.3 billion parameters).
 85 | 
 86 | ## Formatting Prompts
 87 | 
 88 | Manually typing prompts with multiple few-shot examples can be tedious
 89 | and error-prone, particularly if you want to include context-specific
 90 | instructions or few-shot examples. We include the `format_prompt()`
 91 | function to aid in that process.
 92 | 
 93 | The function is designed with classification problems in mind. If you
 94 | input the text you would like to classify along with a set of
 95 | instructions, the default prompt template looks like this:
 96 | 
 97 | ``` r
 98 | prompt <- format_prompt(text = 'I feel positively morose today.', 
 99 |                         instructions = 'Decide whether this statment is happy or sad.')
100 | prompt
101 | #> Decide whether this statment is happy or sad.
102 | #> 
103 | #> Text: I feel positively morose today.
104 | #> Classification:
105 | ```
106 | 
107 | You can customize the template using `glue` syntax, with placeholders
108 | for {text} and {label}.
109 | 
110 | ``` r
111 | format_prompt(text = 'I feel positively morose today.',
112 |               instructions = 'Decide whether this statment is happy or sad.',
113 |               template = 'Statement: {text}\nSentiment: {label}')
114 | #> Decide whether this statment is happy or sad.
115 | #> 
116 | #> Statement: I feel positively morose today.
117 | #> Sentiment:
118 | ```
119 | 
120 | This function is particularly useful when including few-shot examples in
121 | the prompt. If you input these examples as a tidy dataframe, the
122 | `format_prompt()` function will paste them into the prompt according to
123 | the template. The `examples` dataframe must have at least two columns,
124 | one called “text” and the other called “label”.
125 | 
126 | ``` r
127 | examples <- data.frame(
128 |   text = c('What a pleasant day!', 
129 |            'Oh bother.',
130 |            'Merry Christmas!',
131 |            ':-('),
132 |   label = c('happy', 'sad', 'happy', 'sad')
133 | )
134 | 
135 | examples
136 | #>                   text label
137 | #> 1 What a pleasant day! happy
138 | #> 2           Oh bother.   sad
139 | #> 3     Merry Christmas! happy
140 | #> 4                  :-(   sad
141 | 
142 | prompt <- format_prompt(text = 'I feel positively morose today.',
143 |                         instructions = 'Decide whether this statment is happy or sad.',
144 |                         examples = examples,
145 |                         template = 'Statement: {text}\nSentiment: {label}')
146 | 
147 | prompt
148 | #> Decide whether this statment is happy or sad.
149 | #> 
150 | #> Statement: What a pleasant day!
151 | #> Sentiment: happy
152 | #> 
153 | #> Statement: Oh bother.
154 | #> Sentiment: sad
155 | #> 
156 | #> Statement: Merry Christmas!
157 | #> Sentiment: happy
158 | #> 
159 | #> Statement: :-(
160 | #> Sentiment: sad
161 | #> 
162 | #> Statement: I feel positively morose today.
163 | #> Sentiment:
164 | ```
165 | 
166 | Once you’re satisfied with the format of the prompt, you can submit it
167 | with `complete_prompt()`:
168 | 
169 | ``` r
170 | complete_prompt(prompt)
171 | #>     token  probability
172 | #> 1     sad 9.990284e-01
173 | #> 2     sad 6.382159e-04
174 | #> 3     Sad 1.961563e-04
175 | #> 4   happy 3.677703e-05
176 | #> 5 sadness 2.776648e-05
177 | ```
178 | 
179 | The full pipeline—first formatting the text into a prompt, then
180 | submitting the prompt for completion—looks like this:
181 | 
182 | ``` r
183 | 'What a joyous day for our adversaries.' |> 
184 |   format_prompt(instructions = 'Classify this text as happy or sad.',
185 |                 examples = examples) |> 
186 |   complete_prompt()
187 | #>     token  probability
188 | #> 1     sad 0.9931754130
189 | #> 2   happy 0.0023576333
190 | #> 3     sad 0.0021634900
191 | #> 4     Sad 0.0007275062
192 | #> 5 unhappy 0.0006792638
193 | ```
194 | 
195 | The biggest advantage of using text prompts like these is
196 | **efficiency**. One can request up to 2,048 next-word probability
197 | distributions in a single API call, whereas ChatGPT prompts (see next
198 | section) can only be submitted one at a time. Both the `format_prompt()`
199 | function and the `complete_prompt()` function are vectorized so that
200 | users can submit multiple texts to be classified simultaneously.
201 | 
202 | ``` r
203 | texts <- c('What a wonderful world??? As if!', 'Things are looking up.', 'Me gusta mi vida.')
204 | 
205 | texts |> 
206 |   format_prompt(instructions = 'Classify these texts as happy or sad.',
207 |                 examples = examples) |> 
208 |   complete_prompt()
209 | #> [[1]]
210 | #>     token  probability
211 | #> 1     sad 0.9845923503
212 | #> 2   happy 0.0101702041
213 | #> 3     sad 0.0022756506
214 | #> 4 unhappy 0.0005526699
215 | #> 5         0.0005016985
216 | #> 
217 | #> [[2]]
218 | #>   token  probability
219 | #> 1 happy 9.989103e-01
220 | #> 2 happy 8.046505e-04
221 | #> 3       7.620519e-05
222 | #> 4       5.893237e-05
223 | #> 5 Happy 2.052843e-05
224 | #> 
225 | #> [[3]]
226 | #>    token  probability
227 | #> 1  happy 0.9957006846
228 | #> 2  happy 0.0012367921
229 | #> 3        0.0009202636
230 | #> 4 unsure 0.0002593114
231 | #> 5        0.0001682163
232 | ```
233 | 
234 | ## Example: Supreme Court Tweets
235 | 
236 | To illustrate the entire workflow, let’s classify the sentiment of
237 | social media posts from the Supreme Court Tweets dataset included in the
238 | package.
239 | 
240 | ``` r
241 | data(scotus_tweets) # the full dataset
242 | data(scotus_tweets_examples) # a dataframe with few-shot examples
243 | ```
244 | 
245 | Let’s focus on tweets posted following the *Masterpiece Cakeshop v
246 | Colorado* (2018) decision, formatting the prompts with a set of
247 | instructions and few-shot examples tailored to that context.
248 | 
249 | ``` r
250 | library(tidyverse)
251 | 
252 | masterpiece_tweets <- scotus_tweets |> 
253 |   filter(case == 'masterpiece')
254 | 
255 | instructions <- 'Read these tweets posted the day after the US Supreme Court ruled in favor of a baker who refused to bake a wedding cake for a same-sex couple (Masterpiece Cakeshop, 2018). For each tweet, decide whether its sentiment is Positive, Neutral, or Negative.'
256 | 
257 | masterpiece_examples <- scotus_tweets_examples |> 
258 |   filter(case == 'masterpiece')
259 | 
260 | masterpiece_tweets$prompt <- format_prompt(text = masterpiece_tweets$text,
261 |                                            instructions = instructions,
262 |                                            examples = masterpiece_examples)
263 | 
264 | masterpiece_tweets$prompt[3]
265 | #> Read these tweets posted the day after the US Supreme Court ruled in favor of a baker who refused to bake a wedding cake for a same-sex couple (Masterpiece Cakeshop, 2018). For each tweet, decide whether its sentiment is Positive, Neutral, or Negative.
266 | #> 
267 | #> Text: Thank you Supreme Court I take pride in your decision!!!!✝️ #SCOTUS
268 | #> Classification: Positive
269 | #> 
270 | #> Text: Supreme Court rules in favor of Colorado baker! This day is getting better by the minute!
271 | #> Classification: Positive
272 | #> 
273 | #> Text: Can’t escape the awful irony of someone allowed to use religion to discriminate against people in love. 
274 | #> Not my Jesus. 
275 | #> #opentoall #SCOTUS #Hypocrisy #MasterpieceCakeshop
276 | #> Classification: Negative
277 | #> 
278 | #> Text: I can’t believe this cake case went all the way to #SCOTUS . Can someone let me know what cake was ultimately served at the wedding? Are they married and living happily ever after?
279 | #> Classification: Neutral
280 | #> 
281 | #> Text: Supreme Court rules in favor of baker who would not make wedding cake for gay couple
282 | #> Classification: Neutral
283 | #> 
284 | #> Text: #SCOTUS set a dangerous precedent today. Although the Court limited the scope to which a business owner could deny services to patrons, the legal argument has been legitimized that one's subjective religious convictions trump (no pun intended) #humanrights. #LGBTQRights
285 | #> Classification: Negative
286 | #> 
287 | #> Text: The @Scotus ruling was a 🥧 pie-in-the-face to liberal lunacy. 
288 | #> 
289 | #> @charliekirk11 @Richzeoli @DennisDMZ 
290 | #> 
291 | #> 🎂🎂🎂🎂🎂🎂🎂🎂🎂
292 | #> 
293 | #> #CakeEquality #SCOTUS #liberaltears
294 | #> Classification:
295 | ```
296 | 
297 | Then we can submit this list of prompts using `complete_prompt()`:
298 | 
299 | ``` r
300 | masterpiece_tweets$out <- complete_prompt(masterpiece_tweets$prompt)
301 | ```
302 | 
303 | The estimated probability distribution for each completion is now a list
304 | of dataframes in the `out` column. We can compute a simple sentiment
305 | score by taking the estimated probability each tweet is Positive minus
306 | the estimated probability the tweet is Negative:
307 | 
308 | ``` r
309 | masterpiece_tweets$score <- masterpiece_tweets$out |> 
310 |   lapply(mutate, token = str_to_lower(token)) |> 
311 |   lapply(summarize, 
312 |          positive = sum(probability[token=='positive']), 
313 |          negative = sum(probability[token=='negative'])) |>
314 |   lapply(summarize,score=positive-negative) |> 
315 |   unlist()
316 | ```
317 | 
318 | Finally, let’s compare those scores from GPT-3.5 with the authors’
319 | hand-coded sentiment scores (-1 for Negative, 0 for Neutral, and +1 for
320 | Positive).
321 | 
322 | ``` r
323 | ggplot(data = masterpiece_tweets,
324 |        mapping = aes(
325 |          x = (expert1 + expert2 + expert3) / 3,
326 |          y = score
327 |          )) +
328 |   geom_jitter(width = 0.1) +
329 |   labs(x = 'Hand-Coded Sentiment',
330 |        y = 'GPT-3.5 Sentiment Score') +
331 |   theme_bw()
332 | ```
333 | 
334 | <img src="man/figures/README-unnamed-chunk-16-1.png" width="100%" />
335 | 
336 | ## Chat Completions
337 | 
338 | The most recent OpenAI language models—including ChatGPT and GPT-4—have
339 | been fine-tuned to function as “chat” models, and interacting with them
340 | through the API requires a slightly different format for the inputs.
341 | Instead of a single text prompt, few-shot prompts are expressed in the
342 | form of a “dialogue” between the user and the model, which we can
343 | represent in `R` as a “list of lists”.
344 | 
345 | ``` r
346 | prompt <- list(
347 |   list(role = 'user',
348 |        content = 'Hello can you help me with a homework problem?'),
349 |   list(role = 'assistant',
350 |        content = 'Sure thing! What is the problem?'),
351 |   list(role = 'user',
352 |        content = 'I need to explain why Frederick the Great was so fond of potatoes?')
353 | )
354 | ```
355 | 
356 | Users can submit a chat prompt to the API using the `complete_chat()`
357 | function. The default model is “gpt-3.5-turbo” (the most cost-effective
358 | chat model offered through the API as of February 2024).
359 | 
360 | ``` r
361 | complete_chat(prompt, max_tokens = 300)
362 | #> [1] "Frederick the Great, also known as Frederick II of Prussia, was fond of potatoes for several reasons. One of the main reasons was that he recognized the nutritional value and versatility of potatoes. Potatoes are a rich source of carbohydrates, vitamins, and minerals, making them a valuable food source for his subjects, especially during times of famine or food shortages.\n\nAdditionally, Frederick promoted the cultivation of potatoes in Prussia because they were easy to grow and had a high yield compared to other crops. This made potatoes a cost-effective and efficient food source for the population.\n\nFurthermore, Frederick saw the potential of potatoes as a way to improve the agricultural productivity of his kingdom. By encouraging the cultivation of potatoes, he aimed to increase food security and reduce dependence on imported grains.\n\nOverall, Frederick the Great's fondness for potatoes was driven by their nutritional value, ease of cultivation, and potential to improve agricultural productivity in Prussia."
363 | ```
364 | 
365 | The `format_chat()` function allows users to create a chat prompt using
366 | the same syntax as `format_prompt()`.
367 | 
368 | ``` r
369 | tweet <- masterpiece_tweets$text[4]
370 | cat(tweet)
371 | #> Let’s be real, lame anti-gay cake probably sucks anyway. 
372 | #> 
373 | #> Also, I love you Sonia Sotomayor and RBG ❤️🧡💛💚💙💜
374 | #> 
375 | #> #masterpiececakeshop #scotus
376 | 
377 | prompt <- format_chat(tweet, 
378 |                       instructions = 'Read these tweets posted the day after the US Supreme Court ruled in favor of a baker who refused to bake a wedding cake for a same-sex couple (Masterpiece Cakeshop, 2018). For each tweet, decide whether its sentiment is Positive, Neutral, or Negative.',
379 |                       examples = masterpiece_examples)
380 | 
381 | prompt
382 | #> [[1]]
383 | #> [[1]]$role
384 | #> [1] "user"
385 | #> 
386 | #> [[1]]$content
387 | #> [1] "Read these tweets posted the day after the US Supreme Court ruled in favor of a baker who refused to bake a wedding cake for a same-sex couple (Masterpiece Cakeshop, 2018). For each tweet, decide whether its sentiment is Positive, Neutral, or Negative."
388 | #> 
389 | #> 
390 | #> [[2]]
391 | #> [[2]]$role
392 | #> [1] "user"
393 | #> 
394 | #> [[2]]$content
395 | #> [1] "Thank you Supreme Court I take pride in your decision!!!!✝️ #SCOTUS"
396 | #> 
397 | #> 
398 | #> [[3]]
399 | #> [[3]]$role
400 | #> [1] "assistant"
401 | #> 
402 | #> [[3]]$content
403 | #> [1] "Positive"
404 | #> 
405 | #> 
406 | #> [[4]]
407 | #> [[4]]$role
408 | #> [1] "user"
409 | #> 
410 | #> [[4]]$content
411 | #> [1] "Supreme Court rules in favor of Colorado baker! This day is getting better by the minute!"
412 | #> 
413 | #> 
414 | #> [[5]]
415 | #> [[5]]$role
416 | #> [1] "assistant"
417 | #> 
418 | #> [[5]]$content
419 | #> [1] "Positive"
420 | #> 
421 | #> 
422 | #> [[6]]
423 | #> [[6]]$role
424 | #> [1] "user"
425 | #> 
426 | #> [[6]]$content
427 | #> [1] "Can’t escape the awful irony of someone allowed to use religion to discriminate against people in love. \r\nNot my Jesus. \r\n#opentoall #SCOTUS #Hypocrisy #MasterpieceCakeshop"
428 | #> 
429 | #> 
430 | #> [[7]]
431 | #> [[7]]$role
432 | #> [1] "assistant"
433 | #> 
434 | #> [[7]]$content
435 | #> [1] "Negative"
436 | #> 
437 | #> 
438 | #> [[8]]
439 | #> [[8]]$role
440 | #> [1] "user"
441 | #> 
442 | #> [[8]]$content
443 | #> [1] "I can’t believe this cake case went all the way to #SCOTUS . Can someone let me know what cake was ultimately served at the wedding? Are they married and living happily ever after?"
444 | #> 
445 | #> 
446 | #> [[9]]
447 | #> [[9]]$role
448 | #> [1] "assistant"
449 | #> 
450 | #> [[9]]$content
451 | #> [1] "Neutral"
452 | #> 
453 | #> 
454 | #> [[10]]
455 | #> [[10]]$role
456 | #> [1] "user"
457 | #> 
458 | #> [[10]]$content
459 | #> [1] "Supreme Court rules in favor of baker who would not make wedding cake for gay couple"
460 | #> 
461 | #> 
462 | #> [[11]]
463 | #> [[11]]$role
464 | #> [1] "assistant"
465 | #> 
466 | #> [[11]]$content
467 | #> [1] "Neutral"
468 | #> 
469 | #> 
470 | #> [[12]]
471 | #> [[12]]$role
472 | #> [1] "user"
473 | #> 
474 | #> [[12]]$content
475 | #> [1] "#SCOTUS set a dangerous precedent today. Although the Court limited the scope to which a business owner could deny services to patrons, the legal argument has been legitimized that one's subjective religious convictions trump (no pun intended) #humanrights. #LGBTQRights"
476 | #> 
477 | #> 
478 | #> [[13]]
479 | #> [[13]]$role
480 | #> [1] "assistant"
481 | #> 
482 | #> [[13]]$content
483 | #> [1] "Negative"
484 | #> 
485 | #> 
486 | #> [[14]]
487 | #> [[14]]$role
488 | #> [1] "user"
489 | #> 
490 | #> [[14]]$content
491 | #> [1] "Let’s be real, lame anti-gay cake probably sucks anyway. \r\n\r\nAlso, I love you Sonia Sotomayor and RBG ❤️🧡💛💚💙💜\r\n\r\n#masterpiececakeshop #scotus"
492 | ```
493 | 
494 | One advantage of these chat models is that they typically do not require
495 | as many few-shot examples to perform well, but their big practical
496 | disadvantage is that we can only submit one chat to the API at a time.
497 | 
498 | ``` r
499 | response <- complete_chat(prompt)
500 | response
501 | #>        token  probability
502 | #> 1   Positive 7.849799e-01
503 | #> 2    Neutral 2.110320e-01
504 | #> 3   Negative 2.354229e-03
505 | #> 4      Mixed 1.621902e-03
506 | #> 5   positive 2.702952e-06
507 | #> 6       Post 1.892515e-06
508 | #> 7   Positive 1.472733e-06
509 | #> 8    Neutral 1.242802e-06
510 | #> 9        Mix 1.100770e-06
511 | #> 10   neutral 5.678884e-07
512 | #> 11        Ne 5.622518e-07
513 | #> 12       Pos 5.392126e-07
514 | #> 13         N 3.356456e-07
515 | #> 14       Net 2.261731e-07
516 | #> 15 _positive 8.153610e-08
517 | #> 16         - 6.318000e-08
518 | #> 17         M 5.630869e-08
519 | #> 18         I 4.956445e-08
520 | #> 19     mixed 4.791496e-08
521 | #> 20 .Positive 4.363649e-08
522 | ```
523 | 


--------------------------------------------------------------------------------
/cran-comments.md:
--------------------------------------------------------------------------------
 1 | ## R CMD check results
 2 | 
 3 | 0 errors | 0 warnings | 0 notes
 4 | 
 5 | * This is a new release.
 6 | 
 7 | * Added a reference in the description field of the DESCRIPTION file
 8 | 
 9 | * Added \value tag to open_ai_key.Rd
10 | 


--------------------------------------------------------------------------------
/data-raw/DALL·E 2024-02-21 05.59.41 - A line drawing of a teleprompter..png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/joeornstein/promptr/1939d640dbc5af5b20b1521a93aa04ace5e76a90/data-raw/DALL·E 2024-02-21 05.59.41 - A line drawing of a teleprompter..png


--------------------------------------------------------------------------------
/data-raw/logo.R:
--------------------------------------------------------------------------------
 1 | ## code to prepare the hex sticker logo
 2 | 
 3 | library(hexSticker)
 4 | library(ggplot2)
 5 | library(here)
 6 | 
 7 | img <- magick::image_read(here('data-raw/DALL·E 2024-02-21 05.59.41 - A line drawing of a teleprompter..png'))
 8 | 
 9 | img <- magick::image_crop(img, geometry = '1024x1024+0-80')
10 | 
11 | logo <- sticker(img, s_x = 1.05, s_y = 0.95, s_width = 1.5, s_height=1.5,
12 |                 package = 'promptr', p_color = 'black', p_size = 14, p_x = 0.965, p_y=1.42,
13 |                 h_color = 'black', h_size = 2, h_fill = 'white')
14 | logo
15 | 
16 | ggsave(filename = here('data-raw/logo.png'))
17 | 
18 | usethis::use_logo(here('data-raw/logo.png'))
19 | 


--------------------------------------------------------------------------------
/data-raw/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/joeornstein/promptr/1939d640dbc5af5b20b1521a93aa04ace5e76a90/data-raw/logo.png


--------------------------------------------------------------------------------
/data-raw/masterpiece_tweets.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/joeornstein/promptr/1939d640dbc5af5b20b1521a93aa04ace5e76a90/data-raw/masterpiece_tweets.RData


--------------------------------------------------------------------------------
/data/occupations.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/joeornstein/promptr/1939d640dbc5af5b20b1521a93aa04ace5e76a90/data/occupations.rda


--------------------------------------------------------------------------------
/data/occupations_examples.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/joeornstein/promptr/1939d640dbc5af5b20b1521a93aa04ace5e76a90/data/occupations_examples.rda


--------------------------------------------------------------------------------
/data/scotus_tweets.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/joeornstein/promptr/1939d640dbc5af5b20b1521a93aa04ace5e76a90/data/scotus_tweets.rda


--------------------------------------------------------------------------------
/data/scotus_tweets_examples.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/joeornstein/promptr/1939d640dbc5af5b20b1521a93aa04ace5e76a90/data/scotus_tweets_examples.rda


--------------------------------------------------------------------------------
/man/complete_chat.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/complete_chat.R
 3 | \name{complete_chat}
 4 | \alias{complete_chat}
 5 | \title{Complete an LLM Chat}
 6 | \usage{
 7 | complete_chat(
 8 |   prompt,
 9 |   model = "gpt-3.5-turbo",
10 |   openai_api_key = Sys.getenv("OPENAI_API_KEY"),
11 |   max_tokens = 1,
12 |   temperature = 0,
13 |   seed = NULL,
14 |   parallel = FALSE
15 | )
16 | }
17 | \arguments{
18 | \item{prompt}{The prompt}
19 | 
20 | \item{model}{Which OpenAI model to use. Defaults to 'gpt-3.5-turbo'}
21 | 
22 | \item{openai_api_key}{Your API key. By default, looks for a system environment variable called "OPENAI_API_KEY" (recommended option). Otherwise, it will prompt you to enter the API key as an argument.}
23 | 
24 | \item{max_tokens}{How many tokens (roughly 4 characters of text) should the model return? Defaults to a single token (next word prediction).}
25 | 
26 | \item{temperature}{A numeric between 0 and 2 When set to zero, the model will always return the most probable next token. For values greater than zero, the model selects the next word probabilistically.}
27 | 
28 | \item{seed}{An integer. If specified, the OpenAI API will "make a best effort to sample deterministically".}
29 | 
30 | \item{parallel}{TRUE to submit API requests in parallel. Setting to FALSE can reduce rate limit errors at the expense of longer runtime.}
31 | }
32 | \value{
33 | If max_tokens = 1, returns a dataframe with the 5 most likely next-word responses and their probabilities. If max_tokens > 1, returns a single string of text generated by the model.
34 | }
35 | \description{
36 | Submits a prompt to OpenAI's "Chat" API endpoint and formats the response into a string or tidy dataframe.
37 | }
38 | \examples{
39 | \dontrun{
40 | format_chat('Are frogs sentient? Yes or No.') |> complete_chat()
41 | format_chat('Write a haiku about frogs.') |> complete_chat(max_tokens = 100)
42 | }
43 | }
44 | 


--------------------------------------------------------------------------------
/man/complete_prompt.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/complete_prompt.R
 3 | \name{complete_prompt}
 4 | \alias{complete_prompt}
 5 | \title{Complete an LLM Prompt}
 6 | \usage{
 7 | complete_prompt(
 8 |   prompt,
 9 |   model = "gpt-3.5-turbo-instruct",
10 |   openai_api_key = Sys.getenv("OPENAI_API_KEY"),
11 |   max_tokens = 1,
12 |   temperature = 0,
13 |   seed = NULL,
14 |   parallel = FALSE
15 | )
16 | }
17 | \arguments{
18 | \item{prompt}{The prompt}
19 | 
20 | \item{model}{Which OpenAI model to use. Defaults to 'gpt-3.5-turbo-instruct'}
21 | 
22 | \item{openai_api_key}{Your API key. By default, looks for a system environment variable called "OPENAI_API_KEY" (recommended option). Otherwise, it will prompt you to enter the API key as an argument.}
23 | 
24 | \item{max_tokens}{How many tokens (roughly 4 characters of text) should the model return? Defaults to a single token (next word prediction).}
25 | 
26 | \item{temperature}{A numeric between 0 and 2 When set to zero, the model will always return the most probable next token. For values greater than zero, the model selects the next word probabilistically.}
27 | 
28 | \item{seed}{An integer. If specified, the OpenAI API will "make a best effort to sample deterministically".}
29 | 
30 | \item{parallel}{TRUE to submit API requests in parallel. Setting to FALSE can reduce rate limit errors at the expense of longer runtime.}
31 | }
32 | \value{
33 | If max_tokens = 1, returns a dataframe with the 5 most likely next words and their probabilities. If max_tokens > 1, returns a single string of text generated by the model.
34 | }
35 | \description{
36 | Submits a text prompt to OpenAI's "Completion" API endpoint and formats the response into a string or tidy dataframe. (Note that, as of 2024, this endpoint is considered "Legacy" by OpenAI and is likely to be deprecated.)
37 | }
38 | \examples{
39 | \dontrun{
40 | complete_prompt('I feel like a')
41 | complete_prompt('Here is my haiku about frogs:',
42 |                 max_tokens = 100)
43 | }
44 | }
45 | 


--------------------------------------------------------------------------------
/man/figures/README-unnamed-chunk-15-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/joeornstein/promptr/1939d640dbc5af5b20b1521a93aa04ace5e76a90/man/figures/README-unnamed-chunk-15-1.png


--------------------------------------------------------------------------------
/man/figures/README-unnamed-chunk-16-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/joeornstein/promptr/1939d640dbc5af5b20b1521a93aa04ace5e76a90/man/figures/README-unnamed-chunk-16-1.png


--------------------------------------------------------------------------------
/man/figures/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/joeornstein/promptr/1939d640dbc5af5b20b1521a93aa04ace5e76a90/man/figures/logo.png


--------------------------------------------------------------------------------
/man/format_chat.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/format_chat.R
 3 | \name{format_chat}
 4 | \alias{format_chat}
 5 | \title{Format a Chat Prompt}
 6 | \usage{
 7 | format_chat(
 8 |   text,
 9 |   instructions = NA,
10 |   examples = data.frame(),
11 |   system_message = NA
12 | )
13 | }
14 | \arguments{
15 | \item{text}{The text to be classified.}
16 | 
17 | \item{instructions}{Instructions to be included at the beginning of the prompt (format them like you would format instructions to a human research assistant).}
18 | 
19 | \item{examples}{A dataframe of "few-shot" examples. Must include one column called 'text' with the example text(s) and another column called "label" with the correct label(s).}
20 | 
21 | \item{system_message}{An optional "system message" with high-level instructions (e.g. "You are a helpful research assistant.")}
22 | }
23 | \value{
24 | Returns a series of messages formatted as a list object, which can be used as an input for promptr::complete_chat() or openai::create_chat_completion().
25 | }
26 | \description{
27 | Format a chat prompt to submit to OpenAI's ChatGPT or GPT-4 (particularly useful for classification tasks).
28 | }
29 | \examples{
30 | data(scotus_tweets_examples)
31 | 
32 | format_chat(text = "I am disappointed with this ruling.",
33 |                    instructions = "Decide if the statement is Positive or Negative.",
34 |                    examples = scotus_tweets_examples)
35 | }
36 | 


--------------------------------------------------------------------------------
/man/format_prompt.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/format_prompt.R
 3 | \name{format_prompt}
 4 | \alias{format_prompt}
 5 | \title{Format an LLM prompt}
 6 | \usage{
 7 | format_prompt(
 8 |   text,
 9 |   instructions = "",
10 |   examples = data.frame(),
11 |   template = "Text: {text}\\nClassification: {label}",
12 |   prompt_template = "{instructions}{examples}{input}",
13 |   separator = "\\n\\n"
14 | )
15 | }
16 | \arguments{
17 | \item{text}{The text to be classified. Can be a character vector or a single string.}
18 | 
19 | \item{instructions}{Instructions to be included in the prompt (format them like you would format instructions to a human research assistant).}
20 | 
21 | \item{examples}{A dataframe of "few-shot" examples. Must include one column called 'text' with the example text(s) and another column called "label" with the correct label(s).}
22 | 
23 | \item{template}{The template for how examples and completions should be formatted, in \code{glue} syntax. If you are including few-shot examples in the prompt, this must contain the \{text\} and \{label\} placeholders.}
24 | 
25 | \item{prompt_template}{The template for the entire prompt. Defaults to instructions, followed by few-shot examples, followed by the input to be classified.}
26 | 
27 | \item{separator}{A character that separates examples. Defaults to two carriage returns.}
28 | }
29 | \value{
30 | Returns a formatted prompt that can be used as input for \code{complete_prompt()} or \code{openai::create_completion()}.
31 | }
32 | \description{
33 | Format a text prompt for a Large Language Model. Particularly useful for few-shot text classification tasks. Note that if you are planning to use one of OpenAI's chat models, like ChatGPT or GPT-4, you will want to use the \code{format_chat()} function instead.
34 | }
35 | \examples{
36 | data(scotus_tweets_examples)
37 | 
38 | format_prompt(text = "I am disappointed with this ruling.",
39 |               instructions = "Decide if the sentiment of this statement is Positive or Negative.",
40 |               examples = scotus_tweets_examples,
41 |               template = "Statement: {text}\nSentiment: {label}")
42 | 
43 | format_prompt(text = 'I am sad about the Supreme Court',
44 |               examples = scotus_tweets_examples,
45 |               template = '"{text}" is a {label} statement',
46 |               separator = '\n')
47 | }
48 | 


--------------------------------------------------------------------------------
/man/occupations.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{occupations}
 5 | \alias{occupations}
 6 | \title{Occupations}
 7 | \format{
 8 | A data frame with 3948 rows and 2 columns:
 9 | \describe{
10 | \item{baldesig}{Ballot designation as it appears in the CEDA dataset}
11 | \item{hand_coded}{A hand-coded occupation classification (for a random subset)}
12 | }
13 | }
14 | \usage{
15 | data(occupations)
16 | }
17 | \description{
18 | This dataset contains 3,948 ballot designations from municipal elections in California.
19 | A random subset are hand-labeled as either "Working Class" or "Not Working Class" occupations.
20 | }
21 | \references{
22 | California Elections Data Archive (CEDA). https://hdl.handle.net/10211.3/210187
23 | }
24 | \keyword{datasets}
25 | 


--------------------------------------------------------------------------------
/man/occupations_examples.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{occupations_examples}
 5 | \alias{occupations_examples}
 6 | \title{Labelled Occupations}
 7 | \format{
 8 | A data frame with 9 rows and 2 columns:
 9 | \describe{
10 | \item{text}{The text of the ballot designation}
11 | \item{label}{The hand-coded label (Working Class, Not Working Class, NA)}
12 | }
13 | }
14 | \usage{
15 | data(occupations_examples)
16 | }
17 | \description{
18 | This dataset contains 9 example occupations
19 | along with a classification. These can be used as few-shot
20 | examples for classifying occupations in the \code{occupations} dataset.
21 | }
22 | \references{
23 | California Elections Data Archive (CEDA). https://hdl.handle.net/10211.3/210187
24 | }
25 | \keyword{datasets}
26 | 


--------------------------------------------------------------------------------
/man/openai_api_key.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/openai_api_key.R
 3 | \name{openai_api_key}
 4 | \alias{openai_api_key}
 5 | \title{Install an OPENAI API KEY in Your \code{.Renviron} File for Repeated Use}
 6 | \usage{
 7 | openai_api_key(key, overwrite = FALSE, install = FALSE)
 8 | }
 9 | \arguments{
10 | \item{key}{The API key provided to you from the OpenAI formated in quotes.}
11 | 
12 | \item{overwrite}{If this is set to TRUE, it will overwrite an existing OPENAI_API_KEY that you already have in your \code{.Renviron} file.}
13 | 
14 | \item{install}{if TRUE, will install the key in your \code{.Renviron} file for use in future sessions.  Defaults to FALSE.}
15 | }
16 | \value{
17 | No return value, called for side effects
18 | }
19 | \description{
20 | This function will add your OpenAI API key to your \code{.Renviron} file so it can be called securely without being stored
21 | in your code. After you have installed your key, it can be called any time by typing \code{Sys.getenv("OPENAI_API_KEY")} and will be
22 | automatically called in package functions. If you do not have an \code{.Renviron} file, the function will create one for you.
23 | If you already have an \code{.Renviron} file, the function will append the key to your existing file, while making a backup of your
24 | original file for disaster recovery purposes.
25 | }
26 | \examples{
27 | 
28 | \dontrun{
29 | openai_api_key("111111abc", install = TRUE)
30 | # First time, reload your environment so you can use the key without restarting R.
31 | readRenviron("~/.Renviron")
32 | # You can check it with:
33 | Sys.getenv("OPENAI_API_KEY")
34 | }
35 | 
36 | \dontrun{
37 | # If you need to overwrite an existing key:
38 | openai_api_key("111111abc", overwrite = TRUE, install = TRUE)
39 | # First time, reload your environment so you can use the key without restarting R.
40 | readRenviron("~/.Renviron")
41 | # You can check it with:
42 | Sys.getenv("OPENAI_API_KEY")
43 | }
44 | }
45 | 


--------------------------------------------------------------------------------
/man/scotus_tweets.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{scotus_tweets}
 5 | \alias{scotus_tweets}
 6 | \title{Tweets About The Supreme Court of the United States}
 7 | \format{
 8 | A data frame with 945 rows and 5 columns:
 9 | \describe{
10 | \item{tweet_id}{A unique ID}
11 | \item{text}{The text of the tweet}
12 | \item{case}{An identifier denoting which Supreme Court ruling the tweet was collected after.}
13 | \item{expert1, expert2, expert3}{Hand-coded sentiment score (-1 = negative, 0 = neutral, 1 = positive)}
14 | }
15 | }
16 | \usage{
17 | data(scotus_tweets)
18 | }
19 | \description{
20 | This dataset contains 945 tweets referencing the US Supreme Court.
21 | Roughly half were collected on June 4, 2018 following the \emph{Masterpiece Cakeshop}
22 | ruling, and the other half were collected on July 9, 2020 following the
23 | Court's concurrently released opinions in \emph{Trump v. Mazars} and \emph{Trump v. Vance}.
24 | Each tweet includes three independent human-coded sentiment scores (-1 to +1).
25 | }
26 | \details{
27 | CONTENT WARNING: These texts come from social media, and many contain explicit
28 | or offensive language.
29 | }
30 | \references{
31 | Ornstein et al. (2024). "How To Train Your Stochastic Parrot"
32 | }
33 | \keyword{datasets}
34 | 


--------------------------------------------------------------------------------
/man/scotus_tweets_examples.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{scotus_tweets_examples}
 5 | \alias{scotus_tweets_examples}
 6 | \title{Labelled Example Tweets About The Supreme Court of the United States}
 7 | \format{
 8 | A data frame with 12 rows and 4 columns:
 9 | \describe{
10 | \item{tweet_id}{A unique ID for each tweet}
11 | \item{text}{The text of the tweet}
12 | \item{case}{The case referenced in the tweet (Masterpiece Cakeshop or Trump v. Mazars)}
13 | \item{label}{The "true" label (Positive, Negative, or Neutral)}
14 | }
15 | }
16 | \usage{
17 | data(scotus_tweets_examples)
18 | }
19 | \description{
20 | This dataset contains 12 example tweets referencing the Supreme Court
21 | along with a sentiment label. These can be used as few-shot prompt
22 | examples for classifying tweets in the \code{scotus_tweets} dataset.
23 | }
24 | \references{
25 | Ornstein et al. (2023). "How To Train Your Stochastic Parrot"
26 | }
27 | \keyword{datasets}
28 | 


--------------------------------------------------------------------------------
/promptr.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | ProjectId: cf07827b-f743-4ff0-845c-efc973cf5461
 3 | 
 4 | RestoreWorkspace: No
 5 | SaveWorkspace: No
 6 | AlwaysSaveHistory: Default
 7 | 
 8 | EnableCodeIndexing: Yes
 9 | UseSpacesForTab: Yes
10 | NumSpacesForTab: 2
11 | Encoding: UTF-8
12 | 
13 | RnwWeave: Sweave
14 | LaTeX: pdfLaTeX
15 | 
16 | AutoAppendNewline: Yes
17 | StripTrailingWhitespace: Yes
18 | LineEndingConversion: Posix
19 | 
20 | BuildType: Package
21 | PackageUseDevtools: Yes
22 | PackageInstallArgs: --no-multiarch --with-keep.source
23 | PackageRoxygenize: rd,collate,namespace
24 | 


--------------------------------------------------------------------------------