├── NAMESPACE ├── .Rbuildignore ├── man └── hello.Rd ├── DESCRIPTION ├── readgedcom.Rproj ├── R ├── hello.R └── read_gedcom.R ├── .gitignore └── README.md /NAMESPACE: -------------------------------------------------------------------------------- 1 | exportPattern("^[[:alpha:]]+") 2 | -------------------------------------------------------------------------------- /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^.*\.Rproj$ 2 | ^\.Rproj\.user$ 3 | -------------------------------------------------------------------------------- /man/hello.Rd: -------------------------------------------------------------------------------- 1 | \name{hello} 2 | \alias{hello} 3 | \title{Hello, World!} 4 | \usage{ 5 | hello() 6 | } 7 | \description{ 8 | Prints 'Hello, world!'. 9 | } 10 | \examples{ 11 | hello() 12 | } 13 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: readgedcom 2 | Type: Package 3 | Title: What the Package Does (Title Case) 4 | Version: 0.1.0 5 | Author: Who wrote it 6 | Maintainer: The package maintainer 7 | Description: More about what it does (maybe more than one line) 8 | Use four spaces when indenting paragraphs within the Description. 9 | License: What license is it under? 10 | Encoding: UTF-8 11 | LazyData: true 12 | -------------------------------------------------------------------------------- /readgedcom.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | 15 | AutoAppendNewline: Yes 16 | StripTrailingWhitespace: Yes 17 | 18 | BuildType: Package 19 | PackageUseDevtools: Yes 20 | PackageInstallArgs: --no-multiarch --with-keep.source 21 | -------------------------------------------------------------------------------- /R/hello.R: -------------------------------------------------------------------------------- 1 | # Hello, world! 2 | # 3 | # This is an example function named 'hello' 4 | # which prints 'Hello, world!'. 5 | # 6 | # You can learn more about package authoring with RStudio at: 7 | # 8 | # http://r-pkgs.had.co.nz/ 9 | # 10 | # Some useful keyboard shortcuts for package authoring: 11 | # 12 | # Install Package: 'Cmd + Shift + B' 13 | # Check Package: 'Cmd + Shift + E' 14 | # Test Package: 'Cmd + Shift + T' 15 | 16 | hello <- function() { 17 | print("Hello, world!") 18 | } 19 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # History files 2 | .Rhistory 3 | .Rapp.history 4 | .DS_Store 5 | 6 | # Session Data files 7 | .RData 8 | 9 | # User-specific files 10 | .Ruserdata 11 | 12 | # Example code in package build process 13 | *-Ex.R 14 | 15 | # Output files from R CMD build 16 | /*.tar.gz 17 | 18 | # Output files from R CMD check 19 | /*.Rcheck/ 20 | 21 | # RStudio files 22 | .Rproj.user/ 23 | 24 | # produced vignettes 25 | vignettes/*.html 26 | vignettes/*.pdf 27 | 28 | # OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3 29 | .httr-oauth 30 | 31 | # knitr and R markdown default cache directories 32 | *_cache/ 33 | /cache/ 34 | 35 | # Temporary files created by R markdown 36 | *.utf8.md 37 | *.knit.md 38 | 39 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Reading GEDCOM 2 | Reading GEDCOM files and doing analysis in R 3 | 4 | I was surprised to find that with Family history being the 2nd most popular hobby in the United States, that there hasn't been a package built in R to read GEDCOM files. This is my stab at it. I am hoping that this repository can help others learn more about their ancestors using data analysis. 5 | Right now, with the read_gedcom function, you can read gedcom files and store them in a tidy format. This is still in development, and hopefully it can work to fill your family history needs. 6 | 7 | To get GEDCOM file off of ancestry: 8 | https://support.ancestry.com/s/article/Uploading-and-Downloading-Trees 9 | 10 | I also found this script to get a GEDCOM file off of FamilySearch: 11 | https://github.com/Linekio/getmyancestors 12 | 13 | There are also many sites that offer free GEDCOM files for celebrities/royalty/etc. 14 | 15 | You can start using this package by using: 16 | ``` 17 | # install.packages("devtools") 18 | devtools::install_github("jjfitz/readgedcom") 19 | ``` 20 | 21 | Here is also an article that I wrote with exploring my family history: 22 | https://jjfitz.github.io/posts/reading-gedcom-files-r/ 23 | -------------------------------------------------------------------------------- /R/read_gedcom.R: -------------------------------------------------------------------------------- 1 | #' Read GEDCOM 2 | #' 3 | #' This function allows you to read gedcom files into csv format. 4 | #' @param filepath 5 | #' @keywords gedcom 6 | #' @export 7 | #' @examples 8 | #' read_gedcom() 9 | read_gedcom <- function(file_path) { 10 | file <- read_delim(file_path, '\n', "\n\n", col_names = FALSE) 11 | start_recording <- FALSE 12 | is_first <- 0 13 | birthdate <- NA 14 | birthplace <- NA 15 | birthlat <- NA 16 | birthlong <- NA 17 | firstname <- NA 18 | lastname <- NA 19 | famc <- NA 20 | fams <- NA 21 | id <- NA 22 | deathdate <- NA 23 | deathplace <- NA 24 | deathlat <- NA 25 | deathlong <- NA 26 | sex <- NA 27 | 28 | dfTemp <- data.frame(matrix(nrow=1, ncol=14)) 29 | names(dfTemp) <- c("id", "firstname","lastname","birthdate", "birthplace", 30 | "birthlat", "birthlong", "deathdate", "deathplace", 31 | "deathlat", "deathlong", "sex", "FAMC", "FAMS") 32 | 33 | for (i in 1:length(file[1][[1]])) { 34 | tmpv <- file[1][[1]][[i]] 35 | 36 | if(str_detect(tmpv, "@ INDI")) { 37 | line.to.write <- data_frame(id, firstname, lastname, birthdate, birthplace, 38 | birthlat, birthlong, deathdate, deathplace, 39 | deathlat, deathlong, sex, famc, fams) 40 | names(line.to.write) <- c("id", "firstname","lastname","birthdate", "birthplace", 41 | "birthlat", "birthlong", "deathdate", "deathplace", 42 | "deathlat", "deathlong", "sex","FAMC", "FAMS") 43 | dfTemp <- rbind(dfTemp, line.to.write) 44 | birthdate <- NA 45 | birthplace <- NA 46 | birthlat <- NA 47 | birthlong <- NA 48 | deathdate <- NA 49 | deathplace <- NA 50 | deathlat <- NA 51 | deathlong <- NA 52 | firstname <- NA 53 | lastname <- NA 54 | sex <- NA 55 | famc <- NA 56 | fams <- NA 57 | 58 | id <- str_extract(tmpv,"(?<=@.)\\d*(?=@)") 59 | next 60 | } 61 | 62 | if(str_detect(tmpv, " NAME")) { 63 | firstname <- str_extract(tmpv,"(?<=NAME ).+(?= /+.)") 64 | lastname <- str_extract(tmpv,"(?<=/).+(?=/)") 65 | next 66 | } 67 | 68 | if(str_detect(tmpv, " BIRT")) { 69 | birthdate <- str_extract(file[1][[1]][[i+1]],"(?<=DATE ).+") 70 | birthplace <- str_extract(file[1][[1]][[i+2]],"(?<=PLAC ).+") 71 | birthlat <- str_extract(file[1][[1]][[i+4]],"(?<=LATI ).+") 72 | birthlong <- str_extract(file[1][[1]][[i+5]],"(?<=LONG ).+") 73 | next 74 | } 75 | 76 | if(str_detect(tmpv, " DEAT")) { 77 | deathdate <- str_extract(file[1][[1]][[i+1]],"(?<=DATE ).+") 78 | deathplace <- str_extract(file[1][[1]][[i+2]],"(?<=PLAC ).+") 79 | deathlat <- str_extract(file[1][[1]][[i+4]],"(?<=LATI ).+") 80 | deathlong <- str_extract(file[1][[1]][[i+5]],"(?<=LONG ).+") 81 | next 82 | } 83 | 84 | if(str_detect(tmpv, " SEX")) { 85 | sex <- str_extract(tmpv,"(?<=SEX ).+") 86 | next 87 | } 88 | 89 | if(str_detect(tmpv, " FAMC")) { 90 | famc <- str_extract(tmpv,"(?<=@.)\\d*(?=@)") 91 | next 92 | } 93 | 94 | if(str_detect(tmpv, " FAMS")) { 95 | fams <- paste0(fams, " ", str_extract(tmpv,"(?<=@.)\\d*(?=@)")) 96 | } 97 | } 98 | 99 | dfTemp <- as_tibble(dfTemp) 100 | 101 | dfTemp$FAMS <- gsub("NA ", "", dfTemp$FAMS) 102 | 103 | dfTemp <- dfTemp %>% 104 | filter(!is.na(id)) %>% 105 | mutate(birthlat = as.numeric(birthlat), 106 | birthlong = as.numeric(birthlong), 107 | deathlat = as.numeric(deathlat), 108 | deathlong = as.numeric(deathlong) 109 | ) 110 | 111 | return(dfTemp) 112 | } 113 | 114 | 115 | 116 | 117 | --------------------------------------------------------------------------------