Standardize multiple files • standardizeSnapshot

Introduction

This vignette shows the typical workflow to standardize multiple camera trap files from Snapshot Safari.

library(standardizeSnapshot)

Setup a logger

This is an optional but recommended step. If you want not only to print messages to the console, but also to save them in a file, you can use a logger.

The function create_logger allows to create a file in the specified location and setup the logging:

logfile <- file.path(tempdir(), "log", "logger.log") # create the logfile
logfile
#> [1] "/tmp/Rtmpzpfy7V/log/logger.log"

logger <- create_logger(my_logfile = logfile, 
                        console = FALSE)
#> Create logger /tmp/Rtmpzpfy7V/log/logger.log

Read the files

First, we need to read the data. Here, we read files that were previously written in /tmp/Rtmpzpfy7V/data_in (not shown).

in_folder <- file.path(tempdir(), "data_in")
in_folder
#> [1] "/tmp/Rtmpzpfy7V/data_in"
list.files(in_folder, recursive = TRUE)
#> [1] "APN/APN_S1_full_report_0-50__agreement_corrected_fin.csv"
#> [2] "ATH/ATH_Roll1_Snapshot.csv"                              
#> [3] "bar.csv"                                                 
#> [4] "foo.csv"                                                 
#> [5] "MOK/MOK_record_table_0min_deltaT_2021-05-07.csv"

The data we want to read are:

APN/APN_S1_full_report_0-50__agreement_corrected_fin.csv
ATH/ATH_Roll1_Snapshot.csv
MOK/MOK_record_table_0min_deltaT_2021-05-07.csv

NB: these are the same files as the zooniverse, traptagger and digikam datasets included in the package.

The files foo.csv and bar.csv should be ignored.

There is a built-in function to read multiple files into a list in the package: the read_snapshot_files function.

The 2 mandatory arguments to this function are:

input: a vector of valid paths, that can be files and/or folders
basepath: the part of the path that should be ignored when copying final files (which will be used to name the list).

dat_list <- read_snapshot_files(input = in_folder,
                                basepath = in_folder, 
                                except = c("foo.csv", "bar.csv"),
                                logger = logger)
#> The following file(s) will be ignored (they are in 'except'):
#>  foo.csv
#>  bar.csv
#> Reading file APN/APN_S1_full_report_0-50__agreement_corrected_fin.csv (1/3) ---
#> Reading file ATH/ATH_Roll1_Snapshot.csv (2/3) ---
#> Reading file MOK/MOK_record_table_0min_deltaT_2021-05-07.csv (3/3) ---

Here, we also use:

except, which is a vector of files that should be ignored.
logger: a logger created with create_logger. If you did not setup a logger, you can ignore this argument.

The result is a named list of dataframes:

names(dat_list)
#> [1] "APN/APN_S1_full_report_0-50__agreement_corrected_fin.csv"
#> [2] "ATH/ATH_Roll1_Snapshot.csv"                              
#> [3] "MOK/MOK_record_table_0min_deltaT_2021-05-07.csv"
head(dat_list$`APN/APN_S1_full_report_0-50__agreement_corrected_fin.csv`, 3)
#>          capture_id season site roll capture capture_date_local
#> 1  APN_S1#K021#1#32 APN_S1 K021    1      32         2017-07-06
#> 2    APN_S1#DW#2#94 APN_S1   DW    2      94         2017-07-06
#> 3 APN_S1#U64c#2#208 APN_S1 U64c    2     208         2017-07-06
#>   capture_time_local
#> 1           13:56:29
#> 2           18:16:19
#> 3           14:49:11
#>                                                                                         zooniverse_url_0
#> 1 https://panoptes-uploads.zooniverse.org/production/subject_location/12ced-0a4ae-08f79-0b5f1-1141f.jpeg
#> 2 https://panoptes-uploads.zooniverse.org/production/subject_location/0f158-16764-0e8c1-074fc-0f158.jpeg
#> 3 https://panoptes-uploads.zooniverse.org/production/subject_location/15ef4-10ef3-0643f-0dbe5-0e3aa.jpeg
#>                                                                                         zooniverse_url_1
#> 1 https://panoptes-uploads.zooniverse.org/production/subject_location/0aecc-180ee-0ef2c-0ef2c-168fa.jpeg
#> 2 https://panoptes-uploads.zooniverse.org/production/subject_location/0a3dd-0995b-16499-168aa-10b9d.jpeg
#> 3 https://panoptes-uploads.zooniverse.org/production/subject_location/100a4-09293-06a39-0b6c6-097f2.jpeg
#>                                                                                         zooniverse_url_2
#> 1 https://panoptes-uploads.zooniverse.org/production/subject_location/038b4-17531-123a6-0872f-046dc.jpeg
#> 2 https://panoptes-uploads.zooniverse.org/production/subject_location/11744-11744-02c31-02c31-06aae.jpeg
#> 3 https://panoptes-uploads.zooniverse.org/production/subject_location/0e8e6-02c31-11e1c-105f4-15df2.jpeg
#>   subject_id question__species question__count_max question__count_median
#> 1   17260415          elephant                   1                      1
#> 2   17754007            impala                   6                      6
#> 3   17153920            impala                   3                      2
#>   question__count_min question__standing question__resting question__moving
#> 1                   1               0.19                 0             0.81
#> 2                   5               0.24                 0             0.24
#> 3                   1               0.71                 0             0.59
#>   question__eating question__interacting question__young_present
#> 1             0.00                     0                    0.00
#> 2             0.88                     0                    0.06
#> 3             0.35                     0                    0.00
#>   question__horns_visible p_users_identified_this_species
#> 1                    0.00                            0.84
#> 2                    0.00                            0.85
#> 3                    0.06                            0.89
#>   pielous_evenness_index
#> 1                   0.49
#> 2                   0.47
#> 3                   0.47
head(dat_list$`ATH/ATH_Roll1_Snapshot.csv`, 3)
#>     Capture_ID Cam.Site      id latitude longitude       timestamp
#> 1 ATH356#1#F03  ATH_F03 2083476 -24.0595   20.8909 3/31/2020 13:24
#> 2 ATH611#1#B06  ATH_B06 2045520 -24.5076   22.9274  5/13/2020 9:04
#> 3  ATH38#1#E02  ATH_E02 2009604 -22.5372   20.8216 3/12/2020 10:31
#>   capture_labels capture_sighting_count
#> 1           None                      1
#> 2        giraffe                      1
#> 3 wildebeestblue                      3
#>                                                    capture_url
#> 1 https://traptagger.co.uk/imageViewer?type=capture&id=2083476
#> 2 https://traptagger.co.uk/imageViewer?type=capture&id=2045520
#> 3 https://traptagger.co.uk/imageViewer?type=capture&id=2009604
head(dat_list$`MOK/MOK_record_table_0min_deltaT_2021-05-07.csv`, 3)
#>   X.1 X Station   Species DateTimeOriginal       Date     Time delta.time.secs
#> 1   1 1     G03 porcupine  2018-06-28 8:56 2018-06-28 17:38:42               0
#> 2   2 2     D06      kudu 2018-06-25 16:13 2018-06-25  7:18:05               0
#> 3   3 3     E06 springbok 2018-06-29 18:33 2018-06-29  0:53:56          353978
#>   delta.time.mins delta.time.hours delta.time.days            Directory
#> 1             0.0              0.0             0.0 E:/MOK/MOK_Roll1/G03
#> 2             0.0              0.0             0.0 E:/MOK/MOK_Roll1/D06
#> 3          5899.6             98.3             4.1 E:/MOK/MOK_Roll1/E06
#>       FileName EXIF.Model EXIF.Make metadata_Species metadata_Number
#> 1 I_00006a.JPG         E3 CUDDEBACK        porcupine               1
#> 2 I_00003a.JPG         E3 CUDDEBACK             kudu               1
#> 3 I__00013.JPG         E3 CUDDEBACK        springbok               1
#>   metadata_Behaviour metadata_Sex n_images metadata_young_present
#> 1               <NA>         <NA>        1                   <NA>
#> 2             Moving       Female        1                   <NA>
#> 3             Moving         <NA>        1                   <NA>
#>   metadata_Numberofindividuals
#> 1                           NA
#> 2                           NA
#> 3                           NA
#>                                                                     HierarchicalSubject
#> 1                                          Species, Species|porcupine, Number|1, Number
#> 2 Species|kudu, Behaviour, Sex|Female, Number|1, Behaviour|Moving, Species, Number, Sex
#> 3             Number, Behaviour|Moving, Species, Number|1, Species|springbok, Behaviour

Warning: this structure of “lists of dataframes” can take-up a lot of space. Therefore, it is better to copy the standardized results to the same list to overwrite older results and save space.

You can reproduce the following results by using:

data(zooniverse)
data(traptagger)
data(digikam)

dat_list <- list("APN/APN_S1_full_report_0-50__agreement_corrected_fin.csv" = zooniverse,
                 "ATH/ATH_Roll1_Snapshot.csv" = traptagger,
                 "MOK/MOK_record_table_0min_deltaT_2021-05-07.csv" = digikam)

Standardize the files

Then, we standardize the file. The function standardize_snapshot_list allows to standardize a list of dataframes.

This function has a number of options, but the only ones that are mandatory are:

df_list: the list of dataframes to standardize
standard_df: the reference dataframe telling the function how to rename the columns. Here, we use the built-in dataset standard.

dat_list <- standardize_snapshot_list(df_list = dat_list,
                                      standard_df = standard,
                                      logger = logger)
#> 3 files to standardize.
#> Standardizing file APN/APN_S1_full_report_0-50__agreement_corrected_fin.csv (1/3) ---
#> Initial file: 24 columns, 100 rows.
#> Standardizing columns
#> Standardizing dates/times
#> Fill capture info
#> Cleaning location/camera, species and columns values
#> Final file: 27 columns, 100 rows. Here is a sneak peek:
#> locationID   cameraID    season  roll    eventID snapshotName    eventDate   eventTime
#> APN  APN_13U 1   1   APN_13U#1#50    kudu    2017-07-25  02:15:22
#> APN  APN_6U  1   1   APN_6U#1#40 steenbok    2017-07-17  08:56:18
#> APN  APN_6U  1   2   APN_6U#2#5  zebraburchells  2017-08-07  04:49:29
#> APN  APN_DW  1   2   APN_DW#2#94 impala  2017-07-06  18:16:19
#> APN  APN_DW  1   1   APN_DW#1#210    zebraburchells  2017-07-21  18:09:16
#> Standardizing file ATH/ATH_Roll1_Snapshot.csv (2/3) ---
#> Initial file: 9 columns, 100 rows.
#> Standardizing columns
#> Standardizing dates/times
#> Fill capture info
#> Cleaning location/camera, species and columns values
#> Final file: 27 columns, 100 rows. Here is a sneak peek:
#> locationID   cameraID    season  roll    eventID snapshotName    eventDate   eventTime
#> ATH  ATH_B04 NA  1   ATH_B04#1#242   blank   2020-03-08  12:33:00
#> ATH  ATH_B04 NA  1   ATH_B04#1#95    zebraburchells  2020-03-19  07:05:00
#> ATH  ATH_B04 NA  1   ATH_B04#1#40    zebraburchells  2020-03-24  12:03:00
#> ATH  ATH_B04 NA  1   ATH_B04#1#275   wildebeestblue  2020-03-26  13:49:00
#> ATH  ATH_B04 NA  1   ATH_B04#1#315   blank   2020-04-15  14:28:00
#> Standardizing file MOK/MOK_record_table_0min_deltaT_2021-05-07.csv (3/3) ---
#> Initial file: 23 columns, 100 rows.
#> Standardizing columns
#> Match found in column names: renaming column metadata_Numberofindividuals into metadata_NumberOfIndividuals
#> Standardizing dates/times
#> Getting location code for Digikam data
#> Fill capture info
#> Cleaning location/camera, species and columns values
#> Final file: 27 columns, 100 rows. Here is a sneak peek:
#> locationID   cameraID    season  roll    eventID snapshotName    eventDate   eventTime
#> MOK  MOK_A09 NA  1   MOK_A09#1#1 giraffe 2018-07-08  12:15:34
#> MOK  MOK_A09 NA  1   MOK_A09#1#2 springbok   2018-08-26  10:45:55
#> MOK  MOK_A09 NA  1   MOK_A09#1#3 unresolvable    2018-09-02  18:11:28
#> MOK  MOK_B07 NA  1   MOK_B07#1#1 zebraburchells  2018-06-28  07:49:42
#> MOK  MOK_B07 NA  1   MOK_B07#1#2 gemsbok 2018-08-19  09:11:55

Here, we also use the logger argument (as in the data reading step).

By default, the function displays the head of the first 8 columns of each file along with numerous messages.

Write files

The last step is to write the standardized files to a destination. For this, we use the function write_standardized_list.

This function has only 2 mandatory arguments:

df_list: the list of files to write
to: the folder in which the file should be copied.

out_folder <- file.path(tempdir(), "data_out") # the folder in which to copy the file
out_folder
#> [1] "/tmp/Rtmpzpfy7V/data_out"

write_standardized_list(df_list = dat_list,
                        to = out_folder,
                        logger = logger)
#> Creating folder /tmp/Rtmpzpfy7V/data_out/APN
#> Writing file APN/APN_S1_full_report_0-50__agreement_corrected_fin.csv -> APN/APN_S1_R1-2.csv (1/3) ---
#> Creating folder /tmp/Rtmpzpfy7V/data_out/ATH
#> Writing file ATH/ATH_Roll1_Snapshot.csv -> ATH/ATH_SNA_R1.csv (2/3) ---
#> Creating folder /tmp/Rtmpzpfy7V/data_out/MOK
#> Writing file MOK/MOK_record_table_0min_deltaT_2021-05-07.csv -> MOK/MOK_SNA_R1.csv (3/3) ---

Here, we also use the logger argument.

The files are now written in the destination.

list.files(out_folder, recursive = TRUE)
#> [1] "APN/APN_S1_R1-2.csv" "ATH/ATH_SNA_R1.csv"  "MOK/MOK_SNA_R1.csv"

We can check that the logger was filled:

readLines(logfile)
#>  [1] "INFO  [2024-04-13 18:03:01] Create logger /tmp/Rtmpzpfy7V/log/logger.log"                                                               
#>  [2] "INFO  [2024-04-13 18:03:01] The following file(s) will be ignored (they are in 'except'):"                                              
#>  [3] "\tfoo.csv"                                                                                                                              
#>  [4] "\tbar.csv"                                                                                                                              
#>  [5] "INFO  [2024-04-13 18:03:01] Reading file APN/APN_S1_full_report_0-50__agreement_corrected_fin.csv (1/3) ---"                            
#>  [6] "INFO  [2024-04-13 18:03:01] Reading file ATH/ATH_Roll1_Snapshot.csv (2/3) ---"                                                          
#>  [7] "INFO  [2024-04-13 18:03:01] Reading file MOK/MOK_record_table_0min_deltaT_2021-05-07.csv (3/3) ---"                                     
#>  [8] ""                                                                                                                                       
#>  [9] ""                                                                                                                                       
#> [10] "INFO  [2024-04-13 18:03:01] 3 files to standardize."                                                                                    
#> [11] ""                                                                                                                                       
#> [12] "INFO  [2024-04-13 18:03:01] Standardizing file APN/APN_S1_full_report_0-50__agreement_corrected_fin.csv (1/3) ---"                      
#> [13] "INFO  [2024-04-13 18:03:01] Initial file: 24 columns, 100 rows."                                                                        
#> [14] "INFO  [2024-04-13 18:03:01] Standardizing columns"                                                                                      
#> [15] "INFO  [2024-04-13 18:03:01] Standardizing dates/times"                                                                                  
#> [16] "INFO  [2024-04-13 18:03:02] Fill capture info"                                                                                          
#> [17] "INFO  [2024-04-13 18:03:02] Cleaning location/camera, species and columns values"                                                       
#> [18] "INFO  [2024-04-13 18:03:02] Final file: 27 columns, 100 rows. Here is a sneak peek:"                                                    
#> [19] "locationID\tcameraID\tseason\troll\teventID\tsnapshotName\teventDate\teventTime"                                                        
#> [20] "APN\tAPN_13U\t1\t1\tAPN_13U#1#50\tkudu\t2017-07-25\t02:15:22"                                                                           
#> [21] "APN\tAPN_6U\t1\t1\tAPN_6U#1#40\tsteenbok\t2017-07-17\t08:56:18"                                                                         
#> [22] "APN\tAPN_6U\t1\t2\tAPN_6U#2#5\tzebraburchells\t2017-08-07\t04:49:29"                                                                    
#> [23] "APN\tAPN_DW\t1\t2\tAPN_DW#2#94\timpala\t2017-07-06\t18:16:19"                                                                           
#> [24] "APN\tAPN_DW\t1\t1\tAPN_DW#1#210\tzebraburchells\t2017-07-21\t18:09:16"                                                                  
#> [25] ""                                                                                                                                       
#> [26] "INFO  [2024-04-13 18:03:02] Standardizing file ATH/ATH_Roll1_Snapshot.csv (2/3) ---"                                                    
#> [27] "INFO  [2024-04-13 18:03:02] Initial file: 9 columns, 100 rows."                                                                         
#> [28] "INFO  [2024-04-13 18:03:02] Standardizing columns"                                                                                      
#> [29] "INFO  [2024-04-13 18:03:02] Standardizing dates/times"                                                                                  
#> [30] "INFO  [2024-04-13 18:03:02] Fill capture info"                                                                                          
#> [31] "INFO  [2024-04-13 18:03:02] Cleaning location/camera, species and columns values"                                                       
#> [32] "INFO  [2024-04-13 18:03:02] Final file: 27 columns, 100 rows. Here is a sneak peek:"                                                    
#> [33] "locationID\tcameraID\tseason\troll\teventID\tsnapshotName\teventDate\teventTime"                                                        
#> [34] "ATH\tATH_B04\tNA\t1\tATH_B04#1#242\tblank\t2020-03-08\t12:33:00"                                                                        
#> [35] "ATH\tATH_B04\tNA\t1\tATH_B04#1#95\tzebraburchells\t2020-03-19\t07:05:00"                                                                
#> [36] "ATH\tATH_B04\tNA\t1\tATH_B04#1#40\tzebraburchells\t2020-03-24\t12:03:00"                                                                
#> [37] "ATH\tATH_B04\tNA\t1\tATH_B04#1#275\twildebeestblue\t2020-03-26\t13:49:00"                                                               
#> [38] "ATH\tATH_B04\tNA\t1\tATH_B04#1#315\tblank\t2020-04-15\t14:28:00"                                                                        
#> [39] ""                                                                                                                                       
#> [40] "INFO  [2024-04-13 18:03:02] Standardizing file MOK/MOK_record_table_0min_deltaT_2021-05-07.csv (3/3) ---"                               
#> [41] "INFO  [2024-04-13 18:03:02] Initial file: 23 columns, 100 rows."                                                                        
#> [42] "INFO  [2024-04-13 18:03:02] Standardizing columns"                                                                                      
#> [43] "INFO  [2024-04-13 18:03:02] Match found in column names: renaming column metadata_Numberofindividuals into metadata_NumberOfIndividuals"
#> [44] "INFO  [2024-04-13 18:03:02] Standardizing dates/times"                                                                                  
#> [45] "INFO  [2024-04-13 18:03:02] Getting location code for Digikam data"                                                                     
#> [46] "INFO  [2024-04-13 18:03:02] Fill capture info"                                                                                          
#> [47] "INFO  [2024-04-13 18:03:02] Cleaning location/camera, species and columns values"                                                       
#> [48] "INFO  [2024-04-13 18:03:02] Final file: 27 columns, 100 rows. Here is a sneak peek:"                                                    
#> [49] "locationID\tcameraID\tseason\troll\teventID\tsnapshotName\teventDate\teventTime"                                                        
#> [50] "MOK\tMOK_A09\tNA\t1\tMOK_A09#1#1\tgiraffe\t2018-07-08\t12:15:34"                                                                        
#> [51] "MOK\tMOK_A09\tNA\t1\tMOK_A09#1#2\tspringbok\t2018-08-26\t10:45:55"                                                                      
#> [52] "MOK\tMOK_A09\tNA\t1\tMOK_A09#1#3\tunresolvable\t2018-09-02\t18:11:28"                                                                   
#> [53] "MOK\tMOK_B07\tNA\t1\tMOK_B07#1#1\tzebraburchells\t2018-06-28\t07:49:42"                                                                 
#> [54] "MOK\tMOK_B07\tNA\t1\tMOK_B07#1#2\tgemsbok\t2018-08-19\t09:11:55"                                                                        
#> [55] ""                                                                                                                                       
#> [56] ""                                                                                                                                       
#> [57] "INFO  [2024-04-13 18:03:02] Creating folder /tmp/Rtmpzpfy7V/data_out/APN"                                                               
#> [58] "INFO  [2024-04-13 18:03:02] Writing file APN/APN_S1_full_report_0-50__agreement_corrected_fin.csv -> APN/APN_S1_R1-2.csv (1/3) ---"     
#> [59] "INFO  [2024-04-13 18:03:02] Creating folder /tmp/Rtmpzpfy7V/data_out/ATH"                                                               
#> [60] "INFO  [2024-04-13 18:03:02] Writing file ATH/ATH_Roll1_Snapshot.csv -> ATH/ATH_SNA_R1.csv (2/3) ---"                                    
#> [61] "INFO  [2024-04-13 18:03:02] Creating folder /tmp/Rtmpzpfy7V/data_out/MOK"                                                               
#> [62] "INFO  [2024-04-13 18:03:02] Writing file MOK/MOK_record_table_0min_deltaT_2021-05-07.csv -> MOK/MOK_SNA_R1.csv (3/3) ---"