Data Gathering and Cleaning


We ensure that all package dependencies are installed.

if (!require(tidyverse)) {
if (!require(googledrive)) {
TODO: Convert code from magrittr pipe (%>%) to R pipe (|>).

Download data

The survey was generated and data collected using Google Forms. The survey questions are here:

We have separated the data update process from the generation of this report since some manual cleaning of the department names must be done first. So, the typical workflow is to run the following at the console:

update_data(force_update = TRUE, google_credentials = Sys.getenv("GMAIL_ROG"))
survey <- open_survey()

clean_names(survey) |>

The results of show_unique_depts(survey) can then be compared to the code below, and updates made, as needed, to handle various edge cases.

This workflow could be improved.

One idea would be to use the targets package to update the data at specified intervals and then trigger the data cleaning operations.

Load data

Load the data file.

if (file.exists("csv/open-science-survey-2022-fall.csv")) {
  survey <- readr::read_csv("csv/open-science-survey-2022-fall.csv", show_col_types = FALSE)
} else {
  message("File not found: ", "csv/open-science-survey-2022-fall.csv")
  survey <- NULL

There are \(n=\) 104 responses.

Clean data

Examine the variable names.

if (is.null(survey)) {
  warning("Error loading data file")
} else {
Let’s rename them.

full_questions <- names(survey)

short_names <- c(

if (length(short_names) == length(names(survey))) {
  names(survey) <- short_names
} else {
  message("Name vector lengths differ; no change made.")

Some of the variables have values that are easy to parse, while others are more challenging, data_types for example.

Modify timestamp

Make a standard date_time format.

survey <- survey |>
  dplyr::mutate(timestamp = lubridate::mdy_hms(timestamp, tz = "America/New_York"))

Modify campus

Make all campus locations lowercase and replace white space with underscores.

survey <- survey |>
  dplyr::mutate(campus = tolower(campus)) |>
  dplyr::mutate(campus = stringr::str_replace(campus, " ", "_"))

Modify department

survey <- clean_depts(survey)

Modify position

Make lowercase, replace spaces and dash with underscore.

survey <- survey |>
  dplyr::mutate(position = tolower(position)) |>
  dplyr::mutate(position = stringr::str_replace_all(position, "[ -]", "_"))

Modify years_since_degree

TODO: Make ordinal

Modify data_types

survey <- survey |>
  dplyr::mutate(collect_audio = stringr::str_detect(data_types, "Audio files")) |>
  dplyr::mutate(collect_video = stringr::str_detect(data_types, "Video files")) |>
  dplyr::mutate(collect_photos = stringr::str_detect(data_types, "Digital photographs and/or other images")) |>
    collect_computer_data = stringr::str_detect(
      "Data automatically generated from or by computer programs"
  ) |>
  dplyr::mutate(collect_sensor = stringr::str_detect(data_types, "Data collected from sensors")) |>
  dplyr::mutate(collect_docs = stringr::str_detect(data_types, "Documents or reports")) |>
  dplyr::mutate(collect_models = stringr::str_detect(data_types, "Models/algorithms")) |>
  dplyr::mutate(collect_obs = stringr::str_detect(data_types, "Observational data")) |>
  dplyr::mutate(collect_sims = stringr::str_detect(data_types, "Simulation data, models, and software code")) |>
    collect_procedures = stringr::str_detect(data_types, "Standard operating procedures and protocols")
  ) |>
  dplyr::mutate(collect_txt = stringr::str_detect(data_types, "Text files")) |>
  dplyr::mutate(collect_genomic = stringr::str_detect(data_types, "Genomic")) |>
  dplyr::mutate(collect_image = stringr::str_detect(data_types, "Image data")) |>
  dplyr::mutate(collect_surveys = stringr::str_detect(data_types, "Survey results")) |>
  dplyr::mutate(collect_spreadsheets = stringr::str_detect(data_types, "Spreadsheets")) |>
  dplyr::mutate(collect_interviews = stringr::str_detect(data_types, "interview transcripts")) |>
  dplyr::mutate(collect_gis = stringr::str_detect(data_types, "Geographic Information Systems")) |>
  dplyr::mutate(collect_sketches = stringr::str_detect(data_types, "Sketches, diaries in digital form")) |>
  dplyr::mutate(collect_vr = stringr::str_detect(data_types, "Virtual reality, 3D models")) |>
  dplyr::mutate(collect_xml_json = stringr::str_detect(data_types, "Structured text files")) |>
  dplyr::mutate(collect_web_social = stringr::str_detect(data_types, "Websites and blogs"))

Modify restricted_data

survey <- survey |>
    restricted_ethical = stringr::str_detect(restricted_data, "ethical concerns"),
    restricted_legal_ip = stringr::str_detect(restricted_data, "legal/intellectual"),
    restricted_sponsor = stringr::str_detect(restricted_data, "contractual restrictions"),
    restricted_none = str_detect(restricted_data, "No; My data are not restricted")

Modify storage_active_projects

TODO: Handle other options

survey <- survey |>
  dplyr::mutate(store_usb = stringr::str_detect(storage_active_projects,
                                                "External USB or flash drive")) |>
  dplyr::mutate(store_pc_lab = stringr::str_detect(storage_active_projects,
                                                   "Personal/lab computer")) |>
    store_dept_coll_server = stringr::str_detect(storage_active_projects,
                                                 "Departmental/college server")
  ) |>
  dplyr::mutate(store_icds = stringr::str_detect(storage_active_projects,
                                                 "ICDS/ROAR allocation")) |>
    store_onedrive = stringr::str_detect(storage_active_projects,
                                         "Microsoft OneDrive/SharePoint")
  ) |>
  dplyr::mutate(store_googledrive = stringr::str_detect(storage_active_projects,
                                                        "Google Drive")) |>
  dplyr::mutate(store_dropbox = stringr::str_detect(storage_active_projects,
                                                    "Dropbox")) |>
  dplyr::mutate(store_box = stringr::str_detect(storage_active_projects,

Modify barriers_sharing_collab

TODO: Clean these.

Modify barriers_share_community

TODO: Handle “other” cases.

survey <- survey |>
    barriers_sharing_security = stringr::str_detect(
      "Ensuring security/restricting access"
  ) |>
    barriers_sharing_curation = stringr::str_detect(
      "Taking time to curate, organize, document data"
  ) |>
    barriers_sharing_alter_before_share = stringr::str_detect(
      "Altering data to make it suitable to share"
  ) |>
    barriers_sharing_resources = stringr::str_detect(barriers_share_community,
                                                     "Insufficient resources for sharing")
  ) |>
    barriers_sharing_staff = stringr::str_detect(
      "Lack of available or knowledgeable staff"

Modify where_shared_community

survey <- survey |>
  dplyr::mutate(share_inst_repo = stringr::str_detect(where_shared_community,
                                                      "Institutional repository")) |>
    share_journal_suppl = stringr::str_detect(
      "Supplemental material linked to journal article"
  ) |>
  dplyr::mutate(share_lab_web = stringr::str_detect(where_shared_community,
                                                    "Lab/project website")) |>
  dplyr::mutate(share_ext_repo = stringr::str_detect(where_shared_community,
                                                     "External data repository")) |>
    share_govt_repo = stringr::str_detect(where_shared_community,
                                          "Government data repository")
  ) |>
  dplyr::mutate(share_consortia = stringr::str_detect(where_shared_community,
                                                      "Research consortia"))

Modify knowledge_open_science

survey <- survey |>
    knowledge_open_science = recode(
      `No experience/knowledge` = "None",
      `Limited experience/knowledge` = "Limited",
      `Some experience/knowledge` = "Some",
      `Considerable experience/knowledge` = "Considerable",
      `Extensive experience/knowledge` = "Extensive"

Modify awareness_FAIR

survey <- survey |>
    awareness_FAIR = recode(
      `No awareness` = "None",
      `Limited awareness` = "Limited",
      `Some awareness` = "Some",
      `Considerable awareness` = "Considerable",
      `Extensive awareness` = "Extensive"

Modify benefit_psu_center

survey <- survey |>
    benefit_psu_center = recode(
      `No benefit` = "None",
      `Minimal benefit` = "Minimal",
      `Some benefit` = "Some",
      `Considerable benefit` = "Considerable",
      `Extensive benefit` = "Extensive"

Modify service_psu_center

survey <- survey |>
  dplyr::mutate(help_data_review_qa = stringr::str_detect(service_psu_center,
                                                          "Data review and quality")) |>
  dplyr::mutate(help_data_mgmt_plan = stringr::str_detect(service_psu_center,
                                                          "Data management plan")) |>
  dplyr::mutate(help_data_doc = stringr::str_detect(service_psu_center,
                                                    "Data documentation")) |>
    help_data_analysis_verif = stringr::str_detect(service_psu_center,
                                                   "Third party verification")
  ) |>
    help_student_staff_train = stringr::str_detect(service_psu_center,
                                                   "Training and technical assistance")
  )  |>
    help_data_deidentif = stringr::str_detect(service_psu_center,
                                              "De-identification or anonymization")
  ) |>
    help_funder_compliance = stringr::str_detect(service_psu_center,
                                                 "Ensuring compliance with funding")
  ) |>
    help_where_to_share = stringr::str_detect(service_psu_center,
                                              "recommendation of suitable")

Re-export cleaned data

readr::write_csv(survey, "csv/open-science-survey-2022-fall-clean.csv")

Select and export contact data for follow-up

contact_info <- survey |>
  dplyr::select(contact_info) |>