Data Gathering and Cleaning
Set-up
We ensure that all package dependencies are installed.
if (!require(tidyverse)) {
install.packages("tidyverse")
}
if (!require(googledrive)) {
install.packages("googledrive")
}
## Loading required package: googledrive
TODO: Convert code from magrittr
pipe (%>%
) to R pipe (|>
).
Download data
The survey was generated and data collected using Google Forms. The survey questions are here: https://forms.gle/oT2ekzCsw7KVU8YU8.
We have separated the data update process from the generation of this report since some manual cleaning of the department names must be done first. So, the typical workflow is to run the following at the console:
source('../R/functions.R')
update_data(force_update = TRUE, google_credentials = Sys.getenv("GMAIL_ROG"))
survey <- open_survey()
clean_names(survey) |>
show_unique_depts()
The results of show_unique_depts(survey)
can then be compared to the code below, and updates made, as needed, to handle various edge cases.
This workflow could be improved.
One idea would be to use the targets package to update the data at specified intervals and then trigger the data cleaning operations.
Load data
Load the data file.
if (file.exists("csv/open-science-survey-2022-fall.csv")) {
survey <- readr::read_csv("csv/open-science-survey-2022-fall.csv", show_col_types = FALSE)
} else {
message("File not found: ", "csv/open-science-survey-2022-fall.csv")
survey <- NULL
}
There are \(n=\) 104 responses.
Clean data
Examine the variable names.
## [1] "Timestamp"
## [2] "What Penn State campus do you represent?"
## [3] "What is your primary department/unit?"
## [4] "What is your position at Penn State?"
## [5] "How many years have passed since you completed that degree?"
## [6] "What are the primary types of digital data that are used in your research? (choose all that apply)"
## [7] "Do you collect data that have legal or ethical restrictions governing who may access it or how it may be used?"
## [8] "Where do you store data for active projects where data collection and analysis is still ongoing?"
## [9] "How important to you is sharing data from active projects with research collaborators at Penn State or outside of Penn State?"
## [10] "How convenient is it for you to share data from active projects with research collaborators at Penn State or outside of Penn State?"
## [11] "What are the main barriers to sharing data from active projects with research collaborators?"
## [12] "How important to you is sharing data from completed projects with the broader research community (i.e., not direct collaborators)?"
## [13] "Which of the following obstacles make sharing data with the research community harder for you? Mark all that apply."
## [14] "Do research funders in your field require data sharing?"
## [15] "Do journals in your field require data sharing?"
## [16] "If you have shared data with the research community, where have you shared it?"
## [17] "How well-equipped do you feel you, your colleagues, and trainees are to meet data management and sharing requirements of sponsors/funders or journals?"
## [18] "How often do you create computer scripts or data analysis code in the conduct of your research?"
## [19] "How often do you share computer scripts or data analysis code with direct research collaborators ?"
## [20] "Do you create other kinds of software in the conduct of your research?"
## [21] "How often do you use open source code sharing tools (e.g., GitHub, GitLab, BitBucket)?"
## [22] "Do funders in your field require code sharing?"
## [23] "Do journals in your field require code sharing?"
## [24] "How often do you openly share other materials related to your research (protocols, reagents, samples, apparatus, designs, etc.) with other researchers?"
## [25] "What is your experience with/knowledge of open science practices?"
## [26] "Describe your awareness of the FAIR (findable, accessible, interoperable, reusable) principles pertaining to research data."
## [27] "Do you apply FAIR principles in your own data management and sharing practices?"
## [28] "Have you heard of the \"reproducibility crisis\" in science?"
## [29] "Is there a reproducibility crisis in your area of research?"
## [30] "How much benefit would you derive from a center at Penn State focused on supporting the adoption of best practices in data management and sharing, code sharing, open science, and reproducible research?"
## [31] "Select the services that would most benefit your research if offered by such a center."
## [32] "Any final comments about data management, data sharing, and open science?"
## [33] "(Optional) Provide us with your contact information if you would like us to follow up."
## [34] "What is the highest post-secondary degree you have earned?"
## [35] "How often do you share computer scripts or data analysis code openly?"
Let’s rename them.
full_questions <- names(survey)
short_names <- c(
"timestamp",
"campus",
"department",
"position",
"years_since_degree",
"data_types",
"restricted_data",
"storage_active_projects",
"importance_sharing_collab",
"convenience_sharing_collab",
"barriers_sharing_collab",
"importance_share_community",
"barriers_share_community",
"funders_require_data_sharing",
"journals_require_data_sharing",
"where_shared_community",
"equipped_data_mgmt_sharing",
"create_analysis_code",
"share_analysis_code_collab",
"create_other_code",
"use_code_sharing_tools",
"funders_require_code_sharing",
"journals_require_code_sharing",
"share_materials_community",
"knowledge_open_science",
"awareness_FAIR",
"apply_FAIR",
"heardof_reproducibility_crisis",
"my_area_reproducibility_crisis",
"benefit_psu_center",
"service_psu_center",
"comments",
"contact_info",
"highest_degree_earned",
"share_analysis_code_community"
)
if (length(short_names) == length(names(survey))) {
names(survey) <- short_names
} else {
message("Name vector lengths differ; no change made.")
}
Some of the variables have values that are easy to parse, while others are more challenging, data_types
for example.
Modify campus
Make all campus locations lowercase and replace white space with underscores.
survey <- survey |>
dplyr::mutate(campus = tolower(campus)) |>
dplyr::mutate(campus = stringr::str_replace(campus, " ", "_"))
Modify position
Make lowercase, replace spaces and dash with underscore.
survey <- survey |>
dplyr::mutate(position = tolower(position)) |>
dplyr::mutate(position = stringr::str_replace_all(position, "[ -]", "_"))
Modify data_types
survey <- survey |>
dplyr::mutate(collect_audio = stringr::str_detect(data_types, "Audio files")) |>
dplyr::mutate(collect_video = stringr::str_detect(data_types, "Video files")) |>
dplyr::mutate(collect_photos = stringr::str_detect(data_types, "Digital photographs and/or other images")) |>
dplyr::mutate(
collect_computer_data = stringr::str_detect(
data_types,
"Data automatically generated from or by computer programs"
)
) |>
dplyr::mutate(collect_sensor = stringr::str_detect(data_types, "Data collected from sensors")) |>
dplyr::mutate(collect_docs = stringr::str_detect(data_types, "Documents or reports")) |>
dplyr::mutate(collect_models = stringr::str_detect(data_types, "Models/algorithms")) |>
dplyr::mutate(collect_obs = stringr::str_detect(data_types, "Observational data")) |>
dplyr::mutate(collect_sims = stringr::str_detect(data_types, "Simulation data, models, and software code")) |>
dplyr::mutate(
collect_procedures = stringr::str_detect(data_types, "Standard operating procedures and protocols")
) |>
dplyr::mutate(collect_txt = stringr::str_detect(data_types, "Text files")) |>
dplyr::mutate(collect_genomic = stringr::str_detect(data_types, "Genomic")) |>
dplyr::mutate(collect_image = stringr::str_detect(data_types, "Image data")) |>
dplyr::mutate(collect_surveys = stringr::str_detect(data_types, "Survey results")) |>
dplyr::mutate(collect_spreadsheets = stringr::str_detect(data_types, "Spreadsheets")) |>
dplyr::mutate(collect_interviews = stringr::str_detect(data_types, "interview transcripts")) |>
dplyr::mutate(collect_gis = stringr::str_detect(data_types, "Geographic Information Systems")) |>
dplyr::mutate(collect_sketches = stringr::str_detect(data_types, "Sketches, diaries in digital form")) |>
dplyr::mutate(collect_vr = stringr::str_detect(data_types, "Virtual reality, 3D models")) |>
dplyr::mutate(collect_xml_json = stringr::str_detect(data_types, "Structured text files")) |>
dplyr::mutate(collect_web_social = stringr::str_detect(data_types, "Websites and blogs"))
Modify restricted_data
survey <- survey |>
dplyr::mutate(
restricted_ethical = stringr::str_detect(restricted_data, "ethical concerns"),
restricted_legal_ip = stringr::str_detect(restricted_data, "legal/intellectual"),
restricted_sponsor = stringr::str_detect(restricted_data, "contractual restrictions"),
restricted_none = str_detect(restricted_data, "No; My data are not restricted")
)
Modify storage_active_projects
TODO: Handle other options
survey <- survey |>
dplyr::mutate(store_usb = stringr::str_detect(storage_active_projects,
"External USB or flash drive")) |>
dplyr::mutate(store_pc_lab = stringr::str_detect(storage_active_projects,
"Personal/lab computer")) |>
dplyr::mutate(
store_dept_coll_server = stringr::str_detect(storage_active_projects,
"Departmental/college server")
) |>
dplyr::mutate(store_icds = stringr::str_detect(storage_active_projects,
"ICDS/ROAR allocation")) |>
dplyr::mutate(
store_onedrive = stringr::str_detect(storage_active_projects,
"Microsoft OneDrive/SharePoint")
) |>
dplyr::mutate(store_googledrive = stringr::str_detect(storage_active_projects,
"Google Drive")) |>
dplyr::mutate(store_dropbox = stringr::str_detect(storage_active_projects,
"Dropbox")) |>
dplyr::mutate(store_box = stringr::str_detect(storage_active_projects,
"Box"))
Modify knowledge_open_science
survey <- survey |>
dplyr::mutate(
knowledge_open_science = recode(
knowledge_open_science,
`No experience/knowledge` = "None",
`Limited experience/knowledge` = "Limited",
`Some experience/knowledge` = "Some",
`Considerable experience/knowledge` = "Considerable",
`Extensive experience/knowledge` = "Extensive"
)
)
Modify service_psu_center
survey <- survey |>
dplyr::mutate(help_data_review_qa = stringr::str_detect(service_psu_center,
"Data review and quality")) |>
dplyr::mutate(help_data_mgmt_plan = stringr::str_detect(service_psu_center,
"Data management plan")) |>
dplyr::mutate(help_data_doc = stringr::str_detect(service_psu_center,
"Data documentation")) |>
dplyr::mutate(
help_data_analysis_verif = stringr::str_detect(service_psu_center,
"Third party verification")
) |>
dplyr::mutate(
help_student_staff_train = stringr::str_detect(service_psu_center,
"Training and technical assistance")
) |>
dplyr::mutate(
help_data_deidentif = stringr::str_detect(service_psu_center,
"De-identification or anonymization")
) |>
dplyr::mutate(
help_funder_compliance = stringr::str_detect(service_psu_center,
"Ensuring compliance with funding")
) |>
dplyr::mutate(
help_where_to_share = stringr::str_detect(service_psu_center,
"recommendation of suitable")
)
Re-export cleaned data
readr::write_csv(survey, "csv/open-science-survey-2022-fall-clean.csv")