Cleaning r/biotech Salary Data

Published

June 27, 2025

Modified

June 27, 2025

set out dir

if (interactive()){
  outdir <- here::here()
} else {
  outdir <- here::here(params$outdir)
}

libraries

library(readr)
library(here)
library(purrr)
library(glue)
library(lubridate)
library(forcats)
library(dplyr)
library(tidyr)
library(gt)
library(ggplot2)
library(skimr)
library(janitor)
library(stringr)
library(tictoc)
library(usethis)
library(reactable)

preprocess excel data

link to google sheets

I extract the sheets into separate csv files so that they can display on github and be more easily used.

# run this once, manually
library(googlesheets4)
gs4_deauth()
google_sheets_url <- 'https://docs.google.com/spreadsheets/d/1G0FmJhkOME_sv66hWmhnZS5qR2KMTY7nzkxksv46bfk/edit#gid=491268892'
sheet_names <- googlesheets4::sheet_names(google_sheets_url)

yearly survey version differences

sal <- sheet_names |>  
  # read it as character values to avoid list-column headaches
  map(
    \(x) read_sheet(google_sheets_url, sheet = x, col_types = 'c')
  ) 
✔ Reading from "r/biotech salary and company survey".
✔ Range ''2025''.
✔ Reading from "r/biotech salary and company survey".
✔ Range ''2024''.
✔ Reading from "r/biotech salary and company survey".
✔ Range ''2023''.
✔ Reading from "r/biotech salary and company survey".
✔ Range ''2022''.
sal <-  sal |> 
  set_names(sheet_names) |> 
  bind_rows(.id = 'sheet_name')
sal  |>  glimpse()
Rows: 3,664
Columns: 68
$ sheet_name                                                                                                                                                                                                                                                                                                                                                       <chr> …
$ Timestamp                                                                                                                                                                                                                                                                                                                                                        <chr> …
$ `Email Address`                                                                                                                                                                                                                                                                                                                                                  <chr> …
$ `Which country do you work in?`                                                                                                                                                                                                                                                                                                                                  <chr> …
$ `What City do you work in?`                                                                                                                                                                                                                                                                                                                                      <chr> …
$ `Which US state do you work in?`                                                                                                                                                                                                                                                                                                                                 <chr> …
$ `Which Canadian Province do you work in?`                                                                                                                                                                                                                                                                                                                        <chr> …
$ `Company or Institution Name`                                                                                                                                                                                                                                                                                                                                    <chr> …
$ `Which of the following best describes your company?`                                                                                                                                                                                                                                                                                                            <chr> …
$ `Biotech sub industry?`                                                                                                                                                                                                                                                                                                                                          <chr> …
$ `Company Detail - Approximate Company Size`                                                                                                                                                                                                                                                                                                                      <chr> …
$ `Provide a review and rate your company/institution and experience`                                                                                                                                                                                                                                                                                              <chr> …
$ `Work Life Balance - On average, how many hours do you work per week`                                                                                                                                                                                                                                                                                            <chr> …
$ `What is your official job title?\n\nThis should match the title that was present on your offer letter. \n\nSome examples: Research Associate, Scientist, Scientist I/II/III, Senior Scientist, Principal Scientist, Manager, Lab Technician, VP, Director etc.\n\nIf your job title includes additional detail, please include e.g. Director of Bioinformatics` <chr> …
$ `Which department best describes your role? If you can't find one that fits your specific role, select 'Other' and describe`                                                                                                                                                                                                                                     <chr> …
$ `Briefly describe your position and responsibilities`                                                                                                                                                                                                                                                                                                            <chr> …
$ `Do you work In-Person, Remote, or Hybrid?`                                                                                                                                                                                                                                                                                                                      <chr> …
$ `How many days on average per week do you work from home?`                                                                                                                                                                                                                                                                                                       <chr> …
$ `What degrees do you have?`                                                                                                                                                                                                                                                                                                                                      <chr> …
$ `How many total years of experience in post-docs do you have? (If none, enter 0)`                                                                                                                                                                                                                                                                                <chr> …
$ `List other relevant and recognized certifications`                                                                                                                                                                                                                                                                                                              <chr> …
$ `Please enter your total years of experience in the field\n\nDO NOT INCLUDE years in BSc, Masters, PhD, or post-bacc/post-doc`                                                                                                                                                                                                                                   <chr> …
$ `What currency will you be answering these questions in?`                                                                                                                                                                                                                                                                                                        <chr> …
$ `Are you a salaried or hourly employee`                                                                                                                                                                                                                                                                                                                          <chr> …
$ `If you are an hourly employee, how many hours per week do you work?`                                                                                                                                                                                                                                                                                            <chr> …
$ `Annual Base Salary \n\nWhat is your annual BASE salary? Do not include bonus or stocks\n\nIf you are paid hourly, multiply by 2080 to convert to yearly (assuming you work 40hr/week)`                                                                                                                                                                          <chr> …
$ `Overtime Pay\n\nHow is overtime handled on a per hour basis? (1.5x base, 2x base, etc)`                                                                                                                                                                                                                                                                         <chr> …
$ `Annual Target Bonus in PERCENTAGE (%)\n\nThis should be stipulated prior to your employment. DO NOT include one-time bonuses (spot bonuses, signing bonuses, referral bonuses).`                                                                                                                                                                                <chr> …
$ `Commission\n\nIf you are compensated with commission how much is your commission? (Please indicate if answering in percentage or absolute monetary value)`                                                                                                                                                                                                      <chr> …
$ `Annual Equity/Stock Option\n\nPlease enter the grant/market value of your long-term incentive. Use the annualized amount if distributed over several years. Enter 0 if none or not applicable. \n\nDO NOT include equity when signing on or one-time equity bonuses`                                                                                            <chr> …
$ `Most recent annual yearly raise (%)\n\nCompanies typically do yearly raises across the board for all employees. This is not to be confused with promotions or raises that were requested.`                                                                                                                                                                      <chr> …
$ `[Sign on] Bonus Value\nThis is a one-time bonus you may have received when signing-on. This is not a bonus that reoccurs`                                                                                                                                                                                                                                       <chr> …
$ `[Sign on] Stock/Equity Options\nBriefly describe sign on stock/equity options that is available`                                                                                                                                                                                                                                                                <chr> …
$ `[Sign on] Relocation Assistance Total Value\n\nThis is the total value of the relocation assistance including but not limited to tax-true ups company may have paid, movers expenses, shipping of vehicles, temporary living, etc.`                                                                                                                             <chr> …
$ `Retirement Benefits (free text)\nBriefly describe retirement benefits. For example 401k match %, pension, etc`                                                                                                                                                                                                                                                  <chr> …
$ `Healthcare Benefits (free text)\nDescribe healthcare benefits - costs, deductibles, outfamily plan, insurance provider, total out of pocket, HSA contributions, etc`                                                                                                                                                                                            <chr> …
$ `Select the highest level of education that you have that's relevant to your occupation. If you have multiple (e.g. PhD + MD) please select "Other" and describe`                                                                                                                                                                                                <chr> …
$ `How many years have you been working at this current position?`                                                                                                                                                                                                                                                                                                 <chr> …
$ `What currency will you be answering these quesitons in?`                                                                                                                                                                                                                                                                                                        <chr> …
$ `Survey feedback`                                                                                                                                                                                                                                                                                                                                                <chr> …
$ `Any other type of annual bonus? Please describe.\n\n DO NOT include one-time bonuses (spot bonuses, signing bonuses, referral bonuses).`                                                                                                                                                                                                                        <chr> …
$ `How much of your bonus did you receive in the last cycle?\n\ne.g. maybe your target bonus was 10%, but you actually got 8% this year. So here you would write "8%"`                                                                                                                                                                                             <chr> …
$ `What country do you work in?`                                                                                                                                                                                                                                                                                                                                   <chr> …
$ `Where is the closest major city or hub?`                                                                                                                                                                                                                                                                                                                        <chr> …
$ `Where are you located?`                                                                                                                                                                                                                                                                                                                                         <chr> …
$ `Company Details - public/private/start-up/ subsidiary of`                                                                                                                                                                                                                                                                                                       <chr> …
$ `Role / Title of current position`                                                                                                                                                                                                                                                                                                                               <chr> …
$ `[Optional] Briefly describe your position`                                                                                                                                                                                                                                                                                                                      <chr> …
$ `Years of Experience`                                                                                                                                                                                                                                                                                                                                            <chr> …
$ `Compensation - Annual Base Salary/Pay`                                                                                                                                                                                                                                                                                                                          <chr> …
$ `Compensation - Overtime Pay`                                                                                                                                                                                                                                                                                                                                    <chr> …
$ `Compensation - Annual Target Bonus ($)`                                                                                                                                                                                                                                                                                                                         <chr> …
$ `Compensation - Annual Equity/Stock Option`                                                                                                                                                                                                                                                                                                                      <chr> …
$ `Compensation - Most recent annual yearly raise (%)`                                                                                                                                                                                                                                                                                                             <chr> …
$ `Compensation - Sign on Bonus Value`                                                                                                                                                                                                                                                                                                                             <chr> …
$ `Compensation - Sign on Stock/Equity Options`                                                                                                                                                                                                                                                                                                                    <chr> …
$ `Compensation - Retirement Benefits (free text)`                                                                                                                                                                                                                                                                                                                 <chr> …
$ `Compensation - Retirement Percent Match (free text)`                                                                                                                                                                                                                                                                                                            <chr> …
$ `Compensation - Healthcare Benefits (free text)`                                                                                                                                                                                                                                                                                                                 <chr> …
$ `[OPTIONAL] Sign on - Relocation Assistance Total Value`                                                                                                                                                                                                                                                                                                         <chr> …
$ `[Optional] Company Review`                                                                                                                                                                                                                                                                                                                                      <chr> …
$ `[Optional] Work Life Balance - On average, how many hours do you work per week`                                                                                                                                                                                                                                                                                 <chr> …
$ `Survey Feedback`                                                                                                                                                                                                                                                                                                                                                <chr> …
$ `Highest achieved Formal Education`                                                                                                                                                                                                                                                                                                                              <chr> …
$ `Compensation - Annual Equity`                                                                                                                                                                                                                                                                                                                                   <chr> …
$ `Compensation - Stock Options`                                                                                                                                                                                                                                                                                                                                   <chr> …
$ `Compensation - Options`                                                                                                                                                                                                                                                                                                                                         <chr> …
$ `Have you one a post-doc?`                                                                                                                                                                                                                                                                                                                                       <chr> …
sal |> count(sheet_name)
# A tibble: 4 × 2
  sheet_name     n
  <chr>      <int>
1 2022         576
2 2023         670
3 2024        1750
4 2025         668
missing_variables_by_year <- sal |> 
  summarize(
    .by = sheet_name,
    across(everything(), ~sum(is.na(.)))
  ) |>  
  pivot_longer(
    cols = -c(sheet_name),
    names_to = 'variable',
    values_to = 'n_missing'
  ) |>  
  arrange(desc(sheet_name), desc(n_missing)) 
missing_variables_by_year |>   reactable(groupBy = 'sheet_name')

Initial exploratory

Completeness

Look at response rate per question (number of NAs):

# assess NAs
percent_na <- function(x, invert = FALSE) {
  p <- sum(is.na(x)) / length(x)
  if (invert) { p <- 1 - p}
  p
}

completeness <- sal |> 
  summarize(
    .by = sheet_name,
    across(everything(), ~percent_na(.x, invert = TRUE))
  )   |> 
  mutate(across(everything(), ~ifelse(. == 0, NA, .))) |> 
  pivot_longer(-sheet_name, names_to = 'variable', values_to = '% responded') |> 
  pivot_wider(names_from = sheet_name, values_from = '% responded')  

reactable_na <- function(df) {
  df |> 
    gt() |> 
    fmt_percent(columns = -variable, decimals = 0) |> 
    data_color(
      columns = -variable, 
      palette = c('white', 'blue'),
      na_color = 'white',
      domain = c(0, 1)) |> 
    sub_missing(missing_text = '0') |> 
    tab_options(table.width = px(500))
}
reactable_na(completeness) 
variable 2025 2024 2023 2022
Timestamp 100% 100% 100% 100%
Email Address 0 0 0 0
Which country do you work in? 100% 0 0 0
What City do you work in? 10% 0 0 0
Which US state do you work in? 87% 0 0 0
Which Canadian Province do you work in? 3% 0 0 0
Company or Institution Name 100% 100% 100% 26%
Which of the following best describes your company? 100% 0 0 0
Biotech sub industry? 100% 100% 100% 100%
Company Detail - Approximate Company Size 100% 99% 100% 99%
Provide a review and rate your company/institution and experience 35% 0 0 0
Work Life Balance - On average, how many hours do you work per week 100% 0 0 0
What is your official job title? This should match the title that was present on your offer letter. Some examples: Research Associate, Scientist, Scientist I/II/III, Senior Scientist, Principal Scientist, Manager, Lab Technician, VP, Director etc. If your job title includes additional detail, please include e.g. Director of Bioinformatics 100% 0 0 0
Which department best describes your role? If you can't find one that fits your specific role, select 'Other' and describe 100% 0 0 0
Briefly describe your position and responsibilities 57% 0 0 0
Do you work In-Person, Remote, or Hybrid? 100% 0 0 0
How many days on average per week do you work from home? 100% 0 0 0
What degrees do you have? 0 100% 0 0
How many total years of experience in post-docs do you have? (If none, enter 0) 100% 0 0 0
List other relevant and recognized certifications 9% 100% 100% 0
Please enter your total years of experience in the field DO NOT INCLUDE years in BSc, Masters, PhD, or post-bacc/post-doc 100% 0 0 0
What currency will you be answering these questions in? 0 0 0 0
Are you a salaried or hourly employee 100% 0 0 0
If you are an hourly employee, how many hours per week do you work? 100% 0 0 0
Annual Base Salary What is your annual BASE salary? Do not include bonus or stocks If you are paid hourly, multiply by 2080 to convert to yearly (assuming you work 40hr/week) 100% 0 0 0
Overtime Pay How is overtime handled on a per hour basis? (1.5x base, 2x base, etc) 100% 0 0 0
Annual Target Bonus in PERCENTAGE (%) This should be stipulated prior to your employment. DO NOT include one-time bonuses (spot bonuses, signing bonuses, referral bonuses). 86% 0 0 0
Commission If you are compensated with commission how much is your commission? (Please indicate if answering in percentage or absolute monetary value) 100% 0 0 0
Annual Equity/Stock Option Please enter the grant/market value of your long-term incentive. Use the annualized amount if distributed over several years. Enter 0 if none or not applicable. DO NOT include equity when signing on or one-time equity bonuses 100% 0 0 0
Most recent annual yearly raise (%) Companies typically do yearly raises across the board for all employees. This is not to be confused with promotions or raises that were requested. 100% 0 0 0
[Sign on] Bonus Value This is a one-time bonus you may have received when signing-on. This is not a bonus that reoccurs 41% 0 0 0
[Sign on] Stock/Equity Options Briefly describe sign on stock/equity options that is available 36% 0 0 0
[Sign on] Relocation Assistance Total Value This is the total value of the relocation assistance including but not limited to tax-true ups company may have paid, movers expenses, shipping of vehicles, temporary living, etc. 34% 0 0 0
Retirement Benefits (free text) Briefly describe retirement benefits. For example 401k match %, pension, etc 70% 0 0 0
Healthcare Benefits (free text) Describe healthcare benefits - costs, deductibles, outfamily plan, insurance provider, total out of pocket, HSA contributions, etc 47% 0 0 0
Select the highest level of education that you have that's relevant to your occupation. If you have multiple (e.g. PhD + MD) please select "Other" and describe 100% 0 0 0
How many years have you been working at this current position? 100% 0 0 0
What currency will you be answering these quesitons in? 100% 0 0 0
Survey feedback 9% 0 0 0
Any other type of annual bonus? Please describe. DO NOT include one-time bonuses (spot bonuses, signing bonuses, referral bonuses). 21% 0 0 0
How much of your bonus did you receive in the last cycle? e.g. maybe your target bonus was 10%, but you actually got 8% this year. So here you would write "8%" 57% 0 0 0
What country do you work in? 0 10% 100% 0
Where is the closest major city or hub? 0 10% 100% 100%
Where are you located? 0 90% 0 0
Company Details - public/private/start-up/ subsidiary of 0 100% 100% 100%
Role / Title of current position 0 100% 100% 100%
[Optional] Briefly describe your position 0 48% 52% 0
Years of Experience 0 100% 100% 100%
Compensation - Annual Base Salary/Pay 0 100% 100% 100%
Compensation - Overtime Pay 0 100% 100% 0
Compensation - Annual Target Bonus ($) 0 100% 100% 100%
Compensation - Annual Equity/Stock Option 0 100% 0 0
Compensation - Most recent annual yearly raise (%) 0 100% 100% 0
Compensation - Sign on Bonus Value 0 50% 51% 37%
Compensation - Sign on Stock/Equity Options 0 38% 0 0
Compensation - Retirement Benefits (free text) 0 97% 64% 0
Compensation - Retirement Percent Match (free text) 0 0 0 0
Compensation - Healthcare Benefits (free text) 0 44% 41% 0
[OPTIONAL] Sign on - Relocation Assistance Total Value 0 21% 18% 24%
[Optional] Company Review 0 23% 23% 0
[Optional] Work Life Balance - On average, how many hours do you work per week 0 94% 0 0
Survey Feedback 0 6% 0 0
Highest achieved Formal Education 0 0 100% 100%
Compensation - Annual Equity 0 0 100% 100%
Compensation - Stock Options 0 0 41% 0
Compensation - Options 0 0 0 0
Have you one a post-doc? 0 0 0 0

Remove those with 0 answers:

# remove variables that are all NA:
var_all_na <- sal |>  
  skim()  |>   
  filter(n_missing == nrow(sal)) |> 
  pull(skim_variable)

message('removing variables that are have all missing values')
removing variables that are have all missing values
sal_clean <- sal |> 
  select(-any_of(var_all_na))
message(glue(
  "{length(var_all_na)}/{ncol(sal)} columns removed that are all NA"
))
5/68 columns removed that are all NA
print(var_all_na)
[1] "Email Address"                                          
[2] "What currency will you be answering these questions in?"
[3] "Compensation - Retirement Percent Match (free text)"    
[4] "Compensation - Options"                                 
[5] "Have you one a post-doc?"                               

Rename column names

I rename column names to make them easier to call:

# view changed colnames
tibble(original = colnames(sal)) |> 
  mutate(new = janitor::make_clean_names(original)) |> 
  gt() 
original new
sheet_name sheet_name
Timestamp timestamp
Email Address email_address
Which country do you work in? which_country_do_you_work_in
What City do you work in? what_city_do_you_work_in
Which US state do you work in? which_us_state_do_you_work_in
Which Canadian Province do you work in? which_canadian_province_do_you_work_in
Company or Institution Name company_or_institution_name
Which of the following best describes your company? which_of_the_following_best_describes_your_company
Biotech sub industry? biotech_sub_industry
Company Detail - Approximate Company Size company_detail_approximate_company_size
Provide a review and rate your company/institution and experience provide_a_review_and_rate_your_company_institution_and_experience
Work Life Balance - On average, how many hours do you work per week work_life_balance_on_average_how_many_hours_do_you_work_per_week
What is your official job title? This should match the title that was present on your offer letter. Some examples: Research Associate, Scientist, Scientist I/II/III, Senior Scientist, Principal Scientist, Manager, Lab Technician, VP, Director etc. If your job title includes additional detail, please include e.g. Director of Bioinformatics what_is_your_official_job_title_this_should_match_the_title_that_was_present_on_your_offer_letter_some_examples_research_associate_scientist_scientist_i_ii_iii_senior_scientist_principal_scientist_manager_lab_technician_vp_director_etc_if_your_job_title_includes_additional_detail_please_include_e_g_director_of_bioinformatics
Which department best describes your role? If you can't find one that fits your specific role, select 'Other' and describe which_department_best_describes_your_role_if_you_cant_find_one_that_fits_your_specific_role_select_other_and_describe
Briefly describe your position and responsibilities briefly_describe_your_position_and_responsibilities
Do you work In-Person, Remote, or Hybrid? do_you_work_in_person_remote_or_hybrid
How many days on average per week do you work from home? how_many_days_on_average_per_week_do_you_work_from_home
What degrees do you have? what_degrees_do_you_have
How many total years of experience in post-docs do you have? (If none, enter 0) how_many_total_years_of_experience_in_post_docs_do_you_have_if_none_enter_0
List other relevant and recognized certifications list_other_relevant_and_recognized_certifications
Please enter your total years of experience in the field DO NOT INCLUDE years in BSc, Masters, PhD, or post-bacc/post-doc please_enter_your_total_years_of_experience_in_the_field_do_not_include_years_in_b_sc_masters_ph_d_or_post_bacc_post_doc
What currency will you be answering these questions in? what_currency_will_you_be_answering_these_questions_in
Are you a salaried or hourly employee are_you_a_salaried_or_hourly_employee
If you are an hourly employee, how many hours per week do you work? if_you_are_an_hourly_employee_how_many_hours_per_week_do_you_work
Annual Base Salary What is your annual BASE salary? Do not include bonus or stocks If you are paid hourly, multiply by 2080 to convert to yearly (assuming you work 40hr/week) annual_base_salary_what_is_your_annual_base_salary_do_not_include_bonus_or_stocks_if_you_are_paid_hourly_multiply_by_2080_to_convert_to_yearly_assuming_you_work_40hr_week
Overtime Pay How is overtime handled on a per hour basis? (1.5x base, 2x base, etc) overtime_pay_how_is_overtime_handled_on_a_per_hour_basis_1_5x_base_2x_base_etc
Annual Target Bonus in PERCENTAGE (%) This should be stipulated prior to your employment. DO NOT include one-time bonuses (spot bonuses, signing bonuses, referral bonuses). annual_target_bonus_in_percentage_percent_this_should_be_stipulated_prior_to_your_employment_do_not_include_one_time_bonuses_spot_bonuses_signing_bonuses_referral_bonuses
Commission If you are compensated with commission how much is your commission? (Please indicate if answering in percentage or absolute monetary value) commission_if_you_are_compensated_with_commission_how_much_is_your_commission_please_indicate_if_answering_in_percentage_or_absolute_monetary_value
Annual Equity/Stock Option Please enter the grant/market value of your long-term incentive. Use the annualized amount if distributed over several years. Enter 0 if none or not applicable. DO NOT include equity when signing on or one-time equity bonuses annual_equity_stock_option_please_enter_the_grant_market_value_of_your_long_term_incentive_use_the_annualized_amount_if_distributed_over_several_years_enter_0_if_none_or_not_applicable_do_not_include_equity_when_signing_on_or_one_time_equity_bonuses
Most recent annual yearly raise (%) Companies typically do yearly raises across the board for all employees. This is not to be confused with promotions or raises that were requested. most_recent_annual_yearly_raise_percent_companies_typically_do_yearly_raises_across_the_board_for_all_employees_this_is_not_to_be_confused_with_promotions_or_raises_that_were_requested
[Sign on] Bonus Value This is a one-time bonus you may have received when signing-on. This is not a bonus that reoccurs sign_on_bonus_value_this_is_a_one_time_bonus_you_may_have_received_when_signing_on_this_is_not_a_bonus_that_reoccurs
[Sign on] Stock/Equity Options Briefly describe sign on stock/equity options that is available sign_on_stock_equity_options_briefly_describe_sign_on_stock_equity_options_that_is_available
[Sign on] Relocation Assistance Total Value This is the total value of the relocation assistance including but not limited to tax-true ups company may have paid, movers expenses, shipping of vehicles, temporary living, etc. sign_on_relocation_assistance_total_value_this_is_the_total_value_of_the_relocation_assistance_including_but_not_limited_to_tax_true_ups_company_may_have_paid_movers_expenses_shipping_of_vehicles_temporary_living_etc
Retirement Benefits (free text) Briefly describe retirement benefits. For example 401k match %, pension, etc retirement_benefits_free_text_briefly_describe_retirement_benefits_for_example_401k_match_percent_pension_etc
Healthcare Benefits (free text) Describe healthcare benefits - costs, deductibles, outfamily plan, insurance provider, total out of pocket, HSA contributions, etc healthcare_benefits_free_text_describe_healthcare_benefits_costs_deductibles_outfamily_plan_insurance_provider_total_out_of_pocket_hsa_contributions_etc
Select the highest level of education that you have that's relevant to your occupation. If you have multiple (e.g. PhD + MD) please select "Other" and describe select_the_highest_level_of_education_that_you_have_thats_relevant_to_your_occupation_if_you_have_multiple_e_g_ph_d_md_please_select_other_and_describe
How many years have you been working at this current position? how_many_years_have_you_been_working_at_this_current_position
What currency will you be answering these quesitons in? what_currency_will_you_be_answering_these_quesitons_in
Survey feedback survey_feedback
Any other type of annual bonus? Please describe. DO NOT include one-time bonuses (spot bonuses, signing bonuses, referral bonuses). any_other_type_of_annual_bonus_please_describe_do_not_include_one_time_bonuses_spot_bonuses_signing_bonuses_referral_bonuses
How much of your bonus did you receive in the last cycle? e.g. maybe your target bonus was 10%, but you actually got 8% this year. So here you would write "8%" how_much_of_your_bonus_did_you_receive_in_the_last_cycle_e_g_maybe_your_target_bonus_was_10_percent_but_you_actually_got_8_percent_this_year_so_here_you_would_write_8_percent
What country do you work in? what_country_do_you_work_in
Where is the closest major city or hub? where_is_the_closest_major_city_or_hub
Where are you located? where_are_you_located
Company Details - public/private/start-up/ subsidiary of company_details_public_private_start_up_subsidiary_of
Role / Title of current position role_title_of_current_position
[Optional] Briefly describe your position optional_briefly_describe_your_position
Years of Experience years_of_experience
Compensation - Annual Base Salary/Pay compensation_annual_base_salary_pay
Compensation - Overtime Pay compensation_overtime_pay
Compensation - Annual Target Bonus ($) compensation_annual_target_bonus
Compensation - Annual Equity/Stock Option compensation_annual_equity_stock_option
Compensation - Most recent annual yearly raise (%) compensation_most_recent_annual_yearly_raise_percent
Compensation - Sign on Bonus Value compensation_sign_on_bonus_value
Compensation - Sign on Stock/Equity Options compensation_sign_on_stock_equity_options
Compensation - Retirement Benefits (free text) compensation_retirement_benefits_free_text
Compensation - Retirement Percent Match (free text) compensation_retirement_percent_match_free_text
Compensation - Healthcare Benefits (free text) compensation_healthcare_benefits_free_text
[OPTIONAL] Sign on - Relocation Assistance Total Value optional_sign_on_relocation_assistance_total_value
[Optional] Company Review optional_company_review
[Optional] Work Life Balance - On average, how many hours do you work per week optional_work_life_balance_on_average_how_many_hours_do_you_work_per_week
Survey Feedback survey_feedback_2
Highest achieved Formal Education highest_achieved_formal_education
Compensation - Annual Equity compensation_annual_equity
Compensation - Stock Options compensation_stock_options
Compensation - Options compensation_options
Have you one a post-doc? have_you_one_a_post_doc
sal_clean <- janitor::clean_names(sal_clean)

Now I begin cleaning the variable values

Column cleaning

It would be a massive effort to clean every column. Let’s prioritize to most important ones:

  • Salary
  • base
  • target bonus (%)
  • equity
  • Title
  • Experience
  • years
  • degree
  • Location
  • country
  • city

If I have time, can look also at:

  • 401k match

salary

  • base
  • target bonus (%)
  • equity

There are a handful of responses with annual salary reportedly less than $5000. There is one person that responded with “$1” - we remove them. And then the remaining report a range from 105-210, which likely represent 105k - 210k, I multiple these ones by 1000.

base

Which column corresponds to base salary?

completeness |> filter(str_detect(tolower(variable), 'salary')) |>  reactable_na()
variable 2025 2024 2023 2022
Annual Base Salary What is your annual BASE salary? Do not include bonus or stocks If you are paid hourly, multiply by 2080 to convert to yearly (assuming you work 40hr/week) 100% 0 0 0
Compensation - Annual Base Salary/Pay 0 100% 100% 100%

2025 we changed the question, added data validation so that it has to be digits

before 2025, it was a short-form text, so it needed a lot of data cleaning.

2025 looks like this

I rename to compensation_annual_base_salary_pay_2025 first

sal_clean <- sal_clean |> rename(
  compensation_annual_base_salary_pay_2025  = annual_base_salary_what_is_your_annual_base_salary_do_not_include_bonus_or_stocks_if_you_are_paid_hourly_multiply_by_2080_to_convert_to_yearly_assuming_you_work_40hr_week
) 
sal_clean <- sal_clean |> mutate(compensation_annual_base_salary_pay_2025 = as.numeric(compensation_annual_base_salary_pay_2025))

sal_clean |> 
  select(compensation_annual_base_salary_pay_2025 ) |> 
  ggplot(aes(x = compensation_annual_base_salary_pay_2025 )) + 
  geom_histogram() + 
  scale_x_continuous(labels = scales::dollar)
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Warning: Removed 2996 rows containing non-finite outside the scale range
(`stat_bin()`).

Since it looks good I will coaelesce 2025 with previous years:

sal_clean <- sal_clean |> 
  mutate(compensation_annual_base_salary_pay = ifelse(
    is.na(compensation_annual_base_salary_pay), 
    compensation_annual_base_salary_pay_2025,
    compensation_annual_base_salary_pay
  )) |> 
  select(-compensation_annual_base_salary_pay_2025)

Now what follows is cleaning the data: - remove too low salaries (<5k)

# make numeric
sal_clean <- sal_clean |>  
  mutate(
    across(
      c(compensation_annual_base_salary_pay, 
        compensation_annual_target_bonus), ~as.numeric(.x))
  )
Warning: There was 1 warning in `mutate()`.
ℹ In argument: `across(...)`.
Caused by warning:
! NAs introduced by coercion
sal_clean |>  
  ggplot(aes(x = compensation_annual_base_salary_pay)) + 
  geom_histogram() +
  scale_x_continuous(labels = scales::number)
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# a bunch of salary entries that are <1000:
sal_clean |> 
  filter(compensation_annual_base_salary_pay < 5000) |> 
  count(compensation_annual_base_salary_pay)
# A tibble: 39 × 2
   compensation_annual_base_salary_pay     n
                                 <dbl> <int>
 1                                 0       1
 2                                 1       1
 3                                15       1
 4                                23.6     1
 5                                24       1
 6                                28.4     1
 7                                38       1
 8                                58       1
 9                                62       1
10                                70       1
# ℹ 29 more rows
# remove guy with "1" in base
sal_clean_filt <- sal_clean |> 
  filter(compensation_annual_base_salary_pay > 1)
n_removed <- nrow(sal_clean) - nrow(sal_clean_filt)

# 1 guy added extra 0
sal_clean_filt <- sal_clean_filt |> 
  mutate(compensation_annual_base_salary_pay = ifelse(
    compensation_annual_base_salary_pay == 1350000 &
      sheet_name == '2023', 
    1350000/10, 
    compensation_annual_base_salary_pay
  ))

# for the remainder of individuals with salary < 5000 (all in hundreds), 
# multiply by 1000
sal_clean_filt <- sal_clean_filt |>  
  mutate(
    salary_base = ifelse(
      compensation_annual_base_salary_pay < 1000,
      compensation_annual_base_salary_pay*1000, 
      compensation_annual_base_salary_pay     
    ))

n_changed <- sal_clean_filt |> 
  filter(salary_base != compensation_annual_base_salary_pay) |>
  nrow() 

We removed 2 data point, and changed / cleaned 41

sal_clean_filt |> 
  filter(salary_base != compensation_annual_base_salary_pay) |>
  select(compensation_annual_base_salary_pay, salary_base) |> 
  gt()
compensation_annual_base_salary_pay salary_base
200.00 200000
100.00 100000
168.00 168000
195.00 195000
175.00 175000
77.00 77000
120.00 120000
102.00 102000
38.00 38000
210.00 210000
170.00 170000
198.00 198000
105.00 105000
127.00 127000
126.00 126000
110.00 110000
160.00 160000
62.00 62000
15.00 15000
118.00 118000
220.00 220000
115.00 115000
87.00 87000
100.00 100000
58.00 58000
23.65 23650
109.00 109000
276.00 276000
155.00 155000
170.00 170000
100.00 100000
165.00 165000
123.00 123000
125.00 125000
28.37 28370
115.00 115000
70.00 70000
170.00 170000
125.00 125000
24.00 24000
350.00 350000

target bonus

Which column corresponds to bonus?

completeness |> filter(str_detect(tolower(variable), 'bonus')) |>  reactable_na()
variable 2025 2024 2023 2022
Annual Base Salary What is your annual BASE salary? Do not include bonus or stocks If you are paid hourly, multiply by 2080 to convert to yearly (assuming you work 40hr/week) 100% 0 0 0
Annual Target Bonus in PERCENTAGE (%) This should be stipulated prior to your employment. DO NOT include one-time bonuses (spot bonuses, signing bonuses, referral bonuses). 86% 0 0 0
Annual Equity/Stock Option Please enter the grant/market value of your long-term incentive. Use the annualized amount if distributed over several years. Enter 0 if none or not applicable. DO NOT include equity when signing on or one-time equity bonuses 100% 0 0 0
[Sign on] Bonus Value This is a one-time bonus you may have received when signing-on. This is not a bonus that reoccurs 41% 0 0 0
Any other type of annual bonus? Please describe. DO NOT include one-time bonuses (spot bonuses, signing bonuses, referral bonuses). 21% 0 0 0
How much of your bonus did you receive in the last cycle? e.g. maybe your target bonus was 10%, but you actually got 8% this year. So here you would write "8%" 57% 0 0 0
Compensation - Annual Target Bonus ($) 0 100% 100% 100%
Compensation - Sign on Bonus Value 0 50% 51% 37%
sal_clean_filt |>  colnames() |>  str_subset('bonus')
[1] "annual_target_bonus_in_percentage_percent_this_should_be_stipulated_prior_to_your_employment_do_not_include_one_time_bonuses_spot_bonuses_signing_bonuses_referral_bonuses"                                                                               
[2] "annual_equity_stock_option_please_enter_the_grant_market_value_of_your_long_term_incentive_use_the_annualized_amount_if_distributed_over_several_years_enter_0_if_none_or_not_applicable_do_not_include_equity_when_signing_on_or_one_time_equity_bonuses"
[3] "sign_on_bonus_value_this_is_a_one_time_bonus_you_may_have_received_when_signing_on_this_is_not_a_bonus_that_reoccurs"                                                                                                                                     
[4] "any_other_type_of_annual_bonus_please_describe_do_not_include_one_time_bonuses_spot_bonuses_signing_bonuses_referral_bonuses"                                                                                                                             
[5] "how_much_of_your_bonus_did_you_receive_in_the_last_cycle_e_g_maybe_your_target_bonus_was_10_percent_but_you_actually_got_8_percent_this_year_so_here_you_would_write_8_percent"                                                                           
[6] "compensation_annual_target_bonus"                                                                                                                                                                                                                         
[7] "compensation_sign_on_bonus_value"                                                                                                                                                                                                                         

2025: annual_target_bonus_in_percentage_percent_this_should_be_stipulated_prior_to_your_employment_do_not_include_one_time_bonuses_spot_bonuses_signing_bonuses_referral_bonuses
2024: compensation_annual_target_bonus

sal_clean_filt  |> 
  count(annual_target_bonus_in_percentage_percent_this_should_be_stipulated_prior_to_your_employment_do_not_include_one_time_bonuses_spot_bonuses_signing_bonuses_referral_bonuses)  |> 
  set_names(nm = c('bonus', 'n')) |>  
  gt()
bonus n
0 65
1 1
10 122
100 1
11 2
11.5 1
12 34
12.5 12
13 6
14 8
14.5 1
15 101
16 7
17 4
17.5 6
18 11
19 1
2 4
2.5 1
20 53
21 1
22 3
22.5 1
23 5
24 2
25 16
26 1
3 9
30 9
33 1
35 2
4 2
4.5 1
40 1
45 1
5 25
50 1
6 9
6.5 1
7 6
7.5 3
75 1
8 24
9 6
9.5 1
NA 3089

2025 looks good, combine it

leave it in the data, but rename to shorter: bonus_pct_2025

sal_clean_filt <- sal_clean_filt |> 
  rename(bonus_pct_2025 = annual_target_bonus_in_percentage_percent_this_should_be_stipulated_prior_to_your_employment_do_not_include_one_time_bonuses_spot_bonuses_signing_bonuses_referral_bonuses) |> 
  
  mutate(compensation_annual_target_bonus = case_when(
    !is.na(bonus_pct_2025) ~ as.numeric(bonus_pct_2025),
    TRUE ~ compensation_annual_target_bonus
  )) 

Look at the completeness of bonus now:

sal_clean_filt |> 
  count(.by = sheet_name, missing_bonus = is.na(compensation_annual_target_bonus)) |>   
  mutate(.by = .by, p = n / sum(n)) |> 
  pivot_wider(
    names_from = .by, values_from = c(n, p), names_vary = 'slowest'
  ) |> 
  gt() |> 
  fmt_percent(contains('p_')) |>
  data_color(
    columns = contains('p_'), 
    palette = c('grey', 'forestgreen'),
    na_color = 'white',
    domain = c(0, 1)) |> 
  tab_spanner_delim(columns = -missing_bonus, delim = '_', reverse = TRUE)
missing_bonus
2022
2023
2024
2025
n p n p n p n p
FALSE 303 52.70% 346 51.64% 912 52.14% 573 85.78%
TRUE 272 47.30% 324 48.36% 837 47.86% 95 14.22%

Before 2025:

is a mix of % and $ amount

I combine this with annual base to create two new variables: bonus_pct, bonus (raw)

My strategy:

  1. Remove special characters, except periods which represent decimals
  2. convert to numeric, which has two impacts:
  • “Not Applicable/None” entries will be converted to NAs
  • percentages will be converted from those values that are less than “50”
  • raw currency values will be simply those that are above 1000
  1. If there is a range of values, then I keep the higher value, since bonus is typically interpreted as the maximum bonus amount, depending on company and individual performance
sal_clean_filt |> 
  select(compensation_annual_base_salary_pay, 
         compensation_annual_target_bonus) |>  
  head(n = 20) |> gt() 
compensation_annual_base_salary_pay compensation_annual_target_bonus
150000 NA
150000 NA
123000 15.0
77000 NA
165000 15.0
126000 10.0
78000 NA
136000 12.0
180000 20.0
95000 12.0
132500 10.0
115000 NA
122500 12.5
159000 20.0
98000 0.0
167900 NA
190000 15.0
78400 10.0
28500 10.0
260000 25.0
sal_clean_filt <- sal_clean_filt |> 
  mutate(
    bonus_cleaned = compensation_annual_target_bonus |> 
      
      # remove special characters
      str_remove_all('[\\$,\\%]') |>
      
      # remove any non-digits at beginning or end of string
      str_remove('^[^\\d]+') |> 
      str_remove('[^\\d]+$') |> 
      
      # if there is text between numbers, 
      # split it as likely this represents a range
      str_split('[^\\d\\.]+') |> 
      
      # convert to numeric
      # if it is a range, keep the highest value
      map_dbl(\(x) x |> as.numeric() |> max()) 
  ) |> 
  
  mutate(bonus_pct = case_when(
    bonus_cleaned < 50 ~ bonus_cleaned/100,
    bonus_cleaned > 1000 ~ bonus_cleaned / salary_base,
    TRUE ~ 0
  ),
  bonus = case_when(
    bonus_cleaned < 50 ~ bonus_pct * salary_base,
    bonus_cleaned > 1000 ~ bonus_cleaned,
    TRUE ~ 0
  ))

Let’s look at the cleaned bonus data vs the original:

sal_clean_filt |> 
  
  # take 12 rows, 6 that are different, and then 6 not
  mutate(different = compensation_annual_target_bonus != bonus_cleaned) |> 
  slice_sample(n = 6, by = different) |>  
  
  count(salary_base, compensation_annual_target_bonus, 
        bonus_cleaned, bonus_pct, bonus)  |> 
  arrange(salary_base) |> 
  gt() |> 
  fmt_percent(bonus_pct) |> 
  fmt_currency(c(salary_base, bonus)) |> 
  sub_missing()
salary_base compensation_annual_target_bonus bonus_cleaned bonus_pct bonus n
$65,000.00 0.00% $0.00 1
$70,000.00 7000 7000 10.00% $7,000.00 1
$72,000.00 0.00% $0.00 1
$79,000.00 0.00% $0.00 1
$97,000.00 0.00% $0.00 1
$99,750.00 0.00% $0.00 1
$101,000.00 8080 8080 8.00% $8,080.00 1
$105,000.00 5000 5000 4.76% $5,000.00 1
$113,000.00 0.00% $0.00 1
$127,500.00 19125 19125 15.00% $19,125.00 1
$155,000.00 23000 23000 14.84% $23,000.00 1
$190,000.00 100000 5 5.00% $9,500.00 1
$200,000.00 30000 30000 15.00% $30,000.00 1
$275,000.00 100000 5 5.00% $13,750.00 1
$280,000.00 100000 5 5.00% $14,000.00 1
$300,000.00 100000 5 5.00% $15,000.00 1
$350,000.00 100000 5 5.00% $17,500.00 1
n_changed_bonus <- sal_clean_filt |> 
  filter(salary_base != compensation_annual_base_salary_pay) |>
  nrow() 

I converted bonus into % and raw $. This involved modifying 41 bonus data points.

filter

These are the responses that put a bonus amount that is nonsensical (> 100%)

I remove these responses completely

# bonus > 100% of base
sal_clean_filt |> 
  filter(bonus_pct >= 1)  |> 
  select(bonus_pct, bonus_pct_2025, bonus_cleaned, compensation_annual_base_salary_pay)
# A tibble: 8 × 4
  bonus_pct bonus_pct_2025 bonus_cleaned compensation_annual_base_salary_pay
      <dbl> <chr>                  <dbl>                               <dbl>
1      1    <NA>                   11100                               11100
2      1.2  <NA>                  169200                              141000
3      1.15 <NA>                   93000                               81000
4      1.71 <NA>                  240000                              140000
5      1    <NA>                  125000                              125000
6      1    <NA>                  130000                              130000
7      1    <NA>                   13500                               13500
8      1    <NA>                  140000                              140000
sal_clean_filt <- sal_clean_filt |> 
  filter(bonus_pct < 1) 

total comp

sal_clean_filt <- sal_clean_filt |> 
  mutate(salary_total = salary_base + bonus)

sal_clean_filt |>  count(is.na(salary_total))
# A tibble: 1 × 2
  `is.na(salary_total)`     n
  <lgl>                 <int>
1 FALSE                  3654
sal_clean_filt |>  count(is.na(salary_base))
# A tibble: 1 × 2
  `is.na(salary_base)`     n
  <lgl>                <int>
1 FALSE                 3654
sal_clean_filt |>  count(is.na(bonus))
# A tibble: 1 × 2
  `is.na(bonus)`     n
  <lgl>          <int>
1 FALSE           3654

equity

NOT CURRENTLY ATTEMPTED. Prioritize other data first.

Equity is a little bit complicated:

Some are reported as options, some are reported in 1000s e.g. “15k” vs “15000” Some are reported in $ amount Some say “it varies per year” There are a few that report %, is that amount percentage from base in $ or in options??

  1. We set “Not Applicable/None” to 0
  2. Set responses that report %, to NA (I don’t want to deal with this and it is a small number of responses)
  3. We then create two variables, equity_options, equity_currency_value
  • equity_currency_value takes those data points that start with a dollar sign $
  • equity_options is everything else
  • treat this the same as base salary:
  • clean and then:
  • where if it is less than 1000, multiple by 1000
  1. Convert everything to numeric
sal_clean_filt |> 
  pull(compensation_annual_equity_stock_option) |>  
  unique() |> 
  head(n = 20) 
 [1] NA                                       
 [2] "Not Applicable/None"                    
 [3] "70000"                                  
 [4] "5000"                                   
 [5] "Stock options awarded every year."      
 [6] "~2500 in options"                       
 [7] "$2,500"                                 
 [8] "20000"                                  
 [9] "Depends on market value (stock options)"
[10] "IDK … 10s of thousands"                 
[11] "$20,000"                                
[12] "85000"                                  
[13] "13000"                                  
[14] "40000"                                  
[15] "0"                                      
[16] "12000"                                  
[17] "15000"                                  
[18] "15k"                                    
[19] "29000"                                  
[20] "22000"                                  

Title

What variables correspond to job title?

completeness |> filter(str_detect(tolower(variable), 'title')) |>  reactable_na()
variable 2025 2024 2023 2022
What is your official job title? This should match the title that was present on your offer letter. Some examples: Research Associate, Scientist, Scientist I/II/III, Senior Scientist, Principal Scientist, Manager, Lab Technician, VP, Director etc. If your job title includes additional detail, please include e.g. Director of Bioinformatics 100% 0 0 0
Role / Title of current position 0 100% 100% 100%

2025: What is your official job title? This should match the title that was present on your offer letter. Some examples: Research Associate, Scientist, Scientist I/II/III, Senior Scientist, Principal Scientist, Manager, Lab Technician, VP, Director etc. If your job title includes additional detail, please include e.g. Director of Bioinformatics

2024: Role / Title of current position

rename 2025 variable to title_2025

And then merge with 2024 to: role_title_of_current_position

sal_clean_filt <- sal_clean_filt |> 
  rename(title_2025 = what_is_your_official_job_title_this_should_match_the_title_that_was_present_on_your_offer_letter_some_examples_research_associate_scientist_scientist_i_ii_iii_senior_scientist_principal_scientist_manager_lab_technician_vp_director_etc_if_your_job_title_includes_additional_detail_please_include_e_g_director_of_bioinformatics) |> 
  mutate(role_title_of_current_position = case_when(
    is.na(role_title_of_current_position) ~ title_2025,
    TRUE ~ role_title_of_current_position
  ))

pre-2025 cleaning:

Title is very important, but also very messy. Titles can have different meanings (compensation, experience, responsibilities) between companies, and location.

sal_clean_filt |> 
  count(role_title_of_current_position) |>
  slice_sample(n=10) |> 
  gt() 
role_title_of_current_position n
VP R&D 1
Associate Scientist, Bio-Analytics 1
Strategy Lead 1
Pathology Director 1
Senior Scientist II Process Development 1
Sr. Clinical Trials Associate 1
Lead 1
Associate Scientist 1 1
Staff Applications Scientist 1
Quality Associate 1

filtering

Remove some obvious rows

sal_clean_filt <- sal_clean_filt |> 
  filter(role_title_of_current_position != 'pimp')

Standardize titles

It’s a lot of work to standardize titles, so I focus on what I know best, scientist-relaeta Let me intialize some empty columns

sal_clean_filt <- sal_clean_filt |> 
  mutate(
    title_category = NA_character_, 
    title_general = NA_character_, 
    title_detail = NA_character_
  )

Scientist

It’s a lot of work to standardize titles, so I focus on what I know best, scientist

Scientist:

  • Research Associate
  • Associate Scientist
  • Scientist
  • Senior Scientist
  • Staff Scientist
  • Principal Scientist

I’m going to create two new variables

  • title_category - Set this to Scientist
  • title_general - A variable that includes general scientist + levels, e.g. Sci III, and Associate Sci
  • title_detail - Contains the other detail of job title, e.g. Research, Clinical
# all roles matching "Scientist"
sal_clean_filt |> 
  filter(str_detect(
    role_title_of_current_position, regex('Scientist', ignore_case = FALSE))) |> 
  count(role_title_of_current_position) |> head(25) |> gt()
role_title_of_current_position n
Advisor (Scientist 1) 1
Advisor - Quantitative Scientist 1
Analytical Development Scientist I 1
Analytical Scientist 1
Application Development Scientist 1
Application Scientist 1
Ass. Scientist 1
Assistant Scientist 2
Associate Clinical Scientist 1
Associate Clinical Trial Scientist 1
Associate Principal Scientist 9
Associate Process Scientist 1
Associate Research Scientist 3
Associate Scientist 97
Associate Scientist (R1) 1
Associate Scientist 1 1
Associate Scientist 2 2
Associate Scientist Doctoral Student 1
Associate Scientist I 5
Associate Scientist I/Chemist I 1
Associate Scientist II 18
Associate Scientist III 1
Associate Scientist, Bio-Analytics 1
Associate Scientist, QC Flow Cytometry 1
Associate Scientist, QC Microbiology 1

Try a different approach:

Hierarchical, see case_when

The order of the statements are impactful e.g.

  • If they have associate, they will associate scientists
  • If they have associate and “principal” they will be principal scientist
sal_clean_filt <- sal_clean_filt |>
  mutate(
    # title_category
    title_category =  
      ifelse(
        str_detect(
          role_title_of_current_position, regex('Scientist', ignore_case = TRUE)),
        'Scientist',
        title_category
      ),
    
    # scratch
    a = title_category == 'Scientist',
    b = role_title_of_current_position |> 
      str_replace_all('sr\\.?', 'Senior'),
    
    # title detail
    title_detail = ifelse(
      a,
      role_title_of_current_position |> 
        str_remove_all(
          regex('(Associate|Scientist|Principal|Senior)*', ignore_case = TRUE)) |> 
        str_replace_all('[:punct:]+', ' ') # |> 
      # str_replace_all('\\s{2,}', ' ')
      ,
      NA
    )
  ) |> 
  
  # title_gen most specific to most general
  mutate(title_general = case_when(
    
    a & str_detect(
      b, regex('Principal', ignore_case = TRUE)) ~ 'Principal Scientist',
    a & str_detect(
      b, regex('Senior', ignore_case = TRUE)) ~ 'Senior Scientist',
    a & str_detect(
      b, regex('Associate', ignore_case = TRUE)) ~ 'Associate Scientist',
    a ~ 'Scientist',
  )) |> 
  
  # clean
  select(-a, -b)

sal_clean_filt |>  
  count(role_title_of_current_position, title_general, title_detail) |>
  slice_sample(n = 20) |>  gt()
role_title_of_current_position title_general title_detail n
Director of Commercialization Analytics NA NA 1
Research technologist II NA NA 1
Senior Research Associate I NA NA 3
Director Regulatory NA NA 1
Regulatory strategy post-doc NA NA 1
Automation Engineer I NA NA 1
Lab Assistant (Accessioner) NA NA 1
Senior scientist 1 Senior Scientist 1 4
Senior Clinical Scientist Senior Scientist Clinical 2
Director of BD NA NA 1
Technical Service Manager NA NA 1
Director Marketing NA NA 1
Project engineer NA NA 1
Pharmacokineticist NA NA 1
Manager NA NA 35
Senior Bioinformatician NA NA 2
Quality Assurance Manager NA NA 1
Lead QA Technician NA NA 1
Senior Principal Scienist NA NA 1
Associate Director of Bioinformatics NA NA 2
sal_clean_filt |>  count(title_general) |> gt()
title_general n
Associate Scientist 173
Principal Scientist 186
Scientist 856
Senior Scientist 283
NA 2155
filter

remove some outliers upon inspection post deployment

# salary > 800k as scientist
remove <- sal_clean_filt |> 
  filter(
    year(mdy_hms(timestamp)) == 2024,
    month(mdy_hms(timestamp)) == 5,
    day(mdy_hms(timestamp)) == 22,
    role_title_of_current_position == 'Scientist'
  ) |> 
  pull(timestamp)

# removed
sal_clean_filt |>  filter(timestamp == remove)
# A tibble: 1 × 70
  sheet_name timestamp          which_country_do_you_wo…¹ what_city_do_you_wor…²
  <chr>      <chr>              <chr>                     <chr>                 
1 2024       5/22/2024 17:53:21 <NA>                      <NA>                  
# ℹ abbreviated names: ¹​which_country_do_you_work_in, ²​what_city_do_you_work_in
# ℹ 66 more variables: which_us_state_do_you_work_in <chr>,
#   which_canadian_province_do_you_work_in <chr>,
#   company_or_institution_name <chr>,
#   which_of_the_following_best_describes_your_company <chr>,
#   biotech_sub_industry <chr>, company_detail_approximate_company_size <chr>,
#   provide_a_review_and_rate_your_company_institution_and_experience <chr>, …
sal_clean_filt <- sal_clean_filt |>  filter(timestamp != remove)

Directors and VP

Same approach as for Scientist, but these are the levels:

sal_clean_filt |> 
  filter(str_detect(
    role_title_of_current_position, regex('(Dir(ector)*)|(AD)', ignore_case = TRUE))) |> 
  count(role_title_of_current_position) |>  slice_sample(n = 25) |> gt()
role_title_of_current_position n
Senior Director Operations 1
Clinical Development Director 1
Assoc Director Publications 1
Group Leader 1
IT Director 1
Group Lead, Principal Scientist 1
Executive Administrator 1
Pathology Director 1
Associate Director, Scientific Communications 1
Associate director 8
Associate Director of Alliance Management 1
Executive Director, Global Regulatory Operations 1
Global Director 1
Sr Scientist 2 (AD equivalent) 1
Associate Director, Clinical Science 1
Associate Director, Legal 1
Director of Alliance Management 1
Associate Director, Clinical Supplies 1
Product development lead 1
Dir, CLinical Programming 1
Director Biology 2
Director of Engineering 2
Senior Scientist /Team Lead 1
Senior Director Program Management 1
Director of Bioinformatics 1
sal_clean_filt <- sal_clean_filt |>
  mutate(
    
    # title category
    title_category =  
      ifelse(
        str_detect(
          role_title_of_current_position, regex('(Director|\\bAD\\b|\\bDir\\b)', ignore_case = TRUE)),
        'Director',
        title_category
      ),
    
    # scratch for concise coding
    a = title_category == 'Director',
    
    # preprocessed 
    b = role_title_of_current_position |> 
      str_replace_all('sr\\.?', 'Senior') |> 
      str_replace_all('\\bAD\\b', 'Associate Director') |> 
      str_replace_all('\\bAssoc\\.?\\b', 'Associate') |> 
      str_replace_all('\\bDir\\.?\\b', 'Director') 
    ,
    
    # title detail
    title_detail = ifelse(
      a,
      b |> 
        
        # remove the level part + director
        str_remove_all(
          regex('(Associate|Director|Senior|Executive)*', ignore_case = TRUE)) |> 
        str_replace_all('[:punct:]+', ' ') # |> 
      # str_replace_all('\\s{2,}', ' ')
      ,
      title_detail
    )) |> 
  
  # title_gen most specific to most general
  mutate(title_general = case_when(
    
    a & str_detect(
      b, regex('Executive', ignore_case = TRUE)) ~ 'Executive Director',
    a & str_detect(
      b, regex('Senior', ignore_case = TRUE)) ~ 'Senior Director',
    a & str_detect(
      b, regex('Associate', ignore_case = TRUE)) ~ 'Associate Director',
    a ~ 'Director',
    .default = title_general
  )) |> 
  
  # clean
  select(-a, -b) 
sal_clean_filt |> 
  count(
    role_title_of_current_position, title_category, 
    title_general, title_detail) |>
  slice_sample(n = 20) |>  
  gt()
role_title_of_current_position title_category title_general title_detail n
Process Engineer NA NA NA 6
Associate Director, Process Development Director Associate Director Process Development 1
Chemist III NA NA NA 1
FAS NA NA NA 1
Assoc Dir, Supply Director Associate Director Supply 1
Senior Associate Scientist Scientist Senior Scientist 18
Lead Analyst NA NA NA 1
Director, Clinical Sciences Lead Director Director Clinical Sciences Lead 1
Client Account Director Director Director Client Account 1
Process Development Sr. Associate NA NA NA 1
Vice President NA NA NA 1
Senior Manager, Commercial NA NA NA 1
Subject Matter Expert NA NA NA 1
Senior Program and Alliance Manager NA NA NA 1
Supervisor I NA NA NA 1
Bioinformatics Consultant NA NA NA 1
QA NA NA NA 1
Clinical Research Scientist Scientist Scientist Clinical Research 1
MSAT NA NA NA 1
Clinical Scientist II Scientist Scientist Clinical II 1
sal_clean_filt |>  
  filter(title_category == 'Director') |>  count(title_general) |> gt()
title_general n
Associate Director 193
Director 200
Executive Director 13
Senior Director 41
  • Associate Director
  • Director
  • Senior Director
  • Executive Director

VP

There’s not many VPs

sal_clean_filt |> 
  filter(str_detect(
    role_title_of_current_position, regex('(VP|Vice Principal)', ignore_case = TRUE))) |> 
  count(role_title_of_current_position) |> gt()
role_title_of_current_position n
SVP 1
SVP, clinical operations 1
Svp Marketing and BD 1
VP 9
VP / CIO 1
VP Finance 1
VP Medical Affairs 1
VP R&D 1
VP Regulatory 1
VP clin ops 1
VP medical affairs 1
VP of Digital 1
VP, Translational Genetics 1
Vp 1
sal_clean_filt <- sal_clean_filt |>
  mutate(
    title_category =  
      ifelse(
        str_detect(
          role_title_of_current_position, 
          regex('(VP|Vice Principal)', ignore_case = TRUE)),
        'VP',
        title_category
      ),
    
    # scratch for concise coding
    a = title_category == 'VP',
    
    # preprocessed 
    b = role_title_of_current_position |> 
      str_replace_all(regex('vp', ignore_case = TRUE), 'VP')
    ,
    
    # title detail
    title_detail = ifelse(
      a,
      b |> 
        
        # remove the level part + title_category
        str_remove_all(
          regex('S?VP', ignore_case = TRUE)) |> 
        str_replace_all('[:punct:]+', ' ') |> 
        str_replace_all('\\s{2,}', ' ') |> 
        str_trim()
      ,
      title_detail
    )) |> 
  
  # title_gen most specific to most general
  mutate(title_general = case_when(
    
    a & str_detect(
      b, regex('SVP', ignore_case = TRUE)) ~ 'SVP',
    a ~ 'VP',
    .default = title_general
  )) |> 
  
  # clean
  select(-a, -b)  

sal_clean_filt |> 
  filter(str_detect(
    role_title_of_current_position, regex('vp', ignore_case = TRUE))) |> 
  count(
    role_title_of_current_position, title_category, 
    title_general, title_detail) |>  
  gt()
role_title_of_current_position title_category title_general title_detail n
SVP VP SVP 1
SVP, clinical operations VP SVP clinical operations 1
Svp Marketing and BD VP SVP Marketing and BD 1
VP VP VP 9
VP / CIO VP VP CIO 1
VP Finance VP VP Finance 1
VP Medical Affairs VP VP Medical Affairs 1
VP R&D VP VP R D 1
VP Regulatory VP VP Regulatory 1
VP clin ops VP VP clin ops 1
VP medical affairs VP VP medical affairs 1
VP of Digital VP VP of Digital 1
VP, Translational Genetics VP VP Translational Genetics 1
Vp VP VP 1
sal_clean_filt |>  
  filter(title_category == 'VP') |>  count(title_general) |> gt()
title_general n
SVP 3
VP 19

Research Associate

sal_clean_filt |> 
  filter(str_detect(
    role_title_of_current_position, 
    regex('(Research Associate|\\bRA\\b)', ignore_case = TRUE)
  )) |> 
  count(role_title_of_current_position) |> gt()
role_title_of_current_position n
Advanced Research Associate 1
Bioinformatics Research Associate 1
Bioinformatics Senior Research Associate 1
Clinical Research Associate 2
Contract Research Associate 1
Manufacturing Research Associate I I 1
Principal Research Associate 9
Principal Research Associate II 1
Principal research associate 2
RA 2
RA 1 1
RA II 2
Research Associate 75
Research Associate 2 1
Research Associate (Engineer) 1
Research Associate 1 5
Research Associate 111 1
Research Associate 2 8
Research Associate I 31
Research Associate I - Protein Purification 1
Research Associate I, In Vivo 1
Research Associate II 56
Research Associate II, Contract 1
Research Associate III 11
Research Associate Ii 1
Research Associate Scientist 1
Research Associate lvl II 1
Research Associate, Process Development (Tech Dev Rotational Program) 1
Research associate 13
Research associate 2 1
Research associate I 3
Research associate II 5
Research associate Ii 1
SENIOR RESEARCH ASSOCIATE 1
Senior Clinical Research Associate I 1
Senior RA 1
Senior Research Associate 61
Senior Research Associate 1 3
Senior Research Associate 2 1
Senior Research Associate I 3
Senior Research Associate II 3
Senior Research Associate, Platform Development 1
Senior Research Associate, Strain Engineering 1
Senior Research Associates 1
Senior research associate 9
Senior research associate I 2
Sr Research Associate 5
Sr research associate 1
Sr. RA 2
Sr. Research Associate 16
Sr. Research Associate II 1
research associate 1
research associate I 1
senior research associate 1
sal_clean_filt <- sal_clean_filt |>
  mutate(
    # title_category
    title_category =  
      ifelse(
        str_detect(
          role_title_of_current_position, 
          regex('(Research Associate|\\bRA\\b)', ignore_case = TRUE)),
        'Research Associate',
        title_category
      ),
    
    # scratch
    a = title_category == 'Research Associate',
    b = role_title_of_current_position |> 
      str_replace_all('[Ss]r\\.?', 'Senior'),
    
    # title detail
    title_detail = ifelse(
      a,
      role_title_of_current_position |>
        str_remove_all(
          regex('(Research|Associate|Scientist|Senior|\\bRA\\b)*', 
                ignore_case = TRUE)) |> 
        str_replace_all('[:punct:]+', ' ')  |> 
        str_replace_all('\\s{2,}', ' ')
      ,
      title_detail
    )
  ) |> 
  
  # title_gen most specific to most general
  mutate(title_general = case_when(
    a & str_detect(
      b, regex('Senior', ignore_case = TRUE)) ~ 'Senior Research Associate',
    a ~ 'Research Associate',
    .default = title_general
  )) |> 
  
  # clean
  select(-a, -b)

sal_clean_filt |> 
  filter(str_detect(
    role_title_of_current_position, regex('(Research Associate|\\bRA\\b)', ignore_case = TRUE))) |> 
  count(
    role_title_of_current_position, title_category, 
    title_general, title_detail) |>  
  gt()
role_title_of_current_position title_category title_general title_detail n
Advanced Research Associate Research Associate Research Associate Advanced 1
Bioinformatics Research Associate Research Associate Research Associate Bioinformatics 1
Bioinformatics Senior Research Associate Research Associate Senior Research Associate Bioinformatics 1
Clinical Research Associate Research Associate Research Associate Clinical 2
Contract Research Associate Research Associate Research Associate Contract 1
Manufacturing Research Associate I I Research Associate Research Associate Manufacturing I I 1
Principal Research Associate Research Associate Research Associate Principal 9
Principal Research Associate II Research Associate Research Associate Principal II 1
Principal research associate Research Associate Research Associate Principal 2
RA Research Associate Research Associate 2
RA 1 Research Associate Research Associate 1 1
RA II Research Associate Research Associate II 2
Research Associate Research Associate Research Associate 75
Research Associate 2 Research Associate Research Associate 2 1
Research Associate (Engineer) Research Associate Research Associate Engineer 1
Research Associate 1 Research Associate Research Associate 1 5
Research Associate 111 Research Associate Research Associate 111 1
Research Associate 2 Research Associate Research Associate 2 8
Research Associate I Research Associate Research Associate I 31
Research Associate I - Protein Purification Research Associate Research Associate I Protein Purification 1
Research Associate I, In Vivo Research Associate Research Associate I In Vivo 1
Research Associate II Research Associate Research Associate II 56
Research Associate II, Contract Research Associate Research Associate II Contract 1
Research Associate III Research Associate Research Associate III 11
Research Associate Ii Research Associate Research Associate Ii 1
Research Associate Scientist Research Associate Research Associate 1
Research Associate lvl II Research Associate Research Associate lvl II 1
Research Associate, Process Development (Tech Dev Rotational Program) Research Associate Research Associate Process Development Tech Dev Rotational Program 1
Research associate Research Associate Research Associate 13
Research associate 2 Research Associate Research Associate 2 1
Research associate I Research Associate Research Associate I 3
Research associate II Research Associate Research Associate II 5
Research associate Ii Research Associate Research Associate Ii 1
SENIOR RESEARCH ASSOCIATE Research Associate Senior Research Associate 1
Senior Clinical Research Associate I Research Associate Senior Research Associate Clinical I 1
Senior RA Research Associate Senior Research Associate 1
Senior Research Associate Research Associate Senior Research Associate 61
Senior Research Associate 1 Research Associate Senior Research Associate 1 3
Senior Research Associate 2 Research Associate Senior Research Associate 2 1
Senior Research Associate I Research Associate Senior Research Associate I 3
Senior Research Associate II Research Associate Senior Research Associate II 3
Senior Research Associate, Platform Development Research Associate Senior Research Associate Platform Development 1
Senior Research Associate, Strain Engineering Research Associate Senior Research Associate Strain Engineering 1
Senior Research Associates Research Associate Senior Research Associate s 1
Senior research associate Research Associate Senior Research Associate 9
Senior research associate I Research Associate Senior Research Associate I 2
Sr Research Associate Research Associate Senior Research Associate Sr 5
Sr research associate Research Associate Senior Research Associate Sr 1
Sr. RA Research Associate Senior Research Associate Sr 2
Sr. Research Associate Research Associate Senior Research Associate Sr 16
Sr. Research Associate II Research Associate Senior Research Associate Sr II 1
research associate Research Associate Research Associate 1
research associate I Research Associate Research Associate I 1
senior research associate Research Associate Senior Research Associate 1
sal_clean_filt |>  
  filter(title_category == 'Research Associate') |>  count(title_general) |> gt()
title_general n
Research Associate 244
Senior Research Associate 115
filter

I filter out an outlier sample, looks like they incorrectly inputted bonus:

n_1 <- nrow(sal_clean_filt)
sal_clean_filt |> 
  dplyr::filter((
    str_detect(timestamp, '10/31/2023') & 
      role_title_of_current_position == 'Sr Clinical Research Associate')
  ) |> 
  gt::gt()
sal_clean_filt <- sal_clean_filt |> 
  dplyr::filter(!(
    str_detect(timestamp, '10/31/2023') & 
      role_title_of_current_position == 'Sr Clinical Research Associate')
  )
.diff <- nrow(sal_clean_filt) - n_1
stopifnot(.diff == -1)

Roles that have not been processed

Look at the remaining roles that I have not tried to process.

sal_clean_filt |>  
  filter(is.na(title_category)) |> 
  distinct(role_title_of_current_position) |>  
  reactable()

Experience

What variables correspond to years of experience?

completeness |> filter(str_detect(tolower(variable), 'experience')) |>  reactable_na()
variable 2025 2024 2023 2022
Provide a review and rate your company/institution and experience 35% 0 0 0
How many total years of experience in post-docs do you have? (If none, enter 0) 100% 0 0 0
Please enter your total years of experience in the field DO NOT INCLUDE years in BSc, Masters, PhD, or post-bacc/post-doc 100% 0 0 0
Years of Experience 0 100% 100% 100%

2025: Please enter your total years of experience in the field DO NOT INCLUDE years in BSc, Masters, PhD, or post-bacc/post-doc

2024: Years of Experience

rename 2025 variable to yoe_2025

And then merge with 2024 to: years_of_experience

sal_clean_filt <- sal_clean_filt |> 
  rename(yoe_2025 = please_enter_your_total_years_of_experience_in_the_field_do_not_include_years_in_b_sc_masters_ph_d_or_post_bacc_post_doc) |> 
  mutate(years_of_experience = case_when(
    is.na(years_of_experience) ~ yoe_2025,
    TRUE ~ years_of_experience
  ))

pre-2025 cleaning:

sal_clean_filt |> 
  count(years_of_experience) |> gt()
years_of_experience n
0 251
1 328
10 197
11 73
12 98
13 42
14 40
15 106
16 25
17 26
18 37
19 11
2 379
20 42
21 8
22 8
23 9
24 8
25 19
26 1
27 3
3 410
30 3
30 or more 1
32 2
4 361
45 1
5 321
6 256
7 251
8 212
9 123

only 1 will be converted to NA

n_before <- sum(is.na(sal_clean_filt$years_of_experience)) 
sal_clean_filt <- sal_clean_filt |> 
  mutate(years_of_experience = as.numeric(years_of_experience)) 
Warning: There was 1 warning in `mutate()`.
ℹ In argument: `years_of_experience = as.numeric(years_of_experience)`.
Caused by warning:
! NAs introduced by coercion
n_after <- sum(is.na(sal_clean_filt$years_of_experience))
message(glue('# NAs\nbefore: {n_before}\nafter: {n_after}'))
# NAs
before: 0
after: 1

degree

What variables correspond to education?

completeness |> filter(str_detect(tolower(variable), '(degree|edu)')) |>  reactable_na()
variable 2025 2024 2023 2022
What degrees do you have? 0 100% 0 0
Healthcare Benefits (free text) Describe healthcare benefits - costs, deductibles, outfamily plan, insurance provider, total out of pocket, HSA contributions, etc 47% 0 0 0
Select the highest level of education that you have that's relevant to your occupation. If you have multiple (e.g. PhD + MD) please select "Other" and describe 100% 0 0 0
Highest achieved Formal Education 0 0 100% 100%

2025: Select the highest level of education that you have that's relevant to your occupation. If you have multiple (e.g. PhD + MD) please select "Other" and describe

2024: Select the highest level of education that you have that's relevant to your occupation. If you have multiple (e.g. PhD + MD) please select "Other" and describe

2023: Highest achieved Formal Education

rename 2025 variable to highest_degree_2025

And then merge with 2024 to: what_degrees_do_you_have

sal_clean_filt |>  colnames() |>  str_subset('education')
[1] "select_the_highest_level_of_education_that_you_have_thats_relevant_to_your_occupation_if_you_have_multiple_e_g_ph_d_md_please_select_other_and_describe"
[2] "highest_achieved_formal_education"                                                                                                                      
sal_clean_filt <- sal_clean_filt |> 
  rename(highest_degree_2025 = select_the_highest_level_of_education_that_you_have_thats_relevant_to_your_occupation_if_you_have_multiple_e_g_ph_d_md_please_select_other_and_describe) |> 
  mutate(what_degrees_do_you_have = case_when(
    #2025
    is.na(what_degrees_do_you_have) &
      is.na(highest_achieved_formal_education) ~ highest_degree_2025,
    
    # 2023 2022
    is.na(what_degrees_do_you_have) & 
      is.na(highest_degree_2025) ~ highest_achieved_formal_education,
    
    #2024
    TRUE ~ what_degrees_do_you_have
  ))

pre-2025 cleaning:

sal_clean_filt |> 
  count(what_degrees_do_you_have) |>  gt()
what_degrees_do_you_have n
Associates Degree 3
Associate’s Degree or Equivalent 30
Associate’s Degree or Equivalent, Bachelors or Equivalent 4
Bachelor's in pharmacy 1
Bachelors 216
Bachelors or Equivalent 1026
Bachelors or Equivalent, Masters or Equivalent 46
Bachelors or Equivalent, Masters or Equivalent, MD or Equivalent 2
Bachelors or Equivalent, Masters or Equivalent, PhD or Equivalent 16
Bachelors or Equivalent, PhD or Equivalent 10
Bachelors or Equivalent, PharmD or Equivalent 1
DVM + MS 2
High School 3
High School or Equivalent 20
High School or Equivalent, Associate’s Degree or Equivalent 2
High School or Equivalent, Associate’s Degree or Equivalent, Bachelors or Equivalent 20
High School or Equivalent, Associate’s Degree or Equivalent, Bachelors or Equivalent, Masters or Equivalent 12
High School or Equivalent, Associate’s Degree or Equivalent, Bachelors or Equivalent, Masters or Equivalent, PhD or Equivalent 2
High School or Equivalent, Associate’s Degree or Equivalent, Bachelors or Equivalent, PhD or Equivalent 2
High School or Equivalent, Bachelors or Equivalent 37
High School or Equivalent, Bachelors or Equivalent, Masters or Equivalent 40
High School or Equivalent, Bachelors or Equivalent, Masters or Equivalent, JD or Equivalent 2
High School or Equivalent, Bachelors or Equivalent, Masters or Equivalent, PhD or Equivalent 13
High School or Equivalent, Bachelors or Equivalent, PhD or Equivalent 17
High School or Equivalent, Bachelors or Equivalent, PhD or Equivalent, JD or Equivalent 1
High School or Equivalent, Bachelors or Equivalent, PhD or Equivalent, MD or Equivalent 3
High School or Equivalent, Bachelors or Equivalent, PharmD or Equivalent 1
High School or Equivalent, Masters or Equivalent 1
High School or Equivalent, Masters or Equivalent, PhD or Equivalent 1
High School or Equivalent, Masters or Equivalent, PharmD or Equivalent 1
High School or Equivalent, PhD or Equivalent 1
High School or Equivalent, PharmD or Equivalent 1
JD 2
JD + MBA 1
JD or Equivalent 1
M.D./PharmD/D.D.S. or Equivalent 27
MBA 1
MD 1
MD or Equivalent 10
MD, PhD 1
MS MPH PhD 1
Masters 163
Masters or Equivalent 572
Masters or Equivalent, PhD or Equivalent 8
Masters or Equivalent, PharmD or Equivalent 3
PhD 258
PhD + postdoc + PMP 1
PhD or Equivalent 1022
PhD or Equivalent, MD or Equivalent 3
PhD or Equivalent, PharmD or Equivalent 2
PharmD 11
PharmD + Masters in Clinical Research + MBA 1
PharmD PhD 1
PharmD or Equivalent 24
TAFE Diploma 1

contains all degrees, but let’s simplify to the highest degree:

e.g.

  • “High School or Equivalent, Bachelors or Equivalent, Masters or Equivalent, PhD or Equivalent”
  • “Bachelors or Equivalent, Masters or Equivalent, PhD or Equivalent”
  • “Bachelors or Equivalent, PhD or Equivalent”
  • “PhD or Equivalent”

should all just be “PhD”

Below code assumes highest degree is the last listed degree

sal_clean_filt <- sal_clean_filt |> 
  mutate(experience_highest_degree =  str_extract(
    what_degrees_do_you_have,
    '(?<=,|^)[^,]+$'
  ) |> 
    str_replace('M\\.D\\.', 'MD') |> 
    str_remove('or Equivalent') |> 
    str_replace('MD/Pharm.*', 'MD')
  ) 

# View Changes
sal_clean_filt |>  
  count(is.na(experience_highest_degree), is.na(what_degrees_do_you_have))
# A tibble: 1 × 3
  `is.na(experience_highest_degree)` `is.na(what_degrees_do_you_have)`     n
  <lgl>                              <lgl>                             <int>
1 FALSE                              FALSE                              3652
sal_clean_filt |>
  count(experience_highest_degree, what_degrees_do_you_have) |>  
  gt()
experience_highest_degree what_degrees_do_you_have n
Associate’s Degree High School or Equivalent, Associate’s Degree or Equivalent 2
Bachelors Associate’s Degree or Equivalent, Bachelors or Equivalent 4
Bachelors High School or Equivalent, Associate’s Degree or Equivalent, Bachelors or Equivalent 20
Bachelors High School or Equivalent, Bachelors or Equivalent 37
JD High School or Equivalent, Bachelors or Equivalent, Masters or Equivalent, JD or Equivalent 2
JD High School or Equivalent, Bachelors or Equivalent, PhD or Equivalent, JD or Equivalent 1
MD Bachelors or Equivalent, Masters or Equivalent, MD or Equivalent 2
MD High School or Equivalent, Bachelors or Equivalent, PhD or Equivalent, MD or Equivalent 3
MD PhD or Equivalent, MD or Equivalent 3
Masters Bachelors or Equivalent, Masters or Equivalent 46
Masters High School or Equivalent, Associate’s Degree or Equivalent, Bachelors or Equivalent, Masters or Equivalent 12
Masters High School or Equivalent, Bachelors or Equivalent, Masters or Equivalent 40
Masters High School or Equivalent, Masters or Equivalent 1
PhD MD, PhD 1
PhD Bachelors or Equivalent, Masters or Equivalent, PhD or Equivalent 16
PhD Bachelors or Equivalent, PhD or Equivalent 10
PhD High School or Equivalent, Associate’s Degree or Equivalent, Bachelors or Equivalent, Masters or Equivalent, PhD or Equivalent 2
PhD High School or Equivalent, Associate’s Degree or Equivalent, Bachelors or Equivalent, PhD or Equivalent 2
PhD High School or Equivalent, Bachelors or Equivalent, Masters or Equivalent, PhD or Equivalent 13
PhD High School or Equivalent, Bachelors or Equivalent, PhD or Equivalent 17
PhD High School or Equivalent, Masters or Equivalent, PhD or Equivalent 1
PhD High School or Equivalent, PhD or Equivalent 1
PhD Masters or Equivalent, PhD or Equivalent 8
PharmD Bachelors or Equivalent, PharmD or Equivalent 1
PharmD High School or Equivalent, Bachelors or Equivalent, PharmD or Equivalent 1
PharmD High School or Equivalent, Masters or Equivalent, PharmD or Equivalent 1
PharmD High School or Equivalent, PharmD or Equivalent 1
PharmD Masters or Equivalent, PharmD or Equivalent 3
PharmD PhD or Equivalent, PharmD or Equivalent 2
Associates Degree Associates Degree 3
Associate’s Degree Associate’s Degree or Equivalent 30
Bachelor's in pharmacy Bachelor's in pharmacy 1
Bachelors Bachelors 216
Bachelors Bachelors or Equivalent 1026
DVM + MS DVM + MS 2
High School High School 3
High School High School or Equivalent 20
JD JD 2
JD JD or Equivalent 1
JD + MBA JD + MBA 1
MBA MBA 1
MD M.D./PharmD/D.D.S. or Equivalent 27
MD MD 1
MD MD or Equivalent 10
MS MPH PhD MS MPH PhD 1
Masters Masters 163
Masters Masters or Equivalent 572
PhD PhD 258
PhD PhD or Equivalent 1022
PhD + postdoc + PMP PhD + postdoc + PMP 1
PharmD PharmD 11
PharmD PharmD or Equivalent 24
PharmD + Masters in Clinical Research + MBA PharmD + Masters in Clinical Research + MBA 1
PharmD PhD PharmD PhD 1
TAFE Diploma TAFE Diploma 1

Location

What variables correspond to location?

completeness |> filter(str_detect(tolower(variable), '(located|city|country|state|province)')) |>  reactable_na()
variable 2025 2024 2023 2022
Which country do you work in? 100% 0 0 0
What City do you work in? 10% 0 0 0
Which US state do you work in? 87% 0 0 0
Which Canadian Province do you work in? 3% 0 0 0
What country do you work in? 0 10% 100% 0
Where is the closest major city or hub? 0 10% 100% 100%
Where are you located? 0 90% 0 0
sal_clean_filt |>  colnames() |>  str_subset('(country|city|state|provin)')
[1] "which_country_do_you_work_in"          
[2] "what_city_do_you_work_in"              
[3] "which_us_state_do_you_work_in"         
[4] "which_canadian_province_do_you_work_in"
[5] "what_country_do_you_work_in"           
[6] "where_is_the_closest_major_city_or_hub"

2025: Which country do you work in?, What City do you work in?, Which US state do you work in?, Which Canadian Province do you work in?

2024: Where are you located?

<2023: What country do you work in?, Where is the closest major city or hub?

pre-2025 cleaning:

sal_clean_filt |> 
  count(where_is_the_closest_major_city_or_hub, where_are_you_located) |> 
  gt()
where_is_the_closest_major_city_or_hub where_are_you_located n
Alachua NA 1
Albany NA 1
Ames, IA NADC NA 1
Amsterdam NA 1
Ann Arbor NA 1
Athens NA 1
Atlanta NA 2
Atlanta, GA NA 1
Austin (Remote) NA 1
Austin, TX NA 2
Austria NA 1
BOSTON NA 1
Baltimore NA 2
Baltimore, MD NA 2
Baltimore/Washington DC NA 1
Bangalore NA 1
Basel NA 1
Bay Are NA 1
Bay Area NA 185
Bay Area - Mid Penninsula NA 1
Bay area NA 21
Belgium NA 1
Berkeley, CA NA 1
Berlin NA 1
Bloomington, IN, USA NA 1
Boca Raton NA 1
Bosto NA 1
Boston NA 356
Boston (company location) - company is WFH NA 1
Boston / Remote NA 1
Boston MA NA 1
Boston, MA NA 4
Boston, US NA 1
Boston/Cambridge NA 2
Bosyon NA 1
Boulder NA 1
Boulder (remote, company in Bay) NA 1
Boulder, CO NA 3
Brazil NA 1
Brighton, UK NA 1
Buffalo NA 1
CA NA 2
Calgary NA 1
California NA 2
Cambridge NA 7
Cambridge Massachusetts NA 1
Cambridge UK NA 1
Cambridge, MA NA 7
Cambridge, UK NA 3
Cambridge,MA NA 1
Canada NA 6
Canada, Ottawa NA 1
Carlsbad NA 1
Central Maine NA 1
Central/North Jersey NA 1
Chicago NA 18
Chicago IL NA 1
Chicago, IL NA 1
Chicagoland NA 1
Cincinnati, Ohio NA 1
Cleveland NA 2
College Station TX NA 1
Cologne NA 1
Colorado NA 1
Columbus NA 2
Columbus, OH NA 1
Columbus, Ohio NA 1
Copenhagen NA 1
DC NA 5
DC Area/I-270 NA 1
DC area NA 1
DMV NA 1
Dc NA 1
Denver NA 10
Detroit NA 1
Devens NA 1
Dublin NA 2
Dundee NA 1
Durham, NC NA 2
Edinburgh NA 1
Florida NA 1
France NA 1
Frankfurt am Main NA 1
Gaithersburg NA 1
Gaithersburg, Maryland NA 1
Georgia NA 2
Germany NA 1
Greater Boston NA 2
Groton NA 1
HOUSTON NA 1
Halifax, Nova Scotia NA 1
Hamburg NA 2
Hartford, CT NA 1
Hayward, CA NA 2
Houston NA 6
Houston, TX NA 5
Houston,TX NA 2
I-270 corridor NA 1
Illinois NA 1
Indianapolis NA 7
Indianapolis, IN NA 1
Ireland NA 3
Irvine, CA NA 1
Kansas City NA 4
Kansas City, MO NA 1
Kansas City, Missouri, United States NA 1
Knoxville, TN NA 1
LA NA 3
La Jolla NA 1
Limerick NA 1
Liverpool NA 1
Ljubljana NA 1
London NA 5
Los Angeles NA 34
Los Angeles (company is in Bay) NA 1
Los Angeles, CA NA 1
Los Ángeles NA 1
MVA NA 1
Macclesfield, UK NA 1
Madison NA 1
Madison, Wisconsin NA 1
Manchester NA 1
Maryland NA 6
Maryland/DC NA 1
Melbourne NA 1
Miami NA 3
Michigan, USA NA 1
Midwest NA 6
Milwaukee NA 1
Minneapolis NA 5
Minneapolis, mn NA 1
Minnesota NA 1
Montreal NA 3
Munich NA 2
N/A NA 1
NJ NA 5
NJ-Philadelphia NA 1
NJ-Philly-DE NA 1
NRW NA 1
NY NA 1
NY Metro NA 1
NY, NY NA 1
NYC NA 17
Near boston NA 1
Netherlands NA 1
New England, not Boston NA 1
New Haven NA 2
New Jersey NA 15
New Jersey, USA NA 1
New York NA 16
New York City NA 11
New York Metro Area NA 1
New York metropolitan area NA 1
Newark, CA NA 1
North Carolina NA 1
Nyc NA 1
Nyc metro NA 1
Oakland NA 1
Ohio NA 2
Oklahoma City NA 1
Orange County NA 1
Oxford NA 2
PA NA 1
Pacific NW NA 1
Pennsylvania NA 1
Philadelphia NA 59
Philadelphia Metro Area NA 1
Philadelphia metro NA 1
Philadelphia, PA NA 2
Phoenix NA 3
Pittsburgh NA 3
Portland NA 4
Portland Oregon NA 1
Portland, OR NA 3
Porto NA 1
Providence NA 1
RDU NA 2
RESEARCH TRIANGLE NA 1
RTP NA 12
RTP NC NA 1
RTP North Carolina NA 1
RTP, NC NA 3
RTP, North Carolina NA 1
RTP/Raleigh, NC NA 1
Raleigh NA 2
Raleigh, NC NA 1
Raleigh-Durham NA 1
Redwood City NA 1
Rehovot NA 1
Remote NA 3
Remote, HQ in San Diego NA 1
Remote, TX, US NA 1
Remote, office based in Bay Area NA 1
Research Triangle NA 41
Research Triangle Park NA 1
Research Triangle, NC NA 1
Ridgefield, CT NA 1
Rockville NA 1
Rockville, MD NA 2
SF Bay Area NA 5
Sacramento NA 1
Sacramento, CA NA 2
Saint Louis NA 1
Salt Lake City NA 7
Salt Lake City, UT NA 1
Salt lake city NA 1
San Antonio NA 1
San Antonio, texas NA 1
San Diego NA 111
San Diego, CA NA 3
San Francisco NA 18
San Francisco Bay Area NA 2
San Francisco CA NA 1
San Francisco ca NA 2
San diego NA 3
San francisco NA 2
Santa Barbara NA 1
Santa Barbara, CA NA 1
Saxony NA 1
Seattle NA 50
Seattle Area NA 1
Seattle WA NA 2
Seattle but I work remotely from FL NA 1
Seattle, WA NA 2
Shanghai NA 1
Shanghai/Chengdu NA 1
SoCal NA 2
South San Francisco NA 4
Southeast NA 1
St Louis NA 2
St. Louis NA 4
Stockholm NA 1
Switzerland NA 3
Syracuse NA 2
Szeged NA 1
Tampa NA 1
Thessaloniki NA 1
Thousand Oaks NA 2
Thousand Oaks (Los Angeles) NA 1
Thousand Oaks, CA NA 1
Toronto NA 16
Triangle NA 1
Tucson, Az. NA 1
U.K NA 2
U.K. NA 1
U.k. NA 1
UK NA 3
USA, ALBANY NA 1
Uppsala NA 1
Vancouver NA 6
Vancouver, Canada NA 1
Vienna NA 2
Waco NA 1
Waco, TX NA 1
Waltham NA 1
Washington D.C. Metro NA 1
Washington DC NA 7
Washington DC (Rockville MD) USA NA 1
Washington DC Capitol Region NA 1
Washington, DC NA 3
Wilmington,DE NA 1
Worcester MA NA 1
bay area NA 6
boston NA 2
great lakes / Midwest NA 1
nj NA 1
philadelphia NA 1
philly NA 2
san diego NA 2
seattle NA 1
NA Argentine 1
NA Australia 6
NA Austria 2
NA Belgium 9
NA Benelux 1
NA Boston 3
NA CO 1
NA Canada 49
NA Carolinas & Southeast (From NC to AR, South FL and LA) 83
NA Central VA 1
NA Chile 1
NA DC Metro Area (DC, VA, MD, DE) 79
NA Denmark 4
NA EU 2
NA Elkton, Virginia 1
NA FRANCE 1
NA Finland 1
NA France 7
NA Germany 27
NA India 1
NA Iraq 1
NA Kenya 1
NA Midwest (From OH to KS, North to ND) 113
NA NL 1
NA Netherlands 3
NA New England (MA, CT, RI, NH, VT, ME) 401
NA Other US Location (HI, AK, PR, etc.) 1
NA Pharma Central (NY, NJ, PA) 220
NA Poland 2
NA Remote - US 1
NA Research Triangle Park, NC 1
NA San Diego 1
NA Singapore 2
NA South & Mountain West (TX to AZ, North to MT) 58
NA Spain 5
NA Spain (Barcelona) 1
NA Sweden 2
NA Switzerland 10
NA Tennessee 1
NA United Kingdom and Ireland 62
NA West Coast (California & Pacific Northwest) 396
NA australia 1
NA denmark 1
NA india 1
NA russia 1
NA spain 1
NA NA 671
sal_clean_filt |> 
  count(
    where_are_you_located, what_country_do_you_work_in,
    where_is_the_closest_major_city_or_hub) |>  
  gt()
where_are_you_located what_country_do_you_work_in where_is_the_closest_major_city_or_hub n
Argentine NA NA 1
Australia NA NA 6
Austria NA NA 2
Belgium NA NA 9
Benelux NA NA 1
Boston NA NA 3
CO NA NA 1
Canada NA NA 49
Carolinas & Southeast (From NC to AR, South FL and LA) NA NA 83
Central VA NA NA 1
Chile NA NA 1
DC Metro Area (DC, VA, MD, DE) NA NA 79
Denmark NA NA 4
EU NA NA 2
Elkton, Virginia NA NA 1
FRANCE NA NA 1
Finland NA NA 1
France NA NA 7
Germany NA NA 27
India NA NA 1
Iraq NA NA 1
Kenya NA NA 1
Midwest (From OH to KS, North to ND) NA NA 113
NL NA NA 1
Netherlands NA NA 3
New England (MA, CT, RI, NH, VT, ME) NA NA 401
Other US Location (HI, AK, PR, etc.) NA NA 1
Pharma Central (NY, NJ, PA) NA NA 220
Poland NA NA 2
Remote - US NA NA 1
Research Triangle Park, NC NA NA 1
San Diego NA NA 1
Singapore NA NA 2
South & Mountain West (TX to AZ, North to MT) NA NA 58
Spain NA NA 5
Spain (Barcelona) NA NA 1
Sweden NA NA 2
Switzerland NA NA 10
Tennessee NA NA 1
United Kingdom and Ireland NA NA 62
West Coast (California & Pacific Northwest) NA NA 396
australia NA NA 1
denmark NA NA 1
india NA NA 1
russia NA NA 1
spain NA NA 1
NA America Bay Area 1
NA America Philadelphia 1
NA Australia Melbourne 1
NA Austria Vienna 2
NA Canada Montreal 2
NA Canada Toronto 11
NA Canada Vancouver 6
NA China Shanghai 1
NA China Shanghai/Chengdu 1
NA England London 1
NA Germany Berlin 1
NA Germany Cologne 1
NA Germany Frankfurt am Main 1
NA Germany Hamburg 2
NA Germany Munich 1
NA Germany N/A 1
NA Germany NRW 1
NA Germany Saxony 1
NA Greece Thessaloniki 1
NA Hungary Szeged 1
NA India Bangalore 1
NA Ireland Dublin 1
NA Ireland Limerick 1
NA Netherlands Amsterdam 1
NA Portugal Porto 1
NA Slovenia Ljubljana 1
NA Sweden Copenhagen 1
NA Sweden Stockholm 1
NA Sweden Uppsala 1
NA Switzerland Basel 1
NA U.S.A. Albany 1
NA UK Cambridge 1
NA UK Cambridge, UK 2
NA UK London 1
NA US Bay Area 9
NA US Bay area 1
NA US Boston 11
NA US Chicago 2
NA US Columbus 1
NA US DC 1
NA US DMV 1
NA US Denver 1
NA US Los Angeles 1
NA US Los Angeles, CA 1
NA US NJ 2
NA US NY Metro 1
NA US NYC 5
NA US New Jersey 1
NA US New York 1
NA US Philadelphia 2
NA US Remote, office based in Bay Area 1
NA US Research Triangle 3
NA US San Diego 5
NA US Seattle 1
NA US St. Louis 1
NA US Washington DC 1
NA USA Alachua 1
NA USA Austin (Remote) 1
NA USA BOSTON 1
NA USA Baltimore 1
NA USA Bay Area 54
NA USA Bay area 7
NA USA Bosto 1
NA USA Boston 80
NA USA Boston/Cambridge 1
NA USA Bosyon 1
NA USA Boulder 1
NA USA Buffalo 1
NA USA Chicago 5
NA USA Cleveland 1
NA USA Columbus 1
NA USA Columbus, OH 1
NA USA DC 1
NA USA DC Area/I-270 1
NA USA Denver 3
NA USA Georgia 1
NA USA Houston 2
NA USA I-270 corridor 1
NA USA Indianapolis 2
NA USA Kansas City 1
NA USA LA 1
NA USA La Jolla 1
NA USA Los Angeles 9
NA USA Miami 1
NA USA Milwaukee 1
NA USA Minneapolis 1
NA USA NYC 8
NA USA New Haven 1
NA USA New Jersey 2
NA USA New York 3
NA USA New York City 3
NA USA Pacific NW 1
NA USA Philadelphia 16
NA USA Portland 2
NA USA RESEARCH TRIANGLE 1
NA USA RTP 2
NA USA RTP, NC 1
NA USA Remote 1
NA USA Research Triangle 13
NA USA Rockville, MD 1
NA USA SF Bay Area 1
NA USA Salt Lake City 2
NA USA San Diego 26
NA USA San Francisco 4
NA USA San diego 1
NA USA Seattle 18
NA USA SoCal 1
NA USA Southeast 1
NA USA St Louis 1
NA USA Syracuse 1
NA USA Thousand Oaks 1
NA USA Washington DC 1
NA USA Washington, DC 1
NA USA philadelphia 1
NA United Kingdom Cambridge 2
NA United Kingdom Edinburgh 1
NA United Kingdom Liverpool 1
NA United Kingdom London 3
NA United Kingdom Manchester 1
NA United Kingdom Oxford 2
NA United Stated Bay Area 1
NA United Stated of Americ Bay Area 1
NA United States Ann Arbor 1
NA United States Atlanta 2
NA United States Atlanta, GA 1
NA United States Bay Area 44
NA United States Bay area 2
NA United States Boston 118
NA United States Boston (company location) - company is WFH 1
NA United States Boston / Remote 1
NA United States CA 1
NA United States California 1
NA United States Cambridge 2
NA United States Chicago 7
NA United States Cleveland 1
NA United States Columbus, Ohio 1
NA United States Denver 3
NA United States Detroit 1
NA United States Gaithersburg 1
NA United States Groton 1
NA United States Hartford, CT 1
NA United States Houston 2
NA United States Indianapolis 3
NA United States Kansas City 2
NA United States LA 1
NA United States Los Angeles 13
NA United States Maryland/DC 1
NA United States Miami 2
NA United States Midwest 4
NA United States Minneapolis 4
NA United States Minneapolis, mn 1
NA United States NJ 1
NA United States NYC 3
NA United States New Jersey 2
NA United States New York 6
NA United States New York City 5
NA United States Nyc 1
NA United States Oklahoma City 1
NA United States Philadelphia 23
NA United States Phoenix 1
NA United States Pittsburgh 1
NA United States Portland 2
NA United States Portland, OR 1
NA United States Providence 1
NA United States Redwood City 1
NA United States Research Triangle 20
NA United States SF Bay Area 1
NA United States Salt Lake City 3
NA United States San Antonio 1
NA United States San Diego 36
NA United States San Francisco 6
NA United States San Francisco Bay Area 1
NA United States San diego 1
NA United States Seattle 13
NA United States St Louis 1
NA United States St. Louis 1
NA United States Syracuse 1
NA United States Tampa 1
NA United States Washington DC 4
NA United States Washington, DC 2
NA United States Wilmington,DE 1
NA United States bay area 1
NA United States (Remote) New Jersey 1
NA United States - Florida Boca Raton 1
NA United States of America Bay area 1
NA United States of America Boston 5
NA United States of America Madison 1
NA United States of America Philadelphia 1
NA United States of America Research Triangle 3
NA United States of America Salt Lake City 1
NA United States of America San Diego 4
NA United States of america Boston 1
NA United states Bay Are 1
NA United states Bay area 2
NA United states Boston 1
NA United states Cambridge, MA 1
NA United states Chicago 1
NA United states Denver 1
NA United states Philadelphia 1
NA United states Research Triangle 2
NA United states Washington DC Capitol Region 1
NA Us Boston 1
NA Us Seattle 1
NA Usa Bay Area 2
NA Usa Boston 5
NA Usa Philadelphia 2
NA Usa Portland Oregon 1
NA Usa Salt lake city 1
NA Usa San Diego 1
NA Usa philly 2
NA united states DC 1
NA united states Philadelphia 1
NA united states Washington D.C. Metro 1
NA usa bay area 2
NA usa boston 2
NA usa great lakes / Midwest 1
NA usa san diego 1
NA usa seattle 1
NA NA Ames, IA NADC 1
NA NA Athens 1
NA NA Austin, TX 2
NA NA Austria 1
NA NA Baltimore 1
NA NA Baltimore, MD 2
NA NA Baltimore/Washington DC 1
NA NA Bay Area 73
NA NA Bay Area - Mid Penninsula 1
NA NA Bay area 8
NA NA Belgium 1
NA NA Berkeley, CA 1
NA NA Bloomington, IN, USA 1
NA NA Boston 134
NA NA Boston MA 1
NA NA Boston, MA 4
NA NA Boston, US 1
NA NA Boston/Cambridge 1
NA NA Boulder (remote, company in Bay) 1
NA NA Boulder, CO 3
NA NA Brazil 1
NA NA Brighton, UK 1
NA NA CA 1
NA NA Calgary 1
NA NA California 1
NA NA Cambridge 2
NA NA Cambridge Massachusetts 1
NA NA Cambridge UK 1
NA NA Cambridge, MA 6
NA NA Cambridge, UK 1
NA NA Cambridge,MA 1
NA NA Canada 6
NA NA Canada, Ottawa 1
NA NA Carlsbad 1
NA NA Central Maine 1
NA NA Central/North Jersey 1
NA NA Chicago 3
NA NA Chicago IL 1
NA NA Chicago, IL 1
NA NA Chicagoland 1
NA NA Cincinnati, Ohio 1
NA NA College Station TX 1
NA NA Colorado 1
NA NA DC 2
NA NA DC area 1
NA NA Dc 1
NA NA Denver 2
NA NA Devens 1
NA NA Dublin 1
NA NA Dundee 1
NA NA Durham, NC 2
NA NA Florida 1
NA NA France 1
NA NA Gaithersburg, Maryland 1
NA NA Georgia 1
NA NA Germany 1
NA NA Greater Boston 2
NA NA HOUSTON 1
NA NA Halifax, Nova Scotia 1
NA NA Hayward, CA 2
NA NA Houston 2
NA NA Houston, TX 5
NA NA Houston,TX 2
NA NA Illinois 1
NA NA Indianapolis 2
NA NA Indianapolis, IN 1
NA NA Ireland 3
NA NA Irvine, CA 1
NA NA Kansas City 1
NA NA Kansas City, MO 1
NA NA Kansas City, Missouri, United States 1
NA NA Knoxville, TN 1
NA NA LA 1
NA NA Los Angeles 11
NA NA Los Angeles (company is in Bay) 1
NA NA Los Ángeles 1
NA NA MVA 1
NA NA Macclesfield, UK 1
NA NA Madison, Wisconsin 1
NA NA Maryland 6
NA NA Michigan, USA 1
NA NA Midwest 2
NA NA Minnesota 1
NA NA Montreal 1
NA NA Munich 1
NA NA NJ 2
NA NA NJ-Philadelphia 1
NA NA NJ-Philly-DE 1
NA NA NY 1
NA NA NY, NY 1
NA NA NYC 1
NA NA Near boston 1
NA NA Netherlands 1
NA NA New England, not Boston 1
NA NA New Haven 1
NA NA New Jersey 9
NA NA New Jersey, USA 1
NA NA New York 6
NA NA New York City 3
NA NA New York Metro Area 1
NA NA New York metropolitan area 1
NA NA Newark, CA 1
NA NA North Carolina 1
NA NA Nyc metro 1
NA NA Oakland 1
NA NA Ohio 2
NA NA Orange County 1
NA NA PA 1
NA NA Pennsylvania 1
NA NA Philadelphia 12
NA NA Philadelphia Metro Area 1
NA NA Philadelphia metro 1
NA NA Philadelphia, PA 2
NA NA Phoenix 2
NA NA Pittsburgh 2
NA NA Portland, OR 2
NA NA RDU 2
NA NA RTP 10
NA NA RTP NC 1
NA NA RTP North Carolina 1
NA NA RTP, NC 2
NA NA RTP, North Carolina 1
NA NA RTP/Raleigh, NC 1
NA NA Raleigh 2
NA NA Raleigh, NC 1
NA NA Raleigh-Durham 1
NA NA Rehovot 1
NA NA Remote 2
NA NA Remote, HQ in San Diego 1
NA NA Remote, TX, US 1
NA NA Research Triangle Park 1
NA NA Research Triangle, NC 1
NA NA Ridgefield, CT 1
NA NA Rockville 1
NA NA Rockville, MD 1
NA NA SF Bay Area 3
NA NA Sacramento 1
NA NA Sacramento, CA 2
NA NA Saint Louis 1
NA NA Salt Lake City 1
NA NA Salt Lake City, UT 1
NA NA San Antonio, texas 1
NA NA San Diego 39
NA NA San Diego, CA 3
NA NA San Francisco 8
NA NA San Francisco Bay Area 1
NA NA San Francisco CA 1
NA NA San Francisco ca 2
NA NA San diego 1
NA NA San francisco 2
NA NA Santa Barbara 1
NA NA Santa Barbara, CA 1
NA NA Seattle 17
NA NA Seattle Area 1
NA NA Seattle WA 2
NA NA Seattle but I work remotely from FL 1
NA NA Seattle, WA 2
NA NA SoCal 1
NA NA South San Francisco 4
NA NA St. Louis 2
NA NA Switzerland 3
NA NA Thousand Oaks 1
NA NA Thousand Oaks (Los Angeles) 1
NA NA Thousand Oaks, CA 1
NA NA Toronto 5
NA NA Triangle 1
NA NA Tucson, Az. 1
NA NA U.K 2
NA NA U.K. 1
NA NA U.k. 1
NA NA UK 3
NA NA USA, ALBANY 1
NA NA Vancouver, Canada 1
NA NA Waco 1
NA NA Waco, TX 1
NA NA Waltham 1
NA NA Washington DC 1
NA NA Washington DC (Rockville MD) USA 1
NA NA Worcester MA 1
NA NA bay area 3
NA NA nj 1
NA NA san diego 1
NA NA NA 671

Location is quite messy data consisting of free-form responses spread between 3 variables:

  • where_are_you_located contains Countries (USA, Canada, etc.), states (CO, Carolinas, etc.)

This is not a required question so there are 174 responses that did not respond, and this also included an “other” freeform option.

  • Pharma Central (NY, NJ, PA)
  • New England (MA, CT, RI, NH, VT, ME)
  • DC Metro Area (DC, VA, MD, DE)
  • Carolinas & Southeast (From NC to AR, South FL and LA)
  • Midwest (From OH to KS, North to ND)
  • South & Mountain West (TX to AZ, North to MT)
  • West Coast (California & Pacific Northwest)
  • Other US Location (HI, AK, PR, etc.)
  • Canada
  • United Kingdom and Ireland
  • Germany
  • Other:

The 174 missing responses have values for these other two variables. I think this is due to the survey changing the question at some point, probably due to feedback.

  • what_country_do_you_work_in seems to all be countries, but not harmonized e.g.: Usa, usa, United States
  • where_is_the_closest_major_city_or_hub mixed bag: Bay area, Boston, san diego, Research Triangle,

Here I create one new location variable location_country, that, unlike the other variables, cover >90% of the data

USA <- c(
  "Pharma Central (NY, NJ, PA)",
  "New England (MA, CT, RI, NH, VT, ME)",
  "DC Metro Area (DC, VA, MD, DE)",
  "Carolinas & Southeast (From NC to AR, South FL and LA)",
  "Midwest (From OH to KS, North to ND)",
  "South & Mountain West (TX to AZ, North to MT)",
  "West Coast (California & Pacific Northwest)",
  "Other US Location (HI, AK, PR, etc.)"
)

usa_locations <- c(
  'baltimore',
  'boston',
  'bay area', 
  'baltimore',
  'boulder',
  'california',
  'cambridge',
  'carlsbad',
  'jersey', 'maine',
  'chicago',
  '\\bdc\\b',
  'denver',
  'durham',
  'hayward',
  'hungary',
  'indianapolis',
  'kansas',
  '\\bla\\b',
  'los angeles',
  'maryland',
  'midwest',
  'new york',
  '\\bnj\\b',
  '\\bny[c]?\\b',
  'ohio',
  'philadelphia',
  'phoenix',
  'pittsburgh',
  'raleigh',
  'san diego',
  'ridgefield',
  'rockville',
  'rtp\\b',
  'sacramento',
  'salt lake',
  'seattle',
  'san francisco',
  'socal',
  'united states',
  'u\\.s\\.a',
  'washington'
) 

usa_locations <- str_c('(', str_c(usa_locations, collapse = '|'), ')')

sal_clean_filt <- sal_clean_filt |> 
  mutate(
    
    # prep raw location data
    location_raw_data = case_when(
      !is.na(what_country_do_you_work_in) ~ what_country_do_you_work_in,
      !is.na(where_are_you_located) ~ where_are_you_located, 
      !is.na(where_is_the_closest_major_city_or_hub) ~ where_is_the_closest_major_city_or_hub,
      .default = where_is_the_closest_major_city_or_hub
    ) |> 
      tolower(),
    
    # matching
    location_country = case_when(
      
      ### 2025 data
      !is.na(which_country_do_you_work_in) ~ which_country_do_you_work_in |> 
        str_replace('United States', 'United States of America'),
      
      ### usa 
      location_raw_data %in% c(
        tolower(USA), 
        'co', 
        'san diego, ca', 
        'united states'
      ) ~ 'United States of America',
      
      location_raw_data |> 
        str_detect(regex('usa?', ignore_case = TRUE)) ~ 
        'United States of America', 
      
      location_raw_data |>
        str_detect(regex(usa_locations, ignore_case = TRUE), ) ~ 
        'United States of America',
      ### UK
      location_raw_data |> 
        str_detect('(u\\.k|dundee|ireland)') ~ 'United Kingdom',
      location_raw_data %in% c(
        'united kingdom and ireland', 'uk') ~ 'United Kingdom',
      
      ### other
      location_raw_data %in% 
        c('canada', 'united states of america',
          'belgium', 'france', 'spain', 'united kingdom', 'benelux', 'germany',
          'sweden', 'switzerland', 'denmark',
          'singapore', 'india', 'australia') ~ location_raw_data,
      
      location_raw_data |>  str_detect('toronto') ~ 'Canada',
      TRUE ~ NA
      
    ),
    location_country = str_to_title(location_country)
  )

sal_clean_filt |> 
  count(location_country, location_raw_data,
        where_are_you_located, what_country_do_you_work_in, 
        where_is_the_closest_major_city_or_hub) |> 
  arrange(location_country) |>  gt()
location_country location_raw_data where_are_you_located what_country_do_you_work_in where_is_the_closest_major_city_or_hub n
Australia NA NA NA NA 3
Belgium belgium Belgium NA NA 9
Belgium belgium NA NA Belgium 1
Belgium NA NA NA NA 1
Benelux benelux Benelux NA NA 1
Canada canada Canada NA NA 49
Canada canada NA Canada Montreal 2
Canada canada NA Canada Toronto 11
Canada canada NA Canada Vancouver 6
Canada canada NA NA Canada 6
Canada toronto NA NA Toronto 5
Canada NA NA NA NA 21
Denmark denmark Denmark NA NA 4
Denmark denmark denmark NA NA 1
Denmark NA NA NA NA 2
Egypt NA NA NA NA 1
France france FRANCE NA NA 1
France france France NA NA 7
France france NA NA France 1
France NA NA NA NA 2
Germany germany Germany NA NA 27
Germany germany NA Germany Berlin 1
Germany germany NA Germany Cologne 1
Germany germany NA Germany Frankfurt am Main 1
Germany germany NA Germany Hamburg 2
Germany germany NA Germany Munich 1
Germany germany NA Germany N/A 1
Germany germany NA Germany NRW 1
Germany germany NA Germany Saxony 1
Germany germany NA NA Germany 1
Germany NA NA NA NA 3
India india India NA NA 1
India india india NA NA 1
India india NA India Bangalore 1
India NA NA NA NA 2
Ireland NA NA NA NA 3
Italy NA NA NA NA 1
Lithuania NA NA NA NA 1
Netherlands NA NA NA NA 1
Singapore singapore Singapore NA NA 2
Singapore NA NA NA NA 1
Slovenia NA NA NA NA 1
Spain spain Spain NA NA 5
Spain spain spain NA NA 1
Spain NA NA NA NA 1
Sweden sweden Sweden NA NA 2
Sweden sweden NA Sweden Copenhagen 1
Sweden sweden NA Sweden Stockholm 1
Sweden sweden NA Sweden Uppsala 1
Sweden NA NA NA NA 1
Switzerland switzerland Switzerland NA NA 10
Switzerland switzerland NA Switzerland Basel 1
Switzerland switzerland NA NA Switzerland 3
Switzerland NA NA NA NA 13
United Kingdom dundee NA NA Dundee 1
United Kingdom ireland NA Ireland Dublin 1
United Kingdom ireland NA Ireland Limerick 1
United Kingdom ireland NA NA Ireland 3
United Kingdom u.k NA NA U.K 2
United Kingdom u.k. NA NA U.K. 1
United Kingdom u.k. NA NA U.k. 1
United Kingdom uk NA UK Cambridge 1
United Kingdom uk NA UK Cambridge, UK 2
United Kingdom uk NA UK London 1
United Kingdom uk NA NA UK 3
United Kingdom united kingdom NA United Kingdom Cambridge 2
United Kingdom united kingdom NA United Kingdom Edinburgh 1
United Kingdom united kingdom NA United Kingdom Liverpool 1
United Kingdom united kingdom NA United Kingdom London 3
United Kingdom united kingdom NA United Kingdom Manchester 1
United Kingdom united kingdom NA United Kingdom Oxford 2
United Kingdom united kingdom and ireland United Kingdom and Ireland NA NA 62
United Kingdom NA NA NA NA 29
United States Of America austin, tx NA NA Austin, TX 2
United States Of America australia Australia NA NA 6
United States Of America australia australia NA NA 1
United States Of America australia NA Australia Melbourne 1
United States Of America austria Austria NA NA 2
United States Of America austria NA Austria Vienna 2
United States Of America austria NA NA Austria 1
United States Of America baltimore NA NA Baltimore 1
United States Of America baltimore, md NA NA Baltimore, MD 2
United States Of America baltimore/washington dc NA NA Baltimore/Washington DC 1
United States Of America bay area NA NA Bay Area 73
United States Of America bay area NA NA Bay area 8
United States Of America bay area NA NA bay area 3
United States Of America bay area - mid penninsula NA NA Bay Area - Mid Penninsula 1
United States Of America bloomington, in, usa NA NA Bloomington, IN, USA 1
United States Of America boston Boston NA NA 3
United States Of America boston NA NA Boston 134
United States Of America boston ma NA NA Boston MA 1
United States Of America boston, ma NA NA Boston, MA 4
United States Of America boston, us NA NA Boston, US 1
United States Of America boston/cambridge NA NA Boston/Cambridge 1
United States Of America boulder (remote, company in bay) NA NA Boulder (remote, company in Bay) 1
United States Of America boulder, co NA NA Boulder, CO 3
United States Of America california NA NA California 1
United States Of America cambridge NA NA Cambridge 2
United States Of America cambridge massachusetts NA NA Cambridge Massachusetts 1
United States Of America cambridge uk NA NA Cambridge UK 1
United States Of America cambridge, ma NA NA Cambridge, MA 6
United States Of America cambridge, uk NA NA Cambridge, UK 1
United States Of America cambridge,ma NA NA Cambridge,MA 1
United States Of America carlsbad NA NA Carlsbad 1
United States Of America carolinas & southeast (from nc to ar, south fl and la) Carolinas & Southeast (From NC to AR, South FL and LA) NA NA 83
United States Of America central maine NA NA Central Maine 1
United States Of America central/north jersey NA NA Central/North Jersey 1
United States Of America chicago NA NA Chicago 3
United States Of America chicago il NA NA Chicago IL 1
United States Of America chicago, il NA NA Chicago, IL 1
United States Of America chicagoland NA NA Chicagoland 1
United States Of America cincinnati, ohio NA NA Cincinnati, Ohio 1
United States Of America co CO NA NA 1
United States Of America dc NA NA DC 2
United States Of America dc NA NA Dc 1
United States Of America dc area NA NA DC area 1
United States Of America dc metro area (dc, va, md, de) DC Metro Area (DC, VA, MD, DE) NA NA 79
United States Of America denver NA NA Denver 2
United States Of America durham, nc NA NA Durham, NC 2
United States Of America gaithersburg, maryland NA NA Gaithersburg, Maryland 1
United States Of America greater boston NA NA Greater Boston 2
United States Of America hayward, ca NA NA Hayward, CA 2
United States Of America houston NA NA HOUSTON 1
United States Of America houston NA NA Houston 2
United States Of America houston, tx NA NA Houston, TX 5
United States Of America houston,tx NA NA Houston,TX 2
United States Of America hungary NA Hungary Szeged 1
United States Of America indianapolis NA NA Indianapolis 2
United States Of America indianapolis, in NA NA Indianapolis, IN 1
United States Of America kansas city NA NA Kansas City 1
United States Of America kansas city, missouri, united states NA NA Kansas City, Missouri, United States 1
United States Of America kansas city, mo NA NA Kansas City, MO 1
United States Of America la NA NA LA 1
United States Of America los angeles NA NA Los Angeles 11
United States Of America los angeles (company is in bay) NA NA Los Angeles (company is in Bay) 1
United States Of America maryland NA NA Maryland 6
United States Of America michigan, usa NA NA Michigan, USA 1
United States Of America midwest NA NA Midwest 2
United States Of America midwest (from oh to ks, north to nd) Midwest (From OH to KS, North to ND) NA NA 113
United States Of America near boston NA NA Near boston 1
United States Of America new england (ma, ct, ri, nh, vt, me) New England (MA, CT, RI, NH, VT, ME) NA NA 401
United States Of America new england, not boston NA NA New England, not Boston 1
United States Of America new jersey NA NA New Jersey 9
United States Of America new jersey, usa NA NA New Jersey, USA 1
United States Of America new york NA NA New York 6
United States Of America new york city NA NA New York City 3
United States Of America new york metro area NA NA New York Metro Area 1
United States Of America new york metropolitan area NA NA New York metropolitan area 1
United States Of America nj NA NA NJ 2
United States Of America nj NA NA nj 1
United States Of America nj-philadelphia NA NA NJ-Philadelphia 1
United States Of America nj-philly-de NA NA NJ-Philly-DE 1
United States Of America ny NA NA NY 1
United States Of America ny, ny NA NA NY, NY 1
United States Of America nyc NA NA NYC 1
United States Of America nyc metro NA NA Nyc metro 1
United States Of America ohio NA NA Ohio 2
United States Of America other us location (hi, ak, pr, etc.) Other US Location (HI, AK, PR, etc.) NA NA 1
United States Of America pharma central (ny, nj, pa) Pharma Central (NY, NJ, PA) NA NA 220
United States Of America philadelphia NA NA Philadelphia 12
United States Of America philadelphia metro NA NA Philadelphia metro 1
United States Of America philadelphia metro area NA NA Philadelphia Metro Area 1
United States Of America philadelphia, pa NA NA Philadelphia, PA 2
United States Of America phoenix NA NA Phoenix 2
United States Of America pittsburgh NA NA Pittsburgh 2
United States Of America raleigh NA NA Raleigh 2
United States Of America raleigh, nc NA NA Raleigh, NC 1
United States Of America raleigh-durham NA NA Raleigh-Durham 1
United States Of America remote - us Remote - US NA NA 1
United States Of America remote, hq in san diego NA NA Remote, HQ in San Diego 1
United States Of America remote, tx, us NA NA Remote, TX, US 1
United States Of America ridgefield, ct NA NA Ridgefield, CT 1
United States Of America rockville NA NA Rockville 1
United States Of America rockville, md NA NA Rockville, MD 1
United States Of America rtp NA NA RTP 10
United States Of America rtp nc NA NA RTP NC 1
United States Of America rtp north carolina NA NA RTP North Carolina 1
United States Of America rtp, nc NA NA RTP, NC 2
United States Of America rtp, north carolina NA NA RTP, North Carolina 1
United States Of America rtp/raleigh, nc NA NA RTP/Raleigh, NC 1
United States Of America russia russia NA NA 1
United States Of America sacramento NA NA Sacramento 1
United States Of America sacramento, ca NA NA Sacramento, CA 2
United States Of America salt lake city NA NA Salt Lake City 1
United States Of America salt lake city, ut NA NA Salt Lake City, UT 1
United States Of America san diego San Diego NA NA 1
United States Of America san diego NA NA San Diego 39
United States Of America san diego NA NA San diego 1
United States Of America san diego NA NA san diego 1
United States Of America san diego, ca NA NA San Diego, CA 3
United States Of America san francisco NA NA San Francisco 8
United States Of America san francisco NA NA San francisco 2
United States Of America san francisco bay area NA NA San Francisco Bay Area 1
United States Of America san francisco ca NA NA San Francisco CA 1
United States Of America san francisco ca NA NA San Francisco ca 2
United States Of America seattle NA NA Seattle 17
United States Of America seattle area NA NA Seattle Area 1
United States Of America seattle but i work remotely from fl NA NA Seattle but I work remotely from FL 1
United States Of America seattle wa NA NA Seattle WA 2
United States Of America seattle, wa NA NA Seattle, WA 2
United States Of America sf bay area NA NA SF Bay Area 3
United States Of America socal NA NA SoCal 1
United States Of America south & mountain west (tx to az, north to mt) South & Mountain West (TX to AZ, North to MT) NA NA 58
United States Of America south san francisco NA NA South San Francisco 4
United States Of America thousand oaks NA NA Thousand Oaks 1
United States Of America thousand oaks (los angeles) NA NA Thousand Oaks (Los Angeles) 1
United States Of America thousand oaks, ca NA NA Thousand Oaks, CA 1
United States Of America u.s.a. NA U.S.A. Albany 1
United States Of America united states NA United States Ann Arbor 1
United States Of America united states NA United States Atlanta 2
United States Of America united states NA United States Atlanta, GA 1
United States Of America united states NA United States Bay Area 44
United States Of America united states NA United States Bay area 2
United States Of America united states NA United States Boston 118
United States Of America united states NA United States Boston (company location) - company is WFH 1
United States Of America united states NA United States Boston / Remote 1
United States Of America united states NA United States CA 1
United States Of America united states NA United States California 1
United States Of America united states NA United States Cambridge 2
United States Of America united states NA United States Chicago 7
United States Of America united states NA United States Cleveland 1
United States Of America united states NA United States Columbus, Ohio 1
United States Of America united states NA United States Denver 3
United States Of America united states NA United States Detroit 1
United States Of America united states NA United States Gaithersburg 1
United States Of America united states NA United States Groton 1
United States Of America united states NA United States Hartford, CT 1
United States Of America united states NA United States Houston 2
United States Of America united states NA United States Indianapolis 3
United States Of America united states NA United States Kansas City 2
United States Of America united states NA United States LA 1
United States Of America united states NA United States Los Angeles 13
United States Of America united states NA United States Maryland/DC 1
United States Of America united states NA United States Miami 2
United States Of America united states NA United States Midwest 4
United States Of America united states NA United States Minneapolis 4
United States Of America united states NA United States Minneapolis, mn 1
United States Of America united states NA United States NJ 1
United States Of America united states NA United States NYC 3
United States Of America united states NA United States New Jersey 2
United States Of America united states NA United States New York 6
United States Of America united states NA United States New York City 5
United States Of America united states NA United States Nyc 1
United States Of America united states NA United States Oklahoma City 1
United States Of America united states NA United States Philadelphia 23
United States Of America united states NA United States Phoenix 1
United States Of America united states NA United States Pittsburgh 1
United States Of America united states NA United States Portland 2
United States Of America united states NA United States Portland, OR 1
United States Of America united states NA United States Providence 1
United States Of America united states NA United States Redwood City 1
United States Of America united states NA United States Research Triangle 20
United States Of America united states NA United States SF Bay Area 1
United States Of America united states NA United States Salt Lake City 3
United States Of America united states NA United States San Antonio 1
United States Of America united states NA United States San Diego 36
United States Of America united states NA United States San Francisco 6
United States Of America united states NA United States San Francisco Bay Area 1
United States Of America united states NA United States San diego 1
United States Of America united states NA United States Seattle 13
United States Of America united states NA United States St Louis 1
United States Of America united states NA United States St. Louis 1
United States Of America united states NA United States Syracuse 1
United States Of America united states NA United States Tampa 1
United States Of America united states NA United States Washington DC 4
United States Of America united states NA United States Washington, DC 2
United States Of America united states NA United States Wilmington,DE 1
United States Of America united states NA United States bay area 1
United States Of America united states NA United states Bay Are 1
United States Of America united states NA United states Bay area 2
United States Of America united states NA United states Boston 1
United States Of America united states NA United states Cambridge, MA 1
United States Of America united states NA United states Chicago 1
United States Of America united states NA United states Denver 1
United States Of America united states NA United states Philadelphia 1
United States Of America united states NA United states Research Triangle 2
United States Of America united states NA United states Washington DC Capitol Region 1
United States Of America united states NA united states DC 1
United States Of America united states NA united states Philadelphia 1
United States Of America united states NA united states Washington D.C. Metro 1
United States Of America united states (remote) NA United States (Remote) New Jersey 1
United States Of America united states - florida NA United States - Florida Boca Raton 1
United States Of America united states of america NA United States of America Bay area 1
United States Of America united states of america NA United States of America Boston 5
United States Of America united states of america NA United States of America Madison 1
United States Of America united states of america NA United States of America Philadelphia 1
United States Of America united states of america NA United States of America Research Triangle 3
United States Of America united states of america NA United States of America Salt Lake City 1
United States Of America united states of america NA United States of America San Diego 4
United States Of America united states of america NA United States of america Boston 1
United States Of America us NA US Bay Area 9
United States Of America us NA US Bay area 1
United States Of America us NA US Boston 11
United States Of America us NA US Chicago 2
United States Of America us NA US Columbus 1
United States Of America us NA US DC 1
United States Of America us NA US DMV 1
United States Of America us NA US Denver 1
United States Of America us NA US Los Angeles 1
United States Of America us NA US Los Angeles, CA 1
United States Of America us NA US NJ 2
United States Of America us NA US NY Metro 1
United States Of America us NA US NYC 5
United States Of America us NA US New Jersey 1
United States Of America us NA US New York 1
United States Of America us NA US Philadelphia 2
United States Of America us NA US Remote, office based in Bay Area 1
United States Of America us NA US Research Triangle 3
United States Of America us NA US San Diego 5
United States Of America us NA US Seattle 1
United States Of America us NA US St. Louis 1
United States Of America us NA US Washington DC 1
United States Of America us NA Us Boston 1
United States Of America us NA Us Seattle 1
United States Of America usa NA USA Alachua 1
United States Of America usa NA USA Austin (Remote) 1
United States Of America usa NA USA BOSTON 1
United States Of America usa NA USA Baltimore 1
United States Of America usa NA USA Bay Area 54
United States Of America usa NA USA Bay area 7
United States Of America usa NA USA Bosto 1
United States Of America usa NA USA Boston 80
United States Of America usa NA USA Boston/Cambridge 1
United States Of America usa NA USA Bosyon 1
United States Of America usa NA USA Boulder 1
United States Of America usa NA USA Buffalo 1
United States Of America usa NA USA Chicago 5
United States Of America usa NA USA Cleveland 1
United States Of America usa NA USA Columbus 1
United States Of America usa NA USA Columbus, OH 1
United States Of America usa NA USA DC 1
United States Of America usa NA USA DC Area/I-270 1
United States Of America usa NA USA Denver 3
United States Of America usa NA USA Georgia 1
United States Of America usa NA USA Houston 2
United States Of America usa NA USA I-270 corridor 1
United States Of America usa NA USA Indianapolis 2
United States Of America usa NA USA Kansas City 1
United States Of America usa NA USA LA 1
United States Of America usa NA USA La Jolla 1
United States Of America usa NA USA Los Angeles 9
United States Of America usa NA USA Miami 1
United States Of America usa NA USA Milwaukee 1
United States Of America usa NA USA Minneapolis 1
United States Of America usa NA USA NYC 8
United States Of America usa NA USA New Haven 1
United States Of America usa NA USA New Jersey 2
United States Of America usa NA USA New York 3
United States Of America usa NA USA New York City 3
United States Of America usa NA USA Pacific NW 1
United States Of America usa NA USA Philadelphia 16
United States Of America usa NA USA Portland 2
United States Of America usa NA USA RESEARCH TRIANGLE 1
United States Of America usa NA USA RTP 2
United States Of America usa NA USA RTP, NC 1
United States Of America usa NA USA Remote 1
United States Of America usa NA USA Research Triangle 13
United States Of America usa NA USA Rockville, MD 1
United States Of America usa NA USA SF Bay Area 1
United States Of America usa NA USA Salt Lake City 2
United States Of America usa NA USA San Diego 26
United States Of America usa NA USA San Francisco 4
United States Of America usa NA USA San diego 1
United States Of America usa NA USA Seattle 18
United States Of America usa NA USA SoCal 1
United States Of America usa NA USA Southeast 1
United States Of America usa NA USA St Louis 1
United States Of America usa NA USA Syracuse 1
United States Of America usa NA USA Thousand Oaks 1
United States Of America usa NA USA Washington DC 1
United States Of America usa NA USA Washington, DC 1
United States Of America usa NA USA philadelphia 1
United States Of America usa NA Usa Bay Area 2
United States Of America usa NA Usa Boston 5
United States Of America usa NA Usa Philadelphia 2
United States Of America usa NA Usa Portland Oregon 1
United States Of America usa NA Usa Salt lake city 1
United States Of America usa NA Usa San Diego 1
United States Of America usa NA Usa philly 2
United States Of America usa NA usa bay area 2
United States Of America usa NA usa boston 2
United States Of America usa NA usa great lakes / Midwest 1
United States Of America usa NA usa san diego 1
United States Of America usa NA usa seattle 1
United States Of America usa, albany NA NA USA, ALBANY 1
United States Of America washington dc NA NA Washington DC 1
United States Of America washington dc (rockville md) usa NA NA Washington DC (Rockville MD) USA 1
United States Of America west coast (california & pacific northwest) West Coast (California & Pacific Northwest) NA NA 396
United States Of America NA NA NA NA 581
NA america NA America Bay Area 1
NA america NA America Philadelphia 1
NA ames, ia nadc NA NA Ames, IA NADC 1
NA argentine Argentine NA NA 1
NA athens NA NA Athens 1
NA berkeley, ca NA NA Berkeley, CA 1
NA brazil NA NA Brazil 1
NA brighton, uk NA NA Brighton, UK 1
NA ca NA NA CA 1
NA calgary NA NA Calgary 1
NA canada, ottawa NA NA Canada, Ottawa 1
NA central va Central VA NA NA 1
NA chile Chile NA NA 1
NA china NA China Shanghai 1
NA china NA China Shanghai/Chengdu 1
NA college station tx NA NA College Station TX 1
NA colorado NA NA Colorado 1
NA devens NA NA Devens 1
NA dublin NA NA Dublin 1
NA elkton, virginia Elkton, Virginia NA NA 1
NA england NA England London 1
NA eu EU NA NA 2
NA finland Finland NA NA 1
NA florida NA NA Florida 1
NA georgia NA NA Georgia 1
NA greece NA Greece Thessaloniki 1
NA halifax, nova scotia NA NA Halifax, Nova Scotia 1
NA illinois NA NA Illinois 1
NA iraq Iraq NA NA 1
NA irvine, ca NA NA Irvine, CA 1
NA kenya Kenya NA NA 1
NA knoxville, tn NA NA Knoxville, TN 1
NA los ángeles NA NA Los Ángeles 1
NA macclesfield, uk NA NA Macclesfield, UK 1
NA madison, wisconsin NA NA Madison, Wisconsin 1
NA minnesota NA NA Minnesota 1
NA montreal NA NA Montreal 1
NA munich NA NA Munich 1
NA mva NA NA MVA 1
NA netherlands Netherlands NA NA 3
NA netherlands NA Netherlands Amsterdam 1
NA netherlands NA NA Netherlands 1
NA new haven NA NA New Haven 1
NA newark, ca NA NA Newark, CA 1
NA nl NL NA NA 1
NA north carolina NA NA North Carolina 1
NA oakland NA NA Oakland 1
NA orange county NA NA Orange County 1
NA pa NA NA PA 1
NA pennsylvania NA NA Pennsylvania 1
NA poland Poland NA NA 2
NA portland, or NA NA Portland, OR 2
NA portugal NA Portugal Porto 1
NA rdu NA NA RDU 2
NA rehovot NA NA Rehovot 1
NA remote NA NA Remote 2
NA research triangle park NA NA Research Triangle Park 1
NA research triangle park, nc Research Triangle Park, NC NA NA 1
NA research triangle, nc NA NA Research Triangle, NC 1
NA saint louis NA NA Saint Louis 1
NA san antonio, texas NA NA San Antonio, texas 1
NA santa barbara NA NA Santa Barbara 1
NA santa barbara, ca NA NA Santa Barbara, CA 1
NA slovenia NA Slovenia Ljubljana 1
NA spain (barcelona) Spain (Barcelona) NA NA 1
NA st. louis NA NA St. Louis 2
NA tennessee Tennessee NA NA 1
NA triangle NA NA Triangle 1
NA tucson, az. NA NA Tucson, Az. 1
NA united stated NA United Stated Bay Area 1
NA united stated of americ NA United Stated of Americ Bay Area 1
NA vancouver, canada NA NA Vancouver, Canada 1
NA waco NA NA Waco 1
NA waco, tx NA NA Waco, TX 1
NA waltham NA NA Waltham 1
NA worcester ma NA NA Worcester MA 1
NA NA NA NA NA 3

location granular

for selecting within country:

sal_clean_filt <- sal_clean_filt |> 
  mutate(
    location_granular = 
      coalesce(
        which_us_state_do_you_work_in,
        which_canadian_province_do_you_work_in,
        where_is_the_closest_major_city_or_hub, where_are_you_located)
  )  
sal_clean_filt |> 
  count(
    location_granular, 
    
        which_us_state_do_you_work_in,
        which_canadian_province_do_you_work_in,
    where_is_the_closest_major_city_or_hub, 
    where_are_you_located) |>  gt() 
location_granular which_us_state_do_you_work_in which_canadian_province_do_you_work_in where_is_the_closest_major_city_or_hub where_are_you_located n
Alabama Alabama NA NA NA 2
Alachua NA NA Alachua NA 1
Albany NA NA Albany NA 1
Ames, IA NADC NA NA Ames, IA NADC NA 1
Amsterdam NA NA Amsterdam NA 1
Ann Arbor NA NA Ann Arbor NA 1
Argentine NA NA NA Argentine 1
Arizona Arizona NA NA NA 1
Athens NA NA Athens NA 1
Atlanta NA NA Atlanta NA 2
Atlanta, GA NA NA Atlanta, GA NA 1
Austin (Remote) NA NA Austin (Remote) NA 1
Austin, TX NA NA Austin, TX NA 2
Australia NA NA NA Australia 6
Austria NA NA Austria NA 1
Austria NA NA NA Austria 2
BOSTON NA NA BOSTON NA 1
Baltimore NA NA Baltimore NA 2
Baltimore, MD NA NA Baltimore, MD NA 2
Baltimore/Washington DC NA NA Baltimore/Washington DC NA 1
Bangalore NA NA Bangalore NA 1
Basel NA NA Basel NA 1
Bay Are NA NA Bay Are NA 1
Bay Area NA NA Bay Area NA 185
Bay Area - Mid Penninsula NA NA Bay Area - Mid Penninsula NA 1
Bay area NA NA Bay area NA 21
Belgium NA NA Belgium NA 1
Belgium NA NA NA Belgium 9
Benelux NA NA NA Benelux 1
Berkeley, CA NA NA Berkeley, CA NA 1
Berlin NA NA Berlin NA 1
Bloomington, IN, USA NA NA Bloomington, IN, USA NA 1
Boca Raton NA NA Boca Raton NA 1
Bosto NA NA Bosto NA 1
Boston NA NA Boston NA 356
Boston NA NA NA Boston 3
Boston (company location) - company is WFH NA NA Boston (company location) - company is WFH NA 1
Boston / Remote NA NA Boston / Remote NA 1
Boston MA NA NA Boston MA NA 1
Boston, MA NA NA Boston, MA NA 4
Boston, US NA NA Boston, US NA 1
Boston/Cambridge NA NA Boston/Cambridge NA 2
Bosyon NA NA Bosyon NA 1
Boulder NA NA Boulder NA 1
Boulder (remote, company in Bay) NA NA Boulder (remote, company in Bay) NA 1
Boulder, CO NA NA Boulder, CO NA 3
Brazil NA NA Brazil NA 1
Brighton, UK NA NA Brighton, UK NA 1
British Columbia NA British Columbia NA NA 7
Buffalo NA NA Buffalo NA 1
CA NA NA CA NA 2
CO NA NA NA CO 1
Calgary NA NA Calgary NA 1
California California NA NA NA 152
California NA NA California NA 2
Cambridge NA NA Cambridge NA 7
Cambridge Massachusetts NA NA Cambridge Massachusetts NA 1
Cambridge UK NA NA Cambridge UK NA 1
Cambridge, MA NA NA Cambridge, MA NA 7
Cambridge, UK NA NA Cambridge, UK NA 3
Cambridge,MA NA NA Cambridge,MA NA 1
Canada NA NA Canada NA 6
Canada NA NA NA Canada 49
Canada, Ottawa NA NA Canada, Ottawa NA 1
Carlsbad NA NA Carlsbad NA 1
Carolinas & Southeast (From NC to AR, South FL and LA) NA NA NA Carolinas & Southeast (From NC to AR, South FL and LA) 83
Central Maine NA NA Central Maine NA 1
Central VA NA NA NA Central VA 1
Central/North Jersey NA NA Central/North Jersey NA 1
Chicago NA NA Chicago NA 18
Chicago IL NA NA Chicago IL NA 1
Chicago, IL NA NA Chicago, IL NA 1
Chicagoland NA NA Chicagoland NA 1
Chile NA NA NA Chile 1
Cincinnati, Ohio NA NA Cincinnati, Ohio NA 1
Cleveland NA NA Cleveland NA 2
College Station TX NA NA College Station TX NA 1
Cologne NA NA Cologne NA 1
Colorado Colorado NA NA NA 6
Colorado NA NA Colorado NA 1
Columbus NA NA Columbus NA 2
Columbus, OH NA NA Columbus, OH NA 1
Columbus, Ohio NA NA Columbus, Ohio NA 1
Connecticut Connecticut NA NA NA 4
Copenhagen NA NA Copenhagen NA 1
DC NA NA DC NA 5
DC Area/I-270 NA NA DC Area/I-270 NA 1
DC Metro Area (DC, VA, MD, DE) NA NA NA DC Metro Area (DC, VA, MD, DE) 79
DC area NA NA DC area NA 1
DMV NA NA DMV NA 1
Dc NA NA Dc NA 1
Delaware Delaware NA NA NA 2
Denmark NA NA NA Denmark 4
Denver NA NA Denver NA 10
Detroit NA NA Detroit NA 1
Devens NA NA Devens NA 1
Dublin NA NA Dublin NA 2
Dundee NA NA Dundee NA 1
Durham, NC NA NA Durham, NC NA 2
EU NA NA NA EU 2
Edinburgh NA NA Edinburgh NA 1
Elkton, Virginia NA NA NA Elkton, Virginia 1
FRANCE NA NA NA FRANCE 1
Finland NA NA NA Finland 1
Florida NA NA Florida NA 1
France NA NA France NA 1
France NA NA NA France 7
Frankfurt am Main NA NA Frankfurt am Main NA 1
Gaithersburg NA NA Gaithersburg NA 1
Gaithersburg, Maryland NA NA Gaithersburg, Maryland NA 1
Georgia Georgia NA NA NA 4
Georgia NA NA Georgia NA 2
Germany NA NA Germany NA 1
Germany NA NA NA Germany 27
Greater Boston NA NA Greater Boston NA 2
Groton NA NA Groton NA 1
HOUSTON NA NA HOUSTON NA 1
Halifax, Nova Scotia NA NA Halifax, Nova Scotia NA 1
Hamburg NA NA Hamburg NA 2
Hartford, CT NA NA Hartford, CT NA 1
Hayward, CA NA NA Hayward, CA NA 2
Houston NA NA Houston NA 6
Houston, TX NA NA Houston, TX NA 5
Houston,TX NA NA Houston,TX NA 2
I-270 corridor NA NA I-270 corridor NA 1
Idaho Idaho NA NA NA 1
Illinois Illinois NA NA NA 8
Illinois NA NA Illinois NA 1
India NA NA NA India 1
Indiana Indiana NA NA NA 6
Indianapolis NA NA Indianapolis NA 7
Indianapolis, IN NA NA Indianapolis, IN NA 1
Iraq NA NA NA Iraq 1
Ireland NA NA Ireland NA 3
Irvine, CA NA NA Irvine, CA NA 1
Kansas City NA NA Kansas City NA 4
Kansas City, MO NA NA Kansas City, MO NA 1
Kansas City, Missouri, United States NA NA Kansas City, Missouri, United States NA 1
Kenya NA NA NA Kenya 1
Knoxville, TN NA NA Knoxville, TN NA 1
LA NA NA LA NA 3
La Jolla NA NA La Jolla NA 1
Limerick NA NA Limerick NA 1
Liverpool NA NA Liverpool NA 1
Ljubljana NA NA Ljubljana NA 1
London NA NA London NA 5
Los Angeles NA NA Los Angeles NA 34
Los Angeles (company is in Bay) NA NA Los Angeles (company is in Bay) NA 1
Los Angeles, CA NA NA Los Angeles, CA NA 1
Los Ángeles NA NA Los Ángeles NA 1
MVA NA NA MVA NA 1
Macclesfield, UK NA NA Macclesfield, UK NA 1
Madison NA NA Madison NA 1
Madison, Wisconsin NA NA Madison, Wisconsin NA 1
Manchester NA NA Manchester NA 1
Manitoba NA Manitoba NA NA 1
Maryland Maryland NA NA NA 25
Maryland NA NA Maryland NA 6
Maryland/DC NA NA Maryland/DC NA 1
Massachusetts Massachusetts NA NA NA 166
Melbourne NA NA Melbourne NA 1
Miami NA NA Miami NA 3
Michigan Michigan NA NA NA 6
Michigan, USA NA NA Michigan, USA NA 1
Midwest NA NA Midwest NA 6
Midwest (From OH to KS, North to ND) NA NA NA Midwest (From OH to KS, North to ND) 113
Milwaukee NA NA Milwaukee NA 1
Minneapolis NA NA Minneapolis NA 5
Minneapolis, mn NA NA Minneapolis, mn NA 1
Minnesota Minnesota NA NA NA 3
Minnesota NA NA Minnesota NA 1
Mississippi Mississippi NA NA NA 1
Missouri Missouri NA NA NA 10
Montreal NA NA Montreal NA 3
Munich NA NA Munich NA 2
N/A NA NA N/A NA 1
NJ NA NA NJ NA 5
NJ-Philadelphia NA NA NJ-Philadelphia NA 1
NJ-Philly-DE NA NA NJ-Philly-DE NA 1
NL NA NA NA NL 1
NRW NA NA NRW NA 1
NY NA NA NY NA 1
NY Metro NA NA NY Metro NA 1
NY, NY NA NA NY, NY NA 1
NYC NA NA NYC NA 17
Near boston NA NA Near boston NA 1
Nebraska Nebraska NA NA NA 1
Netherlands NA NA Netherlands NA 1
Netherlands NA NA NA Netherlands 3
Nevada Nevada NA NA NA 1
New England (MA, CT, RI, NH, VT, ME) NA NA NA New England (MA, CT, RI, NH, VT, ME) 401
New England, not Boston NA NA New England, not Boston NA 1
New Hampshire New Hampshire NA NA NA 2
New Haven NA NA New Haven NA 2
New Jersey New Jersey NA NA NA 28
New Jersey NA NA New Jersey NA 15
New Jersey, USA NA NA New Jersey, USA NA 1
New Mexico New Mexico NA NA NA 1
New York New York NA NA NA 29
New York NA NA New York NA 16
New York City NA NA New York City NA 11
New York Metro Area NA NA New York Metro Area NA 1
New York metropolitan area NA NA New York metropolitan area NA 1
Newark, CA NA NA Newark, CA NA 1
North Carolina North Carolina NA NA NA 32
North Carolina NA NA North Carolina NA 1
North Dakota North Dakota NA NA NA 1
Nyc NA NA Nyc NA 1
Nyc metro NA NA Nyc metro NA 1
Oakland NA NA Oakland NA 1
Ohio Ohio NA NA NA 6
Ohio NA NA Ohio NA 2
Oklahoma Oklahoma NA NA NA 1
Oklahoma City NA NA Oklahoma City NA 1
Ontario NA Ontario NA NA 7
Orange County NA NA Orange County NA 1
Oregon Oregon NA NA NA 3
Other US Location (HI, AK, PR, etc.) NA NA NA Other US Location (HI, AK, PR, etc.) 1
Oxford NA NA Oxford NA 2
PA NA NA PA NA 1
Pacific NW NA NA Pacific NW NA 1
Pennsylvania Pennsylvania NA NA NA 41
Pennsylvania NA NA Pennsylvania NA 1
Pharma Central (NY, NJ, PA) NA NA NA Pharma Central (NY, NJ, PA) 220
Philadelphia NA NA Philadelphia NA 59
Philadelphia Metro Area NA NA Philadelphia Metro Area NA 1
Philadelphia metro NA NA Philadelphia metro NA 1
Philadelphia, PA NA NA Philadelphia, PA NA 2
Phoenix NA NA Phoenix NA 3
Pittsburgh NA NA Pittsburgh NA 3
Poland NA NA NA Poland 2
Portland NA NA Portland NA 4
Portland Oregon NA NA Portland Oregon NA 1
Portland, OR NA NA Portland, OR NA 3
Porto NA NA Porto NA 1
Providence NA NA Providence NA 1
Quebec NA Quebec NA NA 6
RDU NA NA RDU NA 2
RESEARCH TRIANGLE NA NA RESEARCH TRIANGLE NA 1
RTP NA NA RTP NA 12
RTP NC NA NA RTP NC NA 1
RTP North Carolina NA NA RTP North Carolina NA 1
RTP, NC NA NA RTP, NC NA 3
RTP, North Carolina NA NA RTP, North Carolina NA 1
RTP/Raleigh, NC NA NA RTP/Raleigh, NC NA 1
Raleigh NA NA Raleigh NA 2
Raleigh, NC NA NA Raleigh, NC NA 1
Raleigh-Durham NA NA Raleigh-Durham NA 1
Redwood City NA NA Redwood City NA 1
Rehovot NA NA Rehovot NA 1
Remote NA NA Remote NA 3
Remote - US NA NA NA Remote - US 1
Remote, HQ in San Diego NA NA Remote, HQ in San Diego NA 1
Remote, TX, US NA NA Remote, TX, US NA 1
Remote, office based in Bay Area NA NA Remote, office based in Bay Area NA 1
Research Triangle NA NA Research Triangle NA 41
Research Triangle Park NA NA Research Triangle Park NA 1
Research Triangle Park, NC NA NA NA Research Triangle Park, NC 1
Research Triangle, NC NA NA Research Triangle, NC NA 1
Rhode Island Rhode Island NA NA NA 1
Ridgefield, CT NA NA Ridgefield, CT NA 1
Rockville NA NA Rockville NA 1
Rockville, MD NA NA Rockville, MD NA 2
SF Bay Area NA NA SF Bay Area NA 5
Sacramento NA NA Sacramento NA 1
Sacramento, CA NA NA Sacramento, CA NA 2
Saint Louis NA NA Saint Louis NA 1
Salt Lake City NA NA Salt Lake City NA 7
Salt Lake City, UT NA NA Salt Lake City, UT NA 1
Salt lake city NA NA Salt lake city NA 1
San Antonio NA NA San Antonio NA 1
San Antonio, texas NA NA San Antonio, texas NA 1
San Diego NA NA San Diego NA 111
San Diego NA NA NA San Diego 1
San Diego, CA NA NA San Diego, CA NA 3
San Francisco NA NA San Francisco NA 18
San Francisco Bay Area NA NA San Francisco Bay Area NA 2
San Francisco CA NA NA San Francisco CA NA 1
San Francisco ca NA NA San Francisco ca NA 2
San diego NA NA San diego NA 3
San francisco NA NA San francisco NA 2
Santa Barbara NA NA Santa Barbara NA 1
Santa Barbara, CA NA NA Santa Barbara, CA NA 1
Saxony NA NA Saxony NA 1
Seattle NA NA Seattle NA 50
Seattle Area NA NA Seattle Area NA 1
Seattle WA NA NA Seattle WA NA 2
Seattle but I work remotely from FL NA NA Seattle but I work remotely from FL NA 1
Seattle, WA NA NA Seattle, WA NA 2
Shanghai NA NA Shanghai NA 1
Shanghai/Chengdu NA NA Shanghai/Chengdu NA 1
Singapore NA NA NA Singapore 2
SoCal NA NA SoCal NA 2
South & Mountain West (TX to AZ, North to MT) NA NA NA South & Mountain West (TX to AZ, North to MT) 58
South San Francisco NA NA South San Francisco NA 4
Southeast NA NA Southeast NA 1
Spain NA NA NA Spain 5
Spain (Barcelona) NA NA NA Spain (Barcelona) 1
St Louis NA NA St Louis NA 2
St. Louis NA NA St. Louis NA 4
Stockholm NA NA Stockholm NA 1
Sweden NA NA NA Sweden 2
Switzerland NA NA Switzerland NA 3
Switzerland NA NA NA Switzerland 10
Syracuse NA NA Syracuse NA 2
Szeged NA NA Szeged NA 1
Tampa NA NA Tampa NA 1
Tennessee Tennessee NA NA NA 1
Tennessee NA NA NA Tennessee 1
Texas Texas NA NA NA 7
Thessaloniki NA NA Thessaloniki NA 1
Thousand Oaks NA NA Thousand Oaks NA 2
Thousand Oaks (Los Angeles) NA NA Thousand Oaks (Los Angeles) NA 1
Thousand Oaks, CA NA NA Thousand Oaks, CA NA 1
Toronto NA NA Toronto NA 16
Triangle NA NA Triangle NA 1
Tucson, Az. NA NA Tucson, Az. NA 1
U.K NA NA U.K NA 2
U.K. NA NA U.K. NA 1
U.k. NA NA U.k. NA 1
UK NA NA UK NA 3
USA, ALBANY NA NA USA, ALBANY NA 1
United Kingdom and Ireland NA NA NA United Kingdom and Ireland 62
Uppsala NA NA Uppsala NA 1
Utah Utah NA NA NA 3
Vancouver NA NA Vancouver NA 6
Vancouver, Canada NA NA Vancouver, Canada NA 1
Vienna NA NA Vienna NA 2
Virginia Virginia NA NA NA 3
Waco NA NA Waco NA 1
Waco, TX NA NA Waco, TX NA 1
Waltham NA NA Waltham NA 1
Washington Washington NA NA NA 18
Washington D.C. Metro NA NA Washington D.C. Metro NA 1
Washington DC NA NA Washington DC NA 7
Washington DC (Rockville MD) USA NA NA Washington DC (Rockville MD) USA NA 1
Washington DC Capitol Region NA NA Washington DC Capitol Region NA 1
Washington, DC NA NA Washington, DC NA 3
West Coast (California & Pacific Northwest) NA NA NA West Coast (California & Pacific Northwest) 396
Wilmington,DE NA NA Wilmington,DE NA 1
Wisconsin Wisconsin NA NA NA 5
Worcester MA NA NA Worcester MA NA 1
australia NA NA NA australia 1
bay area NA NA bay area NA 6
boston NA NA boston NA 2
denmark NA NA NA denmark 1
great lakes / Midwest NA NA great lakes / Midwest NA 1
india NA NA NA india 1
nj NA NA nj NA 1
philadelphia NA NA philadelphia NA 1
philly NA NA philly NA 2
russia NA NA NA russia 1
san diego NA NA san diego NA 2
seattle NA NA seattle NA 1
spain NA NA NA spain 1
NA NA NA NA NA 69

Completeness

Compare the missingness of original location variables vs the cleaned location_country variable:

sal_clean_filt |> 
  summarize(
    across(c(
      location_country,
      location_granular,
      where_are_you_located, 
      what_country_do_you_work_in, 
      where_is_the_closest_major_city_or_hub), 
      ~ percent_na(.x))
  ) |> 
  pivot_longer(everything(), names_to = 'column', values_to = '% missing')
# A tibble: 5 × 2
  column                                 `% missing`
  <chr>                                        <dbl>
1 location_country                            0.0238
2 location_granular                           0.0189
3 where_are_you_located                       0.571 
4 what_country_do_you_work_in                 0.770 
5 where_is_the_closest_major_city_or_hub      0.613 

time

sal_clean_filt <- sal_clean_filt |> 
  mutate(timestamp = mdy_hms(timestamp),
         year = year(timestamp),
         month = month(timestamp),
         day = day(timestamp)) |> 
  select(timestamp, year:day, everything())

write out the data

sal_clean_filt |>
  mutate(date = str_extract(as.character(timestamp), "^[-\\w]+")) |> 
  select(
    date, 
    location_country, 
    title_category, title_general, title_detail,
    salary_base, bonus_pct, bonus,
    
    # need to fix these
    years_of_experience, what_degrees_do_you_have,
    
    # the original data
    where_are_you_located, what_country_do_you_work_in,
    role_title_of_current_position,
    everything(),
    -timestamp, -bonus_cleaned, -year, -month, -day
  ) |> 
  write_csv(here::here(outdir , 'salary_results_cleaned.csv'))