library(readr) # import library to read csv
library(dplyr) # import library to do left joins
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr) # import library for reshaping data
final_maternal_mortality_data <- read_csv("final_maternal_mortality_data.csv", show_col_types = FALSE) # read final_maternal_mortality_data.csv
final_infant_mortality_data <- read_csv("final_infant_mortality_data.csv", show_col_types = FALSE) # read final_infant_mortality_data.csv
final_life_expectancy_data <- read_csv("final_life_expectancy_data.csv", show_col_types = FALSE) # read final_life_expectancy_data.csv
# Exercise 1
# join all three tables based on name
merged_data <- final_life_expectancy_data %>%
left_join(final_infant_mortality_data, by = c("name", "region")) %>% # join infant mortality data
left_join(final_maternal_mortality_data, by = c("name", "region")) # join maternal mortality data
# Clean and standardize column names
merged_data_final <- merged_data %>%
select(name, region, years, `deaths/1000`, `deaths/100000`) %>% # select only needed columns
rename(
life_expectancy_years = years, # rename for clarity
infant_mortality_per_1000 = `deaths/1000`, # rename for clarity
maternal_mortality_per_100000 = `deaths/100000` # rename for clarity
)
write_csv(merged_data_final, "merged_cia_data.csv") # save the merged dataset
# Verification
merged_cia_data_csv <- read_csv("merged_cia_data.csv", show_col_types = FALSE) # read merged_cia_data.csv
cat("Exercise 1 Verification:\n") # eye candy
## Exercise 1 Verification:
str(merged_cia_data_csv) # print summary of df
## spc_tbl_ [227 × 5] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ name : chr [1:227] "Monaco" "Singapore" "Macau" "Japan" ...
## $ region : chr [1:227] "Europe" "East and Southeast Asia" "East and Southeast Asia" "East and Southeast Asia" ...
## $ life_expectancy_years : num [1:227] 89.8 86.7 85.3 85.2 84.2 84.2 84 84 83.9 83.8 ...
## $ infant_mortality_per_1000 : num [1:227] 1.7 NA 4.3 1.8 3.8 6.1 2.4 1.6 3 3.3 ...
## $ maternal_mortality_per_100000: num [1:227] 5 6 NA 3 12 8 NA 3 5 11 ...
## - attr(*, "spec")=
## .. cols(
## .. name = col_character(),
## .. region = col_character(),
## .. life_expectancy_years = col_double(),
## .. infant_mortality_per_1000 = col_double(),
## .. maternal_mortality_per_100000 = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
# Exercise 2: Aggregate by region
# Read the merged data
merged_cia_data_csv <- read_csv("merged_cia_data.csv", show_col_types = FALSE) # load the merged dataset
# Group by region and calculate mean
aggregated_data <- merged_cia_data_csv %>%
group_by(region) %>% # group countries by their region
summarise(
avg_life_expectancy = round(mean(life_expectancy_years, na.rm = TRUE), 2), # calculate mean life expectancy
avg_infant_mortality = round(mean(infant_mortality_per_1000, na.rm = TRUE), 2), # calculate mean infant mortality
avg_maternal_mortality = round(mean(maternal_mortality_per_100000, na.rm = TRUE), 2), # calculate mean maternal mortality
)
write_csv(aggregated_data, "aggregated_by_region.csv") # save the aggregated dataset
# Verification
aggregated_data_csv <- read_csv("aggregated_by_region.csv", show_col_types = FALSE) # read aggregated_by_region.csv
cat("\nExercise 2 Verification:\n")
##
## Exercise 2 Verification:
str(aggregated_data_csv) # print summary of df
## spc_tbl_ [10 × 4] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ region : chr [1:10] "Africa" "Australia and Oceania" "Central America and the Caribbean" "Central Asia" ...
## $ avg_life_expectancy : num [1:10] 67.3 76.4 77.8 73.8 76.8 ...
## $ avg_infant_mortality : num [1:10] 39.2 12.6 11.5 17.1 14.9 ...
## $ avg_maternal_mortality: num [1:10] 292.1 102 69.2 18.1 77.7 ...
## - attr(*, "spec")=
## .. cols(
## .. region = col_character(),
## .. avg_life_expectancy = col_double(),
## .. avg_infant_mortality = col_double(),
## .. avg_maternal_mortality = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
# Exercise 3: Reshape to long format
# Read the merged data
merged_cia_data_csv <- read_csv("merged_cia_data.csv", show_col_types = FALSE) # load the merged dataset
# Reshape from wide to long format
long_data <- merged_cia_data_csv %>%
select(-region) %>% # remove region column
pivot_longer(
cols = c(life_expectancy_years, infant_mortality_per_1000, maternal_mortality_per_100000), # columns to pivot
names_to = "indicator", # new column for indicator names
values_to = "value" # new column for values
)
write_csv(long_data, "long_format_cia_data.csv") # save the long format dataset
# Verification
long_format_cia_data_csv <- read_csv("long_format_cia_data.csv", show_col_types = FALSE) # read long_format_cia_data.csv
cat("\nExercise 3 Verification:\n")
##
## Exercise 3 Verification:
str(long_format_cia_data_csv) # print summary of df
## spc_tbl_ [681 × 3] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ name : chr [1:681] "Monaco" "Monaco" "Monaco" "Singapore" ...
## $ indicator: chr [1:681] "life_expectancy_years" "infant_mortality_per_1000" "maternal_mortality_per_100000" "life_expectancy_years" ...
## $ value : num [1:681] 89.8 1.7 5 86.7 NA 6 85.3 4.3 NA 85.2 ...
## - attr(*, "spec")=
## .. cols(
## .. name = col_character(),
## .. indicator = col_character(),
## .. value = col_double()
## .. )
## - attr(*, "problems")=<externalptr>