Use cases • ineptR

library(ineptR)
library(dplyr)
library(magrittr)
library(stringr)
library(purrr)

The main page shows how to use the function get_ine_data() to get the complete dataset from a given indicator.
However, some use cases may require only a subset of the available data. To tackle such situations, the function get_ine_data() accepts parameters where the user can specify the desired values for each dimension.
These parameters come in the form “dimN” (e.g. dim1, dim2, …), up to the number of available dimensions for the selected indicator (use get_dim_info() to see the available dimensions).

Some example use cases are provided below.

Cross-sectional analysis (Only a subset of the data)

Suppose you want to analyse only the most recent population data. The indicator “0008273” has population data. Let’s check what dimensions are available and explore the indicator:

indicator <- "0008273" #Resident population (No.) by Place of residence (NUTS - 2013), Sex and Age group; Annual
get_dim_info(indicator, lang = "EN") # Default lang is PT
#> # A tibble: 4 × 4
#>   dim_num abrv                             versao nota_dsg                      
#>   <chr>   <chr>                            <chr>  <chr>                         
#> 1 1       Data reference period            XXXXX  "2021, Provisional Resident P…
#> 2 2       Place of residence (NUTS - 2013) 03505  "Resident Population Estimate…
#> 3 3       Sex                              00305   NA                           
#> 4 4       Age group                        00708   NA

There are four available dimensions available for this indicator.

Dimension 1 is always the time dimension.
Dimension 2 is always the place dimension.
Other dimensions vary by indicator. In this case dimension 3 is Sex and dimension 4 is Age group.

What is the most recent time period?

Alternative 1: Using the get_metadata() function.

indicator_metadata <- get_metadata(indicator, lang = "EN")
indicator_metadata
#> $IndicadorCod
#> [1] "0008273"
#> 
#> $IndicadorNome
#> [1] "Resident population (No.) by Place of residence (NUTS - 2013), Sex and Age group; Annual - Statistics Portugal, Annual estimates of resident population"
#> 
#> $Periodic
#> [1] "Annual"
#> 
#> $PrimeiroPeriodo
#> [1] "2011"
#> 
#> $UltimoPeriodo
#> [1] "2022"
#> 
#> $UnidadeMedida
#> [1] "Number (No.)"
#> 
#> $Potencia10
#> [1] "0"
#> 
#> $PrecisaoDecimal
#> [1] "0"
#> 
#> $Lingua
#> [1] "EN"
#> 
#> $DataUltimaAtualizacao
#> [1] "2023-06-15"
#> 
#> $DataExtracao
#> [1] "2023-09-07T14:17:20.213+01:00"

most_recent_year <- indicator_metadata %>% 
  pluck("UltimoPeriodo") %>% 
  as.integer()
most_recent_year
#> [1] 2022

This function quickly shows the first and last time periods.
However, it does not provide the id codes for each possible value, that we will need further along in our analysis.

Alternative 2: Using the get_dim_values() function.

#Get a data frame with all the values for all dimensions
dimension_values <- get_dim_values(indicator, lang = "EN")

#check the internal structure
str(dimension_values)
#> tibble [378 × 7] (S3: tbl_df/tbl/data.frame)
#>  $ dim_num    : chr [1:378] "1" "1" "1" "1" ...
#>  $ cat_id     : chr [1:378] "S7A2011" "S7A2012" "S7A2013" "S7A2014" ...
#>  $ categ_cod  : chr [1:378] "S7A2011" "S7A2012" "S7A2013" "S7A2014" ...
#>  $ categ_dsg  : chr [1:378] "2011" "2012" "2013" "2014" ...
#>  $ categ_ord  : chr [1:378] "20110101" "20120101" "20130101" "20140101" ...
#>  $ categ_nivel: chr [1:378] "1" "1" "1" "1" ...
#>  $ value_id   : chr [1:378] "Dim_Num1_S7A2011" "Dim_Num1_S7A2012" "Dim_Num1_S7A2013" "Dim_Num1_S7A2014" ...

#Explore the values
head(dimension_values)
#> # A tibble: 6 × 7
#>   dim_num cat_id  categ_cod categ_dsg categ_ord categ_nivel value_id        
#>   <chr>   <chr>   <chr>     <chr>     <chr>     <chr>       <chr>           
#> 1 1       S7A2011 S7A2011   2011      20110101  1           Dim_Num1_S7A2011
#> 2 1       S7A2012 S7A2012   2012      20120101  1           Dim_Num1_S7A2012
#> 3 1       S7A2013 S7A2013   2013      20130101  1           Dim_Num1_S7A2013
#> 4 1       S7A2014 S7A2014   2014      20140101  1           Dim_Num1_S7A2014
#> 5 1       S7A2015 S7A2015   2015      20150101  1           Dim_Num1_S7A2015
#> 6 1       S7A2016 S7A2016   2016      20160101  1           Dim_Num1_S7A2016

most_recent_year <- dimension_values %>%
  filter(dim_num == 1) %>%
  summarise(categ_dsg = max(as.integer(categ_dsg))) %>%
  as.integer()

The most recent value is 2022


#Exploring the other dimensions
dimension_values %>% group_split(dim_num)
#> <list_of<
#>   tbl_df<
#>     dim_num    : character
#>     cat_id     : character
#>     categ_cod  : character
#>     categ_dsg  : character
#>     categ_ord  : character
#>     categ_nivel: character
#>     value_id   : character
#>   >
#> >[4]>
#> [[1]]
#> # A tibble: 12 × 7
#>    dim_num cat_id  categ_cod categ_dsg categ_ord categ_nivel value_id        
#>    <chr>   <chr>   <chr>     <chr>     <chr>     <chr>       <chr>           
#>  1 1       S7A2011 S7A2011   2011      20110101  1           Dim_Num1_S7A2011
#>  2 1       S7A2012 S7A2012   2012      20120101  1           Dim_Num1_S7A2012
#>  3 1       S7A2013 S7A2013   2013      20130101  1           Dim_Num1_S7A2013
#>  4 1       S7A2014 S7A2014   2014      20140101  1           Dim_Num1_S7A2014
#>  5 1       S7A2015 S7A2015   2015      20150101  1           Dim_Num1_S7A2015
#>  6 1       S7A2016 S7A2016   2016      20160101  1           Dim_Num1_S7A2016
#>  7 1       S7A2017 S7A2017   2017      20170101  1           Dim_Num1_S7A2017
#>  8 1       S7A2018 S7A2018   2018      20180101  1           Dim_Num1_S7A2018
#>  9 1       S7A2019 S7A2019   2019      20190101  1           Dim_Num1_S7A2019
#> 10 1       S7A2020 S7A2020   2020      20200101  1           Dim_Num1_S7A2020
#> 11 1       S7A2021 S7A2021   2021      20210101  1           Dim_Num1_S7A2021
#> 12 1       S7A2022 S7A2022   2022      20220101  1           Dim_Num1_S7A2022
#> 
#> [[2]]
#> # A tibble: 344 × 7
#>    dim_num cat_id  categ_cod categ_dsg         categ_ord categ_nivel value_id   
#>    <chr>   <chr>   <chr>     <chr>             <chr>     <chr>       <chr>      
#>  1 2       PT      PT        Portugal          1         1           Dim_Num2_PT
#>  2 2       1       1         Continente        2         2           Dim_Num2_1 
#>  3 2       11      11        Norte             3         3           Dim_Num2_11
#>  4 2       111     111       Alto Minho        4         4           Dim_Num2_1…
#>  5 2       1111601 1111601   Arcos de Valdevez 5         5           Dim_Num2_1…
#>  6 2       1111602 1111602   Caminha           42        5           Dim_Num2_1…
#>  7 2       1111603 1111603   Melgaço           57        5           Dim_Num2_1…
#>  8 2       1111604 1111604   Monção            71        5           Dim_Num2_1…
#>  9 2       1111605 1111605   Paredes de Coura  96        5           Dim_Num2_1…
#> 10 2       1111606 1111606   Ponte da Barca    113       5           Dim_Num2_1…
#> # ℹ 334 more rows
#> 
#> [[3]]
#> # A tibble: 3 × 7
#>   dim_num cat_id categ_cod categ_dsg categ_ord categ_nivel value_id  
#>   <chr>   <chr>  <chr>     <chr>     <chr>     <chr>       <chr>     
#> 1 3       T      T         MF        1         1           Dim_Num3_T
#> 2 3       1      1         M         2         2           Dim_Num3_1
#> 3 3       2      2         F         3         2           Dim_Num3_2
#> 
#> [[4]]
#> # A tibble: 19 × 7
#>    dim_num cat_id categ_cod categ_dsg         categ_ord categ_nivel value_id   
#>    <chr>   <chr>  <chr>     <chr>             <chr>     <chr>       <chr>      
#>  1 4       T      T         Total             10        1           Dim_Num4_T 
#>  2 4       11     11        0 - 4 years       20        2           Dim_Num4_11
#>  3 4       12     12        5 - 9 years       80        2           Dim_Num4_12
#>  4 4       21     21        10 - 14 years     140       2           Dim_Num4_21
#>  5 4       22     22        15 - 19 years     200       2           Dim_Num4_22
#>  6 4       31     31        20 - 24 years     260       2           Dim_Num4_31
#>  7 4       32     32        25 - 29 years     320       2           Dim_Num4_32
#>  8 4       41     41        30 - 34 years     380       2           Dim_Num4_41
#>  9 4       42     42        35 - 39 years     440       2           Dim_Num4_42
#> 10 4       51     51        40 - 44 years     500       2           Dim_Num4_51
#> 11 4       52     52        45 - 49 years     560       2           Dim_Num4_52
#> 12 4       61     61        50 - 54 years     620       2           Dim_Num4_61
#> 13 4       62     62        55 - 59 years     680       2           Dim_Num4_62
#> 14 4       71     71        60 - 64 years     740       2           Dim_Num4_71
#> 15 4       72     72        65 - 69 years     800       2           Dim_Num4_72
#> 16 4       81     81        70 - 74 years     860       2           Dim_Num4_81
#> 17 4       82     82        75 - 79 years     920       2           Dim_Num4_82
#> 18 4       91     91        80 - 84 years     980       2           Dim_Num4_91
#> 19 4       92     92        85 and more years 1040      2           Dim_Num4_92

The advantage of this approach is that it shows the categ_cod variable, that we can pass to the dimN parameters of the get_ine_data() function to obtain only a subset of the available data.

We see 344 geographic units, including NUTS 1, NUTS 2, NUTS 3 and municipalities as the most disaggregated level. MF, M and F sexes are available. Five year age groups are available.

Now suppose we want to compare population size across NUTS 3, for all available sexes and age groups. (Note that all NUTS 3 have a 3-character categ_cod).
The get_ine_data() function conveniently allows us to only request the desired data:


#Get the id of the latest time period
latest_year_id <- dimension_values$categ_cod[dimension_values$categ_dsg == most_recent_year]

latest_year_id
#> [1] "S7A2022"

#Get the id of all NUTS 3
nuts3_id <- dimension_values %>% 
  filter(str_length(categ_cod)==3) %>% #NUTS 3 have a 3-character code
  pull(categ_cod)

nuts3_id
#>  [1] "111" "112" "119" "11A" "11B" "11C" "11D" "11E" "16B" "16D" "16E" "16F"
#> [13] "16G" "16H" "16I" "16J" "170" "181" "184" "185" "186" "187" "150" "200"
#> [25] "300"

#Get the data
indicator_output <- get_ine_data(indicator, 
                                 lang = "EN", 
                                 dim1 = latest_year_id, 
                                 dim2 = nuts3_id)

Important:
The parameters passed to get_ine_data() are a subset from the categ_cod values that we obtained with the :get_dim_values().
In this example we passed a subset of values only to dim1 and dim2, however we can use this approach to any combination of the available dimensions.

Example with a large and cumbersome dataset

The indicator Deaths (No.) by Place of residence (NUTS - 2013), Sex, Age group and Death cause (European short-list); Annual (6), with ID “0008206” is a very large dataset:

indicator <- "0008206"
get_dim_info(indicator, lang = "EN")
#> # A tibble: 5 × 4
#>   dim_num abrv                              versao nota_dsg                     
#>   <chr>   <chr>                             <chr>  <chr>                        
#> 1 1       Data reference period             XXXXX  "International Classificatio…
#> 2 2       Place of residence (NUTS - 2013)  03513  "From January 1st, 2015 came…
#> 3 3       Sex                               00305  "The total number of deaths …
#> 4 4       Age group (decennial)             03633  "In some situations of previ…
#> 5 5       Death cause (European short-list) 00204  "The \"European short-list f…

#Check number of unique values by dimension
unique_values <- get_dim_values(indicator, lang = "EN") %>% 
  group_by(dim_num) %>% 
  summarise(unique_values = n())

unique_values
#> # A tibble: 5 × 2
#>   dim_num unique_values
#>   <chr>           <int>
#> 1 1                  31
#> 2 2                 354
#> 3 3                   3
#> 4 4                  21
#> 5 5                  66

With the limit of 40k records per API call, extracting all values would result in over 1142 API calls in the best case scenario. This would take many hours and might result in out of memory errors or API timeout. In such situations the recommended approach is to iterate over one of the dimensions (e.g. Time dimension), and store the data obtained at each iteration.