from pyprojroot import here
Processing StatCan Data
Setup
Parameters
= here() / "data" / "14100355.csv" LABOUR_DATA_FILE
Libraries
import polars as pl
import polars.selectors as cs
from mizani.bounds import squish
from pyprojroot import here
from great_tables import GT, md, html
from plotnine import *
from labourcan.data_processing import read_labourcan
Read data
read_labourcan
returns a polars dataframe with columns:
- Unnecessary metadata columns removed
- Filtered to seasonally adjusted estimates only
- Additional
YEAR
,MONTH
, andDATE_YMD
columns extracted fromREF_DATE
- Sorted chronologically by year and month
= read_labourcan(LABOUR_DATA_FILE)
labour labour.glimpse()
Rows: 12252
Columns: 10
$ REF_DATE <str> '1976-01', '1976-01', '1976-01', '1976-01', '1976-01', '1976-01', '1976-01', '1976-01', '1976-01', '1976-01'
$ GEO <str> 'Canada', 'Canada', 'Canada', 'Canada', 'Canada', 'Canada', 'Canada', 'Canada', 'Canada', 'Canada'
$ Industry <str> 'Total employed, all industries', 'Goods-producing sector', 'Agriculture [111-112, 1100, 1151-1152]', 'Forestry, fishing, mining, quarrying, oil and gas [21, 113-114, 1153, 2100]', 'Utilities [22]', 'Construction [23]', 'Manufacturing [31-33]', 'Services-producing sector', 'Wholesale and retail trade [41, 44-45]', 'Transportation and warehousing [48-49]'
$ Statistics <str> 'Estimate', 'Estimate', 'Estimate', 'Estimate', 'Estimate', 'Estimate', 'Estimate', 'Estimate', 'Estimate', 'Estimate'
$ Data type <str> 'Seasonally adjusted', 'Seasonally adjusted', 'Seasonally adjusted', 'Seasonally adjusted', 'Seasonally adjusted', 'Seasonally adjusted', 'Seasonally adjusted', 'Seasonally adjusted', 'Seasonally adjusted', 'Seasonally adjusted'
$ UOM <str> 'Persons in thousands', 'Persons in thousands', 'Persons in thousands', 'Persons in thousands', 'Persons in thousands', 'Persons in thousands', 'Persons in thousands', 'Persons in thousands', 'Persons in thousands', 'Persons in thousands'
$ VALUE <f64> 9636.7, 3312.5, 463.6, 244.2, 110.4, 654.9, 1839.5, 6324.1, 1592.9, 573.2
$ YEAR <i32> 1976, 1976, 1976, 1976, 1976, 1976, 1976, 1976, 1976, 1976
$ MONTH <i32> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
$ DATE_YMD <date> 1976-01-01, 1976-01-01, 1976-01-01, 1976-01-01, 1976-01-01, 1976-01-01, 1976-01-01, 1976-01-01, 1976-01-01, 1976-01-01
Process data
For our graphic, we need to make this variable: signed cenetered rank in % change in # of jobs
Signed referring to that the rank is effectively computed over negative and positive %change separately, and in opposite directions (descending for negative, and ascending for positive).
Meaning the highest negative value (closest to 0) is rank = -1
, and lowest positive value (closets to 0) is rank = 1
For example:
- Input is vector of % change which can be positive or negative:
[-0.01, -0.02, 0.01, 0.02]
- Output:
[-1, -2, 1, 2]
% Change per month
First, compute % change from previous month. This needs to be done over different subsets of data:
- Industry
- Geolocation
- Labour Force Characteristic (If provided)
- Gender
- Age group
In the seasonally adjusted dataset, only Industry and Geolocation are provided. The LFC is total employment, the Gender is both, and Age group is all.
= (
labour_processed # if we sort acesnding by time, then lag value is the month before
"Industry", "YEAR", "MONTH"])
labour.sort([
.with_columns(=pl.col("VALUE")
LAGGED_VALUE1)
.shift("Industry"])
.over([
)# compute percent difference
"VALUE") - pl.col("LAGGED_VALUE")).alias("DIFF"))
.with_columns((pl.col("DIFF") / pl.col("LAGGED_VALUE")).alias("PDIFF"))
.with_columns((pl.col(
.select("Industry"),
pl.col("Labour force characteristics"),
cs.matches("DATE_YMD"),
pl.col("YEAR"),
pl.col("MONTH"),
pl.col("VALUE"),
cs.matches("DIFF"),
cs.matches(
)"Industry", "YEAR", "MONTH", "PDIFF"])
.sort([
) labour_processed.glimpse()
Rows: 12252
Columns: 8
$ Industry <str> 'Accommodation and food services [72]', 'Accommodation and food services [72]', 'Accommodation and food services [72]', 'Accommodation and food services [72]', 'Accommodation and food services [72]', 'Accommodation and food services [72]', 'Accommodation and food services [72]', 'Accommodation and food services [72]', 'Accommodation and food services [72]', 'Accommodation and food services [72]'
$ DATE_YMD <date> 1976-01-01, 1976-02-01, 1976-03-01, 1976-04-01, 1976-05-01, 1976-06-01, 1976-07-01, 1976-08-01, 1976-09-01, 1976-10-01
$ YEAR <i32> 1976, 1976, 1976, 1976, 1976, 1976, 1976, 1976, 1976, 1976
$ MONTH <i32> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10
$ VALUE <f64> 391.9, 395.1, 399.7, 399.7, 407.7, 411.6, 417.6, 423.7, 423.5, 428.9
$ LAGGED_VALUE <f64> None, 391.9, 395.1, 399.7, 399.7, 407.7, 411.6, 417.6, 423.7, 423.5
$ DIFF <f64> None, 3.2000000000000455, 4.599999999999966, 0.0, 8.0, 3.900000000000034, 6.0, 6.099999999999966, -0.19999999999998863, 5.399999999999977
$ PDIFF <f64> None, 0.008165348303138672, 0.011642622120981943, 0.0, 0.020015011258443835, 0.009565857247976537, 0.014577259475218658, 0.014607279693486507, -0.0004720320981826496, 0.012750885478158152
Signed Centered Rank
Now we can compute the signed centered rank
.
Define centered_rank_expr
function which takes a polars series and returns an expression, meaning it can be used in a polars with_columns
call, which is nice because it can take advantage of polars lazy-evaluation optimization.
Below is the definition and a test-case.
def centered_rank_expr(col):
"""
- Largest negative value gets rank -1
- Smallest positive value gets rank +1
- Zero gets rank 0
"""
return (
< 0)
pl.when(col
.then(# minus the total # of -ve values
="ordinal", descending=True) * -1) + (col > 0).sum()
(col.rank(method
)== 0)
.when(col 0))
.then(pl.lit(> 0)
.when(col ="ordinal") - (col < 0).sum())
.then(col.rank(methodNone))
.otherwise(pl.lit(
)
# test it on this subset of data
= (
test_series # .filter(pl.col("Labour force characteristics") == "Employment")
labour_processed"PDIFF").round(decimals=4))
.with_columns(pl.col(filter(pl.col("YEAR") == 2025, pl.col("MONTH") == 1)
."PDIFF"))
.select(pl.col(=10, seed=1)
.sample(n"PDIFF")
.select(
)
"PDIFF")).alias("rank")).sort(
test_series.with_columns(centered_rank_expr(pl.col("PDIFF"
)
PDIFF | rank |
---|---|
f64 | i64 |
-0.0336 | -5 |
-0.0207 | -4 |
-0.0177 | -3 |
-0.0101 | -2 |
-0.003 | -1 |
0.0006 | 1 |
0.0109 | 2 |
0.0122 | 3 |
0.0179 | 4 |
0.044 | 5 |
Looks good, so now we can apply to the data:
= labour_processed.with_columns(
labour_processed =centered_rank_expr(pl.col("PDIFF")).over(
centered_rank_across_industry"YEAR", "MONTH"]
[
)
) labour_processed.glimpse()
Rows: 12252
Columns: 9
$ Industry <str> 'Accommodation and food services [72]', 'Accommodation and food services [72]', 'Accommodation and food services [72]', 'Accommodation and food services [72]', 'Accommodation and food services [72]', 'Accommodation and food services [72]', 'Accommodation and food services [72]', 'Accommodation and food services [72]', 'Accommodation and food services [72]', 'Accommodation and food services [72]'
$ DATE_YMD <date> 1976-01-01, 1976-02-01, 1976-03-01, 1976-04-01, 1976-05-01, 1976-06-01, 1976-07-01, 1976-08-01, 1976-09-01, 1976-10-01
$ YEAR <i32> 1976, 1976, 1976, 1976, 1976, 1976, 1976, 1976, 1976, 1976
$ MONTH <i32> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10
$ VALUE <f64> 391.9, 395.1, 399.7, 399.7, 407.7, 411.6, 417.6, 423.7, 423.5, 428.9
$ LAGGED_VALUE <f64> None, 391.9, 395.1, 399.7, 399.7, 407.7, 411.6, 417.6, 423.7, 423.5
$ DIFF <f64> None, 3.2000000000000455, 4.599999999999966, 0.0, 8.0, 3.900000000000034, 6.0, 6.099999999999966, -0.19999999999998863, 5.399999999999977
$ PDIFF <f64> None, 0.008165348303138672, 0.011642622120981943, 0.0, 0.020015011258443835, 0.009565857247976537, 0.014577259475218658, 0.014607279693486507, -0.0004720320981826496, 0.012750885478158152
$ centered_rank_across_industry <i64> None, 8, 11, 0, 8, 8, 9, 8, -1, 5
Check output visually for 1 year 1 month
# check 1 year 1 month
(
labour_processed"PDIFF").round(decimals=4))
.with_columns(pl.col(filter(pl.col("YEAR") == 2025)
."YEAR", "MONTH", "PDIFF"])
.sort(["YEAR", "MONTH", "Industry", "VALUE", "DIFF", "PDIFF", cs.matches("rank")])
.select([ )
YEAR | MONTH | Industry | VALUE | DIFF | PDIFF | centered_rank_across_industry |
---|---|---|---|---|---|---|
i32 | i32 | str | f64 | f64 | f64 | i64 |
2025 | 1 | "Wholesale trade [41]" | 689.4 | -24.0 | -0.0336 | -8 |
2025 | 1 | "Utilities [22]" | 155.8 | -3.3 | -0.0207 | -7 |
2025 | 1 | "Other services (except public … | 771.0 | -13.9 | -0.0177 | -6 |
2025 | 1 | "Forestry, fishing, mining, qua… | 337.1 | -5.6 | -0.0163 | -5 |
2025 | 1 | "Business, building and other s… | 726.4 | -7.4 | -0.0101 | -4 |
… | … | … | … | … | … | … |
2025 | 8 | "Accommodation and food service… | 1177.1 | 9.2 | 0.0079 | 5 |
2025 | 8 | "Construction [23]" | 1636.3 | 17.1 | 0.0106 | 6 |
2025 | 8 | "Agriculture [111-112, 1100, 11… | 217.7 | 4.8 | 0.0225 | 7 |
2025 | 8 | "Utilities [22]" | 163.5 | 4.7 | 0.0296 | 8 |
2025 | 8 | "Wholesale trade [41]" | 731.1 | 27.6 | 0.0392 | 9 |