from pyprojroot import hereProcessing StatCan Data
Setup
Parameters
LABOUR_DATA_FILE = here() / "data" / "14100355.csv"Libraries
import polars as pl
import polars.selectors as cs
from mizani.bounds import squish
from pyprojroot import here
from great_tables import GT, md, html
from plotnine import *
from labourcan.data_processing import read_labourcanRead data
read_labourcan returns a polars dataframe with columns:
- Unnecessary metadata columns removed
- Filtered to seasonally adjusted estimates only
- Additional
YEAR,MONTH, andDATE_YMDcolumns extracted fromREF_DATE - Sorted chronologically by year and month
labour = read_labourcan(LABOUR_DATA_FILE)
labour.glimpse()Rows: 12252
Columns: 10
$ REF_DATE <str> '1976-01', '1976-01', '1976-01', '1976-01', '1976-01', '1976-01', '1976-01', '1976-01', '1976-01', '1976-01'
$ GEO <str> 'Canada', 'Canada', 'Canada', 'Canada', 'Canada', 'Canada', 'Canada', 'Canada', 'Canada', 'Canada'
$ Industry <str> 'Total employed, all industries', 'Goods-producing sector', 'Agriculture [111-112, 1100, 1151-1152]', 'Forestry, fishing, mining, quarrying, oil and gas [21, 113-114, 1153, 2100]', 'Utilities [22]', 'Construction [23]', 'Manufacturing [31-33]', 'Services-producing sector', 'Wholesale and retail trade [41, 44-45]', 'Transportation and warehousing [48-49]'
$ Statistics <str> 'Estimate', 'Estimate', 'Estimate', 'Estimate', 'Estimate', 'Estimate', 'Estimate', 'Estimate', 'Estimate', 'Estimate'
$ Data type <str> 'Seasonally adjusted', 'Seasonally adjusted', 'Seasonally adjusted', 'Seasonally adjusted', 'Seasonally adjusted', 'Seasonally adjusted', 'Seasonally adjusted', 'Seasonally adjusted', 'Seasonally adjusted', 'Seasonally adjusted'
$ UOM <str> 'Persons in thousands', 'Persons in thousands', 'Persons in thousands', 'Persons in thousands', 'Persons in thousands', 'Persons in thousands', 'Persons in thousands', 'Persons in thousands', 'Persons in thousands', 'Persons in thousands'
$ VALUE <f64> 9636.7, 3312.5, 463.6, 244.2, 110.4, 654.9, 1839.5, 6324.1, 1592.9, 573.2
$ YEAR <i32> 1976, 1976, 1976, 1976, 1976, 1976, 1976, 1976, 1976, 1976
$ MONTH <i32> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
$ DATE_YMD <date> 1976-01-01, 1976-01-01, 1976-01-01, 1976-01-01, 1976-01-01, 1976-01-01, 1976-01-01, 1976-01-01, 1976-01-01, 1976-01-01
Process data
For our graphic, we need to make this variable: signed cenetered rank in % change in # of jobs
Signed referring to that the rank is effectively computed over negative and positive %change separately, and in opposite directions (descending for negative, and ascending for positive).
Meaning the highest negative value (closest to 0) is rank = -1, and lowest positive value (closets to 0) is rank = 1
For example:
- Input is vector of % change which can be positive or negative:
[-0.01, -0.02, 0.01, 0.02] - Output:
[-1, -2, 1, 2]
% Change per month
First, compute % change from previous month. This needs to be done over different subsets of data:
- Industry
- Geolocation
- Labour Force Characteristic (If provided)
- Gender
- Age group
In the seasonally adjusted dataset, only Industry and Geolocation are provided. The LFC is total employment, the Gender is both, and Age group is all.
labour_processed = (
# if we sort acesnding by time, then lag value is the month before
labour.sort(["Industry", "YEAR", "MONTH"])
.with_columns(
LAGGED_VALUE=pl.col("VALUE")
.shift(1)
.over(["Industry"])
)
# compute percent difference
.with_columns((pl.col("VALUE") - pl.col("LAGGED_VALUE")).alias("DIFF"))
.with_columns((pl.col("DIFF") / pl.col("LAGGED_VALUE")).alias("PDIFF"))
.select(
pl.col("Industry"),
cs.matches("Labour force characteristics"),
pl.col("DATE_YMD"),
pl.col("YEAR"),
pl.col("MONTH"),
cs.matches("VALUE"),
cs.matches("DIFF"),
)
.sort(["Industry", "YEAR", "MONTH", "PDIFF"])
)
labour_processed.glimpse()Rows: 12252
Columns: 8
$ Industry <str> 'Accommodation and food services [72]', 'Accommodation and food services [72]', 'Accommodation and food services [72]', 'Accommodation and food services [72]', 'Accommodation and food services [72]', 'Accommodation and food services [72]', 'Accommodation and food services [72]', 'Accommodation and food services [72]', 'Accommodation and food services [72]', 'Accommodation and food services [72]'
$ DATE_YMD <date> 1976-01-01, 1976-02-01, 1976-03-01, 1976-04-01, 1976-05-01, 1976-06-01, 1976-07-01, 1976-08-01, 1976-09-01, 1976-10-01
$ YEAR <i32> 1976, 1976, 1976, 1976, 1976, 1976, 1976, 1976, 1976, 1976
$ MONTH <i32> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10
$ VALUE <f64> 391.9, 395.1, 399.7, 399.7, 407.7, 411.6, 417.6, 423.7, 423.5, 428.9
$ LAGGED_VALUE <f64> None, 391.9, 395.1, 399.7, 399.7, 407.7, 411.6, 417.6, 423.7, 423.5
$ DIFF <f64> None, 3.2000000000000455, 4.599999999999966, 0.0, 8.0, 3.900000000000034, 6.0, 6.099999999999966, -0.19999999999998863, 5.399999999999977
$ PDIFF <f64> None, 0.008165348303138672, 0.011642622120981943, 0.0, 0.020015011258443835, 0.009565857247976537, 0.014577259475218658, 0.014607279693486507, -0.0004720320981826496, 0.012750885478158152
Signed Centered Rank
Now we can compute the signed centered rank.
Define centered_rank_expr function which takes a polars series and returns an expression, meaning it can be used in a polars with_columns call, which is nice because it can take advantage of polars lazy-evaluation optimization.
Below is the definition and a test-case.
def centered_rank_expr(col):
"""
- Largest negative value gets rank -1
- Smallest positive value gets rank +1
- Zero gets rank 0
"""
return (
pl.when(col < 0)
.then(
# minus the total # of -ve values
(col.rank(method="ordinal", descending=True) * -1) + (col > 0).sum()
)
.when(col == 0)
.then(pl.lit(0))
.when(col > 0)
.then(col.rank(method="ordinal") - (col < 0).sum())
.otherwise(pl.lit(None))
)
# test it on this subset of data
test_series = (
# .filter(pl.col("Labour force characteristics") == "Employment")
labour_processed
.with_columns(pl.col("PDIFF").round(decimals=4))
.filter(pl.col("YEAR") == 2025, pl.col("MONTH") == 1)
.select(pl.col("PDIFF"))
.sample(n=10, seed=1)
.select("PDIFF")
)
test_series.with_columns(centered_rank_expr(pl.col("PDIFF")).alias("rank")).sort(
"PDIFF"
)| PDIFF | rank |
|---|---|
| f64 | i64 |
| -0.0336 | -5 |
| -0.0207 | -4 |
| -0.0177 | -3 |
| -0.0101 | -2 |
| -0.003 | -1 |
| 0.0006 | 1 |
| 0.0109 | 2 |
| 0.0122 | 3 |
| 0.0179 | 4 |
| 0.044 | 5 |
Looks good, so now we can apply to the data:
labour_processed = labour_processed.with_columns(
centered_rank_across_industry=centered_rank_expr(pl.col("PDIFF")).over(
["YEAR", "MONTH"]
)
)
labour_processed.glimpse()Rows: 12252
Columns: 9
$ Industry <str> 'Accommodation and food services [72]', 'Accommodation and food services [72]', 'Accommodation and food services [72]', 'Accommodation and food services [72]', 'Accommodation and food services [72]', 'Accommodation and food services [72]', 'Accommodation and food services [72]', 'Accommodation and food services [72]', 'Accommodation and food services [72]', 'Accommodation and food services [72]'
$ DATE_YMD <date> 1976-01-01, 1976-02-01, 1976-03-01, 1976-04-01, 1976-05-01, 1976-06-01, 1976-07-01, 1976-08-01, 1976-09-01, 1976-10-01
$ YEAR <i32> 1976, 1976, 1976, 1976, 1976, 1976, 1976, 1976, 1976, 1976
$ MONTH <i32> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10
$ VALUE <f64> 391.9, 395.1, 399.7, 399.7, 407.7, 411.6, 417.6, 423.7, 423.5, 428.9
$ LAGGED_VALUE <f64> None, 391.9, 395.1, 399.7, 399.7, 407.7, 411.6, 417.6, 423.7, 423.5
$ DIFF <f64> None, 3.2000000000000455, 4.599999999999966, 0.0, 8.0, 3.900000000000034, 6.0, 6.099999999999966, -0.19999999999998863, 5.399999999999977
$ PDIFF <f64> None, 0.008165348303138672, 0.011642622120981943, 0.0, 0.020015011258443835, 0.009565857247976537, 0.014577259475218658, 0.014607279693486507, -0.0004720320981826496, 0.012750885478158152
$ centered_rank_across_industry <i64> None, 8, 11, 0, 8, 8, 9, 8, -1, 5
Check output visually for 1 year 1 month
# check 1 year 1 month
(
labour_processed
.with_columns(pl.col("PDIFF").round(decimals=4))
.filter(pl.col("YEAR") == 2025)
.sort(["YEAR", "MONTH", "PDIFF"])
.select(["YEAR", "MONTH", "Industry", "VALUE", "DIFF", "PDIFF", cs.matches("rank")])
)| YEAR | MONTH | Industry | VALUE | DIFF | PDIFF | centered_rank_across_industry |
|---|---|---|---|---|---|---|
| i32 | i32 | str | f64 | f64 | f64 | i64 |
| 2025 | 1 | "Wholesale trade [41]" | 689.4 | -24.0 | -0.0336 | -8 |
| 2025 | 1 | "Utilities [22]" | 155.8 | -3.3 | -0.0207 | -7 |
| 2025 | 1 | "Other services (except public … | 771.0 | -13.9 | -0.0177 | -6 |
| 2025 | 1 | "Forestry, fishing, mining, qua… | 337.1 | -5.6 | -0.0163 | -5 |
| 2025 | 1 | "Business, building and other s… | 726.4 | -7.4 | -0.0101 | -4 |
| … | … | … | … | … | … | … |
| 2025 | 8 | "Accommodation and food service… | 1177.1 | 9.2 | 0.0079 | 5 |
| 2025 | 8 | "Construction [23]" | 1636.3 | 17.1 | 0.0106 | 6 |
| 2025 | 8 | "Agriculture [111-112, 1100, 11… | 217.7 | 4.8 | 0.0225 | 7 |
| 2025 | 8 | "Utilities [22]" | 163.5 | 4.7 | 0.0296 | 8 |
| 2025 | 8 | "Wholesale trade [41]" | 731.1 | 27.6 | 0.0392 | 9 |