from pyprojroot import hereProcessing StatCan Data
Setup
Parameters
LABOUR_DATA_FILE = here() / "data" / "14100355.csv"Libraries
import polars as pl
import polars.selectors as cs
from mizani.bounds import squish
from pyprojroot import here
from great_tables import GT, md, html
from plotnine import *
from labourcan.data_processing import read_labourcanRead data
read_labourcan returns a polars dataframe with columns:
- Unnecessary metadata columns removed
- Filtered to seasonally adjusted estimates only
- Additional YEAR,MONTH, andDATE_YMDcolumns extracted fromREF_DATE
- Sorted chronologically by year and month
labour = read_labourcan(LABOUR_DATA_FILE)
labour.glimpse()Rows: 12252
Columns: 10
$ REF_DATE    <str> '1976-01', '1976-01', '1976-01', '1976-01', '1976-01', '1976-01', '1976-01', '1976-01', '1976-01', '1976-01'
$ GEO         <str> 'Canada', 'Canada', 'Canada', 'Canada', 'Canada', 'Canada', 'Canada', 'Canada', 'Canada', 'Canada'
$ Industry    <str> 'Total employed, all industries', 'Goods-producing sector', 'Agriculture [111-112, 1100, 1151-1152]', 'Forestry, fishing, mining, quarrying, oil and gas [21, 113-114, 1153, 2100]', 'Utilities [22]', 'Construction [23]', 'Manufacturing [31-33]', 'Services-producing sector', 'Wholesale and retail trade [41, 44-45]', 'Transportation and warehousing [48-49]'
$ Statistics  <str> 'Estimate', 'Estimate', 'Estimate', 'Estimate', 'Estimate', 'Estimate', 'Estimate', 'Estimate', 'Estimate', 'Estimate'
$ Data type   <str> 'Seasonally adjusted', 'Seasonally adjusted', 'Seasonally adjusted', 'Seasonally adjusted', 'Seasonally adjusted', 'Seasonally adjusted', 'Seasonally adjusted', 'Seasonally adjusted', 'Seasonally adjusted', 'Seasonally adjusted'
$ UOM         <str> 'Persons in thousands', 'Persons in thousands', 'Persons in thousands', 'Persons in thousands', 'Persons in thousands', 'Persons in thousands', 'Persons in thousands', 'Persons in thousands', 'Persons in thousands', 'Persons in thousands'
$ VALUE       <f64> 9636.7, 3312.5, 463.6, 244.2, 110.4, 654.9, 1839.5, 6324.1, 1592.9, 573.2
$ YEAR        <i32> 1976, 1976, 1976, 1976, 1976, 1976, 1976, 1976, 1976, 1976
$ MONTH       <i32> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
$ DATE_YMD   <date> 1976-01-01, 1976-01-01, 1976-01-01, 1976-01-01, 1976-01-01, 1976-01-01, 1976-01-01, 1976-01-01, 1976-01-01, 1976-01-01
Process data
For our graphic, we need to make this variable: signed cenetered rank in % change in # of jobs
Signed referring to that the rank is effectively computed over negative and positive %change separately, and in opposite directions (descending for negative, and ascending for positive).
Meaning the highest negative value (closest to 0) is rank = -1, and lowest positive value (closets to 0) is rank = 1
For example:
- Input is vector of % change which can be positive or negative: [-0.01, -0.02, 0.01, 0.02]
- Output: [-1, -2, 1, 2]
% Change per month
First, compute % change from previous month. This needs to be done over different subsets of data:
- Industry
- Geolocation
- Labour Force Characteristic (If provided)
- Gender
- Age group
In the seasonally adjusted dataset, only Industry and Geolocation are provided. The LFC is total employment, the Gender is both, and Age group is all.
labour_processed = (
    # if we sort acesnding by time, then lag value is the month before
    labour.sort(["Industry", "YEAR", "MONTH"])
    .with_columns(
        LAGGED_VALUE=pl.col("VALUE")
        .shift(1)
        .over(["Industry"])
    )
    # compute percent difference
    .with_columns((pl.col("VALUE") - pl.col("LAGGED_VALUE")).alias("DIFF"))
    .with_columns((pl.col("DIFF") / pl.col("LAGGED_VALUE")).alias("PDIFF"))
    .select(
        pl.col("Industry"),
        cs.matches("Labour force characteristics"),
        pl.col("DATE_YMD"),
        pl.col("YEAR"),
        pl.col("MONTH"),
        cs.matches("VALUE"),
        cs.matches("DIFF"),
    )
    .sort(["Industry", "YEAR", "MONTH", "PDIFF"])
)
labour_processed.glimpse()Rows: 12252
Columns: 8
$ Industry      <str> 'Accommodation and food services [72]', 'Accommodation and food services [72]', 'Accommodation and food services [72]', 'Accommodation and food services [72]', 'Accommodation and food services [72]', 'Accommodation and food services [72]', 'Accommodation and food services [72]', 'Accommodation and food services [72]', 'Accommodation and food services [72]', 'Accommodation and food services [72]'
$ DATE_YMD     <date> 1976-01-01, 1976-02-01, 1976-03-01, 1976-04-01, 1976-05-01, 1976-06-01, 1976-07-01, 1976-08-01, 1976-09-01, 1976-10-01
$ YEAR          <i32> 1976, 1976, 1976, 1976, 1976, 1976, 1976, 1976, 1976, 1976
$ MONTH         <i32> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10
$ VALUE         <f64> 391.9, 395.1, 399.7, 399.7, 407.7, 411.6, 417.6, 423.7, 423.5, 428.9
$ LAGGED_VALUE  <f64> None, 391.9, 395.1, 399.7, 399.7, 407.7, 411.6, 417.6, 423.7, 423.5
$ DIFF          <f64> None, 3.2000000000000455, 4.599999999999966, 0.0, 8.0, 3.900000000000034, 6.0, 6.099999999999966, -0.19999999999998863, 5.399999999999977
$ PDIFF         <f64> None, 0.008165348303138672, 0.011642622120981943, 0.0, 0.020015011258443835, 0.009565857247976537, 0.014577259475218658, 0.014607279693486507, -0.0004720320981826496, 0.012750885478158152
Signed Centered Rank
Now we can compute the signed centered rank.
Define centered_rank_expr function which takes a polars series and returns an expression, meaning it can be used in a polars with_columns call, which is nice because it can take advantage of polars lazy-evaluation optimization.
Below is the definition and a test-case.
def centered_rank_expr(col):
    """
    - Largest negative value gets rank -1
    - Smallest positive value gets rank +1
    - Zero gets rank 0
    """
    return (
        pl.when(col < 0)
        .then(
            # minus the total # of -ve values
            (col.rank(method="ordinal", descending=True) * -1) + (col > 0).sum()
        )
        .when(col == 0)
        .then(pl.lit(0))
        .when(col > 0)
        .then(col.rank(method="ordinal") - (col < 0).sum())
        .otherwise(pl.lit(None))
    )
# test it on this subset of data
test_series = (
    # .filter(pl.col("Labour force characteristics") == "Employment")
    labour_processed
    .with_columns(pl.col("PDIFF").round(decimals=4))
    .filter(pl.col("YEAR") == 2025, pl.col("MONTH") == 1)
    .select(pl.col("PDIFF"))
    .sample(n=10, seed=1)
    .select("PDIFF")
)
test_series.with_columns(centered_rank_expr(pl.col("PDIFF")).alias("rank")).sort(
    "PDIFF"
)| PDIFF | rank | 
|---|---|
| f64 | i64 | 
| -0.0336 | -5 | 
| -0.0207 | -4 | 
| -0.0177 | -3 | 
| -0.0101 | -2 | 
| -0.003 | -1 | 
| 0.0006 | 1 | 
| 0.0109 | 2 | 
| 0.0122 | 3 | 
| 0.0179 | 4 | 
| 0.044 | 5 | 
Looks good, so now we can apply to the data:
labour_processed = labour_processed.with_columns(
    centered_rank_across_industry=centered_rank_expr(pl.col("PDIFF")).over(
        ["YEAR", "MONTH"]
    )
)
labour_processed.glimpse()Rows: 12252
Columns: 9
$ Industry                       <str> 'Accommodation and food services [72]', 'Accommodation and food services [72]', 'Accommodation and food services [72]', 'Accommodation and food services [72]', 'Accommodation and food services [72]', 'Accommodation and food services [72]', 'Accommodation and food services [72]', 'Accommodation and food services [72]', 'Accommodation and food services [72]', 'Accommodation and food services [72]'
$ DATE_YMD                      <date> 1976-01-01, 1976-02-01, 1976-03-01, 1976-04-01, 1976-05-01, 1976-06-01, 1976-07-01, 1976-08-01, 1976-09-01, 1976-10-01
$ YEAR                           <i32> 1976, 1976, 1976, 1976, 1976, 1976, 1976, 1976, 1976, 1976
$ MONTH                          <i32> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10
$ VALUE                          <f64> 391.9, 395.1, 399.7, 399.7, 407.7, 411.6, 417.6, 423.7, 423.5, 428.9
$ LAGGED_VALUE                   <f64> None, 391.9, 395.1, 399.7, 399.7, 407.7, 411.6, 417.6, 423.7, 423.5
$ DIFF                           <f64> None, 3.2000000000000455, 4.599999999999966, 0.0, 8.0, 3.900000000000034, 6.0, 6.099999999999966, -0.19999999999998863, 5.399999999999977
$ PDIFF                          <f64> None, 0.008165348303138672, 0.011642622120981943, 0.0, 0.020015011258443835, 0.009565857247976537, 0.014577259475218658, 0.014607279693486507, -0.0004720320981826496, 0.012750885478158152
$ centered_rank_across_industry  <i64> None, 8, 11, 0, 8, 8, 9, 8, -1, 5
Check output visually for 1 year 1 month
# check 1 year 1 month
(
    labour_processed
    .with_columns(pl.col("PDIFF").round(decimals=4))
    .filter(pl.col("YEAR") == 2025)
    .sort(["YEAR", "MONTH", "PDIFF"])
    .select(["YEAR", "MONTH", "Industry", "VALUE", "DIFF", "PDIFF", cs.matches("rank")])
)| YEAR | MONTH | Industry | VALUE | DIFF | PDIFF | centered_rank_across_industry | 
|---|---|---|---|---|---|---|
| i32 | i32 | str | f64 | f64 | f64 | i64 | 
| 2025 | 1 | "Wholesale trade [41]" | 689.4 | -24.0 | -0.0336 | -8 | 
| 2025 | 1 | "Utilities [22]" | 155.8 | -3.3 | -0.0207 | -7 | 
| 2025 | 1 | "Other services (except public … | 771.0 | -13.9 | -0.0177 | -6 | 
| 2025 | 1 | "Forestry, fishing, mining, qua… | 337.1 | -5.6 | -0.0163 | -5 | 
| 2025 | 1 | "Business, building and other s… | 726.4 | -7.4 | -0.0101 | -4 | 
| … | … | … | … | … | … | … | 
| 2025 | 8 | "Accommodation and food service… | 1177.1 | 9.2 | 0.0079 | 5 | 
| 2025 | 8 | "Construction [23]" | 1636.3 | 17.1 | 0.0106 | 6 | 
| 2025 | 8 | "Agriculture [111-112, 1100, 11… | 217.7 | 4.8 | 0.0225 | 7 | 
| 2025 | 8 | "Utilities [22]" | 163.5 | 4.7 | 0.0296 | 8 | 
| 2025 | 8 | "Wholesale trade [41]" | 731.1 | 27.6 | 0.0392 | 9 |