From 8df02c97e840cf84f3d52a7df7917bb48931775a Mon Sep 17 00:00:00 2001 From: Jeroen Date: Mon, 14 Jul 2025 20:49:41 +0200 Subject: [PATCH] Removing duplicated job changes - note, this is to cater for a data error which we should resolve --- .../tap_spreadsheets_anywhere/job_change.sql | 16 ++++++++++++---- .../models/tap_spreadsheets_anywhere/worker.sql | 14 +++++++++++++- 2 files changed, 25 insertions(+), 5 deletions(-) diff --git a/transform/models/tap_spreadsheets_anywhere/job_change.sql b/transform/models/tap_spreadsheets_anywhere/job_change.sql index 7f07fc1..1189a51 100644 --- a/transform/models/tap_spreadsheets_anywhere/job_change.sql +++ b/transform/models/tap_spreadsheets_anywhere/job_change.sql @@ -17,7 +17,15 @@ latest_departments AS ( FROM {{ ref('department') }} ), --- note: Positions ID is not unique, hence removed the deduplication logic +-- note: Positions ID is not unique, hence removed the full deduplication logic +-- however, we had positions with same start date while both having primary position set to true, hence only selecting a random one for now (temp workaround) +deduplicated_positions AS ( + SELECT DISTINCT ON (assigned_employee_id, assigned_employee_effective_date) + * + FROM {{ source('tap_spreadsheets_anywhere', 'positions') }} + WHERE _sdc_deleted_at IS NULL + ORDER BY assigned_employee_id, assigned_employee_effective_date, _sdc_received_at DESC +), transformed_worker AS ( SELECT worker_hris_id, id @@ -33,11 +41,11 @@ position_details AS ( w.contracting_company AS new_contracting_company, d.path::ltree AS new_department_path, d.manager_id::BIGINT AS new_manager_id, - p.assigned_employee_effective_date::DATE AS new_job_effective_date, - p.fte_utilized_by_employee_in_this_position * 100 AS new_fte_percentage, + COALESCE(p.assigned_employee_effective_date, w.original_hire_date)::DATE AS new_job_effective_date, -- Use original hire date if position effective date is not available (this is the case when there is no position record) + COALESCE(p.fte_utilized_by_employee_in_this_position * 100, 100) AS new_fte_percentage, -- Default to 100% if not specified tw.id as worker_id FROM transformed_worker tw - LEFT JOIN {{ source('tap_spreadsheets_anywhere', 'positions') }} p + LEFT JOIN deduplicated_positions p ON p.assigned_employee_id = tw.worker_hris_id LEFT JOIN latest_departments d ON p.assigned_unit_id = d.department_hris_id diff --git a/transform/models/tap_spreadsheets_anywhere/worker.sql b/transform/models/tap_spreadsheets_anywhere/worker.sql index 0050393..aad1e19 100644 --- a/transform/models/tap_spreadsheets_anywhere/worker.sql +++ b/transform/models/tap_spreadsheets_anywhere/worker.sql @@ -9,6 +9,7 @@ WITH latest_workers AS ( SELECT DISTINCT ON (user_id) * FROM {{ source('tap_spreadsheets_anywhere', 'workers') }} WHERE _sdc_deleted_at IS NULL + AND user_id IS NOT NULL and user_id != '' -- Skipping empty user_ids ORDER BY user_id, _sdc_received_at DESC ), @@ -23,6 +24,15 @@ latest_positions AS ( ORDER BY assigned_employee_id, _sdc_received_at DESC, assigned_employee_effective_date DESC ), +-- Get the first position for each worker, to set the hire date +-- This is to fix the "original hire date" column of the All users report containing dates before the first position +first_positions AS ( + SELECT DISTINCT ON (assigned_employee_id) * + FROM {{ source('tap_spreadsheets_anywhere', 'positions') }} + WHERE _sdc_deleted_at IS NULL + ORDER BY assigned_employee_id, _sdc_received_at DESC, assigned_employee_effective_date ASC +), + joined_data AS ( SELECT w.user_id, @@ -41,7 +51,7 @@ joined_data AS ( NULL::VARCHAR AS address_country, NULL::VARCHAR AS phone_number, NULL::VARCHAR AS driver_license, - w.original_hire_date::DATE AS employment_start, + COALESCE(fp.assigned_employee_effective_date, w.original_hire_date)::DATE AS employment_start, w.user_type AS employment_type, w.user_contract_type AS employment_contract_type, w.contracting_company AS employment_contracting_company, @@ -76,6 +86,8 @@ joined_data AS ( FROM latest_workers w LEFT JOIN latest_positions p ON w.user_id = p.assigned_employee_id + LEFT JOIN first_positions fp + ON w.user_id = fp.assigned_employee_id LEFT JOIN {{ ref('department') }} d -- Source = Department from Positions report, only relevant for active workers ON p.assigned_unit_id = d.department_hris_id LEFT JOIN {{ ref('department') }} d2 -- Source = Business Unit from All Users report, only relevant for inactive workers