Files
lakehouse-personio/tap-personio/tap_personio/streams.py
2025-06-03 16:22:21 +02:00

120 lines
4.0 KiB
Python

"""Stream type classes for tap-personio."""
from __future__ import annotations
import typing as t
from importlib import resources
from singer_sdk import typing as th # JSON Schema typing helpers
from tap_personio.client import PersonioStream
from typing import List, Optional
SCHEMAS_DIR = resources.files(__package__) / "schemas"
class PersonsStream(PersonioStream):
name = "persons"
path = "/v2/persons"
primary_keys = ["id"]
replication_key = "updated_at"
is_sorted = False
schema_filepath = SCHEMAS_DIR / "persons.json"
def get_child_context(self, record: dict, context: Optional[dict]) -> dict:
"""Return a context dictionary for child streams."""
return {
"person_id": record["id"],
}
class EmploymentsStream(PersonioStream):
name = "employments"
path = "/v2/persons/{person_id}/employments"
primary_keys = ["id"]
replication_key = "updated_at"
is_sorted = False
schema_filepath = SCHEMAS_DIR / "employments.json"
# EmploymentsStream should be invoked once per parent Person
parent_stream_type = PersonsStream
# Assume employments don't have "updated_at" incremented when employments are changed
ignore_parent_replication_keys = True
# Getting the Org Units, based on the IDs received from the EmploymentsStream
# Also fetches the parent chain for each org unit, to ensure we have the full hierarchy
class OrgUnitStream(PersonioStream):
name = "org-units"
path = "/v2/org-units/{id}"
schema_filepath = SCHEMAS_DIR / "org-units.json"
primary_keys = ["id"]
def get_url_params(self, context, next_page_token):
"""Add the include_parent_chain parameter to all requests"""
params = super().get_url_params(context, next_page_token)
params["include_parent_chain"] = "true"
return params
def __init__(self, tap):
super().__init__(tap)
self._org_unit_ids = None
def get_records(self, context):
initial_org_unit_ids = self._get_required_org_unit_ids()
processed_ids = set()
for org_unit_id in initial_org_unit_ids:
try:
response_data = self._fetch_org_unit(org_unit_id)
# Extract and yield all org units from this response
yield from self._extract_org_units(response_data, processed_ids)
except Exception as e:
self.logger.error(f"Failed to fetch org unit {org_unit_id}: {e}")
continue
def _extract_org_units(self, response_data, processed_ids):
"""Extract all org units from a single API response"""
all_org_units = []
# Add the main org unit
main_org_unit = {k: v for k, v in response_data.items() if k != "parent_chain"}
all_org_units.append(main_org_unit)
# Add all parent org units
all_org_units.extend(response_data.get("parent_chain", []))
# Yield only unique org units
for org_unit in all_org_units:
org_unit_id = org_unit["id"]
if org_unit_id not in processed_ids:
processed_ids.add(org_unit_id)
yield org_unit
def _fetch_org_unit(self, org_unit_id):
url = self.get_url({"id": org_unit_id})
response = self.request_decorator(self._request)(url)
return response.json()
def _get_required_org_unit_ids(self):
# This could read from tap state, a file, or re-scan employment data
employment_stream = EmploymentsStream(self._tap)
org_unit_ids = set()
for record in employment_stream.get_records(None):
if record.get('org_unit_id'):
org_unit_ids.add(record['org_unit_id'])
return org_unit_ids
class LegalEntitiesStream(PersonioStream):
name = "legal-entities"
path = "/v2/legal-entities"
primary_keys = ["id"]
replication_key = "updated_at"
is_sorted = False
schema_filepath = SCHEMAS_DIR / "legal-entities.json"