lakehouse-personio/tap-personio/tap_personio/streams.py

"""Stream type classes for tap-personio."""

from __future__ import annotations

import typing as t
from importlib import resources

from singer_sdk import typing as th  # JSON Schema typing helpers

from tap_personio.client import PersonioStream

from typing import List, Optional

SCHEMAS_DIR = resources.files(__package__) / "schemas"

class PersonsStream(PersonioStream):
    name = "persons"
    path = "/v2/persons"
    primary_keys = ["id"]
    replication_key = "updated_at"
    is_sorted = False
    schema_filepath = SCHEMAS_DIR / "persons.json"


    def get_child_context(self, record: dict, context: Optional[dict]) -> dict:
        """Return a context dictionary for child streams."""
        return {
            "person_id": record["id"],
        }

class EmploymentsStream(PersonioStream):
    name = "employments"
    path = "/v2/persons/{person_id}/employments"
    primary_keys = ["id"]
    replication_key = "updated_at"
    is_sorted = False
    schema_filepath = SCHEMAS_DIR / "employments.json"

    # EmploymentsStream should be invoked once per parent Person
    parent_stream_type = PersonsStream

    # Assume employments don't have "updated_at" incremented when employments are changed
    ignore_parent_replication_keys = True


# Getting the Org Units, based on the IDs received from the EmploymentsStream
# Also fetches the parent chain for each org unit, to ensure we have the full hierarchy
class OrgUnitStream(PersonioStream):
    name = "org-units"
    path = "/v2/org-units/{id}"
    schema_filepath = SCHEMAS_DIR / "org-units.json"
    primary_keys = ["id"]

    def get_url_params(self, context, next_page_token):
        """Add the include_parent_chain parameter to all requests"""
        params = super().get_url_params(context, next_page_token)
        params["include_parent_chain"] = "true"
        return params

    def __init__(self, tap):
        super().__init__(tap)
        self._org_unit_ids = None

    def get_records(self, context):
        initial_org_unit_ids = self._get_required_org_unit_ids()
        processed_ids = set()

        for org_unit_id in initial_org_unit_ids:
            try:
                response_data = self._fetch_org_unit(org_unit_id)

                # Extract and yield all org units from this response
                yield from self._extract_org_units(response_data, processed_ids)

            except Exception as e:
                self.logger.error(f"Failed to fetch org unit {org_unit_id}: {e}")
                continue

    def _extract_org_units(self, response_data, processed_ids):
        """Extract all org units from a single API response"""
        all_org_units = []

        # Add the main org unit
        main_org_unit = {k: v for k, v in response_data.items() if k != "parent_chain"}
        all_org_units.append(main_org_unit)

        # Add all parent org units
        all_org_units.extend(response_data.get("parent_chain", []))

        # Yield only unique org units
        for org_unit in all_org_units:
            org_unit_id = org_unit["id"]
            if org_unit_id not in processed_ids:
                processed_ids.add(org_unit_id)
                yield org_unit

    def _fetch_org_unit(self, org_unit_id):
        url = self.get_url({"id": org_unit_id})
        response = self.request_decorator(self._request)(url)
        return response.json()

    def _get_required_org_unit_ids(self):
        # This could read from tap state, a file, or re-scan employment data
        employment_stream = EmploymentsStream(self._tap)
        org_unit_ids = set()

        for record in employment_stream.get_records(None):
            if record.get('org_unit_id'):
                org_unit_ids.add(record['org_unit_id'])

        return org_unit_ids


class LegalEntitiesStream(PersonioStream):
    name = "legal-entities"
    path = "/v2/legal-entities"
    primary_keys = ["id"]
    replication_key = "updated_at"
    is_sorted = False
    schema_filepath = SCHEMAS_DIR / "legal-entities.json"