120 lines
4.0 KiB
Python
120 lines
4.0 KiB
Python
"""Stream type classes for tap-personio."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import typing as t
|
|
from importlib import resources
|
|
|
|
from singer_sdk import typing as th # JSON Schema typing helpers
|
|
|
|
from tap_personio.client import PersonioStream
|
|
|
|
from typing import List, Optional
|
|
|
|
SCHEMAS_DIR = resources.files(__package__) / "schemas"
|
|
|
|
class PersonsStream(PersonioStream):
|
|
name = "persons"
|
|
path = "/v2/persons"
|
|
primary_keys = ["id"]
|
|
replication_key = "updated_at"
|
|
is_sorted = False
|
|
schema_filepath = SCHEMAS_DIR / "persons.json"
|
|
|
|
|
|
def get_child_context(self, record: dict, context: Optional[dict]) -> dict:
|
|
"""Return a context dictionary for child streams."""
|
|
return {
|
|
"person_id": record["id"],
|
|
}
|
|
|
|
class EmploymentsStream(PersonioStream):
|
|
name = "employments"
|
|
path = "/v2/persons/{person_id}/employments"
|
|
primary_keys = ["id"]
|
|
replication_key = "updated_at"
|
|
is_sorted = False
|
|
schema_filepath = SCHEMAS_DIR / "employments.json"
|
|
|
|
# EmploymentsStream should be invoked once per parent Person
|
|
parent_stream_type = PersonsStream
|
|
|
|
# Assume employments don't have "updated_at" incremented when employments are changed
|
|
ignore_parent_replication_keys = True
|
|
|
|
|
|
# Getting the Org Units, based on the IDs received from the EmploymentsStream
|
|
# Also fetches the parent chain for each org unit, to ensure we have the full hierarchy
|
|
class OrgUnitStream(PersonioStream):
|
|
name = "org-units"
|
|
path = "/v2/org-units/{id}"
|
|
schema_filepath = SCHEMAS_DIR / "org-units.json"
|
|
primary_keys = ["id"]
|
|
|
|
def get_url_params(self, context, next_page_token):
|
|
"""Add the include_parent_chain parameter to all requests"""
|
|
params = super().get_url_params(context, next_page_token)
|
|
params["include_parent_chain"] = "true"
|
|
return params
|
|
|
|
def __init__(self, tap):
|
|
super().__init__(tap)
|
|
self._org_unit_ids = None
|
|
|
|
def get_records(self, context):
|
|
initial_org_unit_ids = self._get_required_org_unit_ids()
|
|
processed_ids = set()
|
|
|
|
for org_unit_id in initial_org_unit_ids:
|
|
try:
|
|
response_data = self._fetch_org_unit(org_unit_id)
|
|
|
|
# Extract and yield all org units from this response
|
|
yield from self._extract_org_units(response_data, processed_ids)
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Failed to fetch org unit {org_unit_id}: {e}")
|
|
continue
|
|
|
|
def _extract_org_units(self, response_data, processed_ids):
|
|
"""Extract all org units from a single API response"""
|
|
all_org_units = []
|
|
|
|
# Add the main org unit
|
|
main_org_unit = {k: v for k, v in response_data.items() if k != "parent_chain"}
|
|
all_org_units.append(main_org_unit)
|
|
|
|
# Add all parent org units
|
|
all_org_units.extend(response_data.get("parent_chain", []))
|
|
|
|
# Yield only unique org units
|
|
for org_unit in all_org_units:
|
|
org_unit_id = org_unit["id"]
|
|
if org_unit_id not in processed_ids:
|
|
processed_ids.add(org_unit_id)
|
|
yield org_unit
|
|
|
|
def _fetch_org_unit(self, org_unit_id):
|
|
url = self.get_url({"id": org_unit_id})
|
|
response = self.request_decorator(self._request)(url)
|
|
return response.json()
|
|
|
|
def _get_required_org_unit_ids(self):
|
|
# This could read from tap state, a file, or re-scan employment data
|
|
employment_stream = EmploymentsStream(self._tap)
|
|
org_unit_ids = set()
|
|
|
|
for record in employment_stream.get_records(None):
|
|
if record.get('org_unit_id'):
|
|
org_unit_ids.add(record['org_unit_id'])
|
|
|
|
return org_unit_ids
|
|
|
|
|
|
class LegalEntitiesStream(PersonioStream):
|
|
name = "legal-entities"
|
|
path = "/v2/legal-entities"
|
|
primary_keys = ["id"]
|
|
replication_key = "updated_at"
|
|
is_sorted = False
|
|
schema_filepath = SCHEMAS_DIR / "legal-entities.json" |