Add Department extract, load and transform logic (lacking manager and

cost center logic)
This commit is contained in:
2025-06-12 15:27:53 +02:00
parent 0738d16c2d
commit 33b22855a6
10 changed files with 655 additions and 0 deletions

View File

@@ -3,5 +3,53 @@ default_environment: dev
project_id: 65319107-1c78-4167-b3db-9d61a3ba981e
environments:
- name: dev
config:
plugins:
extractors:
- name: tap-spreadsheets-anywhere
config:
tables:
- path: file://C:/Users/vdsje/OneDrive/LakeHouse/Sarens/Data/2025-06-11/
name: departments
pattern: Applicable Organizations.xlsx
start_date: '2000-01-01T00:00:00Z'
key_properties:
- id
format: excel
worksheet_name: AO
select:
- departments.*
- '!departments._*'
loaders:
- name: target-postgres
config:
database: lakehouse_sarens
host: localhost
load_method: upsert
user: lakehouse_sarens
default_target_schema: public
utilities:
- name: dbt-postgres
config:
host: localhost
dbname: lakehouse_sarens
user: lakehouse_sarens
port: 5432
schema: public
- name: staging
- name: prod
plugins:
extractors:
- name: tap-spreadsheets-anywhere
variant: ets
pip_url: git+https://github.com/ets/tap-spreadsheets-anywhere.git
loaders:
- name: target-postgres
variant: meltanolabs
pip_url: meltanolabs-target-postgres
utilities:
- name: dbt-postgres
variant: dbt-labs
pip_url: dbt-core dbt-postgres meltano-dbt-ext~=0.3.0
env:
DBT_CLEAN_PROJECT_FILES_ONLY: 'false'

View File

@@ -0,0 +1,30 @@
{
"plugin_type": "extractors",
"name": "tap-spreadsheets-anywhere",
"namespace": "tap_spreadsheets_anywhere",
"variant": "ets",
"label": "Spreadsheets Anywhere",
"docs": "https://hub.meltano.com/extractors/tap-spreadsheets-anywhere--ets",
"repo": "https://github.com/ets/tap-spreadsheets-anywhere",
"pip_url": "git+https://github.com/ets/tap-spreadsheets-anywhere.git",
"description": "Data extractor for CSV and Excel files from any smart_open supported transport (S3, SFTP, localhost, etc...)",
"logo_url": "https://hub.meltano.com/assets/logos/extractors/spreadsheets-anywhere.png",
"capabilities": [
"catalog",
"discover",
"state"
],
"settings_group_validation": [
[
"tables"
]
],
"settings": [
{
"name": "tables",
"kind": "array",
"label": "Tables",
"description": "An array holding json objects that each describe a set of targeted source files.\n\nEach object in the 'tables' array describes one or more CSV or Excel spreadsheet files that adhere to the same schema and are meant to be tapped as the source for a Singer-based data flow.\nThe available keys are:\n\n- path: A string describing the transport and bucket/root directory holding the targeted source files.\n- name: A string describing the \"table\" (aka Singer stream) into which the source data should be loaded.\n- search_prefix: (optional) This is an optional prefix to apply after the bucket that will be used to filter files in the listing request from the targeted system. This prefix potentially reduces the number of files returned from the listing request.\n- pattern: This is an escaped regular expression that the tap will use to filter the listing result set returned from the listing request. This pattern potentially reduces the number of listed files that are considered as sources for the declared table. It's a bit strange, since this is an escaped string inside of an escaped string, any backslashes in the RegEx will need to be double-escaped.\n- start_date: This is the datetime that the tap will use to filter files, based on the modified timestamp of the file.\n- key_properties: These are the \"primary keys\" of the CSV files, to be used by the target for deduplication and primary key definitions downstream in the destination.\n- format: Must be either 'csv', 'json', 'excel', or 'detect'. Note that csv can be further customized with delimiter and quotechar variables below.\n- invalid_format_action: (optional) By default, the tap will raise an exception if a source file can not be read . Set this key to \"ignore\" to skip such source files and continue the run.\n- field_names: (optional) An array holding the names of the columns in the targeted files. If not supplied, the first row of each file must hold the desired values.\n- universal_newlines: (optional) Should the source file parsers honor universal newlines). Setting this to false will instruct the parser to only consider '\\n' as a valid newline identifier.\n- sample_rate: (optional) The sampling rate to apply when reading a source file for sampling in discovery mode. A sampling rate of 1 will sample every line. A sampling rate of 10 (the default) will sample every 10th line.\n- max_sampling_read: (optional) How many lines of the source file should be sampled when in discovery mode attempting to infer a schema. The default is 1000 samples.\n- max_sampled_files: (optional) The maximum number of files in the targeted set that will be sampled. The default is 5.\n- max_records_per_run: (optional) The maximum number of records that should be written to this stream in a single sync run. The default is unlimited.\n- prefer_number_vs_integer: (optional) If the discovery mode sampling process sees only integer values for a field, should number be used anyway so that floats are not considered errors? The default is false but true can help in situations where floats only appear rarely in sources and may not be detected through discovery sampling.\n- selected: (optional) Should this table be synced. Defaults to true. Setting to false will skip this table on a sync run.\n- worksheet_name: (optional) the worksheet name to pull from in the targeted xls file(s). Only required when format is excel\n- delimiter: (optional) the delimiter to use when format is 'csv', for example ','. You can leave delimiter blank or set it to 'detect' to leverage the csv \"Sniffer\" for auto-detecting delimiter.\n- quotechar: (optional) the character used to surround values that may contain delimiters - defaults to a double quote '\"' when not using the auto detecting feature. That is, delimiter is defined and not equal to `detect`.\n- json_path: (optional) the JSON key under which the list of objets to use is located. Defaults to None, corresponding to an array at the top level of the JSON tree.\n\nFor example:\n\n```yaml\nconfig:\n tables:\n - path: s3://my-s3-bucket\n name: target_table_name\n pattern: subfolder/common_prefix.*\n start_date: 2017-05-01T00:00:00Z\n key_properties: []\n format: csv\n delimiter: \"|\"\n quotechar: '\"'\n universal_newlines: false\n sample_rate: 10\n max_sampling_read: 2000\n max_sampled_files: 3\n prefer_number_vs_integer: true\n selected: true\n```\n\nSee the `Common Config Examples` section below for more examples or see the [repo README](https://github.com/ets/tap-spreadsheets-anywhere) for more details.\n"
}
]
}

View File

@@ -0,0 +1,288 @@
{
"plugin_type": "loaders",
"name": "target-postgres",
"namespace": "target_postgres",
"variant": "meltanolabs",
"label": "Postgres",
"docs": "https://hub.meltano.com/loaders/target-postgres--meltanolabs",
"repo": "https://github.com/MeltanoLabs/target-postgres",
"pip_url": "meltanolabs-target-postgres",
"executable": "target-postgres",
"description": "PostgreSQL database loader",
"logo_url": "https://hub.meltano.com/assets/logos/loaders/postgres.png",
"capabilities": [
"about",
"activate-version",
"hard-delete",
"schema-flattening",
"stream-maps"
],
"settings_group_validation": [
[]
],
"settings": [
{
"name": "activate_version",
"kind": "boolean",
"value": true,
"label": "Activate Version",
"description": "If set to false, the tap will ignore activate version messages. If set to true, add_record_metadata must be set to true as well."
},
{
"name": "add_record_metadata",
"kind": "boolean",
"value": true,
"label": "Add Record Metadata",
"description": "Note that this must be enabled for activate_version to work!This adds _sdc_extracted_at, _sdc_batched_at, and more to every table. See https://sdk.meltano.com/en/latest/implementation/record_metadata.html for more information."
},
{
"name": "batch_size_rows",
"kind": "integer",
"label": "Batch Size Rows",
"description": "Maximum number of rows in each batch."
},
{
"name": "database",
"kind": "string",
"label": "Database",
"description": "Database name."
},
{
"name": "default_target_schema",
"kind": "string",
"value": "$MELTANO_EXTRACT__LOAD_SCHEMA",
"label": "Default Target Schema",
"description": "Postgres schema to send data to, example: tap-clickup"
},
{
"name": "dialect+driver",
"kind": "string",
"value": "postgresql+psycopg",
"label": "Dialect+Driver",
"description": "DEPRECATED. Dialect+driver see https://docs.sqlalchemy.org/en/20/core/engines.html. Generally just leave this alone."
},
{
"name": "faker_config.locale",
"kind": "array",
"label": "Faker Locale",
"description": "One or more LCID locale strings to produce localized output for: https://faker.readthedocs.io/en/master/#localization"
},
{
"name": "faker_config.seed",
"kind": "string",
"label": "Faker Seed",
"description": "Value to seed the Faker generator for deterministic output: https://faker.readthedocs.io/en/master/#seeding-the-generator"
},
{
"name": "flattening_enabled",
"kind": "boolean",
"label": "Enable Schema Flattening",
"description": "'True' to enable schema flattening and automatically expand nested properties."
},
{
"name": "flattening_max_depth",
"kind": "integer",
"label": "Max Flattening Depth",
"description": "The max depth to flatten schemas."
},
{
"name": "hard_delete",
"kind": "boolean",
"value": false,
"label": "Hard Delete",
"description": "When activate version is sent from a tap this specefies if we should delete the records that don't match, or mark them with a date in the `_sdc_deleted_at` column. This config option is ignored if `activate_version` is set to false."
},
{
"name": "host",
"kind": "string",
"label": "Host",
"description": "Hostname for postgres instance."
},
{
"name": "interpret_content_encoding",
"kind": "boolean",
"value": false,
"label": "Interpret Content Encoding",
"description": "If set to true, the target will interpret the content encoding of the schema to determine how to store the data. Using this option may result in a more efficient storage of the data but may also result in an error if the data is not encoded as expected."
},
{
"name": "load_method",
"kind": "options",
"value": "append-only",
"label": "Load Method",
"description": "The method to use when loading data into the destination. `append-only` will always write all input records whether that records already exists or not. `upsert` will update existing records and insert new records. `overwrite` will delete all existing records and insert all input records.",
"options": [
{
"label": "Append Only",
"value": "append-only"
},
{
"label": "Upsert",
"value": "upsert"
},
{
"label": "Overwrite",
"value": "overwrite"
}
]
},
{
"name": "password",
"kind": "string",
"label": "Password",
"description": "Password used to authenticate.",
"sensitive": true
},
{
"name": "port",
"kind": "integer",
"value": 5432,
"label": "Port",
"description": "The port on which postgres is awaiting connections."
},
{
"name": "process_activate_version_messages",
"kind": "boolean",
"value": true,
"label": "Process `ACTIVATE_VERSION` messages",
"description": "Whether to process `ACTIVATE_VERSION` messages."
},
{
"name": "sanitize_null_text_characters",
"kind": "boolean",
"value": false,
"label": "Sanitize Null Text Characters",
"description": "If set to true, the target will sanitize null characters in char/text/varchar fields, as they are not supported by Postgres. See [postgres documentation](https://www.postgresql.org/docs/current/functions-string.html) for more information about chr(0) not being supported."
},
{
"name": "sqlalchemy_url",
"kind": "string",
"label": "SQLAlchemy URL",
"description": "DEPRECATED. SQLAlchemy connection string. This will override using host, user, password, port, dialect, and all ssl settings. Note that you must escape password special characters properly. See https://docs.sqlalchemy.org/en/20/core/engines.html#escaping-special-characters-such-as-signs-in-passwords"
},
{
"name": "ssh_tunnel.enable",
"kind": "boolean",
"value": false,
"label": "SSH Tunnel Enable",
"description": "Enable an ssh tunnel (also known as bastion host), see the other ssh_tunnel.* properties for more details"
},
{
"name": "ssh_tunnel.host",
"kind": "string",
"label": "SSH Tunnel Host",
"description": "Host of the bastion host, this is the host we'll connect to via ssh"
},
{
"name": "ssh_tunnel.port",
"kind": "integer",
"value": 22,
"label": "SSH Tunnel Port",
"description": "Port to connect to bastion host"
},
{
"name": "ssh_tunnel.private_key",
"kind": "string",
"label": "SSH Tunnel Private Key",
"description": "Private Key for authentication to the bastion host",
"sensitive": true
},
{
"name": "ssh_tunnel.private_key_password",
"kind": "string",
"label": "SSH Tunnel Private Key Password",
"description": "Private Key Password, leave None if no password is set",
"sensitive": true
},
{
"name": "ssh_tunnel.username",
"kind": "string",
"label": "SSH Tunnel Username",
"description": "Username to connect to bastion host"
},
{
"name": "ssl_certificate_authority",
"kind": "string",
"value": "~/.postgresql/root.crl",
"label": "SSL Certificate Authority",
"description": "The certificate authority that should be used to verify the server's identity. Can be provided either as the certificate itself (in .env) or as a filepath to the certificate."
},
{
"name": "ssl_client_certificate",
"kind": "string",
"value": "~/.postgresql/postgresql.crt",
"label": "SSL Client Certificate",
"description": "The certificate that should be used to verify your identity to the server. Can be provided either as the certificate itself (in .env) or as a filepath to the certificate."
},
{
"name": "ssl_client_certificate_enable",
"kind": "boolean",
"value": false,
"label": "SSL Client Certificate Enable",
"description": "Whether or not to provide client-side certificates as a method of authentication to the server. Use ssl_client_certificate and ssl_client_private_key for further customization. To use SSL to verify the server's identity, use ssl_enable instead."
},
{
"name": "ssl_client_private_key",
"kind": "string",
"value": "~/.postgresql/postgresql.key",
"label": "SSL Client Private Key",
"description": "The private key for the certificate you provided. Can be provided either as the certificate itself (in .env) or as a filepath to the certificate.",
"sensitive": true
},
{
"name": "ssl_enable",
"kind": "boolean",
"value": false,
"label": "SSL Enable",
"description": "Whether or not to use ssl to verify the server's identity. Use ssl_certificate_authority and ssl_mode for further customization. To use a client certificate to authenticate yourself to the server, use ssl_client_certificate_enable instead."
},
{
"name": "ssl_mode",
"kind": "string",
"value": "verify-full",
"label": "SSL Mode",
"description": "SSL Protection method, see [postgres documentation](https://www.postgresql.org/docs/current/libpq-ssl.html#LIBPQ-SSL-PROTECTION) for more information. Must be one of disable, allow, prefer, require, verify-ca, or verify-full."
},
{
"name": "ssl_storage_directory",
"kind": "string",
"value": ".secrets",
"label": "SSL Storage Directory",
"description": "The folder in which to store SSL certificates provided as raw values. When a certificate/key is provided as a raw value instead of as a filepath, it must be written to a file before it can be used. This configuration option determines where that file is created."
},
{
"name": "stream_map_config",
"kind": "object",
"label": "User Stream Map Configuration",
"description": "User-defined config values to be used within map expressions."
},
{
"name": "stream_maps",
"kind": "object",
"label": "Stream Maps",
"description": "Config object for stream maps capability. For more information check out [Stream Maps](https://sdk.meltano.com/en/latest/stream_maps.html)."
},
{
"name": "use_copy",
"kind": "boolean",
"value": false,
"label": "Use COPY",
"description": "Use the COPY command to insert data. This is usually faster than INSERT statements. This option is only available for the postgresql+psycopg dialect+driver."
},
{
"name": "user",
"kind": "string",
"label": "User",
"description": "User name used to authenticate."
},
{
"name": "validate_records",
"kind": "boolean",
"value": true,
"label": "Validate Records",
"description": "Whether to validate the schema of the incoming streams."
}
],
"dialect": "postgres",
"target_schema": "$TARGET_POSTGRES_SCHEMA"
}

View File

@@ -0,0 +1,172 @@
{
"plugin_type": "utilities",
"name": "dbt-postgres",
"namespace": "dbt_postgres",
"variant": "dbt-labs",
"label": "dbt PostgreSQL",
"docs": "https://hub.meltano.com/utilities/dbt-postgres--dbt-labs",
"repo": "https://github.com/dbt-labs/dbt-core",
"pip_url": "dbt-core dbt-postgres meltano-dbt-ext~=0.3.0",
"executable": "dbt_invoker",
"logo_url": "https://hub.meltano.com/assets/logos/utilities/dbt.png",
"settings": [
{
"name": "dbname",
"aliases": [
"database"
],
"kind": "string",
"label": "Database",
"description": "The db to connect to.\n"
},
{
"name": "host",
"kind": "string",
"label": "Host",
"description": "The postgres host to connect to.\n"
},
{
"name": "keepalives_idle",
"kind": "integer",
"label": "Keep Alives Idle",
"description": "Seconds between TCP keepalive packets.\n"
},
{
"name": "password",
"kind": "string",
"label": "Password",
"description": "The password to connect with.\n",
"sensitive": true
},
{
"name": "port",
"kind": "integer",
"label": "Port",
"description": "The port to connect to.\n"
},
{
"name": "profiles_dir",
"env": "DBT_PROFILES_DIR",
"value": "$MELTANO_PROJECT_ROOT/transform/profiles/postgres",
"label": "Profiles Directory"
},
{
"name": "project_dir",
"env": "DBT_PROJECT_DIR",
"value": "$MELTANO_PROJECT_ROOT/transform",
"label": "Projects Directory"
},
{
"name": "role",
"kind": "string",
"label": "Role",
"description": "Role for dbt to assume when executing queries.\n"
},
{
"name": "schema",
"kind": "string",
"label": "Schema",
"description": "The schema to use.\n"
},
{
"name": "search_path",
"kind": "string",
"label": "Search Path",
"description": "Overrides the default search path.\n"
},
{
"name": "skip_pre_invoke",
"env": "DBT_EXT_SKIP_PRE_INVOKE",
"kind": "boolean",
"value": false,
"label": "Skip Pre-invoke",
"description": "Whether to skip pre-invoke hooks which automatically run dbt clean and deps"
},
{
"name": "sslmode",
"kind": "array",
"label": "SSL Mode",
"description": "SSL Mode used to connect to the database.\n"
},
{
"name": "target_path",
"env": "DBT_TARGET_PATH",
"kind": "string",
"value": "$MELTANO_PROJECT_ROOT/.meltano/transformers/dbt/target",
"label": "Target Path"
},
{
"name": "type",
"env": "DBT_EXT_TYPE",
"value": "postgres",
"label": "dbt Profile type"
},
{
"name": "user",
"kind": "string",
"label": "User",
"description": "The user to connect as.\n"
}
],
"commands": {
"build": {
"args": "build",
"description": "Will run your models, tests, snapshots and seeds in DAG order."
},
"clean": {
"args": "clean",
"description": "Delete all folders in the clean-targets list (usually the dbt_modules and target directories.)"
},
"compile": {
"args": "compile",
"description": "Generates executable SQL from source model, test, and analysis files. Compiled SQL files are written to the target/ directory."
},
"debug": {
"args": "debug",
"description": "Debug your DBT project and warehouse connection."
},
"deps": {
"args": "deps",
"description": "Pull the most recent version of the dependencies listed in packages.yml"
},
"describe": {
"args": "describe",
"description": "Describe the",
"executable": "dbt_extension"
},
"docs-generate": {
"args": "docs generate",
"description": "Generate documentation for your project."
},
"docs-serve": {
"args": "docs serve",
"description": "Serve documentation for your project. Make sure you ran `docs-generate` first."
},
"freshness": {
"args": "source freshness",
"description": "Check the freshness of your source data."
},
"initialize": {
"args": "initialize",
"description": "Initialize a new dbt project. This will create a dbt_project.yml file, a profiles.yml file, and models directory.\n",
"executable": "dbt_extension"
},
"run": {
"args": "run",
"description": "Compile SQL and execute against the current target database."
},
"seed": {
"args": "seed",
"description": "Load data from csv files into your data warehouse."
},
"snapshot": {
"args": "snapshot",
"description": "Execute snapshots defined in your project."
},
"test": {
"args": "test",
"description": "Runs tests on data in deployed models."
}
},
"ext_repo": "https://github.com/meltano/dbt-ext"
}

3
transform/.gitignore vendored Normal file
View File

@@ -0,0 +1,3 @@
target/
dbt_packages/
logs/

29
transform/dbt_project.yml Normal file
View File

@@ -0,0 +1,29 @@
name: my_meltano_project
version: '1.0'
profile: meltano
config-version: 2
require-dbt-version: [">=1.0.0", "<2.0.0"]
flags:
send_anonymous_usage_stats: False
use_colors: True
model-paths:
- models
analysis-paths:
- analysis
test-paths:
- tests
seed-paths:
- data
macro-paths:
- macros
snapshot-paths:
- snapshots
target-path: ../.meltano/transformers/dbt/target
log-path: logs
packages-install-path: dbt_packages
clean-targets:
- ../.meltano/transformers/dbt/target
- dbt_packages
- logs
models:
my_meltano_project: null

View File

View File

@@ -0,0 +1,32 @@
{{
config(
materialized='table'
)
}}
with recursive department_tree as (
-- Anchor: top-level departments (no parent_id)
select
id as department_hris_id,
applicable_organization as name,
null::text as cost_center,
null::text as manager_id,
id::text as path -- start path with own ID
from {{ source('tap_spreadsheets_everywhere', 'departments') }}
where parent_id = 'Sarens Group'
union all
-- Recursive part: join child departments
select
d.id as department_hris_id,
d.applicable_organization as name,
null::text as cost_center,
null::text as manager_id,
dt.path || '.' || d.id::text as path
from {{ source('tap_spreadsheets_everywhere', 'departments') }} d
join department_tree dt
on d.parent_id = dt.department_hris_id
)
select * from department_tree

View File

@@ -0,0 +1,7 @@
config-version: 2
version: 2
sources:
- name: tap_spreadsheets_everywhere
schema: public
tables:
- name: departments

View File

@@ -0,0 +1,46 @@
# Postgres config reference:
meltano:
target: "{{ env_var('MELTANO_ENVIRONMENT', 'dev') }}"
outputs:
dev:
type: postgres
host: "{{ env_var('DBT_POSTGRES_HOST') }}"
user: "{{ env_var('DBT_POSTGRES_USER') }}"
password: "{{ env_var('DBT_POSTGRES_PASSWORD') }}"
port: "{{ env_var('DBT_POSTGRES_PORT') | int }}"
dbname: "{{ env_var('DBT_POSTGRES_DBNAME', '') }}"
schema: "{{ env_var('DBT_POSTGRES_SCHEMA') }}"
threads: 2
keepalives_idle: 0 # default 0, indicating the system default
connect_timeout: 10 # default 10 seconds
search_path: "{{ env_var('DBT_POSTGRES_SEARCH_PATH', '') }}" # optional, override the default postgres search_path
role: "{{ env_var('DBT_POSTGRES_ROLE', '') }}" # optional, set the role dbt assumes when executing queries
# sslmode: "{{ env_var('DBT_POSTGRES_SSLMODE', '').split() }}" # optional, set the sslmode used to connect to the database
staging:
type: postgres
host: "{{ env_var('DBT_POSTGRES_HOST') }}"
user: "{{ env_var('DBT_POSTGRES_USER') }}"
password: "{{ env_var('DBT_POSTGRES_PASSWORD') }}"
port: "{{ env_var('DBT_POSTGRES_PORT') | int }}"
dbname: "{{ env_var('DBT_POSTGRES_DBNAME', '') }}"
schema: "{{ env_var('DBT_POSTGRES_SCHEMA') }}"
threads: 4
keepalives_idle: 0 # default 0, indicating the system default
connect_timeout: 10 # default 10 seconds
search_path: "{{ env_var('DBT_POSTGRES_SEARCH_PATH', '') }}" # optional, override the default postgres search_path
role: "{{ env_var('DBT_POSTGRES_ROLE', '') }}" # optional, set the role dbt assumes when executing queries
# sslmode: "{{ env_var('DBT_POSTGRES_SSLMODE', '').split() }}" # optional, set the sslmode used to connect to the database
prod:
type: postgres
host: "{{ env_var('DBT_POSTGRES_HOST') }}"
user: "{{ env_var('DBT_POSTGRES_USER') }}"
password: "{{ env_var('DBT_POSTGRES_PASSWORD') }}"
port: "{{ env_var('DBT_POSTGRES_PORT') | int }}"
dbname: "{{ env_var('DBT_POSTGRES_DBNAME', '') }}"
schema: "{{ env_var('DBT_POSTGRES_SCHEMA') }}"
threads: 6
keepalives_idle: 0 # default 0, indicating the system default
connect_timeout: 10 # default 10 seconds
search_path: "{{ env_var('DBT_POSTGRES_SEARCH_PATH', '') }}" # optional, override the default postgres search_path
role: "{{ env_var('DBT_POSTGRES_ROLE', '') }}" # optional, set the role dbt assumes when executing queries
# sslmode: "{{ env_var('DBT_POSTGRES_SSLMODE', '').split() }}" # optional, set the sslmode used to connect to the database