Data Lakehouse Developer Experience DX —

Lakehouse Developer Experience

Data Lakehouse Developer Experience DX Local Development Data Catalog Query CI/CD Self-service Analytics Productivity

เนื้อหาเกี่ยวข้อง — บทความที่เกี่ยวข้อง: đầu tư vàng online ở đâu

DX Area	Bad DX	Good DX	Tool
Local Dev	ต้อง Deploy ถึงทดสอบได้	รัน Pipeline บนเครื่องตัวเอง	Docker, DuckDB, dbt
Data Discovery	ถาม Slack ทุกครั้ง ไม่รู้ Table ไหน	Search ใน Catalog ได้เอง	Unity Catalog, DataHub
Query Speed	รอ 10 นาทีต่อ Query	ได้ผลลัพธ์ใน 5 วินาที	Photon, Starburst, DuckDB
CI/CD	Deploy Manual SSH เข้าไปรัน	PR → Test → Deploy อัตโนมัติ	GitHub Actions, dbt Cloud
Documentation	ไม่มี Doc ต้องอ่าน Code	Auto-generated Schema + Lineage	dbt docs, DataHub
Self-service	Analyst รอ Engineer ทำให้ทุกอย่าง	Analyst Query เองได้ผ่าน SQL	Redash, Metabase, Superset

Local Development Stack

# === Local Dev Setup ===





# docker-compose.yml for local lakehouse


# version: "3.8"


# services:


#   spark:


#     image: bitnami/spark:3.5


#     ports: ["8080:8080", "4040:4040"]


#     volumes: ["./data:/data", "./notebooks:/notebooks"]


#   minio:


#     image: minio/minio


#     command: server /data --console-address ":9001"


#     ports: ["9000:9000", "9001:9001"]


#     environment:


#       MINIO_ROOT_USER: admin


#       MINIO_ROOT_PASSWORD: password


#   metastore:


#     image: apache/hive:4.0.0


#     ports: ["9083:9083"]





# DuckDB for fast local queries


# pip install duckdb


# import duckdb


# conn = duckdb.connect()


# conn.execute("INSTALL delta; LOAD delta;")


# df = conn.sql("SELECT * FROM delta_scan('/data/lakehouse/sales')")


# df.show()





# dbt local run


# dbt init my_lakehouse


# dbt run --select staging.*


# dbt test --select staging.*


# dbt docs generate && dbt docs serve





from dataclasses import dataclass





@dataclass


class LocalTool:


    tool: str


    purpose: str


    install: str


    speed: str


    production_parity: str





tools = [


    LocalTool("DuckDB",


        "Fast local SQL on Parquet/Delta files",


        "pip install duckdb",


        "Query GB data in seconds on laptop",


        "High — same SQL, reads same file formats"),


    LocalTool("dbt Core",


        "SQL transform pipeline, test, document",


        "pip install dbt-core dbt-duckdb",


        "Seconds for local models",


        "High — same models deploy to production"),


    LocalTool("Docker Compose",


        "Run Spark, MinIO, Metastore locally",


        "docker compose up -d",


        "Minutes to start, then fast",


        "Medium — simulates production but smaller"),


    LocalTool("Jupyter + PySpark",


        "Interactive exploration and prototyping",


        "pip install jupyterlab pyspark",


        "Interactive, cell-by-cell",


        "High — same PySpark code runs in prod"),


    LocalTool("SQLFluff",


        "SQL linter, enforce style, catch errors",


        "pip install sqlfluff",


        "Seconds to lint",


        "Same rules in CI/CD pipeline"),


]





print("=== Local Dev Tools ===")


for t in tools:


    print(f"  [{t.tool}] {t.purpose}")


    print(f"    Install: {t.install}")


    print(f"    Speed: {t.speed}")


    print(f"    Prod Parity: {t.production_parity}")

CI/CD Pipeline

# === CI/CD for Data Lakehouse ===





# .github/workflows/data-pipeline.yml


# name: Data Pipeline CI/CD


# on:


#   pull_request:


#     paths: ['models/**', 'tests/**']


#   push:


#     branches: [main]


# jobs:


#   lint:


#     runs-on: ubuntu-latest


#     steps:


#       - uses: actions/checkout@v4


#       - run: pip install sqlfluff


#       - run: sqlfluff lint models/


#   test:


#     runs-on: ubuntu-latest


#     steps:


#       - uses: actions/checkout@v4


#       - run: pip install dbt-core dbt-duckdb


#       - run: dbt deps


#       - run: dbt build --target ci


#   deploy-staging:


#     if: github.event_name == 'push'


#     needs: [lint, test]


#     runs-on: ubuntu-latest


#     steps:


#       - run: dbt run --target staging


#       - run: dbt test --target staging


#   deploy-production:


#     needs: [deploy-staging]


#     environment: production


#     runs-on: ubuntu-latest


#     steps:


#       - run: dbt run --target production


#       - run: dbt test --target production





@dataclass


class CICDStage:


    stage: str


    trigger: str


    actions: str


    duration: str


    fail_action: str





stages = [


    CICDStage("Lint", "Every PR",


        "SQLFluff lint, YAML validate, schema check",


        "30 sec", "Block merge, fix lint errors"),


    CICDStage("Unit Test", "Every PR",


        "dbt test, data contract test, custom assertions",


        "2-5 min", "Block merge, fix failing tests"),


    CICDStage("Integration Test", "Every PR",


        "Run models on sample data, check output",


        "5-10 min", "Block merge, review data issues"),


    CICDStage("Deploy Staging", "Merge to main",


        "dbt run + test on staging environment",


        "10-30 min", "Alert team, investigate before prod"),


    CICDStage("Deploy Production", "After staging pass",


        "dbt run + test on production, notify team",


        "10-60 min", "Rollback, alert on-call, investigate"),


]





print("=== CI/CD Stages ===")


for s in stages:


    print(f"  [{s.stage}] Trigger: {s.trigger}")


    print(f"    Actions: {s.actions}")


    print(f"    Duration: {s.duration}")


    print(f"    On Fail: {s.fail_action}")

Self-service Analytics

# === Self-service Platform ===





@dataclass


class SelfServiceLayer:


    layer: str


    audience: str


    tool: str


    access: str


    governance: str





layers = [


    SelfServiceLayer("SQL Workspace",


        "Data Analyst, Business Analyst",


        "Redash, Metabase, Superset, Databricks SQL",


        "SQL query ผ่าน Web UI ไม่ต้องติดตั้งอะไร",


        "Read-only access, row-level security"),


    SelfServiceLayer("Notebook",


        "Data Scientist, ML Engineer",


        "Jupyter, Databricks Notebook, Zeppelin",


        "Python R SQL ผ่าน Notebook",


        "Cluster access control, data masking"),


    SelfServiceLayer("Dashboard",


        "Business User, Manager, Executive",


        "Looker, Tableau, Power BI, Superset",


        "Click-based, no code, scheduled refresh",


        "Dashboard-level permission, export control"),


    SelfServiceLayer("Data API",


        "Application Developer, Frontend",


        "REST API, GraphQL, gRPC",


        "API Key, OAuth, rate limiting",


        "API gateway, usage tracking, SLA"),


    SelfServiceLayer("Data Catalog",


        "Everyone",


        "Unity Catalog, DataHub, Atlan",


        "Search, browse, request access",


        "Tag-based access, approval workflow"),


]





print("=== Self-service Layers ===")


for l in layers:


    print(f"  [{l.layer}] Audience: {l.audience}")


    print(f"    Tool: {l.tool}")


    print(f"    Access: {l.access}")


    print(f"    Governance: {l.governance}")