SiamCafe · Blog
Databricks Unity Catalog กับ Interview
บทความ

Databricks Unity Catalog กับ Interview

เผยแพร่ 28 พฤษภาคม 2569

Databricks Unity Catalog

Databricks Unity Catalog กับ Interview

Unity Catalog Centralized Governance Databricks Lakehouse Data Assets Tables Views Volumes Models Functions 3-Level Namespace Catalog.Schema.Table Fine-grained Access Control Data Lineage Audit Logs

Interview Preparation Data Engineering Delta Lake Spark Optimization Medallion Architecture ETL Patterns Data Quality Cost Optimization

Unity Catalog Architecture

# unity_catalog.py — Databricks Unity Catalog
from dataclasses import dataclass, field
from typing import List, Dict, Optional

@dataclass
class Table:
    name: str
    table_type: str  # MANAGED, EXTERNAL
    format: str  # DELTA, PARQUET, CSV
    columns: List[Dict[str, str]] = field(default_factory=list)
    owner: str = ""

@dataclass
class Schema:
    name: str
    tables: List[Table] = field(default_factory=list)
    owner: str = ""

@dataclass
class Catalog:
    name: str
    schemas: List[Schema] = field(default_factory=list)
    owner: str = ""

class UnityCatalog:
    """Unity Catalog Management"""

    def __init__(self):
        self.catalogs: List[Catalog] = []
        self.grants: List[dict] = []

    def create_catalog(self, name: str, owner: str) -> Catalog:
        cat = Catalog(name=name, owner=owner)
        self.catalogs.append(cat)
        return cat

    def create_schema(self, catalog_name: str, schema_name: str, owner: str):
        cat = next((c for c in self.catalogs if c.name == catalog_name), None)
        if cat:
            schema = Schema(name=schema_name, owner=owner)
            cat.schemas.append(schema)
            return schema

    def create_table(self, catalog: str, schema: str, table: Table):
        cat = next((c for c in self.catalogs if c.name == catalog), None)
        if cat:
            sch = next((s for s in cat.schemas if s.name == schema), None)
            if sch:
                sch.tables.append(table)

    def grant(self, principal: str, privilege: str, target: str):
        self.grants.append({
            "principal": principal,
            "privilege": privilege,
            "target": target,
        })

    def show_hierarchy(self):
        print(f"\n{'='*55}")
        print(f"Unity Catalog Hierarchy")
        print(f"{'='*55}")
        for cat in self.catalogs:
            print(f"\n  Catalog: {cat.name} (Owner: {cat.owner})")
            for schema in cat.schemas:
                print(f"    Schema: {schema.name}")
                for table in schema.tables:
                    print(f"      Table: {table.name} [{table.table_type}] ({table.format})")

# -- SQL Commands --
# CREATE CATALOG production;
# CREATE SCHEMA production.bronze;
# CREATE SCHEMA production.silver;
# CREATE SCHEMA production.gold;
#
# CREATE TABLE production.bronze.raw_events (
#   event_id STRING,
#   event_type STRING,
#   user_id STRING,
#   timestamp TIMESTAMP,
#   payload STRING
# ) USING DELTA;
#
# -- Access Control
# GRANT USE CATALOG ON CATALOG production TO `data-engineers`;
# GRANT USE SCHEMA ON SCHEMA production.bronze TO `data-engineers`;
# GRANT SELECT ON TABLE production.bronze.raw_events TO `data-analysts`;
# GRANT MODIFY ON TABLE production.silver.users TO `etl-service`;
#
# -- Column-level Security
# GRANT SELECT (user_id, event_type) ON TABLE production.bronze.raw_events TO `limited-access`;

uc = UnityCatalog()
cat = uc.create_catalog("production", "admin")
uc.create_schema("production", "bronze", "data-engineers")
uc.create_schema("production", "silver", "data-engineers")
uc.create_schema("production", "gold", "data-analysts")

tables = [
    Table("raw_events", "MANAGED", "DELTA", [{"name": "event_id", "type": "STRING"}]),
    Table("raw_users", "MANAGED", "DELTA", [{"name": "user_id", "type": "STRING"}]),
]
for t in tables:
    uc.create_table("production", "bronze", t)

uc.create_table("production", "silver",
    Table("clean_events", "MANAGED", "DELTA"))
uc.create_table("production", "gold",
    Table("daily_metrics", "MANAGED", "DELTA"))

uc.grant("data-engineers", "USE CATALOG", "production")
uc.grant("data-analysts", "SELECT", "production.gold.*")

uc.show_hierarchy()

print(f"\n  Grants:")
for g in uc.grants:
    print(f"    GRANT {g['privilege']} ON {g['target']} TO {g['principal']}")

Interview Questions

Databricks Unity Catalog กับ Interview
# interview_questions.py — Databricks Interview Prep
interview = {
    "Delta Lake": {
        "questions": [
            "Delta Lake คืออะไร ต่างจาก Parquet อย่างไร",
            "ACID Transactions ใน Delta Lake ทำงานอย่างไร",
            "Time Travel คืออะไร ใช้อย่างไร",
            "Z-Ordering คืออะไร เมื่อไหร่ควรใช้",
            "VACUUM ทำอะไร ตั้ง Retention อย่างไร",
            "OPTIMIZE ทำอะไร ต่างจาก ZORDER อย่างไร",
        ],
        "key_concepts": "Transaction Log, ACID, Schema Evolution, CDF",
    },
    "Unity Catalog": {
        "questions": [
            "Unity Catalog คืออะไร ต่างจาก Hive Metastore อย่างไร",
            "3-Level Namespace คืออะไร",
            "Fine-grained Access Control ตั้งค่าอย่างไร",
            "Data Lineage ใช้อย่างไร",
            "External Tables vs Managed Tables",
            "Column-level Security ทำอย่างไร",
        ],
        "key_concepts": "RBAC, Lineage, Audit, Governance",
    },
    "Spark Optimization": {
        "questions": [
            "Spark Partitioning ตั้งค่าอย่างไร",
            "Broadcast Join ใช้เมื่อไหร่",
            "Shuffle คืออะไร ลดอย่างไร",
            "Caching vs Persist ต่างกันอย่างไร",
            "AQE (Adaptive Query Execution) คืออะไร",
            "Skew Join Optimization ทำอย่างไร",
        ],
        "key_concepts": "Partitioning, Shuffle, AQE, Catalyst",
    },
    "Medallion Architecture": {
        "questions": [
            "Medallion Architecture คืออะไร",
            "Bronze, Silver, Gold แตกต่างกันอย่างไร",
            "Data Quality ตรวจสอบที่ Layer ไหน",
            "SCD Type 2 ทำที่ Layer ไหน อย่างไร",
            "Streaming + Batch ใน Medallion ทำอย่างไร",
        ],
        "key_concepts": "Bronze (Raw), Silver (Clean), Gold (Business)",
    },
}

print("Databricks Interview Questions:")
for topic, info in interview.items():
    print(f"\n  [{topic}]")
    print(f"    Key Concepts: {info['key_concepts']}")
    for q in info["questions"]:
        print(f"    Q: {q}")

PySpark Code Examples

# pyspark_examples.py — PySpark for Interview
# from pyspark.sql import SparkSession
# from pyspark.sql.functions import col, count, sum, avg, when, lit
# from delta.tables import DeltaTable

# spark = SparkSession.builder.appName("interview").getOrCreate()

# 1. Delta Lake Operations
# -- Create Delta Table
# CREATE TABLE production.bronze.events
# USING DELTA
# PARTITIONED BY (event_date)
# AS SELECT * FROM raw_events;

# -- Time Travel
# SELECT * FROM production.bronze.events VERSION AS OF 5;
# SELECT * FROM production.bronze.events TIMESTAMP AS OF '2024-01-15';

# -- MERGE (Upsert)
# MERGE INTO production.silver.users AS target
# USING staging.new_users AS source
# ON target.user_id = source.user_id
# WHEN MATCHED THEN UPDATE SET *
# WHEN NOT MATCHED THEN INSERT *;

# -- Z-Ordering
# OPTIMIZE production.silver.events ZORDER BY (user_id, event_type);

# -- VACUUM
# VACUUM production.bronze.events RETAIN 168 HOURS;

# 2. PySpark Optimization
# df = spark.read.table("production.bronze.events")
#
# # Broadcast Join (Small Table < 10MB)
# from pyspark.sql.functions import broadcast
# small_df = spark.read.table("production.silver.dim_users")
# result = df.join(broadcast(small_df), "user_id")
#
# # Partitioning
# df.repartition(200, "event_date") \
#   .write.mode("overwrite") \
#   .partitionBy("event_date") \
#   .saveAsTable("production.silver.events")
#
# # Caching
# df.cache()  # MEMORY_ONLY
# df.persist(StorageLevel.MEMORY_AND_DISK)
#
# # AQE
# spark.conf.set("spark.sql.adaptive.enabled", "true")
# spark.conf.set("spark.sql.adaptive.coalescePartitions.enabled", "true")
# spark.conf.set("spark.sql.adaptive.skewJoin.enabled", "true")

# Certification Path
certs = {
    "Databricks Certified Data Engineer Associate": {
        "topics": "Delta Lake, ELT, Workflows, Unity Catalog Basics",
        "difficulty": "ปานกลาง",
        "prep_time": "2-4 สัปดาห์",
        "cost": "$200",
    },
    "Databricks Certified Data Engineer Professional": {
        "topics": "Advanced Delta, Streaming, Optimization, Production",
        "difficulty": "ยาก",
        "prep_time": "4-8 สัปดาห์",
        "cost": "$300",
    },
    "Databricks Certified ML Professional": {
        "topics": "MLflow, Feature Store, Model Serving, AutoML",
        "difficulty": "ยาก",
        "prep_time": "4-8 สัปดาห์",
        "cost": "$300",
    },
}

print("\nDatabricks Certifications:")
for cert, info in certs.items():
    print(f"\n  [{cert}]")
    for key, value in info.items():
        print(f"    {key}: {value}")

เคล็ดลับ

  • Hands-on: ฝึกบน Databricks Community Edition ฟรี
  • Delta Lake: เข้าใจ Transaction Log, ACID, Time Travel ลึก
  • Unity Catalog: รู้ 3-Level Namespace, RBAC, Lineage
  • Medallion: อธิบาย Bronze/Silver/Gold ได้ชัดเจน
  • Optimization: รู้ Broadcast Join, AQE, Z-Ordering
  • Certification: สอบ Associate ก่อน แล้วค่อย Professional

Unity Catalog คืออะไร

Centralized Governance Databricks Lakehouse Data Assets Tables Views 3-Level Namespace Catalog.Schema.Table Fine-grained Access Control Data Lineage Audit Logs