Databricks Unity Catalog กับ Interview

Databricks Unity Catalog

Unity Catalog Centralized Governance Databricks Lakehouse Data Assets Tables Views Volumes Models Functions 3-Level Namespace Catalog.Schema.Table Fine-grained Access Control Data Lineage Audit Logs

เนื้อหาเกี่ยวข้อง — ทำความเข้าใจ Databricks Unity Catalog Capacity Planning

Interview Preparation Data Engineering Delta Lake Spark Optimization Medallion Architecture ETL Patterns Data Quality Cost Optimization

เนื้อหาเกี่ยวข้อง — ดูเพิ่มเติมเรื่อง SonarQube Analysis Metric Collection

Unity Catalog Architecture

# unity_catalog.py — Databricks Unity Catalog

from dataclasses import dataclass, field

from typing import List, Dict, Optional



@dataclass

class Table:

    name: str

    table_type: str  # MANAGED, EXTERNAL

    format: str  # DELTA, PARQUET, CSV

    columns: List[Dict[str, str]] = field(default_factory=list)

    owner: str = ""



@dataclass

class Schema:

    name: str

    tables: List[Table] = field(default_factory=list)

    owner: str = ""



@dataclass

class Catalog:

    name: str

    schemas: List[Schema] = field(default_factory=list)

    owner: str = ""



class UnityCatalog:

    """Unity Catalog Management"""



    def __init__(self):

        self.catalogs: List[Catalog] = []

        self.grants: List[dict] = []



    def create_catalog(self, name: str, owner: str) -> Catalog:

        cat = Catalog(name=name, owner=owner)

        self.catalogs.append(cat)

        return cat



    def create_schema(self, catalog_name: str, schema_name: str, owner: str):

        cat = next((c for c in self.catalogs if c.name == catalog_name), None)

        if cat:

            schema = Schema(name=schema_name, owner=owner)

            cat.schemas.append(schema)

            return schema



    def create_table(self, catalog: str, schema: str, table: Table):

        cat = next((c for c in self.catalogs if c.name == catalog), None)

        if cat:

            sch = next((s for s in cat.schemas if s.name == schema), None)

            if sch:

                sch.tables.append(table)



    def grant(self, principal: str, privilege: str, target: str):

        self.grants.append({

            "principal": principal,

            "privilege": privilege,

            "target": target,

        })



    def show_hierarchy(self):

        print(f"\n{'='*55}")

        print(f"Unity Catalog Hierarchy")

        print(f"{'='*55}")

        for cat in self.catalogs:

            print(f"\n  Catalog: {cat.name} (Owner: {cat.owner})")

            for schema in cat.schemas:

                print(f"    Schema: {schema.name}")

                for table in schema.tables:

                    print(f"      Table: {table.name} [{table.table_type}] ({table.format})")



# -- SQL Commands --

# CREATE CATALOG production;

# CREATE SCHEMA production.bronze;

# CREATE SCHEMA production.silver;

# CREATE SCHEMA production.gold;

#

# CREATE TABLE production.bronze.raw_events (

#   event_id STRING,

#   event_type STRING,

#   user_id STRING,

#   timestamp TIMESTAMP,

#   payload STRING

# ) USING DELTA;

#

# -- Access Control

# GRANT USE CATALOG ON CATALOG production TO `data-engineers`;

# GRANT USE SCHEMA ON SCHEMA production.bronze TO `data-engineers`;

# GRANT SELECT ON TABLE production.bronze.raw_events TO `data-analysts`;

# GRANT MODIFY ON TABLE production.silver.users TO `etl-service`;

#

# -- Column-level Security

# GRANT SELECT (user_id, event_type) ON TABLE production.bronze.raw_events TO `limited-access`;



uc = UnityCatalog()

cat = uc.create_catalog("production", "admin")

uc.create_schema("production", "bronze", "data-engineers")

uc.create_schema("production", "silver", "data-engineers")

uc.create_schema("production", "gold", "data-analysts")



tables = [

    Table("raw_events", "MANAGED", "DELTA", [{"name": "event_id", "type": "STRING"}]),

    Table("raw_users", "MANAGED", "DELTA", [{"name": "user_id", "type": "STRING"}]),

]

for t in tables:

    uc.create_table("production", "bronze", t)



uc.create_table("production", "silver",

    Table("clean_events", "MANAGED", "DELTA"))

uc.create_table("production", "gold",

    Table("daily_metrics", "MANAGED", "DELTA"))



uc.grant("data-engineers", "USE CATALOG", "production")

uc.grant("data-analysts", "SELECT", "production.gold.*")



uc.show_hierarchy()



print(f"\n  Grants:")

for g in uc.grants:

    print(f"    GRANT {g['privilege']} ON {g['target']} TO {g['principal']}")

Interview Questions

# interview_questions.py — Databricks Interview Prep

interview = {

    "Delta Lake": {

        "questions": [

            "Delta Lake คืออะไร ต่างจาก Parquet อย่างไร",

            "ACID Transactions ใน Delta Lake ทำงานอย่างไร",

            "Time Travel คืออะไร ใช้อย่างไร",

            "Z-Ordering คืออะไร เมื่อไหร่ควรใช้",

            "VACUUM ทำอะไร ตั้ง Retention อย่างไร",

            "OPTIMIZE ทำอะไร ต่างจาก ZORDER อย่างไร",

        ],

        "key_concepts": "Transaction Log, ACID, Schema Evolution, CDF",

    },

    "Unity Catalog": {

        "questions": [

            "Unity Catalog คืออะไร ต่างจาก Hive Metastore อย่างไร",

            "3-Level Namespace คืออะไร",

            "Fine-grained Access Control ตั้งค่าอย่างไร",

            "Data Lineage ใช้อย่างไร",

            "External Tables vs Managed Tables",

            "Column-level Security ทำอย่างไร",

        ],

        "key_concepts": "RBAC, Lineage, Audit, Governance",

    },

    "Spark Optimization": {

        "questions": [

            "Spark Partitioning ตั้งค่าอย่างไร",

            "Broadcast Join ใช้เมื่อไหร่",

            "Shuffle คืออะไร ลดอย่างไร",

            "Caching vs Persist ต่างกันอย่างไร",

            "AQE (Adaptive Query Execution) คืออะไร",

            "Skew Join Optimization ทำอย่างไร",

        ],

        "key_concepts": "Partitioning, Shuffle, AQE, Catalyst",

    },

    "Medallion Architecture": {

        "questions": [

            "Medallion Architecture คืออะไร",

            "Bronze, Silver, Gold แตกต่างกันอย่างไร",

            "Data Quality ตรวจสอบที่ Layer ไหน",

            "SCD Type 2 ทำที่ Layer ไหน อย่างไร",

            "Streaming + Batch ใน Medallion ทำอย่างไร",

        ],

        "key_concepts": "Bronze (Raw), Silver (Clean), Gold (Business)",

    },

}



print("Databricks Interview Questions:")

for topic, info in interview.items():

    print(f"\n  [{topic}]")

    print(f"    Key Concepts: {info['key_concepts']}")

    for q in info["questions"]:

        print(f"    Q: {q}")

PySpark Code Examples

# pyspark_examples.py — PySpark for Interview

# from pyspark.sql import SparkSession

# from pyspark.sql.functions import col, count, sum, avg, when, lit

# from delta.tables import DeltaTable



# spark = SparkSession.builder.appName("interview").getOrCreate()



# 1. Delta Lake Operations

# -- Create Delta Table

# CREATE TABLE production.bronze.events

# USING DELTA

# PARTITIONED BY (event_date)

# AS SELECT * FROM raw_events;



# -- Time Travel

# SELECT * FROM production.bronze.events VERSION AS OF 5;

# SELECT * FROM production.bronze.events TIMESTAMP AS OF '2024-01-15';



# -- MERGE (Upsert)

# MERGE INTO production.silver.users AS target

# USING staging.new_users AS source

# ON target.user_id = source.user_id

# WHEN MATCHED THEN UPDATE SET *

# WHEN NOT MATCHED THEN INSERT *;



# -- Z-Ordering

# OPTIMIZE production.silver.events ZORDER BY (user_id, event_type);



# -- VACUUM

# VACUUM production.bronze.events RETAIN 168 HOURS;



# 2. PySpark Optimization

# df = spark.read.table("production.bronze.events")

#

# # Broadcast Join (Small Table < 10MB)

# from pyspark.sql.functions import broadcast

# small_df = spark.read.table("production.silver.dim_users")

# result = df.join(broadcast(small_df), "user_id")

#

# # Partitioning

# df.repartition(200, "event_date") \

#   .write.mode("overwrite") \

#   .partitionBy("event_date") \

#   .saveAsTable("production.silver.events")

#

# # Caching

# df.cache()  # MEMORY_ONLY

# df.persist(StorageLevel.MEMORY_AND_DISK)

#

# # AQE

# spark.conf.set("spark.sql.adaptive.enabled", "true")

# spark.conf.set("spark.sql.adaptive.coalescePartitions.enabled", "true")

# spark.conf.set("spark.sql.adaptive.skewJoin.enabled", "true")



# Certification Path

certs = {

    "Databricks Certified Data Engineer Associate": {

        "topics": "Delta Lake, ELT, Workflows, Unity Catalog Basics",

        "difficulty": "ปานกลาง",

        "prep_time": "2-4 สัปดาห์",

        "cost": "$200",

    },

    "Databricks Certified Data Engineer Professional": {

        "topics": "Advanced Delta, Streaming, Optimization, Production",

        "difficulty": "ยาก",

        "prep_time": "4-8 สัปดาห์",

        "cost": "$300",

    },

    "Databricks Certified ML Professional": {

        "topics": "MLflow, Feature Store, Model Serving, AutoML",

        "difficulty": "ยาก",

        "prep_time": "4-8 สัปดาห์",

        "cost": "$300",

    },

}



print("\nDatabricks Certifications:")

for cert, info in certs.items():

    print(f"\n  [{cert}]")

    for key, value in info.items():

        print(f"    {key}: {value}")

เคล็ดลับ

Hands-on: ฝึกบน Databricks Community Edition ฟรี
Delta Lake: เข้าใจ Transaction Log, ACID, Time Travel ลึก
Unity Catalog: รู้ 3-Level Namespace, RBAC, Lineage
Medallion: อธิบาย Bronze/Silver/Gold ได้ชัดเจน
Optimization: รู้ Broadcast Join, AQE, Z-Ordering
Certification: สอบ Associate ก่อน แล้วค่อย Professional

Unity Catalog คืออะไร

Centralized Governance Databricks Lakehouse Data Assets Tables Views 3-Level Namespace Catalog.Schema.Table Fine-grained Access Control Data Lineage Audit Logs

แนะนำเพิ่มเติม — เรียนเทรดกับ iCafeForex

เนื้อหาเกี่ยวข้อง — ดูเพิ่มเติมเรื่อง สมาคมสโมสรนักลงทุน — ข้อมูลครบถ้วน 2026