ai

Databricks Unity Catalog กับ Interview

Databricks Unity Catalog กับ Interview

Databricks Unity Catalog

Databricks Unity Catalog กับ Interview

Unity Catalog Centralized Governance Databricks Lakehouse Data Assets Tables Views Volumes Models Functions 3-Level Namespace Catalog.Schema.Table Fine-grained Access Control Data Lineage Audit Logs

เนื้อหาเกี่ยวข้อง — ทำความเข้าใจ Databricks Unity Catalog Capacity Planning

Interview Preparation Data Engineering Delta Lake Spark Optimization Medallion Architecture ETL Patterns Data Quality Cost Optimization

เนื้อหาเกี่ยวข้อง — ดูเพิ่มเติมเรื่อง SonarQube Analysis Metric Collection

Unity Catalog Architecture

# unity_catalog.py — Databricks Unity Catalog

from dataclasses import dataclass, field

from typing import List, Dict, Optional



@dataclass

class Table:

    name: str

    table_type: str  # MANAGED, EXTERNAL

    format: str  # DELTA, PARQUET, CSV

    columns: List[Dict[str, str]] = field(default_factory=list)

    owner: str = ""



@dataclass

class Schema:

    name: str

    tables: List[Table] = field(default_factory=list)

    owner: str = ""



@dataclass

class Catalog:

    name: str

    schemas: List[Schema] = field(default_factory=list)

    owner: str = ""



class UnityCatalog:

    """Unity Catalog Management"""



    def __init__(self):

        self.catalogs: List[Catalog] = []

        self.grants: List[dict] = []



    def create_catalog(self, name: str, owner: str) -> Catalog:

        cat = Catalog(name=name, owner=owner)

        self.catalogs.append(cat)

        return cat



    def create_schema(self, catalog_name: str, schema_name: str, owner: str):

        cat = next((c for c in self.catalogs if c.name == catalog_name), None)

        if cat:

            schema = Schema(name=schema_name, owner=owner)

            cat.schemas.append(schema)

            return schema



    def create_table(self, catalog: str, schema: str, table: Table):

        cat = next((c for c in self.catalogs if c.name == catalog), None)

        if cat:

            sch = next((s for s in cat.schemas if s.name == schema), None)

            if sch:

                sch.tables.append(table)



    def grant(self, principal: str, privilege: str, target: str):

        self.grants.append({

            "principal": principal,

            "privilege": privilege,

            "target": target,

        })



    def show_hierarchy(self):

        print(f"\n{'='*55}")

        print(f"Unity Catalog Hierarchy")

        print(f"{'='*55}")

        for cat in self.catalogs:

            print(f"\n  Catalog: {cat.name} (Owner: {cat.owner})")

            for schema in cat.schemas:

                print(f"    Schema: {schema.name}")

                for table in schema.tables:

                    print(f"      Table: {table.name} [{table.table_type}] ({table.format})")



# -- SQL Commands --

# CREATE CATALOG production;

# CREATE SCHEMA production.bronze;

# CREATE SCHEMA production.silver;

# CREATE SCHEMA production.gold;

#

# CREATE TABLE production.bronze.raw_events (

#   event_id STRING,

#   event_type STRING,

#   user_id STRING,

#   timestamp TIMESTAMP,

#   payload STRING

# ) USING DELTA;

#

# -- Access Control

# GRANT USE CATALOG ON CATALOG production TO `data-engineers`;

# GRANT USE SCHEMA ON SCHEMA production.bronze TO `data-engineers`;

# GRANT SELECT ON TABLE production.bronze.raw_events TO `data-analysts`;

# GRANT MODIFY ON TABLE production.silver.users TO `etl-service`;

#

# -- Column-level Security

# GRANT SELECT (user_id, event_type) ON TABLE production.bronze.raw_events TO `limited-access`;



uc = UnityCatalog()

cat = uc.create_catalog("production", "admin")

uc.create_schema("production", "bronze", "data-engineers")

uc.create_schema("production", "silver", "data-engineers")

uc.create_schema("production", "gold", "data-analysts")



tables = [

    Table("raw_events", "MANAGED", "DELTA", [{"name": "event_id", "type": "STRING"}]),

    Table("raw_users", "MANAGED", "DELTA", [{"name": "user_id", "type": "STRING"}]),

]

for t in tables:

    uc.create_table("production", "bronze", t)



uc.create_table("production", "silver",

    Table("clean_events", "MANAGED", "DELTA"))

uc.create_table("production", "gold",

    Table("daily_metrics", "MANAGED", "DELTA"))



uc.grant("data-engineers", "USE CATALOG", "production")

uc.grant("data-analysts", "SELECT", "production.gold.*")



uc.show_hierarchy()



print(f"\n  Grants:")

for g in uc.grants:

    print(f"    GRANT {g['privilege']} ON {g['target']} TO {g['principal']}")

Interview Questions

Databricks Unity Catalog กับ Interview
# interview_questions.py — Databricks Interview Prep

interview = {

    "Delta Lake": {

        "questions": [

            "Delta Lake คืออะไร ต่างจาก Parquet อย่างไร",

            "ACID Transactions ใน Delta Lake ทำงานอย่างไร",

            "Time Travel คืออะไร ใช้อย่างไร",

            "Z-Ordering คืออะไร เมื่อไหร่ควรใช้",

            "VACUUM ทำอะไร ตั้ง Retention อย่างไร",

            "OPTIMIZE ทำอะไร ต่างจาก ZORDER อย่างไร",

        ],

        "key_concepts": "Transaction Log, ACID, Schema Evolution, CDF",

    },

    "Unity Catalog": {

        "questions": [

            "Unity Catalog คืออะไร ต่างจาก Hive Metastore อย่างไร",

            "3-Level Namespace คืออะไร",

            "Fine-grained Access Control ตั้งค่าอย่างไร",

            "Data Lineage ใช้อย่างไร",

            "External Tables vs Managed Tables",

            "Column-level Security ทำอย่างไร",

        ],

        "key_concepts": "RBAC, Lineage, Audit, Governance",

    },

    "Spark Optimization": {

        "questions": [

            "Spark Partitioning ตั้งค่าอย่างไร",

            "Broadcast Join ใช้เมื่อไหร่",

            "Shuffle คืออะไร ลดอย่างไร",

            "Caching vs Persist ต่างกันอย่างไร",

            "AQE (Adaptive Query Execution) คืออะไร",

            "Skew Join Optimization ทำอย่างไร",

        ],

        "key_concepts": "Partitioning, Shuffle, AQE, Catalyst",

    },

    "Medallion Architecture": {

        "questions": [

            "Medallion Architecture คืออะไร",

            "Bronze, Silver, Gold แตกต่างกันอย่างไร",

            "Data Quality ตรวจสอบที่ Layer ไหน",

            "SCD Type 2 ทำที่ Layer ไหน อย่างไร",

            "Streaming + Batch ใน Medallion ทำอย่างไร",

        ],

        "key_concepts": "Bronze (Raw), Silver (Clean), Gold (Business)",

    },

}



print("Databricks Interview Questions:")

for topic, info in interview.items():

    print(f"\n  [{topic}]")

    print(f"    Key Concepts: {info['key_concepts']}")

    for q in info["questions"]:

        print(f"    Q: {q}")

PySpark Code Examples

# pyspark_examples.py — PySpark for Interview

# from pyspark.sql import SparkSession

# from pyspark.sql.functions import col, count, sum, avg, when, lit

# from delta.tables import DeltaTable



# spark = SparkSession.builder.appName("interview").getOrCreate()



# 1. Delta Lake Operations

# -- Create Delta Table

# CREATE TABLE production.bronze.events

# USING DELTA

# PARTITIONED BY (event_date)

# AS SELECT * FROM raw_events;



# -- Time Travel

# SELECT * FROM production.bronze.events VERSION AS OF 5;

# SELECT * FROM production.bronze.events TIMESTAMP AS OF '2024-01-15';



# -- MERGE (Upsert)

# MERGE INTO production.silver.users AS target

# USING staging.new_users AS source

# ON target.user_id = source.user_id

# WHEN MATCHED THEN UPDATE SET *

# WHEN NOT MATCHED THEN INSERT *;



# -- Z-Ordering

# OPTIMIZE production.silver.events ZORDER BY (user_id, event_type);



# -- VACUUM

# VACUUM production.bronze.events RETAIN 168 HOURS;



# 2. PySpark Optimization

# df = spark.read.table("production.bronze.events")

#

# # Broadcast Join (Small Table < 10MB)

# from pyspark.sql.functions import broadcast

# small_df = spark.read.table("production.silver.dim_users")

# result = df.join(broadcast(small_df), "user_id")

#

# # Partitioning

# df.repartition(200, "event_date") \

#   .write.mode("overwrite") \

#   .partitionBy("event_date") \

#   .saveAsTable("production.silver.events")

#

# # Caching

# df.cache()  # MEMORY_ONLY

# df.persist(StorageLevel.MEMORY_AND_DISK)

#

# # AQE

# spark.conf.set("spark.sql.adaptive.enabled", "true")

# spark.conf.set("spark.sql.adaptive.coalescePartitions.enabled", "true")

# spark.conf.set("spark.sql.adaptive.skewJoin.enabled", "true")



# Certification Path

certs = {

    "Databricks Certified Data Engineer Associate": {

        "topics": "Delta Lake, ELT, Workflows, Unity Catalog Basics",

        "difficulty": "ปานกลาง",

        "prep_time": "2-4 สัปดาห์",

        "cost": "$200",

    },

    "Databricks Certified Data Engineer Professional": {

        "topics": "Advanced Delta, Streaming, Optimization, Production",

        "difficulty": "ยาก",

        "prep_time": "4-8 สัปดาห์",

        "cost": "$300",

    },

    "Databricks Certified ML Professional": {

        "topics": "MLflow, Feature Store, Model Serving, AutoML",

        "difficulty": "ยาก",

        "prep_time": "4-8 สัปดาห์",

        "cost": "$300",

    },

}



print("\nDatabricks Certifications:")

for cert, info in certs.items():

    print(f"\n  [{cert}]")

    for key, value in info.items():

        print(f"    {key}: {value}")

เคล็ดลับ

  • Hands-on: ฝึกบน Databricks Community Edition ฟรี
  • Delta Lake: เข้าใจ Transaction Log, ACID, Time Travel ลึก
  • Unity Catalog: รู้ 3-Level Namespace, RBAC, Lineage
  • Medallion: อธิบาย Bronze/Silver/Gold ได้ชัดเจน
  • Optimization: รู้ Broadcast Join, AQE, Z-Ordering
  • Certification: สอบ Associate ก่อน แล้วค่อย Professional

Unity Catalog คืออะไร

Centralized Governance Databricks Lakehouse Data Assets Tables Views 3-Level Namespace Catalog.Schema.Table Fine-grained Access Control Data Lineage Audit Logs

แนะนำเพิ่มเติม — เรียนเทรดกับ iCafeForex

เนื้อหาเกี่ยวข้อง — ดูเพิ่มเติมเรื่อง สมาคมสโมสรนักลงทุน — ข้อมูลครบถ้วน 2026

XM Legend · เทรดเดอร์ & ผู้สอน Forex 13 ปี

ผู้ก่อตั้ง SiamCafe ตั้งแต่ปี 1997 · เทรดเดอร์สาย Forex มากกว่า 13 ปี ได้รับการยกย่องเป็น XM Legend · แบ่งปันความรู้ Forex, ไอที, AI และการเทรด จากประสบการณ์จริงในตลาดจริง