Skip to content

介绍

DuckDB 是一个 嵌入式的、面向分析型查询(OLAP)的关系型数据库,你可以把它理解为:

✅ “数据库界的 SQLite + 数据仓库能力”

✅ 在应用程序内直接运行,不需要独立服务

✅ 读写 Parquet/CSV 非常强

✅ SQL 兼容性好,性能接近甚至超过部分专用分析系统

本地分析

数据科学

ETL/查询大型文件

内嵌在应用程序做快速数据分析

替代 pandas 加速数据处理

python示例


#!/usr/bin/env python
# -*- coding:utf-8 -*-
import logging
from pathlib import Path
import tempfile
import duckdb


logging.basicConfig(
    level=logging.DEBUG,
    format="%(asctime)s - %(name)s - %(levelname)s - %(filename)s:%(lineno)d - %(message)s",
    datefmt="%H:%M:%S",
)
logger = logging.getLogger(__name__)
temp_directory = tempfile.gettempdir()

base_path = str(Path(temp_directory) / "duckdb")
if not Path(base_path).exists():
    Path(base_path).mkdir(parents=True, exist_ok=True)

conn = duckdb.connect()

df = duckdb.sql(
    f"SELECT ip,count(ip) FROM '{str(Path(base_path)/'request_log.parquet')}'  group by ip order by count(ip) desc"
)
logger.info(df)
# duckdb.install_extension("mysql")
# duckdb.load_extension("mysql")
HOST = ""
USER = ""
PORT = 3306
DATABASE = ""
PASSWORD = ""
ALIAS = "mysqldb"
# with duckdb.connect() as conn:
#     conn.install_extension("mysql")
#     conn.load_extension("mysql")
#     conn.sql("CALL mysql_clear_cache();")
#     conn.sql(
#         f" ATTACH 'host={HOST} user={USER} port={PORT} database={DATABASE} password={PASSWORD}' AS {ALIAS} (TYPE mysql, READ_ONLY);"
#     )
#     conn.sql(f" use {ALIAS} ;")

#     rows = conn.sql("SHOW TABLES").fetchall()
#     logger.info(rows)
#     for row in rows:
#         table_name = row[0]
#         logger.info(table_name)
#         conn.sql(
#             f"COPY {ALIAS}.{table_name} TO '{str(Path(base_path)/table_name)}.parquet';"
#         )

更新于:

note