import duckdb

# Path to local datalake
datalake_path = "F:/datalake/AABBCCDD/"

# Create table from datalake
table_gnss_speed = duckdb.read_parquet(f"{datalake_path}/CAN2_GnssSpeed/**/*.parquet")

# Query table using standard SQL
result_gnss_speed = duckdb.sql("SELECT * FROM table_gnss_speed")

# Display summary
result_gnss_speed.show()

# Aggregate functions
result_gnss_speed_agg = duckdb.sql("SELECT COUNT(Speed), AVG(Speed), MAX(Speed), MIN(Speed) FROM table_gnss_speed")

# Subset by date range
result_gnss_speed_time_range = duckdb.sql("SELECT * FROM table_gnss_speed WHERE t BETWEEN '2022-04-24 08:00:00' AND '2022-04-24 08:50:00'")

# Subset by "sample" (1000 points)
result_gnss_speed_sample = duckdb.sql("SELECT * FROM table_gnss_speed USING SAMPLE 1000 ORDER BY t")

# Fetch sample subset as Python list (loads sample subset into memory)
result_gnss_speed_sample_list = result_gnss_speed_sample.fetchall()

# Fetch sample subset as Numpy array (loads sample subset into memory)
result_gnss_speed_sample_numpy = result_gnss_speed_sample.fetchnumpy()

# Fetch sample subset as Pandas Dataframe (loads sample subset into memory)
result_gnss_speed_sample_df = result_gnss_speed_sample.fetchdf()