|
|
|
@ -0,0 +1,933 @@ |
|
|
|
{ |
|
|
|
"cells": [ |
|
|
|
{ |
|
|
|
"cell_type": "markdown", |
|
|
|
"id": "7f1f5a4a-ae34-4a27-9efa-399edc0e384a", |
|
|
|
"metadata": { |
|
|
|
"tags": [] |
|
|
|
}, |
|
|
|
"source": [ |
|
|
|
"## Benchmark: ClickHouse Vs. InfluxDB Vs. Postgresql Vs. Parquet \n", |
|
|
|
"\n", |
|
|
|
"-----\n", |
|
|
|
"\n", |
|
|
|
"#### How to use:\n", |
|
|
|
"* Rename the file \"properties-model.ini\" to \"properties.ini\"\n", |
|
|
|
"* Fill with your own credentials\n", |
|
|
|
"----\n", |
|
|
|
"\n", |
|
|
|
"The proposal of this work is to compare the speed in read/writing a midle level of data ( a dataset with 9 columns and 50.000 lines) to four diferent databases:\n", |
|
|
|
"* ClickHouse\n", |
|
|
|
"* InfluxDB\n", |
|
|
|
"* Postgresql\n", |
|
|
|
"* Parquet (in a S3 Minio Storage)\n", |
|
|
|
"* DuckDB with Polars\n", |
|
|
|
"* MongoDB\n", |
|
|
|
"* Kdb+\n", |
|
|
|
"\n", |
|
|
|
" - [ ] Clickhouse read\n", |
|
|
|
" \n", |
|
|
|
"Deve-se relevar:\n", |
|
|
|
"é uma \"cold-storage\" ou \"frezze-storage\"\n", |
|
|
|
"influxdb: alta leitura etem a vantagem da indexaçõa para viizualização de dados em gráficos\n", |
|
|
|
"\n", |
|
|
|
"notas: \n", |
|
|
|
"* comparar tamanho do csv com parquet" |
|
|
|
] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "markdown", |
|
|
|
"id": "6bb26ce7-1e84-4665-accd-916bb977f95d", |
|
|
|
"metadata": { |
|
|
|
"tags": [] |
|
|
|
}, |
|
|
|
"source": [ |
|
|
|
"### Imports " |
|
|
|
] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "code", |
|
|
|
"execution_count": 74, |
|
|
|
"id": "ab6c6c81-6ac1-4668-a79b-a9a0341fb35a", |
|
|
|
"metadata": { |
|
|
|
"tags": [] |
|
|
|
}, |
|
|
|
"outputs": [ |
|
|
|
{ |
|
|
|
"data": { |
|
|
|
"text/plain": [ |
|
|
|
"False" |
|
|
|
] |
|
|
|
}, |
|
|
|
"execution_count": 74, |
|
|
|
"metadata": {}, |
|
|
|
"output_type": "execute_result" |
|
|
|
} |
|
|
|
], |
|
|
|
"source": [ |
|
|
|
"import configparser\n", |
|
|
|
"from datetime import datetime\n", |
|
|
|
"\n", |
|
|
|
"import influxdb_client\n", |
|
|
|
"import pandas as pd\n", |
|
|
|
"from clickhouse_driver import Client\n", |
|
|
|
"from dotenv import load_dotenv\n", |
|
|
|
"from minio import Minio\n", |
|
|
|
"from pymongo import MongoClient\n", |
|
|
|
"from pytz import timezone\n", |
|
|
|
"from sqlalchemy import create_engine\n", |
|
|
|
"\n", |
|
|
|
"load_dotenv()\n", |
|
|
|
"\n", |
|
|
|
"\n", |
|
|
|
"# import io\n", |
|
|
|
"# import time\n", |
|
|
|
"# import numpy as np\n", |
|
|
|
"# import clickhouse_connect\n", |
|
|
|
"# pip install python-dotenv\n", |
|
|
|
"# import psycopg2\n", |
|
|
|
"# import os\n", |
|
|
|
"# import pyarrow as pa\n", |
|
|
|
"# import pyarrow.parquet as pq\n", |
|
|
|
"# import s3fs\n", |
|
|
|
"# from friendly.jupyter import Friendly\n", |
|
|
|
"# from minio.error import S3Error\n", |
|
|
|
"# from pyarrow import Table\n", |
|
|
|
"# import os\n", |
|
|
|
"# from influxdb_client import InfluxDBClient, Point, WritePrecision\n", |
|
|
|
"# from influxdb_client.client.write_api import SYNCHRONOUS\n", |
|
|
|
"# Friendly.dark()" |
|
|
|
] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "code", |
|
|
|
"execution_count": null, |
|
|
|
"id": "01d88282-32a1-404f-92da-488a23302fd0", |
|
|
|
"metadata": {}, |
|
|
|
"outputs": [], |
|
|
|
"source": [ |
|
|
|
"# teset" |
|
|
|
] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "code", |
|
|
|
"execution_count": null, |
|
|
|
"id": "55c3cd57-0996-4723-beb5-8f3196c96009", |
|
|
|
"metadata": { |
|
|
|
"tags": [] |
|
|
|
}, |
|
|
|
"outputs": [], |
|
|
|
"source": [ |
|
|
|
"# Variables\n", |
|
|
|
"dbname = \"EURUSDtest\"" |
|
|
|
] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "code", |
|
|
|
"execution_count": null, |
|
|
|
"id": "968403e3-2e5e-4834-b969-be4600e2963a", |
|
|
|
"metadata": { |
|
|
|
"tags": [] |
|
|
|
}, |
|
|
|
"outputs": [], |
|
|
|
"source": [ |
|
|
|
"arq = configparser.RawConfigParser()\n", |
|
|
|
"arq.read(\"properties.ini\")\n", |
|
|
|
"ClickHouseUser = arq.get(\"CLICKHOUSE\", \"user\")\n", |
|
|
|
"ClickHouseKey = arq.get(\"CLICKHOUSE\", \"key\")\n", |
|
|
|
"ClickHouseUrl = arq.get(\"CLICKHOUSE\", \"url\")\n", |
|
|
|
"\n", |
|
|
|
"InfluxDBUser = arq.get(\"INFLUXDB\", \"user\")\n", |
|
|
|
"InfluxDBKey = arq.get(\"INFLUXDB\", \"key\")\n", |
|
|
|
"InfluxDBUrl = arq.get(\"INFLUXDB\", \"url\")\n", |
|
|
|
"InfluxDBBucket = arq.get(\"INFLUXDB\", \"bucket\")\n", |
|
|
|
"\n", |
|
|
|
"PostgresqlUser = arq.get(\"POSTGRESQL\", \"user\")\n", |
|
|
|
"PostgresqlKey = arq.get(\"POSTGRESQL\", \"key\")\n", |
|
|
|
"PostgresqlUrl = arq.get(\"POSTGRESQL\", \"url\")\n", |
|
|
|
"PostgresqlDB = arq.get(\"POSTGRESQL\", \"database\")\n", |
|
|
|
"\n", |
|
|
|
"S3MinioUser = arq.get(\"S3MINIO\", \"user\")\n", |
|
|
|
"S3MinioKey = arq.get(\"S3MINIO\", \"key\")\n", |
|
|
|
"S3MinioUrl = arq.get(\"S3MINIO\", \"url\")\n", |
|
|
|
"S3MinioRegion = arq.get(\"S3MINIO\", \"region\")\n", |
|
|
|
"\n", |
|
|
|
"MongoUser = arq.get(\"MONGODB\", \"user\")\n", |
|
|
|
"MongoKey = arq.get(\"MONGODB\", \"key\")\n", |
|
|
|
"MongoUrl = arq.get(\"MONGODB\", \"url\")" |
|
|
|
] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "code", |
|
|
|
"execution_count": null, |
|
|
|
"id": "3634a4ec-04c2-4f1e-8659-5d22eb17a254", |
|
|
|
"metadata": {}, |
|
|
|
"outputs": [], |
|
|
|
"source": [ |
|
|
|
"%%time\n", |
|
|
|
"# Load Dataset\n", |
|
|
|
"df = pd.read_csv(\"out.csv\", index_col=0)" |
|
|
|
] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "code", |
|
|
|
"execution_count": null, |
|
|
|
"id": "7e7c46b6-90ee-4ca3-8b5a-553b09ece913", |
|
|
|
"metadata": {}, |
|
|
|
"outputs": [], |
|
|
|
"source": [ |
|
|
|
"# df.head()" |
|
|
|
] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "code", |
|
|
|
"execution_count": null, |
|
|
|
"id": "76199f91-31d6-416b-9f15-5d435b3792c9", |
|
|
|
"metadata": {}, |
|
|
|
"outputs": [], |
|
|
|
"source": [ |
|
|
|
"df[\"from\"] = pd.to_datetime(df[\"from\"], unit=\"s\")\n", |
|
|
|
"df[\"to\"] = pd.to_datetime(df[\"to\"], unit=\"s\")\n", |
|
|
|
"# Optional use when not transoformed yet\n", |
|
|
|
"# Transform Datetime" |
|
|
|
] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "markdown", |
|
|
|
"id": "274cc026-2f48-4e38-b80f-b1a9ff982060", |
|
|
|
"metadata": { |
|
|
|
"jp-MarkdownHeadingCollapsed": true, |
|
|
|
"tags": [] |
|
|
|
}, |
|
|
|
"source": [ |
|
|
|
"#### Funçoes\n", |
|
|
|
"\n", |
|
|
|
"-> Class" |
|
|
|
] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "code", |
|
|
|
"execution_count": null, |
|
|
|
"id": "27de1ec8-4de1-440a-b555-b4a46c5ef7ce", |
|
|
|
"metadata": {}, |
|
|
|
"outputs": [], |
|
|
|
"source": [ |
|
|
|
"def timestamp2dataHora(x, timezone_=\"America/Sao_Paulo\"):\n", |
|
|
|
" d = datetime.fromtimestamp(x, tz=timezone(timezone_))\n", |
|
|
|
" return d" |
|
|
|
] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "markdown", |
|
|
|
"id": "4a8d5703-9bc9-4d38-83ff-457159304d58", |
|
|
|
"metadata": { |
|
|
|
"jp-MarkdownHeadingCollapsed": true, |
|
|
|
"tags": [] |
|
|
|
}, |
|
|
|
"source": [ |
|
|
|
"### ClickHouse" |
|
|
|
] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "code", |
|
|
|
"execution_count": null, |
|
|
|
"id": "9cf86669-7722-4a2c-895c-51f0a5eebefc", |
|
|
|
"metadata": { |
|
|
|
"tags": [] |
|
|
|
}, |
|
|
|
"outputs": [], |
|
|
|
"source": [ |
|
|
|
"# !! O client oficial usa um driver http, nesse exemplo vamos usar a biblioteca\n", |
|
|
|
"# de terceirtos clickhouse_driver recomendada, por sua vez que usa tcp.\n", |
|
|
|
"client = Client(\n", |
|
|
|
" host=ClickHouseUrl,\n", |
|
|
|
" user=ClickHouseUser,\n", |
|
|
|
" password=ClickHouseKey,\n", |
|
|
|
" settings={\"use_numpy\": True},\n", |
|
|
|
")" |
|
|
|
] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "code", |
|
|
|
"execution_count": null, |
|
|
|
"id": "a0a1f67b-2e63-462e-be66-d322d99837ea", |
|
|
|
"metadata": {}, |
|
|
|
"outputs": [], |
|
|
|
"source": [ |
|
|
|
"# Create Tables in ClickHouse\n", |
|
|
|
"# !! ALTERAR TIPOS !!\n", |
|
|
|
"# ENGINE: 'Memory' desaparece quando server é reiniciado\n", |
|
|
|
"client.execute(\n", |
|
|
|
" \"CREATE TABLE IF NOT EXISTS {} (id UInt32,\"\n", |
|
|
|
" \"from DateTime, at UInt64, to DateTime, open Float64,\"\n", |
|
|
|
" \"close Float64, min Float64, max Float64, volume UInt32)\"\n", |
|
|
|
" \"ENGINE MergeTree ORDER BY to\".format(dbname)\n", |
|
|
|
")" |
|
|
|
] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "code", |
|
|
|
"execution_count": null, |
|
|
|
"id": "3a029a09-46f4-43c3-b3df-cfbed33fb0dc", |
|
|
|
"metadata": {}, |
|
|
|
"outputs": [], |
|
|
|
"source": [ |
|
|
|
"%%time\n", |
|
|
|
"# Write dataframe to db\n", |
|
|
|
"client.insert_dataframe(\"INSERT INTO {} VALUES\".format(dbname), df)" |
|
|
|
] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "code", |
|
|
|
"execution_count": null, |
|
|
|
"id": "17251288-2442-43ee-98f2-ca680c3c4f13", |
|
|
|
"metadata": { |
|
|
|
"tags": [] |
|
|
|
}, |
|
|
|
"outputs": [], |
|
|
|
"source": [ |
|
|
|
"%%time\n", |
|
|
|
"client.query_dataframe(\"SELECT * FROM default.{}\".format(dbname)) # LIMIT 10000" |
|
|
|
] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "code", |
|
|
|
"execution_count": null, |
|
|
|
"id": "51497522-bd6c-44a8-aaea-ec5dda30b95b", |
|
|
|
"metadata": { |
|
|
|
"tags": [] |
|
|
|
}, |
|
|
|
"outputs": [], |
|
|
|
"source": [ |
|
|
|
"%%time\n", |
|
|
|
"df = pd.DataFrame(client.query_dataframe(\"SELECT * FROM default.{}\".format(dbname)))" |
|
|
|
] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "markdown", |
|
|
|
"id": "1d389546-911f-43f7-aad1-49f7bcc83503", |
|
|
|
"metadata": { |
|
|
|
"tags": [] |
|
|
|
}, |
|
|
|
"source": [ |
|
|
|
"### InfluxDB\n" |
|
|
|
] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "code", |
|
|
|
"execution_count": null, |
|
|
|
"id": "c3e7ebfd-76f1-4ac4-9833-312eb1a531af", |
|
|
|
"metadata": {}, |
|
|
|
"outputs": [], |
|
|
|
"source": [ |
|
|
|
"client = influxdb_client.InfluxDBClient(\n", |
|
|
|
" url=InfluxDBUrl, token=InfluxDBKey, org=InfluxDBUser\n", |
|
|
|
")" |
|
|
|
] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "code", |
|
|
|
"execution_count": null, |
|
|
|
"id": "cbf61f12-830b-4c57-804a-2257d8b3599a", |
|
|
|
"metadata": { |
|
|
|
"tags": [] |
|
|
|
}, |
|
|
|
"outputs": [], |
|
|
|
"source": [ |
|
|
|
"# Read data from CSV without index and parse 'TimeStamp' as date.\n", |
|
|
|
"df = pd.read_csv(\"out.csv\", sep=\",\", index_col=False, parse_dates=[\"from\"])\n", |
|
|
|
"# Set 'TimeStamp' field as index of dataframe # test another indexs\n", |
|
|
|
"df.set_index(\"from\", inplace=True)" |
|
|
|
] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "code", |
|
|
|
"execution_count": null, |
|
|
|
"id": "54342a28-ba2b-4ade-a692-00566b53a639", |
|
|
|
"metadata": { |
|
|
|
"tags": [] |
|
|
|
}, |
|
|
|
"outputs": [], |
|
|
|
"source": [ |
|
|
|
"df.head()" |
|
|
|
] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "code", |
|
|
|
"execution_count": null, |
|
|
|
"id": "f861fab2-f1b1-49dd-b758-12d10aef3462", |
|
|
|
"metadata": {}, |
|
|
|
"outputs": [], |
|
|
|
"source": [ |
|
|
|
"%%time\n", |
|
|
|
"# gravando... demorou... mas deu certo\n", |
|
|
|
"with client.write_api() as writer:\n", |
|
|
|
" writer.write(\n", |
|
|
|
" bucket=InfluxDBBucket,\n", |
|
|
|
" record=df,\n", |
|
|
|
" data_frame_measurement_name=\"id\",\n", |
|
|
|
" data_frame_tag_columns=[\"volume\"],\n", |
|
|
|
" )" |
|
|
|
] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "code", |
|
|
|
"execution_count": null, |
|
|
|
"id": "0bb2563d-68e2-4ff4-8842-70ac730dc6b1", |
|
|
|
"metadata": {}, |
|
|
|
"outputs": [], |
|
|
|
"source": [ |
|
|
|
"# data\n", |
|
|
|
"# |> pivot(\n", |
|
|
|
"# rowKey:[\"_time\"],\n", |
|
|
|
"# columnKey: [\"_field\"],\n", |
|
|
|
"# valueColumn: \"_value\"\n", |
|
|
|
"# )" |
|
|
|
] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "code", |
|
|
|
"execution_count": null, |
|
|
|
"id": "bb1596f9-4cee-4642-803a-ee61c9dddf64", |
|
|
|
"metadata": {}, |
|
|
|
"outputs": [], |
|
|
|
"source": [ |
|
|
|
"# Read" |
|
|
|
] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "markdown", |
|
|
|
"id": "b9ddfdc6-c899-4f6c-9b4e-8ec6ab6d7e05", |
|
|
|
"metadata": { |
|
|
|
"jp-MarkdownHeadingCollapsed": true, |
|
|
|
"tags": [] |
|
|
|
}, |
|
|
|
"source": [ |
|
|
|
"### Postgresql" |
|
|
|
] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "code", |
|
|
|
"execution_count": null, |
|
|
|
"id": "16cd8eb7-333d-43fd-88e0-ee983645d3fd", |
|
|
|
"metadata": {}, |
|
|
|
"outputs": [], |
|
|
|
"source": [ |
|
|
|
"# Connect / Create Tables\n", |
|
|
|
"engine = create_engine(\n", |
|
|
|
" \"postgresql+psycopg2://{}:{}@{}:5432/{}\".format(\n", |
|
|
|
" PostgresqlUser, PostgresqlKey, PostgresqlUrl, PostgresqlDB\n", |
|
|
|
" )\n", |
|
|
|
")" |
|
|
|
] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "code", |
|
|
|
"execution_count": null, |
|
|
|
"id": "be31f3a0-b7ed-48e6-9b65-dc16319fb8d1", |
|
|
|
"metadata": {}, |
|
|
|
"outputs": [], |
|
|
|
"source": [ |
|
|
|
"# Drop old table and create new empty table\n", |
|
|
|
"df.head(0).to_sql(\"comparedbs\", engine, if_exists=\"replace\", index=False)" |
|
|
|
] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "code", |
|
|
|
"execution_count": null, |
|
|
|
"id": "a7883c4d-4609-4380-8a45-246b7ca2f9c5", |
|
|
|
"metadata": { |
|
|
|
"tags": [] |
|
|
|
}, |
|
|
|
"outputs": [], |
|
|
|
"source": [ |
|
|
|
"%%time\n", |
|
|
|
"# Write\n", |
|
|
|
"conn = engine.raw_connection()\n", |
|
|
|
"cur = conn.cursor()\n", |
|
|
|
"output = io.StringIO()\n", |
|
|
|
"df.to_csv(output, sep=\"\\t\", header=False, index=False)\n", |
|
|
|
"output.seek(0)\n", |
|
|
|
"contents = output.getvalue()\n", |
|
|
|
"\n", |
|
|
|
"cur.copy_from(output, \"comparedbs\") # , null=\"\") # null values become ''\n", |
|
|
|
"conn.commit()\n", |
|
|
|
"cur.close()\n", |
|
|
|
"conn.close()" |
|
|
|
] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "code", |
|
|
|
"execution_count": null, |
|
|
|
"id": "e37a93e1-fc0e-4d27-9e16-dca6c8aea324", |
|
|
|
"metadata": {}, |
|
|
|
"outputs": [], |
|
|
|
"source": [ |
|
|
|
"# Read" |
|
|
|
] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "markdown", |
|
|
|
"id": "f9e0393d-7d1d-406a-a068-9dbf4968e977", |
|
|
|
"metadata": { |
|
|
|
"jp-MarkdownHeadingCollapsed": true, |
|
|
|
"tags": [] |
|
|
|
}, |
|
|
|
"source": [ |
|
|
|
"### S3 Parquet" |
|
|
|
] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "code", |
|
|
|
"execution_count": null, |
|
|
|
"id": "98cf93c9-cb63-436c-809b-ef3ff4c3d8a5", |
|
|
|
"metadata": {}, |
|
|
|
"outputs": [], |
|
|
|
"source": [ |
|
|
|
"# fs = s3fs.S3FileSystem(\n", |
|
|
|
"# anon=False,\n", |
|
|
|
"# use_ssl=False,\n", |
|
|
|
"# client_kwargs={\n", |
|
|
|
"# \"region_name\": S3MinioRegion,\n", |
|
|
|
"# \"endpoint_url\": S3MinioUrl,\n", |
|
|
|
"# \"aws_access_key_id\": \"MatMPA7NyHltz7DQ\",\n", |
|
|
|
"# \"aws_secret_access_key\": \"SO1IG5iBPSjNPZanYUaHCLcoSbjphLCP\",\n", |
|
|
|
"# \"verify\": False,\n", |
|
|
|
"# },\n", |
|
|
|
"# )" |
|
|
|
] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "code", |
|
|
|
"execution_count": 72, |
|
|
|
"id": "60a990e2-4607-4654-84ec-17d4985adae2", |
|
|
|
"metadata": { |
|
|
|
"tags": [] |
|
|
|
}, |
|
|
|
"outputs": [], |
|
|
|
"source": [ |
|
|
|
"# fazer sem funçao para ver se melhora\n", |
|
|
|
"# verifique se esta no ssd os arquivos da pasta git\n", |
|
|
|
"def main():\n", |
|
|
|
" # Create a client with the MinIO server playground, its access key\n", |
|
|
|
" # and secret key.\n", |
|
|
|
" client = Minio(\n", |
|
|
|
" S3MinioUrl,\n", |
|
|
|
" secure=False,\n", |
|
|
|
" region=S3MinioRegion,\n", |
|
|
|
" access_key=\"MatMPA7NyHltz7DQ\",\n", |
|
|
|
" secret_key=\"SO1IG5iBPSjNPZanYUaHCLcoSbjphLCP\",\n", |
|
|
|
" )\n", |
|
|
|
"\n", |
|
|
|
" # Make 'asiatrip' bucket if not exist.\n", |
|
|
|
" found = client.bucket_exists(\"data\")\n", |
|
|
|
" if not found:\n", |
|
|
|
" client.make_bucket(\"data\")\n", |
|
|
|
" else:\n", |
|
|
|
" print(\"Bucket 'data' already exists\")\n", |
|
|
|
"\n", |
|
|
|
" # Upload '/home/user/Photos/asiaphotos.zip' as object name\n", |
|
|
|
" # 'asiaphotos-2015.zip' to bucket 'asiatrip'.\n", |
|
|
|
" client.fput_object(\n", |
|
|
|
" \"data\",\n", |
|
|
|
" \"data.parquet\",\n", |
|
|
|
" \"data/data.parquet\",\n", |
|
|
|
" )\n", |
|
|
|
" # print(\n", |
|
|
|
" # \"'data/data.parquet' is successfully uploaded as \"\n", |
|
|
|
" # \"object 'data.parquet' to bucket 'data'.\"\n", |
|
|
|
" # )" |
|
|
|
] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "code", |
|
|
|
"execution_count": 73, |
|
|
|
"id": "390918c8-c88f-404a-96c4-685d578fdad0", |
|
|
|
"metadata": { |
|
|
|
"tags": [] |
|
|
|
}, |
|
|
|
"outputs": [ |
|
|
|
{ |
|
|
|
"name": "stdout", |
|
|
|
"output_type": "stream", |
|
|
|
"text": [ |
|
|
|
"Bucket 'data' already exists\n", |
|
|
|
"CPU times: user 610 ms, sys: 133 ms, total: 743 ms\n", |
|
|
|
"Wall time: 4.05 s\n" |
|
|
|
] |
|
|
|
} |
|
|
|
], |
|
|
|
"source": [ |
|
|
|
"%%time\n", |
|
|
|
"df.to_parquet(\"data/data.parquet\")\n", |
|
|
|
"if __name__ == \"__main__\":\n", |
|
|
|
" try:\n", |
|
|
|
" main()\n", |
|
|
|
" except S3Error as exc:\n", |
|
|
|
" print(\"error occurred.\", exc)" |
|
|
|
] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "code", |
|
|
|
"execution_count": 71, |
|
|
|
"id": "a9e07143-8c11-4b68-a869-c3922cda9092", |
|
|
|
"metadata": { |
|
|
|
"tags": [] |
|
|
|
}, |
|
|
|
"outputs": [ |
|
|
|
{ |
|
|
|
"data": { |
|
|
|
"text/html": [ |
|
|
|
"<div>\n", |
|
|
|
"<style scoped>\n", |
|
|
|
" .dataframe tbody tr th:only-of-type {\n", |
|
|
|
" vertical-align: middle;\n", |
|
|
|
" }\n", |
|
|
|
"\n", |
|
|
|
" .dataframe tbody tr th {\n", |
|
|
|
" vertical-align: top;\n", |
|
|
|
" }\n", |
|
|
|
"\n", |
|
|
|
" .dataframe thead th {\n", |
|
|
|
" text-align: right;\n", |
|
|
|
" }\n", |
|
|
|
"</style>\n", |
|
|
|
"<table border=\"1\" class=\"dataframe\">\n", |
|
|
|
" <thead>\n", |
|
|
|
" <tr style=\"text-align: right;\">\n", |
|
|
|
" <th></th>\n", |
|
|
|
" <th>Unnamed: 0</th>\n", |
|
|
|
" <th>id</th>\n", |
|
|
|
" <th>at</th>\n", |
|
|
|
" <th>to</th>\n", |
|
|
|
" <th>open</th>\n", |
|
|
|
" <th>close</th>\n", |
|
|
|
" <th>min</th>\n", |
|
|
|
" <th>max</th>\n", |
|
|
|
" <th>volume</th>\n", |
|
|
|
" </tr>\n", |
|
|
|
" <tr>\n", |
|
|
|
" <th>from</th>\n", |
|
|
|
" <th></th>\n", |
|
|
|
" <th></th>\n", |
|
|
|
" <th></th>\n", |
|
|
|
" <th></th>\n", |
|
|
|
" <th></th>\n", |
|
|
|
" <th></th>\n", |
|
|
|
" <th></th>\n", |
|
|
|
" <th></th>\n", |
|
|
|
" <th></th>\n", |
|
|
|
" </tr>\n", |
|
|
|
" </thead>\n", |
|
|
|
" <tbody>\n", |
|
|
|
" <tr>\n", |
|
|
|
" <th>2023-01-02 15:58:45</th>\n", |
|
|
|
" <td>0</td>\n", |
|
|
|
" <td>7730801</td>\n", |
|
|
|
" <td>1672675140000000000</td>\n", |
|
|
|
" <td>2023-01-02 15:59:00</td>\n", |
|
|
|
" <td>1.065995</td>\n", |
|
|
|
" <td>1.066035</td>\n", |
|
|
|
" <td>1.065930</td>\n", |
|
|
|
" <td>1.066070</td>\n", |
|
|
|
" <td>57</td>\n", |
|
|
|
" </tr>\n", |
|
|
|
" <tr>\n", |
|
|
|
" <th>2023-01-02 15:59:00</th>\n", |
|
|
|
" <td>1</td>\n", |
|
|
|
" <td>7730802</td>\n", |
|
|
|
" <td>1672675155000000000</td>\n", |
|
|
|
" <td>2023-01-02 15:59:15</td>\n", |
|
|
|
" <td>1.066055</td>\n", |
|
|
|
" <td>1.066085</td>\n", |
|
|
|
" <td>1.066005</td>\n", |
|
|
|
" <td>1.066115</td>\n", |
|
|
|
" <td>52</td>\n", |
|
|
|
" </tr>\n", |
|
|
|
" <tr>\n", |
|
|
|
" <th>2023-01-02 15:59:15</th>\n", |
|
|
|
" <td>2</td>\n", |
|
|
|
" <td>7730803</td>\n", |
|
|
|
" <td>1672675170000000000</td>\n", |
|
|
|
" <td>2023-01-02 15:59:30</td>\n", |
|
|
|
" <td>1.066080</td>\n", |
|
|
|
" <td>1.066025</td>\n", |
|
|
|
" <td>1.066025</td>\n", |
|
|
|
" <td>1.066110</td>\n", |
|
|
|
" <td>57</td>\n", |
|
|
|
" </tr>\n", |
|
|
|
" <tr>\n", |
|
|
|
" <th>2023-01-02 15:59:30</th>\n", |
|
|
|
" <td>3</td>\n", |
|
|
|
" <td>7730804</td>\n", |
|
|
|
" <td>1672675185000000000</td>\n", |
|
|
|
" <td>2023-01-02 15:59:45</td>\n", |
|
|
|
" <td>1.065980</td>\n", |
|
|
|
" <td>1.065985</td>\n", |
|
|
|
" <td>1.065885</td>\n", |
|
|
|
" <td>1.066045</td>\n", |
|
|
|
" <td>64</td>\n", |
|
|
|
" </tr>\n", |
|
|
|
" <tr>\n", |
|
|
|
" <th>2023-01-02 15:59:45</th>\n", |
|
|
|
" <td>4</td>\n", |
|
|
|
" <td>7730805</td>\n", |
|
|
|
" <td>1672675200000000000</td>\n", |
|
|
|
" <td>2023-01-02 16:00:00</td>\n", |
|
|
|
" <td>1.065975</td>\n", |
|
|
|
" <td>1.066055</td>\n", |
|
|
|
" <td>1.065830</td>\n", |
|
|
|
" <td>1.066055</td>\n", |
|
|
|
" <td>50</td>\n", |
|
|
|
" </tr>\n", |
|
|
|
" </tbody>\n", |
|
|
|
"</table>\n", |
|
|
|
"</div>" |
|
|
|
], |
|
|
|
"text/plain": [ |
|
|
|
" Unnamed: 0 id at \n", |
|
|
|
"from \n", |
|
|
|
"2023-01-02 15:58:45 0 7730801 1672675140000000000 \\\n", |
|
|
|
"2023-01-02 15:59:00 1 7730802 1672675155000000000 \n", |
|
|
|
"2023-01-02 15:59:15 2 7730803 1672675170000000000 \n", |
|
|
|
"2023-01-02 15:59:30 3 7730804 1672675185000000000 \n", |
|
|
|
"2023-01-02 15:59:45 4 7730805 1672675200000000000 \n", |
|
|
|
"\n", |
|
|
|
" to open close min \n", |
|
|
|
"from \n", |
|
|
|
"2023-01-02 15:58:45 2023-01-02 15:59:00 1.065995 1.066035 1.065930 \\\n", |
|
|
|
"2023-01-02 15:59:00 2023-01-02 15:59:15 1.066055 1.066085 1.066005 \n", |
|
|
|
"2023-01-02 15:59:15 2023-01-02 15:59:30 1.066080 1.066025 1.066025 \n", |
|
|
|
"2023-01-02 15:59:30 2023-01-02 15:59:45 1.065980 1.065985 1.065885 \n", |
|
|
|
"2023-01-02 15:59:45 2023-01-02 16:00:00 1.065975 1.066055 1.065830 \n", |
|
|
|
"\n", |
|
|
|
" max volume \n", |
|
|
|
"from \n", |
|
|
|
"2023-01-02 15:58:45 1.066070 57 \n", |
|
|
|
"2023-01-02 15:59:00 1.066115 52 \n", |
|
|
|
"2023-01-02 15:59:15 1.066110 57 \n", |
|
|
|
"2023-01-02 15:59:30 1.066045 64 \n", |
|
|
|
"2023-01-02 15:59:45 1.066055 50 " |
|
|
|
] |
|
|
|
}, |
|
|
|
"execution_count": 71, |
|
|
|
"metadata": {}, |
|
|
|
"output_type": "execute_result" |
|
|
|
} |
|
|
|
], |
|
|
|
"source": [ |
|
|
|
"pq = pd.read_parquet(\"data/data.parquet\", engine=\"pyarrow\")\n", |
|
|
|
"pq.head()" |
|
|
|
] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "code", |
|
|
|
"execution_count": null, |
|
|
|
"id": "62b5ecf6-1178-4824-9c97-91522abcde93", |
|
|
|
"metadata": { |
|
|
|
"tags": [] |
|
|
|
}, |
|
|
|
"outputs": [], |
|
|
|
"source": [ |
|
|
|
"# # from friendly.jupyter import Friendly\n", |
|
|
|
"# path_to_s3_object = \"http://192.168.1.125:9000/obsidian/sample.parquet\"\n", |
|
|
|
"# # df = to_df(data)\n", |
|
|
|
"# df.to_parquet(\"data/data.parquet\")" |
|
|
|
] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "code", |
|
|
|
"execution_count": null, |
|
|
|
"id": "55dd51a3-7387-467c-95f0-6c282c4135eb", |
|
|
|
"metadata": { |
|
|
|
"tags": [] |
|
|
|
}, |
|
|
|
"outputs": [], |
|
|
|
"source": [ |
|
|
|
"# with fs.open(\"obsidian/data.parquet\", \"wb\") as f:\n", |
|
|
|
"# df.to_parquet(f)" |
|
|
|
] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "code", |
|
|
|
"execution_count": null, |
|
|
|
"id": "0c351614-8373-4822-a423-20412c92a6eb", |
|
|
|
"metadata": {}, |
|
|
|
"outputs": [], |
|
|
|
"source": [ |
|
|
|
"# s3_filepath = \"obsidian/data.parquet\"\n", |
|
|
|
"\n", |
|
|
|
"# pq.write_to_dataset(\n", |
|
|
|
"# Table.from_pandas(df),\n", |
|
|
|
"# s3_filepath,\n", |
|
|
|
"# filesystem=fs,\n", |
|
|
|
"# use_dictionary=True,\n", |
|
|
|
"# compression=\"snappy\",\n", |
|
|
|
"# version=\"2.4\",\n", |
|
|
|
"# )" |
|
|
|
] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "code", |
|
|
|
"execution_count": null, |
|
|
|
"id": "480cc01f-3239-4f7d-b20a-70c17d59d6f6", |
|
|
|
"metadata": { |
|
|
|
"tags": [] |
|
|
|
}, |
|
|
|
"outputs": [], |
|
|
|
"source": [ |
|
|
|
"# pq.write_to_dataset(\n", |
|
|
|
"# Table.from_pandas(df),\n", |
|
|
|
"# path_to_s3_object,\n", |
|
|
|
"# filesystem=fs,\n", |
|
|
|
"# use_dictionary=True,\n", |
|
|
|
"# compression=\"snappy\",\n", |
|
|
|
"# version=\"2.0\",\n", |
|
|
|
"# )" |
|
|
|
] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "code", |
|
|
|
"execution_count": null, |
|
|
|
"id": "c33f70ac-cfcf-4024-af86-c08bcc60d9a5", |
|
|
|
"metadata": {}, |
|
|
|
"outputs": [], |
|
|
|
"source": [ |
|
|
|
"# path_to_s3_object = \"s3://sample-bucket/path/to/sample.parquet\"\n", |
|
|
|
"\n", |
|
|
|
"# data = [\n", |
|
|
|
"# {\n", |
|
|
|
"# \"hoge\": 1,\n", |
|
|
|
"# \"foo\": \"blah\",\n", |
|
|
|
"# },\n", |
|
|
|
"# {\n", |
|
|
|
"# \"boo\": \"test\",\n", |
|
|
|
"# \"bar\": 123,\n", |
|
|
|
"# },\n", |
|
|
|
"# ]\n", |
|
|
|
"# df = to_df(data)\n", |
|
|
|
"# pq.write_to_dataset(\n", |
|
|
|
"# Table.from_pandas(df),\n", |
|
|
|
"# path_to_s3_object,\n", |
|
|
|
"# filesystem=fs,\n", |
|
|
|
"# use_dictionary=True,\n", |
|
|
|
"# compression=\"snappy\",\n", |
|
|
|
"# version=\"2.0\",\n", |
|
|
|
"# )" |
|
|
|
] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "markdown", |
|
|
|
"id": "50d1fc58-89a7-4507-aff0-6e943656cfe0", |
|
|
|
"metadata": {}, |
|
|
|
"source": [ |
|
|
|
"### MongoDB" |
|
|
|
] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "code", |
|
|
|
"execution_count": null, |
|
|
|
"id": "81a4a33d-5914-45d8-af4e-2b0aabd2ac38", |
|
|
|
"metadata": { |
|
|
|
"tags": [] |
|
|
|
}, |
|
|
|
"outputs": [], |
|
|
|
"source": [ |
|
|
|
"client = MongoClient(MongoUrl);" |
|
|
|
] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "code", |
|
|
|
"execution_count": null, |
|
|
|
"id": "3e634d34-ad62-432e-aa0c-07cd4b7556e2", |
|
|
|
"metadata": { |
|
|
|
"tags": [] |
|
|
|
}, |
|
|
|
"outputs": [], |
|
|
|
"source": [ |
|
|
|
"DB = client[\"collection_name\"]" |
|
|
|
] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "code", |
|
|
|
"execution_count": null, |
|
|
|
"id": "dd871028-41c3-4f0d-aefc-f2ea4ee866e7", |
|
|
|
"metadata": { |
|
|
|
"scrolled": true, |
|
|
|
"tags": [] |
|
|
|
}, |
|
|
|
"outputs": [], |
|
|
|
"source": [ |
|
|
|
"db = client[\"test\"]" |
|
|
|
] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "markdown", |
|
|
|
"id": "97405e42-61dc-42c7-8220-237a312c0ec7", |
|
|
|
"metadata": {}, |
|
|
|
"source": [ |
|
|
|
"### DuckDB" |
|
|
|
] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "code", |
|
|
|
"execution_count": null, |
|
|
|
"id": "bbcdb883-d6dc-46db-88db-4c90b84522ba", |
|
|
|
"metadata": {}, |
|
|
|
"outputs": [], |
|
|
|
"source": [] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "markdown", |
|
|
|
"id": "4409cc89-ed14-4313-ac89-65b826038533", |
|
|
|
"metadata": {}, |
|
|
|
"source": [ |
|
|
|
"### Kdb+" |
|
|
|
] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "code", |
|
|
|
"execution_count": null, |
|
|
|
"id": "14f63810-1943-4e28-9bce-2148be6be02d", |
|
|
|
"metadata": {}, |
|
|
|
"outputs": [], |
|
|
|
"source": [] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "code", |
|
|
|
"execution_count": null, |
|
|
|
"id": "3bf5e29b-fd38-4ec6-b583-f53e504073ab", |
|
|
|
"metadata": {}, |
|
|
|
"outputs": [], |
|
|
|
"source": [] |
|
|
|
} |
|
|
|
], |
|
|
|
"metadata": { |
|
|
|
"kernelspec": { |
|
|
|
"display_name": "Python 3 (ipykernel)", |
|
|
|
"language": "python", |
|
|
|
"name": "python3" |
|
|
|
}, |
|
|
|
"language_info": { |
|
|
|
"codemirror_mode": { |
|
|
|
"name": "ipython", |
|
|
|
"version": 3 |
|
|
|
}, |
|
|
|
"file_extension": ".py", |
|
|
|
"mimetype": "text/x-python", |
|
|
|
"name": "python", |
|
|
|
"nbconvert_exporter": "python", |
|
|
|
"pygments_lexer": "ipython3", |
|
|
|
"version": "3.10.11" |
|
|
|
}, |
|
|
|
"widgets": { |
|
|
|
"application/vnd.jupyter.widget-state+json": { |
|
|
|
"state": {}, |
|
|
|
"version_major": 2, |
|
|
|
"version_minor": 0 |
|
|
|
} |
|
|
|
} |
|
|
|
}, |
|
|
|
"nbformat": 4, |
|
|
|
"nbformat_minor": 5 |
|
|
|
} |