From a48b8c43dac17dcb868d52229a482c5c6bb159b0 Mon Sep 17 00:00:00 2001 From: flashlan Date: Sat, 10 Jun 2023 00:53:25 -0300 Subject: [PATCH] changed code to functions and add time execution benchmarks on CilickHouse --- compareDBs.ipynb | 558 +++++++++++++++++++++++++---------------------- 1 file changed, 301 insertions(+), 257 deletions(-) diff --git a/compareDBs.ipynb b/compareDBs.ipynb index 5ab1d11..ae4e834 100644 --- a/compareDBs.ipynb +++ b/compareDBs.ipynb @@ -47,25 +47,16 @@ }, { "cell_type": "code", - "execution_count": 68, + "execution_count": null, "id": "ab6c6c81-6ac1-4668-a79b-a9a0341fb35a", "metadata": { "tags": [] }, - "outputs": [ - { - "data": { - "text/plain": [ - "False" - ] - }, - "execution_count": 68, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "import configparser\n", + "import time\n", + "import timeit\n", "from datetime import datetime\n", "\n", "import duckdb\n", @@ -98,7 +89,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 42, "id": "968403e3-2e5e-4834-b969-be4600e2963a", "metadata": { "tags": [] @@ -138,19 +129,10 @@ "metadata": {}, "outputs": [], "source": [ - "%%time\n", + "# %%time\n", "# Load Dataset\n", - "df = pd.read_csv(\"out.csv\", index_col=0)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7e7c46b6-90ee-4ca3-8b5a-553b09ece913", - "metadata": {}, - "outputs": [], - "source": [ - "# df.head()" + "df = pd.read_csv(\"out.csv\", index_col=0)\n", + "df.head()" ] }, { @@ -202,83 +184,287 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "9cf86669-7722-4a2c-895c-51f0a5eebefc", + "execution_count": 44, + "id": "c3202bbb-2655-45b2-b166-9f45a3ef854c", "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "'Database created'" + ] + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# !! O client oficial usa um driver http, nesse exemplo vamos usar a biblioteca\n", "# de terceirtos clickhouse_driver recomendada, por sua vez que usa tcp.\n", - "client = Client(\n", - " host=ClickHouseUrl,\n", - " user=ClickHouseUser,\n", - " password=ClickHouseKey,\n", - " settings={\"use_numpy\": True},\n", - ")" + "def cHouseConnect():\n", + " client = Client(\n", + " host=ClickHouseUrl,\n", + " user=ClickHouseUser,\n", + " password=ClickHouseKey,\n", + " settings={\"use_numpy\": True},\n", + " )\n", + " return client\n", + "\n", + "\n", + "# Create Tables in ClickHouse\n", + "# !! ALTERAR TIPOS !!\n", + "# ENGINE: 'Memory' desaparece quando server é reiniciado\n", + "def cHouseCreateDb(databasename):\n", + " client = cHouseConnect()\n", + " client.execute(\n", + " \"CREATE TABLE IF NOT EXISTS {} (id UInt32,\"\n", + " \"from DateTime, at UInt64, to DateTime, open Float64,\"\n", + " \"close Float64, min Float64, max Float64, volume UInt32)\"\n", + " \"ENGINE MergeTree ORDER BY to\".format(databasename)\n", + " )\n", + " client.disconnect()\n", + " return \"Database created\"\n", + "\n", + "\n", + "# Write dataframe to db\n", + "def cHouseInsertDf(dbName, dataframe):\n", + " client = cHouseConnect()\n", + " client.insert_dataframe(\"INSERT INTO {} VALUES\".format(dbName), dataframe)\n", + " client.disconnect()\n", + " return \" dataframe {} inserted in clickhouse database\".format(dataframe)\n", + "\n", + "\n", + "def cHouseQueryDf(databaseName):\n", + " client = cHouseConnect()\n", + " dfQuery = client.query_dataframe(\n", + " \"SELECT * FROM default.{}\".format(databaseName)\n", + " ) # LIMIT 10000\n", + " client.disconnect()\n", + " return dfQuery\n", + "\n", + "\n", + "cHouseCreateDb(dbname)" ] }, { "cell_type": "code", - "execution_count": null, - "id": "a0a1f67b-2e63-462e-be66-d322d99837ea", + "execution_count": 47, + "id": "cc4865b3-a1bc-4a35-9624-15334754b3a1", "metadata": {}, "outputs": [], "source": [ - "# Create Tables in ClickHouse\n", - "# !! ALTERAR TIPOS !!\n", - "# ENGINE: 'Memory' desaparece quando server é reiniciado\n", - "client.execute(\n", - " \"CREATE TABLE IF NOT EXISTS {} (id UInt32,\"\n", - " \"from DateTime, at UInt64, to DateTime, open Float64,\"\n", - " \"close Float64, min Float64, max Float64, volume UInt32)\"\n", - " \"ENGINE MergeTree ORDER BY to\".format(dbname)\n", - ")" + "# Insert to db and benchmark time\n", + "start = timeit.default_timer()\n", + "cHouseInsertDf(dbname, df)\n", + "stop = timeit.default_timer()\n", + "cHouse_write_execution_time = stop - start" ] }, { "cell_type": "code", - "execution_count": null, - "id": "3a029a09-46f4-43c3-b3df-cfbed33fb0dc", + "execution_count": 48, + "id": "1fac82c1-2d04-44ef-893a-dc13b755e6d8", "metadata": {}, "outputs": [], "source": [ - "%%time\n", - "# Write dataframe to db\n", - "client.insert_dataframe(\"INSERT INTO {} VALUES\".format(dbname), df)" + "# read from db and benchmark time\n", + "start = timeit.default_timer()\n", + "dfCh = cHouseQueryDf(dbname)\n", + "stop = timeit.default_timer()\n", + "cHouse_read_execution_time = stop - start" ] }, { "cell_type": "code", - "execution_count": null, - "id": "17251288-2442-43ee-98f2-ca680c3c4f13", + "execution_count": 49, + "id": "597ae7bd-2eea-44d7-b379-f0eb7e745c15", "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idfromattoopencloseminmaxvolume
077308012023-01-02 15:58:4516726751400000000002023-01-02 15:59:001.0659951.0660351.0659301.06607057
177308012023-01-02 15:58:4516726751400000000002023-01-02 15:59:001.0659951.0660351.0659301.06607057
277308022023-01-02 15:59:0016726751550000000002023-01-02 15:59:151.0660551.0660851.0660051.06611552
377308022023-01-02 15:59:0016726751550000000002023-01-02 15:59:151.0660551.0660851.0660051.06611552
477308032023-01-02 15:59:1516726751700000000002023-01-02 15:59:301.0660801.0660251.0660251.06611057
\n", + "
" + ], + "text/plain": [ + " id from at to \\\n", + "0 7730801 2023-01-02 15:58:45 1672675140000000000 2023-01-02 15:59:00 \n", + "1 7730801 2023-01-02 15:58:45 1672675140000000000 2023-01-02 15:59:00 \n", + "2 7730802 2023-01-02 15:59:00 1672675155000000000 2023-01-02 15:59:15 \n", + "3 7730802 2023-01-02 15:59:00 1672675155000000000 2023-01-02 15:59:15 \n", + "4 7730803 2023-01-02 15:59:15 1672675170000000000 2023-01-02 15:59:30 \n", + "\n", + " open close min max volume \n", + "0 1.065995 1.066035 1.065930 1.066070 57 \n", + "1 1.065995 1.066035 1.065930 1.066070 57 \n", + "2 1.066055 1.066085 1.066005 1.066115 52 \n", + "3 1.066055 1.066085 1.066005 1.066115 52 \n", + "4 1.066080 1.066025 1.066025 1.066110 57 " + ] + }, + "execution_count": 49, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "%%time\n", - "client.query_dataframe(\"SELECT * FROM default.{}\".format(dbname)) # LIMIT 10000" + "dfCh.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "id": "86794e47-611f-4ca8-a7e8-07e71afafe67", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "5.175396532999002\n" + ] + } + ], + "source": [ + "print(cHouse_read_execution_time)" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "id": "e7926062-8e84-4d3f-90a9-32807ce4f3d4", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "6.163630739996734\n" + ] + } + ], + "source": [ + "print(cHouse_write_execution_time)" ] }, { "cell_type": "code", "execution_count": null, - "id": "51497522-bd6c-44a8-aaea-ec5dda30b95b", + "id": "8faa5683-a204-461d-80c3-67644aa714ce", "metadata": { "tags": [] }, "outputs": [], "source": [ - "# %%time\n", - "# df = pd.DataFrame(client.query_dataframe(\"SELECT * FROM default.{}\".format(dbname)))" + "%%time\n", + "dfCh = cHouseQueryDf(dbname)" ] }, { "cell_type": "markdown", "id": "1d389546-911f-43f7-aad1-49f7bcc83503", "metadata": { + "jp-MarkdownHeadingCollapsed": true, "tags": [] }, "source": [ @@ -427,6 +613,14 @@ "conn.close()" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "73de4294-1284-49b0-b31e-45db6e835877", + "metadata": {}, + "outputs": [], + "source": [] + }, { "cell_type": "code", "execution_count": null, @@ -434,7 +628,36 @@ "metadata": {}, "outputs": [], "source": [ - "# Read" + "start = time.time()\n", + "# %%time\n", + "# Read\n", + "df = pd.read_sql_query('select * from \"comparedbs\"', con=engine)\n", + "end = time.time()\n", + "postgresql_read_time = exec_time(start, end)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6d1b7480-5bc7-4f08-8cf3-b9590802d8f7", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "print(postgresql_read_time)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6acb2959-3255-43bd-aea5-9ef70acc8902", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "df.head()" ] }, { @@ -674,7 +897,7 @@ }, { "cell_type": "code", - "execution_count": 69, + "execution_count": null, "id": "14f63810-1943-4e28-9bce-2148be6be02d", "metadata": {}, "outputs": [], @@ -687,7 +910,7 @@ }, { "cell_type": "code", - "execution_count": 70, + "execution_count": null, "id": "8ff6c090-7e02-435a-a179-f2aab81da972", "metadata": {}, "outputs": [], @@ -698,7 +921,7 @@ }, { "cell_type": "code", - "execution_count": 71, + "execution_count": null, "id": "b4eb8ab9-81e8-4732-8cf7-51f0981d3d57", "metadata": { "tags": [] @@ -712,19 +935,10 @@ }, { "cell_type": "code", - "execution_count": 75, + "execution_count": null, "id": "97cb6b5b-65a5-46a0-a4ee-e5c535a716ab", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "CPU times: user 925 ms, sys: 40 ms, total: 965 ms\n", - "Wall time: 1.43 s\n" - ] - } - ], + "outputs": [], "source": [ "%%time\n", "# send df to kd+ in memory bank\n", @@ -733,21 +947,10 @@ }, { "cell_type": "code", - "execution_count": 76, + "execution_count": null, "id": "c2ed2d51-bc8e-4207-892a-35fc55d43570", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "b':/home/sandman/q/tab1'" - ] - }, - "execution_count": 76, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# write to on disk table\n", "q.sendSync(\"`:/home/sandman/q/tab1 set t\")" @@ -755,21 +958,12 @@ }, { "cell_type": "code", - "execution_count": 77, + "execution_count": null, "id": "9c055a95-f73f-43a3-8fbd-61e42235117e", "metadata": { "tags": [] }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "CPU times: user 1.94 ms, sys: 1 µs, total: 1.94 ms\n", - "Wall time: 426 ms\n" - ] - } - ], + "outputs": [], "source": [ "%%time\n", "# read from on disk table\n", @@ -778,7 +972,7 @@ }, { "cell_type": "code", - "execution_count": 78, + "execution_count": null, "id": "9760de38-9f04-4322-bfff-c7ee12d5dee5", "metadata": { "tags": [] @@ -790,21 +984,12 @@ }, { "cell_type": "code", - "execution_count": 79, + "execution_count": null, "id": "c06c9222-c69d-4872-9d21-052281a013e2", "metadata": { "tags": [] }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "CPU times: user 1.08 s, sys: 116 ms, total: 1.2 s\n", - "Wall time: 1.27 s\n" - ] - } - ], + "outputs": [], "source": [ "%%time\n", "# load to variable df2\n", @@ -813,7 +998,7 @@ }, { "cell_type": "code", - "execution_count": 80, + "execution_count": null, "id": "8815f01c-fd0a-4f94-ab7f-f8ede84ba4e7", "metadata": { "tags": [] @@ -825,144 +1010,12 @@ }, { "cell_type": "code", - "execution_count": 82, + "execution_count": null, "id": "e6ed3927-4395-45cd-9a28-88c5db01f2e5", "metadata": { "tags": [] }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "CPU times: user 1.25 s, sys: 132 ms, total: 1.39 s\n", - "Wall time: 1.46 s\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Unnamed: 0idfromattoopencloseminmaxvolume
007730801b'2023-01-02 15:58:45'1672675140000000000b'2023-01-02 15:59:00'1.0659951.0660351.0659301.06607057
117730802b'2023-01-02 15:59:00'1672675155000000000b'2023-01-02 15:59:15'1.0660551.0660851.0660051.06611552
227730803b'2023-01-02 15:59:15'1672675170000000000b'2023-01-02 15:59:30'1.0660801.0660251.0660251.06611057
337730804b'2023-01-02 15:59:30'1672675185000000000b'2023-01-02 15:59:45'1.0659801.0659851.0658851.06604564
447730805b'2023-01-02 15:59:45'1672675200000000000b'2023-01-02 16:00:00'1.0659751.0660551.0658301.06605550
\n", - "
" - ], - "text/plain": [ - " Unnamed: 0 id from at \n", - "0 0 7730801 b'2023-01-02 15:58:45' 1672675140000000000 \\\n", - "1 1 7730802 b'2023-01-02 15:59:00' 1672675155000000000 \n", - "2 2 7730803 b'2023-01-02 15:59:15' 1672675170000000000 \n", - "3 3 7730804 b'2023-01-02 15:59:30' 1672675185000000000 \n", - "4 4 7730805 b'2023-01-02 15:59:45' 1672675200000000000 \n", - "\n", - " to open close min max volume \n", - "0 b'2023-01-02 15:59:00' 1.065995 1.066035 1.065930 1.066070 57 \n", - "1 b'2023-01-02 15:59:15' 1.066055 1.066085 1.066005 1.066115 52 \n", - "2 b'2023-01-02 15:59:30' 1.066080 1.066025 1.066025 1.066110 57 \n", - "3 b'2023-01-02 15:59:45' 1.065980 1.065985 1.065885 1.066045 64 \n", - "4 b'2023-01-02 16:00:00' 1.065975 1.066055 1.065830 1.066055 50 " - ] - }, - "execution_count": 82, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "%%time\n", "# converto to dataframe\n", @@ -972,21 +1025,12 @@ }, { "cell_type": "code", - "execution_count": 83, + "execution_count": null, "id": "0fc7f16b-6c39-4ebe-88d2-ff857e30ab62", "metadata": { "tags": [] }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "CPU times: user 1.11 s, sys: 116 ms, total: 1.23 s\n", - "Wall time: 1.3 s\n" - ] - } - ], + "outputs": [], "source": [ "%%time\n", "# select\n", @@ -995,7 +1039,7 @@ }, { "cell_type": "code", - "execution_count": 84, + "execution_count": null, "id": "c88646ca-3d25-4a85-80b5-f9e559f568dd", "metadata": { "tags": []