{ "cells": [ { "cell_type": "code", "execution_count": 2, "id": "8d7e4358", "metadata": {}, "outputs": [], "source": [ "import pandas as pd" ] }, { "cell_type": "markdown", "id": "c5b2042e", "metadata": {}, "source": [ "```bash\n", "pip install sqlalchemy psycopg2-binary \n", "```" ] }, { "cell_type": "code", "execution_count": 3, "id": "a1096f72", "metadata": {}, "outputs": [], "source": [ "from sqlalchemy import create_engine" ] }, { "cell_type": "code", "execution_count": 4, "id": "f569baa5", "metadata": {}, "outputs": [], "source": [ "engine = create_engine('postgresql://root:root@localhost:5432/ny_taxi')\n" ] }, { "cell_type": "markdown", "id": "df36c834", "metadata": {}, "source": [ "If you cannot connect to ny_taxi, use `postgres` database instead\n", "\n", "```python\n", "engine = create_engine('postgresql://root:root@localhost:5432/postgres')\n", "```" ] }, { "cell_type": "code", "execution_count": 5, "id": "523f5cf5", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "engine.connect()" ] }, { "cell_type": "code", "execution_count": 6, "id": "9f5a61a1", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
number
01
\n", "
" ], "text/plain": [ " number\n", "0 1" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "query = \"\"\"\n", "SELECT 1 as number;\n", "\"\"\"\n", "\n", "pd.read_sql(query, con=engine)" ] }, { "cell_type": "code", "execution_count": 7, "id": "e1c509ea", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
schemanametablenametableownertablespacehasindexeshasruleshastriggersrowsecurity
0publicyellow_tripdata_triprootNoneFalseFalseFalseFalse
1publicyellow_tripdatarootNoneFalseFalseFalseFalse
2publicyellow_taxi_datarootNoneTrueFalseFalseFalse
3publiczonesrootNoneTrueFalseFalseFalse
\n", "
" ], "text/plain": [ " schemaname tablename tableowner tablespace hasindexes \\\n", "0 public yellow_tripdata_trip root None False \n", "1 public yellow_tripdata root None False \n", "2 public yellow_taxi_data root None True \n", "3 public zones root None True \n", "\n", " hasrules hastriggers rowsecurity \n", "0 False False False \n", "1 False False False \n", "2 False False False \n", "3 False False False " ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "query = \"\"\"\n", "SELECT *\n", "FROM pg_catalog.pg_tables\n", "WHERE schemaname != 'pg_catalog' AND \n", " schemaname != 'information_schema';\n", "\"\"\"\n", "\n", "pd.read_sql(query, con=engine)" ] }, { "cell_type": "code", "execution_count": 8, "id": "2743d8e2", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Dockerfile\n", "InsertData.png\n", "README.md\n", "docker-compose.yaml\n", "ingest_data.py\n", "pg-test-connection.ipynb\n", "pipeline.py\n", "\u001b[1m\u001b[36mpostgres_data\u001b[m\u001b[m\n", "upload-data.ipynb\n" ] } ], "source": [ "%%bash\n", "ls\n" ] }, { "cell_type": "code", "execution_count": 4, "id": "3b06e4ab", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
VendorIDtpep_pickup_datetimetpep_dropoff_datetimepassenger_counttrip_distanceRatecodeIDstore_and_fwd_flagPULocationIDDOLocationIDpayment_typefare_amountextramta_taxtip_amounttolls_amountimprovement_surchargetotal_amountcongestion_surchargeAirport_fee
022024-01-01 00:57:552024-01-01 01:17:431.01.721.0N18679217.71.00.50.000.01.022.702.50.0
112024-01-01 00:03:002024-01-01 00:09:361.01.801.0N140236110.03.50.53.750.01.018.752.50.0
212024-01-01 00:17:062024-01-01 00:35:011.04.701.0N23679123.33.50.53.000.01.031.302.50.0
312024-01-01 00:36:382024-01-01 00:44:561.01.401.0N79211110.03.50.52.000.01.017.002.50.0
412024-01-01 00:46:512024-01-01 00:52:571.00.801.0N21114817.93.50.53.200.01.016.102.50.0
\n", "
" ], "text/plain": [ " VendorID tpep_pickup_datetime tpep_dropoff_datetime passenger_count \\\n", "0 2 2024-01-01 00:57:55 2024-01-01 01:17:43 1.0 \n", "1 1 2024-01-01 00:03:00 2024-01-01 00:09:36 1.0 \n", "2 1 2024-01-01 00:17:06 2024-01-01 00:35:01 1.0 \n", "3 1 2024-01-01 00:36:38 2024-01-01 00:44:56 1.0 \n", "4 1 2024-01-01 00:46:51 2024-01-01 00:52:57 1.0 \n", "\n", " trip_distance RatecodeID store_and_fwd_flag PULocationID DOLocationID \\\n", "0 1.72 1.0 N 186 79 \n", "1 1.80 1.0 N 140 236 \n", "2 4.70 1.0 N 236 79 \n", "3 1.40 1.0 N 79 211 \n", "4 0.80 1.0 N 211 148 \n", "\n", " payment_type fare_amount extra mta_tax tip_amount tolls_amount \\\n", "0 2 17.7 1.0 0.5 0.00 0.0 \n", "1 1 10.0 3.5 0.5 3.75 0.0 \n", "2 1 23.3 3.5 0.5 3.00 0.0 \n", "3 1 10.0 3.5 0.5 2.00 0.0 \n", "4 1 7.9 3.5 0.5 3.20 0.0 \n", "\n", " improvement_surcharge total_amount congestion_surcharge Airport_fee \n", "0 1.0 22.70 2.5 0.0 \n", "1 1.0 18.75 2.5 0.0 \n", "2 1.0 31.30 2.5 0.0 \n", "3 1.0 17.00 2.5 0.0 \n", "4 1.0 16.10 2.5 0.0 " ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "\n", "# Read the Parquet file\n", "df = pd.read_parquet('../data/yellow_tripdata_2024_01.parquet')\n", "\n", "df.head()\n", "# df.info()\n", "\n", "\n", "# Save as CSV file\n", "# df.to_csv('../data/yellow_tripdata_2024_01.csv', index=False)" ] }, { "cell_type": "code", "execution_count": 10, "id": "a1d1f971", "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv('../data/yellow_tripdata_2024-01.csv', nrows=100)" ] }, { "cell_type": "code", "execution_count": 5, "id": "f9e76d7f", "metadata": {}, "outputs": [], "source": [ "df.tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime)\n", "df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime)" ] }, { "cell_type": "code", "execution_count": 9, "id": "c5aaff6e", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 2964624 entries, 0 to 2964623\n", "Data columns (total 19 columns):\n", " # Column Dtype \n", "--- ------ ----- \n", " 0 VendorID int32 \n", " 1 tpep_pickup_datetime datetime64[ns]\n", " 2 tpep_dropoff_datetime datetime64[ns]\n", " 3 passenger_count float64 \n", " 4 trip_distance float64 \n", " 5 RatecodeID float64 \n", " 6 store_and_fwd_flag object \n", " 7 PULocationID int32 \n", " 8 DOLocationID int32 \n", " 9 payment_type int64 \n", " 10 fare_amount float64 \n", " 11 extra float64 \n", " 12 mta_tax float64 \n", " 13 tip_amount float64 \n", " 14 tolls_amount float64 \n", " 15 improvement_surcharge float64 \n", " 16 total_amount float64 \n", " 17 congestion_surcharge float64 \n", " 18 Airport_fee float64 \n", "dtypes: datetime64[ns](2), float64(12), int32(3), int64(1), object(1)\n", "memory usage: 395.8+ MB\n" ] } ], "source": [ "df.info()" ] }, { "cell_type": "code", "execution_count": 6, "id": "c146a7b1", "metadata": {}, "outputs": [], "source": [ "df.to_parquet('../data/yellow_tripdata_2024_01_cleansed.parquet', index=False)" ] }, { "cell_type": "code", "execution_count": 10, "id": "65a6a392", "metadata": {}, "outputs": [], "source": [ "df.to_csv('../data/yellow_tripdata_2024_01_cleansed.csv', index=False)" ] }, { "cell_type": "code", "execution_count": 13, "id": "f711ab67", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "100" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.to_sql(name='yellow_taxi_trips', con=engine, index=False)" ] }, { "cell_type": "code", "execution_count": 14, "id": "9549987c", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
VendorIDtpep_pickup_datetimetpep_dropoff_datetimepassenger_counttrip_distanceRatecodeIDstore_and_fwd_flagPULocationIDDOLocationIDpayment_typefare_amountextramta_taxtip_amounttolls_amountimprovement_surchargetotal_amountcongestion_surchargeAirport_fee
022024-01-01 00:57:552024-01-01 01:17:431.01.721.0N18679217.71.00.50.000.01.022.702.50.00
112024-01-01 00:03:002024-01-01 00:09:361.01.801.0N140236110.03.50.53.750.01.018.752.50.00
212024-01-01 00:17:062024-01-01 00:35:011.04.701.0N23679123.33.50.53.000.01.031.302.50.00
312024-01-01 00:36:382024-01-01 00:44:561.01.401.0N79211110.03.50.52.000.01.017.002.50.00
412024-01-01 00:46:512024-01-01 00:52:571.00.801.0N21114817.93.50.53.200.01.016.102.50.00
512024-01-01 00:54:082024-01-01 01:26:311.04.701.0N148141129.63.50.56.900.01.041.502.50.00
622024-01-01 00:49:442024-01-01 01:15:472.010.821.0N138181145.76.00.510.000.01.064.950.01.75
712024-01-01 00:30:402024-01-01 00:58:400.03.001.0N246231225.43.50.50.000.01.030.402.50.00
822024-01-01 00:26:012024-01-01 00:54:121.05.441.0N161261231.01.00.50.000.01.036.002.50.00
922024-01-01 00:28:082024-01-01 00:29:161.00.041.0N11311323.01.00.50.000.01.08.002.50.00
\n", "
" ], "text/plain": [ " VendorID tpep_pickup_datetime tpep_dropoff_datetime passenger_count \\\n", "0 2 2024-01-01 00:57:55 2024-01-01 01:17:43 1.0 \n", "1 1 2024-01-01 00:03:00 2024-01-01 00:09:36 1.0 \n", "2 1 2024-01-01 00:17:06 2024-01-01 00:35:01 1.0 \n", "3 1 2024-01-01 00:36:38 2024-01-01 00:44:56 1.0 \n", "4 1 2024-01-01 00:46:51 2024-01-01 00:52:57 1.0 \n", "5 1 2024-01-01 00:54:08 2024-01-01 01:26:31 1.0 \n", "6 2 2024-01-01 00:49:44 2024-01-01 01:15:47 2.0 \n", "7 1 2024-01-01 00:30:40 2024-01-01 00:58:40 0.0 \n", "8 2 2024-01-01 00:26:01 2024-01-01 00:54:12 1.0 \n", "9 2 2024-01-01 00:28:08 2024-01-01 00:29:16 1.0 \n", "\n", " trip_distance RatecodeID store_and_fwd_flag PULocationID DOLocationID \\\n", "0 1.72 1.0 N 186 79 \n", "1 1.80 1.0 N 140 236 \n", "2 4.70 1.0 N 236 79 \n", "3 1.40 1.0 N 79 211 \n", "4 0.80 1.0 N 211 148 \n", "5 4.70 1.0 N 148 141 \n", "6 10.82 1.0 N 138 181 \n", "7 3.00 1.0 N 246 231 \n", "8 5.44 1.0 N 161 261 \n", "9 0.04 1.0 N 113 113 \n", "\n", " payment_type fare_amount extra mta_tax tip_amount tolls_amount \\\n", "0 2 17.7 1.0 0.5 0.00 0.0 \n", "1 1 10.0 3.5 0.5 3.75 0.0 \n", "2 1 23.3 3.5 0.5 3.00 0.0 \n", "3 1 10.0 3.5 0.5 2.00 0.0 \n", "4 1 7.9 3.5 0.5 3.20 0.0 \n", "5 1 29.6 3.5 0.5 6.90 0.0 \n", "6 1 45.7 6.0 0.5 10.00 0.0 \n", "7 2 25.4 3.5 0.5 0.00 0.0 \n", "8 2 31.0 1.0 0.5 0.00 0.0 \n", "9 2 3.0 1.0 0.5 0.00 0.0 \n", "\n", " improvement_surcharge total_amount congestion_surcharge Airport_fee \n", "0 1.0 22.70 2.5 0.00 \n", "1 1.0 18.75 2.5 0.00 \n", "2 1.0 31.30 2.5 0.00 \n", "3 1.0 17.00 2.5 0.00 \n", "4 1.0 16.10 2.5 0.00 \n", "5 1.0 41.50 2.5 0.00 \n", "6 1.0 64.95 0.0 1.75 \n", "7 1.0 30.40 2.5 0.00 \n", "8 1.0 36.00 2.5 0.00 \n", "9 1.0 8.00 2.5 0.00 " ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "query = \"\"\"\n", "SELECT * FROM yellow_taxi_trips LIMIT 10\n", "\"\"\"\n", "\n", "pd.read_sql(query, con=engine)" ] }, { "cell_type": "markdown", "id": "bcd61c63", "metadata": {}, "source": [ "```sql\n", "SELECT *\n", "FROM pg_catalog.pg_tables\n", "WHERE schemaname != 'pg_catalog' AND \n", " schemaname != 'information_schema';\n", "```\n", "\n", "Source: https://www.postgresqltutorial.com/postgresql-show-tables/" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.10" } }, "nbformat": 4, "nbformat_minor": 5 }