TonicAI · isra17 · Nov 30, 2022
diff --git a/README.md b/README.md
@@ -14,17 +14,25 @@ Our open-source tool can subset databases up to 10GB, but it will struggle with
 
 # Installation
 
-Five steps to install, assuming Python 3.5+:
+Five steps to install, assuming Python 3.6+:
 
-1. Download the required Python modules. You can use [`pip`](https://pypi.org/project/pip/) for easy installation. The required modules are `toposort`, `psycopg2-binary`, and `mysql-connector-python`.
+1. [Install Poetry](https://python-poetry.org/docs/#installation)
+
+2. Install Postgres and/or MySQL database tools. For Postgres we need `pg_dump` and `psql` tools; they need to be on your `$PATH` or point to them with `$POSTGRES_PATH`. For MySQL we need `mysqldump` and `mysql`, they can be on your `$PATH` or point to them with `$MYSQL_PATH`.
+
+3. Clone project locally:
 ```
-$ pip install toposort
-$ pip install psycopg2-binary
-$ pip install mysql-connector-python
+$ git clone https://github.com/TonicAI/condenser.git
+$ cd condenser
 ```
-2. Install Postgres and/or MySQL database tools. For Postgres we need `pg_dump` and `psql` tools; they need to be on your `$PATH` or point to them with `$POSTGRES_PATH`. For MySQL we need `mysqldump` and `mysql`, they can be on your `$PATH` or point to them with `$MYSQL_PATH`.
-3. Download this repo. You can clone the repo or Download it as a zip. Scroll up, it's the green button that says "Clone or download".
-4. Setup your configuration and save it in `config.json`. The provided `config.json.example` has the skeleton of what you need to provide: source and destination database connection details, as well as subsetting goals in `initial_targets`. Here's an example that will collect 10% of a table named `public.target_table`.
+
+4. Install project:
+```
+$ poetry shell
+$ poetry install -E postgres # Or use -E mysql
+```
+
+5. Setup your configuration and save it in `config.json`. The provided `config.json.example` has the skeleton of what you need to provide: source and destination database connection details, as well as subsetting goals in `initial_targets`. Here's an example that will collect 10% of a table named `public.target_table`.
 ```
 "initial_targets": [
     {
@@ -35,7 +43,7 @@ $ pip install mysql-connector-python
 ```
 There may be more required configuration depending on your database, but simple databases should be easy. See the Config section for more details, and `config.json.example_all` for all of the options in a single config file.
 
-5. Run! `$ python direct_subset.py`
+5. Run! `$ poetry run subset`
 
 # Config
 
@@ -80,15 +88,11 @@ Below we describe the use of all configuration parameters, but the best place to
 Almost all the configuration is in the `config.json` file, so running is as simple as
 
 ```
-$ python direct_subset.py
+$ poetry run subset
 ```
 
 Two commandline arguements are supported:
 
 `-v`: Verbose output. Useful for performance debugging. Lists almost every query made, and it's speed.
 
 `--no-constraints`: For Postgres this will not add constraints found in the source database to the destination database. This option has no effect for MySQL.
-
-# Requirements
-
-Reference the requirements.txt file for a list of required python packages.  Also, please note that Python 3.5+ is required.
diff --git a/condenser/__init__.py b/condenser/__init__.py
diff --git a/config_reader.py → condenser/config_reader.py b/config_reader.py → condenser/config_reader.py
diff --git a/database_helper.py → condenser/database_helper.py b/database_helper.py → condenser/database_helper.py
@@ -1,9 +1,9 @@
-import config_reader
+from condenser import config_reader
 
 def get_specific_helper():
     if config_reader.get_db_type() == 'postgres':
-        import psql_database_helper
+        from condenser import psql_database_helper
         return psql_database_helper
     else:
-        import mysql_database_helper
+        from condenser import mysql_database_helper
         return mysql_database_helper
diff --git a/db_connect.py → condenser/db_connect.py b/db_connect.py → condenser/db_connect.py
@@ -1,5 +1,4 @@
-import config_reader
-import psycopg2, mysql.connector
+from condenser import config_reader
 import os, pathlib, re, urllib, subprocess, os.path, json, getpass, time, sys, datetime
 
 class DbConnect:
@@ -74,6 +73,7 @@ def __enter__(self):
 # method across MySQL and Postgres. This one is for Postgres
 class PsqlConnection(DbConnection):
     def __init__(self,  connect, read_repeatable):
+        import psycopg2
         connection_string = 'dbname=\'{0}\' user=\'{1}\' password=\'{2}\' host={3} port={4}'.format(connect.db_name, connect.user, connect.password, connect.host, connect.port)
 
         if connect.ssl_mode :
@@ -91,6 +91,7 @@ def cursor(self, name=None, withhold=False):
 # method across MySQL and Postgres. This one is for MySQL
 class MySqlConnection(DbConnection):
     def __init__(self,  connect, read_repeatable):
+        import mysql.connector
         DbConnection.__init__(self, mysql.connector.connect(host=connect.host, port=connect.port, user=connect.user, password=connect.password, database=connect.db_name))
 
         self.db_name = connect.db_name

diff --git a/direct_subset.py → condenser/direct_subset.py b/direct_subset.py → condenser/direct_subset.py
@@ -1,23 +1,23 @@
+from condenser import config_reader, result_tabulator
+from condenser.subset import Subset
+from condenser.db_connect import DbConnect
+from condenser.subset_utils import print_progress
+from condenser import database_helper
 import uuid, sys
-import config_reader, result_tabulator
 import time
-from subset import Subset
-from psql_database_creator import PsqlDatabaseCreator
-from mysql_database_creator import MySqlDatabaseCreator
-from db_connect import DbConnect
-from subset_utils import print_progress
-import database_helper
 
 def db_creator(db_type, source, dest):
     if db_type == 'postgres':
+        from condenser.psql_database_creator import PsqlDatabaseCreator
         return PsqlDatabaseCreator(source, dest, False)
     elif db_type == 'mysql':
+        from condenser.mysql_database_creator import MySqlDatabaseCreator
         return MySqlDatabaseCreator(source, dest)
     else:
         raise ValueError('unknown db_type ' + db_type)
 
 
-if __name__ == '__main__':
+def run():
     if "--stdin" in sys.argv:
         config_reader.initialize(sys.stdin)
     else:
@@ -48,7 +48,7 @@ def db_creator(db_type, source, dest):
             print_progress(sql, idx+1, len(config_reader.get_pre_constraint_sql()))
             db_helper.run_query(sql, destination_dbc.get_db_connection())
         print("Completed pre constraint SQL calls in {}s".format(time.time()-start_time))
-        
+
 
         print("Adding database constraints")
         if "--no-constraints" not in sys.argv:
@@ -65,4 +65,5 @@ def db_creator(db_type, source, dest):
     finally:
         subsetter.unprep_temp_dbs()
 
-
+if __name__ == '__main__':
+    run()
diff --git a/mysql_database_creator.py → condenser/mysql_database_creator.py b/mysql_database_creator.py → condenser/mysql_database_creator.py
@@ -75,7 +75,7 @@ def connection_args(connect):
 
 # This is just for unit testing the creation and tear down processes
 if __name__ == '__main__':
-    import config_reader, db_connect
+    from condenser import config_reader, db_connect
     config_reader.initialize()
     src_connect = db_connect.DbConnect(config_reader.get_source_db_connection_info(), 'mysql')
     dest_connect = db_connect.DbConnect(config_reader.get_destination_db_connection_info(), 'mysql')

diff --git a/mysql_database_helper.py → condenser/mysql_database_helper.py b/mysql_database_helper.py → condenser/mysql_database_helper.py
@@ -1,7 +1,7 @@
 import os, uuid, csv
-import config_reader
+from condenser import config_reader
 from pathlib import Path
-from subset_utils import columns_joined, columns_tupled, quoter, schema_name, table_name, fully_qualified_table, redact_relationships
+from condenser.subset_utils import columns_joined, columns_tupled, quoter, schema_name, table_name, fully_qualified_table, redact_relationships
 
 system_schemas_str = ','.join(['\'' + schema + '\'' for schema in  ['information_schema', 'performance_schema', 'sys', 'mysql', 'innodb','tmp']])
 temp_db = 'tonic_subset_temp_db_398dhjr23'

diff --git a/psql_database_creator.py → condenser/psql_database_creator.py b/psql_database_creator.py → condenser/psql_database_creator.py
@@ -1,6 +1,6 @@
 import os, urllib, subprocess
-from db_connect import DbConnect
-import database_helper
+from condenser.db_connect import DbConnect
+from condenser import database_helper
 
 class PsqlDatabaseCreator:
     def __init__(self, source_dbc, destination_dbc, use_existing_dump = False):

diff --git a/psql_database_helper.py → condenser/psql_database_helper.py b/psql_database_helper.py → condenser/psql_database_helper.py
@@ -1,8 +1,8 @@
 import os, uuid, csv
-import config_reader
+from condenser import config_reader
 from pathlib import Path
 from psycopg2.extras import execute_values, register_default_json, register_default_jsonb
-from subset_utils import columns_joined, columns_tupled, schema_name, table_name, fully_qualified_table, redact_relationships, quoter
+from condenser.subset_utils import columns_joined, columns_tupled, schema_name, table_name, fully_qualified_table, redact_relationships, quoter
 
 register_default_json(loads=lambda x: str(x))
 register_default_jsonb(loads=lambda x: str(x))

diff --git a/result_tabulator.py → condenser/result_tabulator.py b/result_tabulator.py → condenser/result_tabulator.py
@@ -1,6 +1,6 @@
-import config_reader
-import database_helper
-from db_connect import MySqlConnection
+from condenser import config_reader
+from condenser import database_helper
+from condenser.db_connect import MySqlConnection
 
 
 def tabulate(source_dbc, destination_dbc, tables):

diff --git a/subset.py → condenser/subset.py b/subset.py → condenser/subset.py
@@ -1,7 +1,7 @@
-from topo_orderer import get_topological_order_by_tables
-from subset_utils import UnionFind, schema_name, table_name, find, compute_disconnected_tables, compute_downstream_tables, compute_upstream_tables, columns_joined, columns_tupled, columns_to_copy, quoter, fully_qualified_table, print_progress, mysql_db_name_hack, upstream_filter_match, redact_relationships
-import database_helper
-import config_reader
+from condenser.topo_orderer import get_topological_order_by_tables
+from condenser.subset_utils import UnionFind, schema_name, table_name, find, compute_disconnected_tables, compute_downstream_tables, compute_upstream_tables, columns_joined, columns_tupled, columns_to_copy, quoter, fully_qualified_table, print_progress, mysql_db_name_hack, upstream_filter_match, redact_relationships
+from condenser import database_helper
+from condenser import config_reader
 import shutil, os, uuid, time, itertools
 
 #

diff --git a/subset_utils.py → condenser/subset_utils.py b/subset_utils.py → condenser/subset_utils.py
@@ -1,6 +1,6 @@
-import config_reader
-import database_helper
-from db_connect import MySqlConnection
+from condenser import config_reader
+from condenser import database_helper
+from condenser.db_connect import MySqlConnection
 
 # this function generally copies all columns as is, but if the table has been selected as
 # breaking a dependency cycle, then it will insert NULLs instead of that table's foreign keys

diff --git a/topo_orderer.py → condenser/topo_orderer.py b/topo_orderer.py → condenser/topo_orderer.py
@@ -1,5 +1,5 @@
 from toposort import toposort, toposort_flatten
-import config_reader
+from condenser import config_reader
 
 def get_topological_order_by_tables(relationships, tables):
     topsort_input =  __prepare_topsort_input(relationships, tables)