Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 18 additions & 14 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,17 +14,25 @@ Our open-source tool can subset databases up to 10GB, but it will struggle with

# Installation

Five steps to install, assuming Python 3.5+:
Five steps to install, assuming Python 3.6+:

1. Download the required Python modules. You can use [`pip`](https://pypi.org/project/pip/) for easy installation. The required modules are `toposort`, `psycopg2-binary`, and `mysql-connector-python`.
1. [Install Poetry](https://python-poetry.org/docs/#installation)

2. Install Postgres and/or MySQL database tools. For Postgres we need `pg_dump` and `psql` tools; they need to be on your `$PATH` or point to them with `$POSTGRES_PATH`. For MySQL we need `mysqldump` and `mysql`, they can be on your `$PATH` or point to them with `$MYSQL_PATH`.

3. Clone project locally:
```
$ pip install toposort
$ pip install psycopg2-binary
$ pip install mysql-connector-python
$ git clone https://github.com/TonicAI/condenser.git
$ cd condenser
```
2. Install Postgres and/or MySQL database tools. For Postgres we need `pg_dump` and `psql` tools; they need to be on your `$PATH` or point to them with `$POSTGRES_PATH`. For MySQL we need `mysqldump` and `mysql`, they can be on your `$PATH` or point to them with `$MYSQL_PATH`.
3. Download this repo. You can clone the repo or Download it as a zip. Scroll up, it's the green button that says "Clone or download".
4. Setup your configuration and save it in `config.json`. The provided `config.json.example` has the skeleton of what you need to provide: source and destination database connection details, as well as subsetting goals in `initial_targets`. Here's an example that will collect 10% of a table named `public.target_table`.

4. Install project:
```
$ poetry shell
$ poetry install -E postgres # Or use -E mysql
```

5. Setup your configuration and save it in `config.json`. The provided `config.json.example` has the skeleton of what you need to provide: source and destination database connection details, as well as subsetting goals in `initial_targets`. Here's an example that will collect 10% of a table named `public.target_table`.
```
"initial_targets": [
{
Expand All @@ -35,7 +43,7 @@ $ pip install mysql-connector-python
```
There may be more required configuration depending on your database, but simple databases should be easy. See the Config section for more details, and `config.json.example_all` for all of the options in a single config file.

5. Run! `$ python direct_subset.py`
5. Run! `$ poetry run subset`

# Config

Expand Down Expand Up @@ -80,15 +88,11 @@ Below we describe the use of all configuration parameters, but the best place to
Almost all the configuration is in the `config.json` file, so running is as simple as

```
$ python direct_subset.py
$ poetry run subset
```

Two commandline arguements are supported:

`-v`: Verbose output. Useful for performance debugging. Lists almost every query made, and it's speed.

`--no-constraints`: For Postgres this will not add constraints found in the source database to the destination database. This option has no effect for MySQL.

# Requirements

Reference the requirements.txt file for a list of required python packages. Also, please note that Python 3.5+ is required.
Empty file added condenser/__init__.py
Empty file.
File renamed without changes.
6 changes: 3 additions & 3 deletions database_helper.py → condenser/database_helper.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import config_reader
from condenser import config_reader

def get_specific_helper():
if config_reader.get_db_type() == 'postgres':
import psql_database_helper
from condenser import psql_database_helper
return psql_database_helper
else:
import mysql_database_helper
from condenser import mysql_database_helper
return mysql_database_helper
5 changes: 3 additions & 2 deletions db_connect.py → condenser/db_connect.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import config_reader
import psycopg2, mysql.connector
from condenser import config_reader
import os, pathlib, re, urllib, subprocess, os.path, json, getpass, time, sys, datetime

class DbConnect:
Expand Down Expand Up @@ -74,6 +73,7 @@ def __enter__(self):
# method across MySQL and Postgres. This one is for Postgres
class PsqlConnection(DbConnection):
def __init__(self, connect, read_repeatable):
import psycopg2
connection_string = 'dbname=\'{0}\' user=\'{1}\' password=\'{2}\' host={3} port={4}'.format(connect.db_name, connect.user, connect.password, connect.host, connect.port)

if connect.ssl_mode :
Expand All @@ -91,6 +91,7 @@ def cursor(self, name=None, withhold=False):
# method across MySQL and Postgres. This one is for MySQL
class MySqlConnection(DbConnection):
def __init__(self, connect, read_repeatable):
import mysql.connector
DbConnection.__init__(self, mysql.connector.connect(host=connect.host, port=connect.port, user=connect.user, password=connect.password, database=connect.db_name))

self.db_name = connect.db_name
Expand Down
21 changes: 11 additions & 10 deletions direct_subset.py → condenser/direct_subset.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,23 @@
from condenser import config_reader, result_tabulator
from condenser.subset import Subset
from condenser.db_connect import DbConnect
from condenser.subset_utils import print_progress
from condenser import database_helper
import uuid, sys
import config_reader, result_tabulator
import time
from subset import Subset
from psql_database_creator import PsqlDatabaseCreator
from mysql_database_creator import MySqlDatabaseCreator
from db_connect import DbConnect
from subset_utils import print_progress
import database_helper

def db_creator(db_type, source, dest):
if db_type == 'postgres':
from condenser.psql_database_creator import PsqlDatabaseCreator
return PsqlDatabaseCreator(source, dest, False)
elif db_type == 'mysql':
from condenser.mysql_database_creator import MySqlDatabaseCreator
return MySqlDatabaseCreator(source, dest)
else:
raise ValueError('unknown db_type ' + db_type)


if __name__ == '__main__':
def run():
if "--stdin" in sys.argv:
config_reader.initialize(sys.stdin)
else:
Expand Down Expand Up @@ -48,7 +48,7 @@ def db_creator(db_type, source, dest):
print_progress(sql, idx+1, len(config_reader.get_pre_constraint_sql()))
db_helper.run_query(sql, destination_dbc.get_db_connection())
print("Completed pre constraint SQL calls in {}s".format(time.time()-start_time))


print("Adding database constraints")
if "--no-constraints" not in sys.argv:
Expand All @@ -65,4 +65,5 @@ def db_creator(db_type, source, dest):
finally:
subsetter.unprep_temp_dbs()


if __name__ == '__main__':
run()
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ def connection_args(connect):

# This is just for unit testing the creation and tear down processes
if __name__ == '__main__':
import config_reader, db_connect
from condenser import config_reader, db_connect
config_reader.initialize()
src_connect = db_connect.DbConnect(config_reader.get_source_db_connection_info(), 'mysql')
dest_connect = db_connect.DbConnect(config_reader.get_destination_db_connection_info(), 'mysql')
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import os, uuid, csv
import config_reader
from condenser import config_reader
from pathlib import Path
from subset_utils import columns_joined, columns_tupled, quoter, schema_name, table_name, fully_qualified_table, redact_relationships
from condenser.subset_utils import columns_joined, columns_tupled, quoter, schema_name, table_name, fully_qualified_table, redact_relationships

system_schemas_str = ','.join(['\'' + schema + '\'' for schema in ['information_schema', 'performance_schema', 'sys', 'mysql', 'innodb','tmp']])
temp_db = 'tonic_subset_temp_db_398dhjr23'
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import os, urllib, subprocess
from db_connect import DbConnect
import database_helper
from condenser.db_connect import DbConnect
from condenser import database_helper

class PsqlDatabaseCreator:
def __init__(self, source_dbc, destination_dbc, use_existing_dump = False):
Expand Down
4 changes: 2 additions & 2 deletions psql_database_helper.py → condenser/psql_database_helper.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import os, uuid, csv
import config_reader
from condenser import config_reader
from pathlib import Path
from psycopg2.extras import execute_values, register_default_json, register_default_jsonb
from subset_utils import columns_joined, columns_tupled, schema_name, table_name, fully_qualified_table, redact_relationships, quoter
from condenser.subset_utils import columns_joined, columns_tupled, schema_name, table_name, fully_qualified_table, redact_relationships, quoter

register_default_json(loads=lambda x: str(x))
register_default_jsonb(loads=lambda x: str(x))
Expand Down
6 changes: 3 additions & 3 deletions result_tabulator.py → condenser/result_tabulator.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import config_reader
import database_helper
from db_connect import MySqlConnection
from condenser import config_reader
from condenser import database_helper
from condenser.db_connect import MySqlConnection


def tabulate(source_dbc, destination_dbc, tables):
Expand Down
8 changes: 4 additions & 4 deletions subset.py → condenser/subset.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from topo_orderer import get_topological_order_by_tables
from subset_utils import UnionFind, schema_name, table_name, find, compute_disconnected_tables, compute_downstream_tables, compute_upstream_tables, columns_joined, columns_tupled, columns_to_copy, quoter, fully_qualified_table, print_progress, mysql_db_name_hack, upstream_filter_match, redact_relationships
import database_helper
import config_reader
from condenser.topo_orderer import get_topological_order_by_tables
from condenser.subset_utils import UnionFind, schema_name, table_name, find, compute_disconnected_tables, compute_downstream_tables, compute_upstream_tables, columns_joined, columns_tupled, columns_to_copy, quoter, fully_qualified_table, print_progress, mysql_db_name_hack, upstream_filter_match, redact_relationships
from condenser import database_helper
from condenser import config_reader
import shutil, os, uuid, time, itertools

#
Expand Down
6 changes: 3 additions & 3 deletions subset_utils.py → condenser/subset_utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import config_reader
import database_helper
from db_connect import MySqlConnection
from condenser import config_reader
from condenser import database_helper
from condenser.db_connect import MySqlConnection

# this function generally copies all columns as is, but if the table has been selected as
# breaking a dependency cycle, then it will insert NULLs instead of that table's foreign keys
Expand Down
2 changes: 1 addition & 1 deletion topo_orderer.py → condenser/topo_orderer.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from toposort import toposort, toposort_flatten
import config_reader
from condenser import config_reader

def get_topological_order_by_tables(relationships, tables):
topsort_input = __prepare_topsort_input(relationships, tables)
Expand Down
Loading