diff --git a/README.md b/README.md index 001028b..1b2ce36 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,8 @@ pandasql ======== -`pandasql` allows you to query `pandas` DataFrames using SQL syntax. It works -similarly to `sqldf` in R. `pandasql` seeks to provide a more familiar way of +`pandasql` allows you to query `pandas` DataFrames using SQL syntax. It works +similarly to `sqldf` in R. `pandasql` seeks to provide a more familiar way of manipulating and cleaning data for people new to Python or `pandas`. #### Installation @@ -15,15 +15,15 @@ The main function used in pandasql is `sqldf`. `sqldf` accepts 2 parametrs - a sql query string - a set of session/environment variables (`locals()` or `globals()`) -Specifying `locals()` or `globals()` can get tedious. You can define a short +Specifying `locals()` or `globals()` can get tedious. You can define a short helper function to fix this. from pandasql import sqldf - pysqldf = lambda q: sqldf(q, globals()) + pysqldf = lambda q, params=None: sqldf(q, locals(), params=params) #### Querying -`pandasql` uses [SQLite syntax](http://www.sqlite.org/lang.html). Any `pandas` -dataframes will be automatically detected by `pandasql`. You can query them as +`pandasql` uses [SQLite syntax](http://www.sqlite.org/lang.html). Any `pandas` +dataframes will be automatically detected by `pandasql`. You can query them as you would any regular SQL table. @@ -76,9 +76,19 @@ joins and aggregations are also supported 4 1948 8766 ``` +queries with parameters are supported +``` +>>> iris = load_iris() +>>> iris_df = pd.DataFrame(iris.data, columns=iris.feature_names) +>>> print pyqldf("SELECT DISTINCT species FROM iris_df WHERE species <> ? ", params=('versicolor',) ) + species +0 setosa +1 virginica +``` + More information and code samples available in the [examples](https://github.com/yhat/pandasql/blob/master/examples/demo.py) folder or on [our blog](http://blog.yhathq.com/posts/pandasql-sql-for-pandas-dataframes.html). -[![Analytics](https://ga-beacon.appspot.com/UA-46996803-1/pandasql/README.md)](https://github.com/yhat/pandasql) +[![Analytics](https://ga-beacon.appspot.com/UA-46996803-1/pandasql/README.md)](https://github.com/yhat/pandasql) diff --git a/examples/demo2.py b/examples/demo2.py new file mode 100644 index 0000000..ea3a32c --- /dev/null +++ b/examples/demo2.py @@ -0,0 +1,25 @@ +import os, time +import pandas as pd +from pandasql import sqldf + +# dummy DataFrame +data = [ [ "abc", 123, True, "C:\\temp" ], [ "d'ef", -45.6, False, "C:\\windows" ], [ "xyz", 0.89, 0, "/usr/" ] ] +df = pd.DataFrame(data, columns = [ "id", "n", "b", "f" ]) + + +# define 'pysqldf' as per pandasql documentation, with extra params and user-defined-functions registration + +def my_sqlite_connect_listener( dbapi_con, con_record ): + # registering a few extra functions to SQLite + dbapi_con.create_function( 'IIF', 3, lambda b, t, f : t if b else f ) + dbapi_con.create_function( 'CUBE', 1, lambda x : x*x*x ) + dbapi_con.create_function( 'FileExists', 1, lambda f : os.path.exists(f) ) + dbapi_con.create_function( 'FileModificationDate', 1, lambda f : time.ctime(os.path.getmtime(f)) if os.path.exists(f) else None) + +pysqldf = lambda q, params=None: sqldf(q, globals(), params=params, sqlite_connect_listener=my_sqlite_connect_listener) + + +# demo of request using the extra functions +print(pysqldf("select n, IIF(n<0, 'n is negative', 'n is positive') from df where id<>?", params = ('abc', ))) +print(pysqldf("select CUBE(2), CUBE(3), CUBE(4), CUBE(5)")) +print(pysqldf("select f, FileExists(f), FileModificationDate(f) from df")) diff --git a/pandasql/sqldf.py b/pandasql/sqldf.py index e25398a..be2c4be 100644 --- a/pandasql/sqldf.py +++ b/pandasql/sqldf.py @@ -15,7 +15,7 @@ class PandaSQLException(Exception): class PandaSQL: - def __init__(self, db_uri='sqlite:///:memory:', persist=False): + def __init__(self, db_uri='sqlite:///:memory:', persist=False, sqlite_connect_listener=None): """ Initialize with a specific database. @@ -26,6 +26,8 @@ def __init__(self, db_uri='sqlite:///:memory:', persist=False): if self.engine.name == 'sqlite': listen(self.engine, 'connect', self._set_text_factory) + if self.engine.name == 'sqlite' and sqlite_connect_listener is not None: + listen(self.engine, 'connect', sqlite_connect_listener) if self.engine.name not in ('sqlite', 'postgresql'): raise PandaSQLException('Currently only sqlite and postgresql are supported.') @@ -36,7 +38,7 @@ def __init__(self, db_uri='sqlite:///:memory:', persist=False): self._conn = self.engine.connect() self._init_connection(self._conn) - def __call__(self, query, env=None): + def __call__(self, query, env=None, params=None): """ Execute the SQL query. Automatically creates tables mentioned in the query from dataframes before executing. @@ -61,7 +63,7 @@ def __call__(self, query, env=None): write_table(env[table_name], table_name, conn) try: - result = read_sql(query, conn) + result = read_sql(query, conn, params=params) except DatabaseError as ex: raise PandaSQLException(ex) except ResourceClosedError: @@ -126,7 +128,7 @@ def write_table(df, tablename, conn): index=not any(name is None for name in df.index.names)) # load index into db if all levels are named -def sqldf(query, env=None, db_uri='sqlite:///:memory:'): +def sqldf(query, env=None, db_uri='sqlite:///:memory:', persist=False, sqlite_connect_listener=None, params=None): """ Query pandas data frames using sql syntax This function is meant for backward compatibility only. New users are encouraged to use the PandaSQL class. @@ -158,4 +160,4 @@ def sqldf(query, env=None, db_uri='sqlite:///:memory:'): >>> sqldf("select * from df;", locals()) >>> sqldf("select avg(x) from df;", locals()) """ - return PandaSQL(db_uri)(query, env) + return PandaSQL(db_uri, persist, sqlite_connect_listener)(query, env, params)