mirror of
https://github.com/harelba/q.git
synced 2024-10-03 22:39:52 +03:00
option to list udfs, and added new functions
This commit is contained in:
parent
0473927e94
commit
e85c4c50a0
180
bin/q.py
180
bin/q.py
@ -31,6 +31,8 @@ from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
from collections import OrderedDict
|
||||
|
||||
q_version = '2.0.16'
|
||||
|
||||
__all__ = [ 'QTextAsData' ]
|
||||
@ -72,11 +74,26 @@ def get_stdout_encoding(encoding_override=None):
|
||||
|
||||
SHOW_SQL = False
|
||||
|
||||
def sha1(data):
|
||||
if not isinstance(data,str) and not isinstance(data,unicode):
|
||||
return hashlib.sha1(str(data)).hexdigest()
|
||||
return hashlib.sha1(data).hexdigest()
|
||||
sha_algorithms = {
|
||||
1 : hashlib.sha1,
|
||||
224: hashlib.sha224,
|
||||
256: hashlib.sha256,
|
||||
386: hashlib.sha384,
|
||||
512: hashlib.sha512
|
||||
}
|
||||
|
||||
def sha(data,algorithm,encoding):
|
||||
try:
|
||||
f = sha_algorithms[algorithm]
|
||||
return f(six.text_type(data).encode(encoding)).hexdigest()
|
||||
except Exception as e:
|
||||
print(e)
|
||||
|
||||
# For backward compatibility
|
||||
def sha1(data,encoding):
|
||||
return sha(data,1,encoding)
|
||||
|
||||
# TODO Add caching of compiled regexps - Will be added after benchmarking capability is baked in
|
||||
def regexp(regular_expression, data):
|
||||
if data is not None:
|
||||
if not isinstance(data, str) and not isinstance(data, unicode):
|
||||
@ -85,15 +102,16 @@ def regexp(regular_expression, data):
|
||||
else:
|
||||
return False
|
||||
|
||||
def md5(data,encoding='utf-8'):
|
||||
def md5(data,encoding):
|
||||
m = hashlib.md5()
|
||||
m.update(six.text_type(data).encode(encoding))
|
||||
return m.hexdigest()
|
||||
|
||||
class Sqlite3DBResults(object):
|
||||
def __init__(self,query_column_names,results):
|
||||
self.query_column_names = query_column_names
|
||||
self.results = results
|
||||
def sqrt(data):
|
||||
return math.sqrt(data)
|
||||
|
||||
def power(data,p):
|
||||
return data**p
|
||||
|
||||
def percentile(l, p):
|
||||
# TODO Alpha implementation, need to provide multiple interpolation methods, and add tests
|
||||
@ -106,6 +124,7 @@ def percentile(l, p):
|
||||
return l[int(k)]
|
||||
return (c-k) * l[int(f)] + (k-f) * l[int(c)]
|
||||
|
||||
# TODO Streaming Percentile to prevent memory consumption blowup for large datasets
|
||||
class StrictPercentile(object):
|
||||
def __init__(self):
|
||||
self.values = []
|
||||
@ -121,6 +140,130 @@ class StrictPercentile(object):
|
||||
else:
|
||||
return percentile(sorted(self.values),self.p)
|
||||
|
||||
class StdevPopulation(object):
|
||||
def __init__(self):
|
||||
self.M = 0.0
|
||||
self.S = 0.0
|
||||
self.k = 0
|
||||
|
||||
def step(self, value):
|
||||
try:
|
||||
# Ignore nulls
|
||||
if value is None:
|
||||
return
|
||||
val = float(value) # if fails, skips this iteration, which also ignores nulls
|
||||
tM = self.M
|
||||
self.k += 1
|
||||
self.M += ((val - tM) / self.k)
|
||||
self.S += ((val - tM) * (val - self.M))
|
||||
except ValueError:
|
||||
# TODO propagate udf errors to console
|
||||
raise Exception("Data is not numeric when calculating stddev (%s)" % value)
|
||||
|
||||
def finalize(self):
|
||||
if self.k <= 1: # avoid division by zero
|
||||
return None
|
||||
else:
|
||||
return math.sqrt(self.S / (self.k))
|
||||
|
||||
class StdevSample(object):
|
||||
def __init__(self):
|
||||
self.M = 0.0
|
||||
self.S = 0.0
|
||||
self.k = 0
|
||||
|
||||
def step(self, value):
|
||||
try:
|
||||
# Ignore nulls
|
||||
if value is None:
|
||||
return
|
||||
val = float(value) # if fails, skips this iteration, which also ignores nulls
|
||||
tM = self.M
|
||||
self.k += 1
|
||||
self.M += ((val - tM) / self.k)
|
||||
self.S += ((val - tM) * (val - self.M))
|
||||
except ValueError:
|
||||
# TODO propagate udf errors to console
|
||||
raise Exception("Data is not numeric when calculating stddev (%s)" % value)
|
||||
|
||||
def finalize(self):
|
||||
if self.k <= 1: # avoid division by zero
|
||||
return None
|
||||
else:
|
||||
return math.sqrt(self.S / (self.k-1))
|
||||
|
||||
class FunctionType(object):
|
||||
REGULAR = 1
|
||||
AGG = 2
|
||||
|
||||
class UserFunctionDef(object):
|
||||
def __init__(self,func_type,name,usage,description,func_or_obj,param_count):
|
||||
self.func_type = func_type
|
||||
self.name = name
|
||||
self.usage = usage
|
||||
self.description = description
|
||||
self.func_or_obj = func_or_obj
|
||||
self.param_count = param_count
|
||||
|
||||
user_functions = [
|
||||
UserFunctionDef(FunctionType.REGULAR,
|
||||
"regexp","regexp(<regular_expression>,<expr>) = <1|0>",
|
||||
"Find regexp in string expression. Returns 1 if found or 0 if not",
|
||||
regexp,
|
||||
2),
|
||||
UserFunctionDef(FunctionType.REGULAR,
|
||||
"sha","sha(<expr>,<encoding>,<algorithm>) = <hex-string-of-sha>",
|
||||
"Calculate sha of some expression. Algorithm can be one of 1,224,256,384,512. For now encoding must be manually provided. Will use the input encoding automatically in the future.",
|
||||
sha,
|
||||
3),
|
||||
UserFunctionDef(FunctionType.REGULAR,
|
||||
"sha1","sha1(<expr>,<encoding>) = <hex-string-of-sha>",
|
||||
"Calculate sha1 of some expression. For now encoding must be manually provided. Will be taken automatically from the input encoding in the future.",
|
||||
sha1,
|
||||
2),
|
||||
UserFunctionDef(FunctionType.REGULAR,
|
||||
"md5","md5(<expr>,<encoding>) = <hex-string-of-md5>",
|
||||
"Calculate md5 of expression. Returns a hex-string of the result. Currently requires to manually provide the encoding of the data. Will be taken automatically from the input encoding in the future.",
|
||||
md5,
|
||||
2),
|
||||
UserFunctionDef(FunctionType.REGULAR,
|
||||
"sqrt","sqrt(<expr>) = <square-root>",
|
||||
"Calculate the square root of the expression",
|
||||
sqrt,
|
||||
1),
|
||||
UserFunctionDef(FunctionType.REGULAR,
|
||||
"power","power(<expr1>,<expr2>) = <expr1-to-the-power-of-expr2>",
|
||||
"Raise expr1 to the power of expr2",
|
||||
power,
|
||||
2),
|
||||
UserFunctionDef(FunctionType.AGG,
|
||||
"percentile","percentile(<expr>,<percentile-in-the-range-0-to-1>) = <percentile-value>",
|
||||
"Calculate the strict percentile of a set of a values.",
|
||||
StrictPercentile,
|
||||
2),
|
||||
UserFunctionDef(FunctionType.AGG,
|
||||
"stddev_pop","stddev_pop(<expr>) = <stddev-value>",
|
||||
"Calculate the population standard deviation of a set of values",
|
||||
StdevPopulation,
|
||||
1),
|
||||
UserFunctionDef(FunctionType.AGG,
|
||||
"stddev_sample","stddev_sample(<expr>) = <stddev-value>",
|
||||
"Calculate the sample standard deviation of a set of values",
|
||||
StdevSample,
|
||||
1)
|
||||
]
|
||||
|
||||
def print_user_functions():
|
||||
for udf in user_functions:
|
||||
print("Function: %s" % udf.name)
|
||||
print(" Usage: %s" % udf.usage)
|
||||
print(" Description: %s" % udf.description)
|
||||
|
||||
class Sqlite3DBResults(object):
|
||||
def __init__(self,query_column_names,results):
|
||||
self.query_column_names = query_column_names
|
||||
self.results = results
|
||||
|
||||
class Sqlite3DB(object):
|
||||
|
||||
def __init__(self, show_sql=SHOW_SQL):
|
||||
@ -169,11 +312,13 @@ class Sqlite3DB(object):
|
||||
raise ValueError('Unknown store-db-to-disk method %s' % method)
|
||||
|
||||
def add_user_functions(self):
|
||||
self.conn.create_function("regexp", 2, regexp)
|
||||
self.conn.create_function("sha1", 1, sha1)
|
||||
self.conn.create_function("md5", 2, md5)
|
||||
self.conn.create_function("md5", 1, md5)
|
||||
self.conn.create_aggregate("percentile",2,StrictPercentile)
|
||||
for udf in user_functions:
|
||||
if type(udf.func_or_obj) == type(object):
|
||||
self.conn.create_aggregate(udf.name,udf.param_count,udf.func_or_obj)
|
||||
elif type(udf.func_or_obj) == type(md5):
|
||||
self.conn.create_function(udf.name,udf.param_count,udf.func_or_obj)
|
||||
else:
|
||||
raise Exception("Invalid user function definition %s" % str(udf))
|
||||
|
||||
def is_numeric_type(self, column_type):
|
||||
return column_type in self.numeric_column_types
|
||||
@ -1791,6 +1936,8 @@ def run_standalone():
|
||||
help="Output encoding. Defaults to 'none', leading to selecting the system/terminal encoding")
|
||||
output_data_option_group.add_option("-W","--output-quoting-mode",dest="output_quoting_mode",default="minimal",
|
||||
help="Output quoting mode. Possible values are all, minimal, nonnumeric and none. Note the slightly misleading parameter name, and see the matching -w parameter for input quoting.")
|
||||
output_data_option_group.add_option("-L","--list-user-functions",dest="list_user_functions",default=False,action="store_true",
|
||||
help="List all user functions")
|
||||
parser.add_option_group(output_data_option_group)
|
||||
#-----------------------------------------------
|
||||
query_option_group = OptionGroup(parser,"Query Related Options")
|
||||
@ -1808,6 +1955,11 @@ def run_standalone():
|
||||
sys.exit(0)
|
||||
|
||||
###
|
||||
|
||||
if options.list_user_functions:
|
||||
print_user_functions()
|
||||
sys.exit(0)
|
||||
|
||||
if len(args) == 0 and options.query_filename is None:
|
||||
print_credentials()
|
||||
print("Must provide at least one query in the command line, or through a file with the -q parameter", file=sys.stderr)
|
||||
|
@ -89,7 +89,7 @@ Usage:
|
||||
|
||||
Its purpose is to bring SQL expressive power to manipulating text data using the Linux command line.
|
||||
|
||||
Basic usage is q "<sql-like query>" where table names are just regular file names (Use - to read from standard input)
|
||||
Basic usage is q "<sql like query>" where table names are just regular file names (Use - to read from standard input)
|
||||
When the input contains a header row, use -H, and column names will be set according to the header row content. If there isn't a header row, then columns will automatically be named c1..cN.
|
||||
|
||||
Column types are detected automatically. Use -A in order to see the column name/type analysis.
|
||||
@ -133,6 +133,8 @@ Options:
|
||||
-d DELIMITER, --delimiter=DELIMITER
|
||||
Field delimiter. If none specified, then space is used
|
||||
as the delimiter.
|
||||
-p, --pipe-delimited
|
||||
Same as -d '|'. Added for convenience and readability
|
||||
-t, --tab-delimited
|
||||
Same as -d <tab>. Just a shorthand for handling
|
||||
standard tab delimited file You can use $'\t' if you
|
||||
@ -186,6 +188,8 @@ Options:
|
||||
Field delimiter for output. If none specified, then
|
||||
the -d delimiter is used if present, or space if no
|
||||
delimiter is specified
|
||||
-P, --pipe-delimited-output
|
||||
Same as -D '|'. Added for convenience and readability.
|
||||
-T, --tab-delimited-output
|
||||
Same as -D <tab>. Just a shorthand for outputting tab
|
||||
delimited output. You can use -D $'\t' if you want.
|
||||
@ -210,6 +214,8 @@ Options:
|
||||
nonnumeric and none. Note the slightly misleading
|
||||
parameter name, and see the matching -w parameter for
|
||||
input quoting.
|
||||
-L, --list-user-functions
|
||||
List all user functions
|
||||
|
||||
Query Related Options:
|
||||
-q QUERY_FILENAME, --query-filename=QUERY_FILENAME
|
||||
|
146
test/test-suite
146
test/test-suite
@ -11,7 +11,6 @@
|
||||
#
|
||||
|
||||
import unittest
|
||||
import pytest
|
||||
import random
|
||||
import json
|
||||
from json import JSONEncoder
|
||||
@ -283,34 +282,6 @@ class BasicTests(AbstractQTestCase):
|
||||
|
||||
self.cleanup(tmpfile)
|
||||
|
||||
def test_regexp_int_data_handling(self):
|
||||
tmpfile = self.create_file_with_data(sample_data_no_header)
|
||||
|
||||
cmd = Q_EXECUTABLE + ' -d , "select c2 from %s where regexp(\'^1\',c2)"' % tmpfile.name
|
||||
retcode, o, e = run_command(cmd)
|
||||
|
||||
self.assertEqual(retcode, 0)
|
||||
self.assertEqual(len(o), 1)
|
||||
self.assertEqual(len(e), 0)
|
||||
|
||||
self.assertEqual(o[0],six.b("1"))
|
||||
|
||||
self.cleanup(tmpfile)
|
||||
|
||||
def test_regexp_null_data_handling(self):
|
||||
tmpfile = self.create_file_with_data(sample_data_no_header)
|
||||
|
||||
cmd = Q_EXECUTABLE + ' -d , "select count(*) from %s where regexp(\'^\',c2)"' % tmpfile.name
|
||||
retcode, o, e = run_command(cmd)
|
||||
|
||||
self.assertEqual(retcode, 0)
|
||||
self.assertEqual(len(o), 1)
|
||||
self.assertEqual(len(e), 0)
|
||||
|
||||
self.assertEqual(o[0],six.b("2"))
|
||||
|
||||
self.cleanup(tmpfile)
|
||||
|
||||
def test_select_one_column(self):
|
||||
tmpfile = self.create_file_with_data(sample_data_no_header)
|
||||
|
||||
@ -1525,6 +1496,55 @@ class BasicTests(AbstractQTestCase):
|
||||
|
||||
|
||||
class UserFunctionTests(AbstractQTestCase):
|
||||
def test_regexp_int_data_handling(self):
|
||||
tmpfile = self.create_file_with_data(sample_data_no_header)
|
||||
|
||||
cmd = Q_EXECUTABLE + ' -d , "select c2 from %s where regexp(\'^1\',c2)"' % tmpfile.name
|
||||
retcode, o, e = run_command(cmd)
|
||||
|
||||
self.assertEqual(retcode, 0)
|
||||
self.assertEqual(len(o), 1)
|
||||
self.assertEqual(len(e), 0)
|
||||
|
||||
self.assertEqual(o[0],six.b("1"))
|
||||
|
||||
self.cleanup(tmpfile)
|
||||
|
||||
def test_percentile_func(self):
|
||||
cmd = 'seq 1000 1999 | %s "select substr(c1,0,3),percentile(c1,0),percentile(c1,0.5),percentile(c1,1) from - group by substr(c1,0,3)" -c 1' % Q_EXECUTABLE
|
||||
retcode, o, e = run_command(cmd)
|
||||
|
||||
self.assertEqual(retcode, 0)
|
||||
self.assertEqual(len(o), 10)
|
||||
self.assertEqual(len(e), 0)
|
||||
|
||||
output_table = [l.split(six.b(" ")) for l in o]
|
||||
group_labels = [int(row[0]) for row in output_table]
|
||||
minimum_values = [float(row[1]) for row in output_table]
|
||||
median_values = [float(row[2]) for row in output_table]
|
||||
max_values = [float(row[3]) for row in output_table]
|
||||
|
||||
base_values = list(range(1000,2000,100))
|
||||
|
||||
self.assertEqual(group_labels,list(range(10,20)))
|
||||
self.assertEqual(minimum_values,base_values)
|
||||
self.assertEqual(median_values,list(map(lambda x: x + 49.5,base_values)))
|
||||
self.assertEqual(max_values,list(map(lambda x: x + 99,base_values)))
|
||||
|
||||
def test_regexp_null_data_handling(self):
|
||||
tmpfile = self.create_file_with_data(sample_data_no_header)
|
||||
|
||||
cmd = Q_EXECUTABLE + ' -d , "select count(*) from %s where regexp(\'^\',c2)"' % tmpfile.name
|
||||
retcode, o, e = run_command(cmd)
|
||||
|
||||
self.assertEqual(retcode, 0)
|
||||
self.assertEqual(len(o), 1)
|
||||
self.assertEqual(len(e), 0)
|
||||
|
||||
self.assertEqual(o[0],six.b("2"))
|
||||
|
||||
self.cleanup(tmpfile)
|
||||
|
||||
def test_md5_function(self):
|
||||
cmd = 'seq 1 4 | %s -c 1 -d , "select c1,md5(c1,\'utf-8\') from -"' % Q_EXECUTABLE
|
||||
retcode, o, e = run_command(cmd)
|
||||
@ -1538,6 +1558,74 @@ class UserFunctionTests(AbstractQTestCase):
|
||||
self.assertEqual(tuple(o[2].split(six.b(','),1)),(six.b('3'),six.b('eccbc87e4b5ce2fe28308fd9f2a7baf3')))
|
||||
self.assertEqual(tuple(o[3].split(six.b(','),1)),(six.b('4'),six.b('a87ff679a2f3e71d9181a67b7542122c')))
|
||||
|
||||
def test_stddev_functions(self):
|
||||
tmpfile = self.create_file_with_data(six.b("\n".join(map(str,[234,354,3234,123,4234,234,634,56,65]))))
|
||||
|
||||
cmd = '%s -c 1 -d , "select round(stddev_pop(c1),10),round(stddev_sample(c1),10) from %s"' % (Q_EXECUTABLE,tmpfile.name)
|
||||
retcode, o, e = run_command(cmd)
|
||||
|
||||
self.assertEqual(retcode,0)
|
||||
self.assertEqual(len(o),1)
|
||||
self.assertEqual(len(e),0)
|
||||
|
||||
self.assertEqual(o[0],'1479.7015464838,1569.4604964764')
|
||||
|
||||
self.cleanup(tmpfile)
|
||||
|
||||
def test_sqrt_function(self):
|
||||
cmd = 'seq 1 5 | %s -c 1 -d , "select round(sqrt(c1),10) from -"' % Q_EXECUTABLE
|
||||
retcode, o, e = run_command(cmd)
|
||||
|
||||
self.assertEqual(retcode,0)
|
||||
self.assertEqual(len(o),5)
|
||||
self.assertEqual(len(e),0)
|
||||
|
||||
self.assertEqual(o[0],six.b('1.0'))
|
||||
self.assertEqual(o[1],six.b('1.4142135624'))
|
||||
self.assertEqual(o[2],six.b('1.7320508076'))
|
||||
self.assertEqual(o[3],six.b('2.0'))
|
||||
self.assertEqual(o[4],six.b('2.2360679775'))
|
||||
|
||||
def test_power_function(self):
|
||||
cmd = 'seq 1 5 | %s -c 1 -d , "select round(power(c1,2.5),10) from -"' % Q_EXECUTABLE
|
||||
retcode, o, e = run_command(cmd)
|
||||
|
||||
self.assertEqual(retcode,0)
|
||||
self.assertEqual(len(o),5)
|
||||
self.assertEqual(len(e),0)
|
||||
|
||||
self.assertEqual(o[0],six.b('1.0'))
|
||||
self.assertEqual(o[1],six.b('5.6568542495'))
|
||||
self.assertEqual(o[2],six.b('15.5884572681'))
|
||||
self.assertEqual(o[3],six.b('32.0'))
|
||||
self.assertEqual(o[4],six.b('55.9016994375'))
|
||||
|
||||
def test_sha1_function(self):
|
||||
cmd = 'seq 1 4 | %s -c 1 -d , "select c1,sha1(c1,\'utf-8\') from -"' % Q_EXECUTABLE
|
||||
retcode, o, e = run_command(cmd)
|
||||
|
||||
self.assertEqual(retcode,0)
|
||||
self.assertEqual(len(o),4)
|
||||
self.assertEqual(len(e),0)
|
||||
|
||||
self.assertEqual(o[0],six.b('1,356a192b7913b04c54574d18c28d46e6395428ab'))
|
||||
self.assertEqual(o[1],six.b('2,da4b9237bacccdf19c0760cab7aec4a8359010b0'))
|
||||
self.assertEqual(o[2],six.b('3,77de68daecd823babbb58edb1c8e14d7106e83bb'))
|
||||
self.assertEqual(o[3],six.b('4,1b6453892473a467d07372d45eb05abc2031647a'))
|
||||
|
||||
def test_sha_function(self):
|
||||
cmd = 'seq 1 4 | %s -c 1 -d , "select c1,sha(c1,1,\'utf-8\') as sha1,sha(c1,224,\'utf-8\') as sha224,sha(c1,256,\'utf-8\') as sha256 from -"' % Q_EXECUTABLE
|
||||
retcode, o, e = run_command(cmd)
|
||||
|
||||
self.assertEqual(retcode,0)
|
||||
self.assertEqual(len(o),4)
|
||||
self.assertEqual(len(e),0)
|
||||
|
||||
self.assertEqual(o[0],six.b('1,356a192b7913b04c54574d18c28d46e6395428ab,e25388fde8290dc286a6164fa2d97e551b53498dcbf7bc378eb1f178,6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d49c01e52ddb7875b4b'))
|
||||
self.assertEqual(o[1],six.b('2,da4b9237bacccdf19c0760cab7aec4a8359010b0,58b2aaa0bfae7acc021b3260e941117b529b2e69de878fd7d45c61a9,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f90da3a666eec13ab35'))
|
||||
self.assertEqual(o[2],six.b('3,77de68daecd823babbb58edb1c8e14d7106e83bb,4cfc3a1811fe40afa401b25ef7fa0379f1f7c1930a04f8755d678474,4e07408562bedb8b60ce05c1decfe3ad16b72230967de01f640b7e4729b49fce'))
|
||||
self.assertEqual(o[3],six.b('4,1b6453892473a467d07372d45eb05abc2031647a,271f93f45e9b4067327ed5c8cd30a034730aaace4382803c3e1d6c2f,4b227777d4dd1fc61c6f884f48641d02b4d121d3fd328cb08b5531fcacdabf8a'))
|
||||
|
||||
|
||||
class MultiHeaderTests(AbstractQTestCase):
|
||||
def test_output_header_when_multiple_input_headers_exist(self):
|
||||
|
Loading…
Reference in New Issue
Block a user