1
1
mirror of https://github.com/harelba/q.git synced 2024-10-04 06:48:13 +03:00

option to list udfs, and added new functions

This commit is contained in:
Harel Ben-Attia 2020-09-13 17:29:53 +03:00
parent 0473927e94
commit e85c4c50a0
3 changed files with 290 additions and 44 deletions

180
bin/q.py
View File

@ -31,6 +31,8 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from collections import OrderedDict
q_version = '2.0.16'
__all__ = [ 'QTextAsData' ]
@ -72,11 +74,26 @@ def get_stdout_encoding(encoding_override=None):
SHOW_SQL = False
def sha1(data):
if not isinstance(data,str) and not isinstance(data,unicode):
return hashlib.sha1(str(data)).hexdigest()
return hashlib.sha1(data).hexdigest()
sha_algorithms = {
1 : hashlib.sha1,
224: hashlib.sha224,
256: hashlib.sha256,
386: hashlib.sha384,
512: hashlib.sha512
}
def sha(data,algorithm,encoding):
try:
f = sha_algorithms[algorithm]
return f(six.text_type(data).encode(encoding)).hexdigest()
except Exception as e:
print(e)
# For backward compatibility
def sha1(data,encoding):
return sha(data,1,encoding)
# TODO Add caching of compiled regexps - Will be added after benchmarking capability is baked in
def regexp(regular_expression, data):
if data is not None:
if not isinstance(data, str) and not isinstance(data, unicode):
@ -85,15 +102,16 @@ def regexp(regular_expression, data):
else:
return False
def md5(data,encoding='utf-8'):
def md5(data,encoding):
m = hashlib.md5()
m.update(six.text_type(data).encode(encoding))
return m.hexdigest()
class Sqlite3DBResults(object):
def __init__(self,query_column_names,results):
self.query_column_names = query_column_names
self.results = results
def sqrt(data):
return math.sqrt(data)
def power(data,p):
return data**p
def percentile(l, p):
# TODO Alpha implementation, need to provide multiple interpolation methods, and add tests
@ -106,6 +124,7 @@ def percentile(l, p):
return l[int(k)]
return (c-k) * l[int(f)] + (k-f) * l[int(c)]
# TODO Streaming Percentile to prevent memory consumption blowup for large datasets
class StrictPercentile(object):
def __init__(self):
self.values = []
@ -121,6 +140,130 @@ class StrictPercentile(object):
else:
return percentile(sorted(self.values),self.p)
class StdevPopulation(object):
def __init__(self):
self.M = 0.0
self.S = 0.0
self.k = 0
def step(self, value):
try:
# Ignore nulls
if value is None:
return
val = float(value) # if fails, skips this iteration, which also ignores nulls
tM = self.M
self.k += 1
self.M += ((val - tM) / self.k)
self.S += ((val - tM) * (val - self.M))
except ValueError:
# TODO propagate udf errors to console
raise Exception("Data is not numeric when calculating stddev (%s)" % value)
def finalize(self):
if self.k <= 1: # avoid division by zero
return None
else:
return math.sqrt(self.S / (self.k))
class StdevSample(object):
def __init__(self):
self.M = 0.0
self.S = 0.0
self.k = 0
def step(self, value):
try:
# Ignore nulls
if value is None:
return
val = float(value) # if fails, skips this iteration, which also ignores nulls
tM = self.M
self.k += 1
self.M += ((val - tM) / self.k)
self.S += ((val - tM) * (val - self.M))
except ValueError:
# TODO propagate udf errors to console
raise Exception("Data is not numeric when calculating stddev (%s)" % value)
def finalize(self):
if self.k <= 1: # avoid division by zero
return None
else:
return math.sqrt(self.S / (self.k-1))
class FunctionType(object):
REGULAR = 1
AGG = 2
class UserFunctionDef(object):
def __init__(self,func_type,name,usage,description,func_or_obj,param_count):
self.func_type = func_type
self.name = name
self.usage = usage
self.description = description
self.func_or_obj = func_or_obj
self.param_count = param_count
user_functions = [
UserFunctionDef(FunctionType.REGULAR,
"regexp","regexp(<regular_expression>,<expr>) = <1|0>",
"Find regexp in string expression. Returns 1 if found or 0 if not",
regexp,
2),
UserFunctionDef(FunctionType.REGULAR,
"sha","sha(<expr>,<encoding>,<algorithm>) = <hex-string-of-sha>",
"Calculate sha of some expression. Algorithm can be one of 1,224,256,384,512. For now encoding must be manually provided. Will use the input encoding automatically in the future.",
sha,
3),
UserFunctionDef(FunctionType.REGULAR,
"sha1","sha1(<expr>,<encoding>) = <hex-string-of-sha>",
"Calculate sha1 of some expression. For now encoding must be manually provided. Will be taken automatically from the input encoding in the future.",
sha1,
2),
UserFunctionDef(FunctionType.REGULAR,
"md5","md5(<expr>,<encoding>) = <hex-string-of-md5>",
"Calculate md5 of expression. Returns a hex-string of the result. Currently requires to manually provide the encoding of the data. Will be taken automatically from the input encoding in the future.",
md5,
2),
UserFunctionDef(FunctionType.REGULAR,
"sqrt","sqrt(<expr>) = <square-root>",
"Calculate the square root of the expression",
sqrt,
1),
UserFunctionDef(FunctionType.REGULAR,
"power","power(<expr1>,<expr2>) = <expr1-to-the-power-of-expr2>",
"Raise expr1 to the power of expr2",
power,
2),
UserFunctionDef(FunctionType.AGG,
"percentile","percentile(<expr>,<percentile-in-the-range-0-to-1>) = <percentile-value>",
"Calculate the strict percentile of a set of a values.",
StrictPercentile,
2),
UserFunctionDef(FunctionType.AGG,
"stddev_pop","stddev_pop(<expr>) = <stddev-value>",
"Calculate the population standard deviation of a set of values",
StdevPopulation,
1),
UserFunctionDef(FunctionType.AGG,
"stddev_sample","stddev_sample(<expr>) = <stddev-value>",
"Calculate the sample standard deviation of a set of values",
StdevSample,
1)
]
def print_user_functions():
for udf in user_functions:
print("Function: %s" % udf.name)
print(" Usage: %s" % udf.usage)
print(" Description: %s" % udf.description)
class Sqlite3DBResults(object):
def __init__(self,query_column_names,results):
self.query_column_names = query_column_names
self.results = results
class Sqlite3DB(object):
def __init__(self, show_sql=SHOW_SQL):
@ -169,11 +312,13 @@ class Sqlite3DB(object):
raise ValueError('Unknown store-db-to-disk method %s' % method)
def add_user_functions(self):
self.conn.create_function("regexp", 2, regexp)
self.conn.create_function("sha1", 1, sha1)
self.conn.create_function("md5", 2, md5)
self.conn.create_function("md5", 1, md5)
self.conn.create_aggregate("percentile",2,StrictPercentile)
for udf in user_functions:
if type(udf.func_or_obj) == type(object):
self.conn.create_aggregate(udf.name,udf.param_count,udf.func_or_obj)
elif type(udf.func_or_obj) == type(md5):
self.conn.create_function(udf.name,udf.param_count,udf.func_or_obj)
else:
raise Exception("Invalid user function definition %s" % str(udf))
def is_numeric_type(self, column_type):
return column_type in self.numeric_column_types
@ -1791,6 +1936,8 @@ def run_standalone():
help="Output encoding. Defaults to 'none', leading to selecting the system/terminal encoding")
output_data_option_group.add_option("-W","--output-quoting-mode",dest="output_quoting_mode",default="minimal",
help="Output quoting mode. Possible values are all, minimal, nonnumeric and none. Note the slightly misleading parameter name, and see the matching -w parameter for input quoting.")
output_data_option_group.add_option("-L","--list-user-functions",dest="list_user_functions",default=False,action="store_true",
help="List all user functions")
parser.add_option_group(output_data_option_group)
#-----------------------------------------------
query_option_group = OptionGroup(parser,"Query Related Options")
@ -1808,6 +1955,11 @@ def run_standalone():
sys.exit(0)
###
if options.list_user_functions:
print_user_functions()
sys.exit(0)
if len(args) == 0 and options.query_filename is None:
print_credentials()
print("Must provide at least one query in the command line, or through a file with the -q parameter", file=sys.stderr)

View File

@ -89,7 +89,7 @@ Usage:
Its purpose is to bring SQL expressive power to manipulating text data using the Linux command line.
Basic usage is q "<sql-like query>" where table names are just regular file names (Use - to read from standard input)
Basic usage is q "<sql like query>" where table names are just regular file names (Use - to read from standard input)
When the input contains a header row, use -H, and column names will be set according to the header row content. If there isn't a header row, then columns will automatically be named c1..cN.
Column types are detected automatically. Use -A in order to see the column name/type analysis.
@ -133,6 +133,8 @@ Options:
-d DELIMITER, --delimiter=DELIMITER
Field delimiter. If none specified, then space is used
as the delimiter.
-p, --pipe-delimited
Same as -d '|'. Added for convenience and readability
-t, --tab-delimited
Same as -d <tab>. Just a shorthand for handling
standard tab delimited file You can use $'\t' if you
@ -186,6 +188,8 @@ Options:
Field delimiter for output. If none specified, then
the -d delimiter is used if present, or space if no
delimiter is specified
-P, --pipe-delimited-output
Same as -D '|'. Added for convenience and readability.
-T, --tab-delimited-output
Same as -D <tab>. Just a shorthand for outputting tab
delimited output. You can use -D $'\t' if you want.
@ -210,6 +214,8 @@ Options:
nonnumeric and none. Note the slightly misleading
parameter name, and see the matching -w parameter for
input quoting.
-L, --list-user-functions
List all user functions
Query Related Options:
-q QUERY_FILENAME, --query-filename=QUERY_FILENAME

View File

@ -11,7 +11,6 @@
#
import unittest
import pytest
import random
import json
from json import JSONEncoder
@ -283,34 +282,6 @@ class BasicTests(AbstractQTestCase):
self.cleanup(tmpfile)
def test_regexp_int_data_handling(self):
tmpfile = self.create_file_with_data(sample_data_no_header)
cmd = Q_EXECUTABLE + ' -d , "select c2 from %s where regexp(\'^1\',c2)"' % tmpfile.name
retcode, o, e = run_command(cmd)
self.assertEqual(retcode, 0)
self.assertEqual(len(o), 1)
self.assertEqual(len(e), 0)
self.assertEqual(o[0],six.b("1"))
self.cleanup(tmpfile)
def test_regexp_null_data_handling(self):
tmpfile = self.create_file_with_data(sample_data_no_header)
cmd = Q_EXECUTABLE + ' -d , "select count(*) from %s where regexp(\'^\',c2)"' % tmpfile.name
retcode, o, e = run_command(cmd)
self.assertEqual(retcode, 0)
self.assertEqual(len(o), 1)
self.assertEqual(len(e), 0)
self.assertEqual(o[0],six.b("2"))
self.cleanup(tmpfile)
def test_select_one_column(self):
tmpfile = self.create_file_with_data(sample_data_no_header)
@ -1525,6 +1496,55 @@ class BasicTests(AbstractQTestCase):
class UserFunctionTests(AbstractQTestCase):
def test_regexp_int_data_handling(self):
tmpfile = self.create_file_with_data(sample_data_no_header)
cmd = Q_EXECUTABLE + ' -d , "select c2 from %s where regexp(\'^1\',c2)"' % tmpfile.name
retcode, o, e = run_command(cmd)
self.assertEqual(retcode, 0)
self.assertEqual(len(o), 1)
self.assertEqual(len(e), 0)
self.assertEqual(o[0],six.b("1"))
self.cleanup(tmpfile)
def test_percentile_func(self):
cmd = 'seq 1000 1999 | %s "select substr(c1,0,3),percentile(c1,0),percentile(c1,0.5),percentile(c1,1) from - group by substr(c1,0,3)" -c 1' % Q_EXECUTABLE
retcode, o, e = run_command(cmd)
self.assertEqual(retcode, 0)
self.assertEqual(len(o), 10)
self.assertEqual(len(e), 0)
output_table = [l.split(six.b(" ")) for l in o]
group_labels = [int(row[0]) for row in output_table]
minimum_values = [float(row[1]) for row in output_table]
median_values = [float(row[2]) for row in output_table]
max_values = [float(row[3]) for row in output_table]
base_values = list(range(1000,2000,100))
self.assertEqual(group_labels,list(range(10,20)))
self.assertEqual(minimum_values,base_values)
self.assertEqual(median_values,list(map(lambda x: x + 49.5,base_values)))
self.assertEqual(max_values,list(map(lambda x: x + 99,base_values)))
def test_regexp_null_data_handling(self):
tmpfile = self.create_file_with_data(sample_data_no_header)
cmd = Q_EXECUTABLE + ' -d , "select count(*) from %s where regexp(\'^\',c2)"' % tmpfile.name
retcode, o, e = run_command(cmd)
self.assertEqual(retcode, 0)
self.assertEqual(len(o), 1)
self.assertEqual(len(e), 0)
self.assertEqual(o[0],six.b("2"))
self.cleanup(tmpfile)
def test_md5_function(self):
cmd = 'seq 1 4 | %s -c 1 -d , "select c1,md5(c1,\'utf-8\') from -"' % Q_EXECUTABLE
retcode, o, e = run_command(cmd)
@ -1538,6 +1558,74 @@ class UserFunctionTests(AbstractQTestCase):
self.assertEqual(tuple(o[2].split(six.b(','),1)),(six.b('3'),six.b('eccbc87e4b5ce2fe28308fd9f2a7baf3')))
self.assertEqual(tuple(o[3].split(six.b(','),1)),(six.b('4'),six.b('a87ff679a2f3e71d9181a67b7542122c')))
def test_stddev_functions(self):
tmpfile = self.create_file_with_data(six.b("\n".join(map(str,[234,354,3234,123,4234,234,634,56,65]))))
cmd = '%s -c 1 -d , "select round(stddev_pop(c1),10),round(stddev_sample(c1),10) from %s"' % (Q_EXECUTABLE,tmpfile.name)
retcode, o, e = run_command(cmd)
self.assertEqual(retcode,0)
self.assertEqual(len(o),1)
self.assertEqual(len(e),0)
self.assertEqual(o[0],'1479.7015464838,1569.4604964764')
self.cleanup(tmpfile)
def test_sqrt_function(self):
cmd = 'seq 1 5 | %s -c 1 -d , "select round(sqrt(c1),10) from -"' % Q_EXECUTABLE
retcode, o, e = run_command(cmd)
self.assertEqual(retcode,0)
self.assertEqual(len(o),5)
self.assertEqual(len(e),0)
self.assertEqual(o[0],six.b('1.0'))
self.assertEqual(o[1],six.b('1.4142135624'))
self.assertEqual(o[2],six.b('1.7320508076'))
self.assertEqual(o[3],six.b('2.0'))
self.assertEqual(o[4],six.b('2.2360679775'))
def test_power_function(self):
cmd = 'seq 1 5 | %s -c 1 -d , "select round(power(c1,2.5),10) from -"' % Q_EXECUTABLE
retcode, o, e = run_command(cmd)
self.assertEqual(retcode,0)
self.assertEqual(len(o),5)
self.assertEqual(len(e),0)
self.assertEqual(o[0],six.b('1.0'))
self.assertEqual(o[1],six.b('5.6568542495'))
self.assertEqual(o[2],six.b('15.5884572681'))
self.assertEqual(o[3],six.b('32.0'))
self.assertEqual(o[4],six.b('55.9016994375'))
def test_sha1_function(self):
cmd = 'seq 1 4 | %s -c 1 -d , "select c1,sha1(c1,\'utf-8\') from -"' % Q_EXECUTABLE
retcode, o, e = run_command(cmd)
self.assertEqual(retcode,0)
self.assertEqual(len(o),4)
self.assertEqual(len(e),0)
self.assertEqual(o[0],six.b('1,356a192b7913b04c54574d18c28d46e6395428ab'))
self.assertEqual(o[1],six.b('2,da4b9237bacccdf19c0760cab7aec4a8359010b0'))
self.assertEqual(o[2],six.b('3,77de68daecd823babbb58edb1c8e14d7106e83bb'))
self.assertEqual(o[3],six.b('4,1b6453892473a467d07372d45eb05abc2031647a'))
def test_sha_function(self):
cmd = 'seq 1 4 | %s -c 1 -d , "select c1,sha(c1,1,\'utf-8\') as sha1,sha(c1,224,\'utf-8\') as sha224,sha(c1,256,\'utf-8\') as sha256 from -"' % Q_EXECUTABLE
retcode, o, e = run_command(cmd)
self.assertEqual(retcode,0)
self.assertEqual(len(o),4)
self.assertEqual(len(e),0)
self.assertEqual(o[0],six.b('1,356a192b7913b04c54574d18c28d46e6395428ab,e25388fde8290dc286a6164fa2d97e551b53498dcbf7bc378eb1f178,6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d49c01e52ddb7875b4b'))
self.assertEqual(o[1],six.b('2,da4b9237bacccdf19c0760cab7aec4a8359010b0,58b2aaa0bfae7acc021b3260e941117b529b2e69de878fd7d45c61a9,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f90da3a666eec13ab35'))
self.assertEqual(o[2],six.b('3,77de68daecd823babbb58edb1c8e14d7106e83bb,4cfc3a1811fe40afa401b25ef7fa0379f1f7c1930a04f8755d678474,4e07408562bedb8b60ce05c1decfe3ad16b72230967de01f640b7e4729b49fce'))
self.assertEqual(o[3],six.b('4,1b6453892473a467d07372d45eb05abc2031647a,271f93f45e9b4067327ed5c8cd30a034730aaace4382803c3e1d6c2f,4b227777d4dd1fc61c6f884f48641d02b4d121d3fd328cb08b5531fcacdabf8a'))
class MultiHeaderTests(AbstractQTestCase):
def test_output_header_when_multiple_input_headers_exist(self):