option to list udfs, and added new functions

2024-10-03 22:39:52 +03:00 · 2020-09-13 17:29:53 +03:00 · 2020-09-13 17:29:53 +03:00 · e85c4c50a0
commit e85c4c50a0
parent 0473927e94
3 changed files with 290 additions and 44 deletions
--- a/bin/q.py
+++ b/bin/q.py
@ -31,6 +31,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

+from collections import OrderedDict
+
 q_version = '2.0.16'

 __all__ = [ 'QTextAsData' ]
@ -72,11 +74,26 @@ def get_stdout_encoding(encoding_override=None):

 SHOW_SQL = False

-def sha1(data):
-    if not isinstance(data,str) and not isinstance(data,unicode):
-        return hashlib.sha1(str(data)).hexdigest()
-    return hashlib.sha1(data).hexdigest()
+sha_algorithms = {
+    1 : hashlib.sha1,
+    224: hashlib.sha224,
+    256: hashlib.sha256,
+    386: hashlib.sha384,
+    512: hashlib.sha512
+}

+def sha(data,algorithm,encoding):
+    try:
+        f = sha_algorithms[algorithm]
+        return f(six.text_type(data).encode(encoding)).hexdigest()
+    except Exception as e:
+        print(e)
+
+# For backward compatibility
+def sha1(data,encoding):
+    return sha(data,1,encoding)
+
+# TODO Add caching of compiled regexps - Will be added after benchmarking capability is baked in
 def regexp(regular_expression, data):
    if data is not None:
        if not isinstance(data, str) and not isinstance(data, unicode):
@ -85,15 +102,16 @@ def regexp(regular_expression, data):
    else:
        return False

-def md5(data,encoding='utf-8'):
+def md5(data,encoding):
    m = hashlib.md5()
    m.update(six.text_type(data).encode(encoding))
    return m.hexdigest()

-class Sqlite3DBResults(object):
-    def __init__(self,query_column_names,results):
-        self.query_column_names = query_column_names
-        self.results = results
+def sqrt(data):
+    return math.sqrt(data)
+
+def power(data,p):
+    return data**p

 def percentile(l, p):
    # TODO Alpha implementation, need to provide multiple interpolation methods, and add tests
@ -106,6 +124,7 @@ def percentile(l, p):
        return l[int(k)]
    return (c-k) * l[int(f)] + (k-f) * l[int(c)]

+# TODO Streaming Percentile to prevent memory consumption blowup for large datasets
 class StrictPercentile(object):
    def __init__(self):
        self.values = []
@ -121,6 +140,130 @@ class StrictPercentile(object):
        else:
            return percentile(sorted(self.values),self.p)

+class StdevPopulation(object):
+    def __init__(self):
+        self.M = 0.0
+        self.S = 0.0
+        self.k = 0
+
+    def step(self, value):
+        try:
+            # Ignore nulls
+            if value is None:
+                return
+            val = float(value) # if fails, skips this iteration, which also ignores nulls
+            tM = self.M
+            self.k += 1
+            self.M += ((val - tM) / self.k)
+            self.S += ((val - tM) * (val - self.M))
+        except ValueError:
+            # TODO propagate udf errors to console
+            raise Exception("Data is not numeric when calculating stddev (%s)" % value)
+
+    def finalize(self):
+        if self.k <= 1: # avoid division by zero
+            return None
+        else:
+            return math.sqrt(self.S / (self.k))
+
+class StdevSample(object):
+    def __init__(self):
+        self.M = 0.0
+        self.S = 0.0
+        self.k = 0
+
+    def step(self, value):
+        try:
+            # Ignore nulls
+            if value is None:
+                return
+            val = float(value) # if fails, skips this iteration, which also ignores nulls
+            tM = self.M
+            self.k += 1
+            self.M += ((val - tM) / self.k)
+            self.S += ((val - tM) * (val - self.M))
+        except ValueError:
+            # TODO propagate udf errors to console
+            raise Exception("Data is not numeric when calculating stddev (%s)" % value)
+
+    def finalize(self):
+        if self.k <= 1: # avoid division by zero
+            return None
+        else:
+            return math.sqrt(self.S / (self.k-1))
+
+class FunctionType(object):
+    REGULAR = 1
+    AGG = 2
+
+class UserFunctionDef(object):
+    def __init__(self,func_type,name,usage,description,func_or_obj,param_count):
+        self.func_type = func_type
+        self.name = name
+        self.usage = usage
+        self.description = description
+        self.func_or_obj = func_or_obj
+        self.param_count = param_count
+
+user_functions = [
+    UserFunctionDef(FunctionType.REGULAR,
+                    "regexp","regexp(<regular_expression>,<expr>) = <1|0>",
+                    "Find regexp in string expression. Returns 1 if found or 0 if not",
+                    regexp,
+                    2),
+    UserFunctionDef(FunctionType.REGULAR,
+                    "sha","sha(<expr>,<encoding>,<algorithm>) = <hex-string-of-sha>",
+                    "Calculate sha of some expression. Algorithm can be one of 1,224,256,384,512. For now encoding must be manually provided. Will use the input encoding automatically in the future.",
+                    sha,
+                    3),
+    UserFunctionDef(FunctionType.REGULAR,
+                    "sha1","sha1(<expr>,<encoding>) = <hex-string-of-sha>",
+                    "Calculate sha1 of some expression. For now encoding must be manually provided. Will be taken automatically from the input encoding in the future.",
+                    sha1,
+                    2),
+    UserFunctionDef(FunctionType.REGULAR,
+                    "md5","md5(<expr>,<encoding>) = <hex-string-of-md5>",
+                    "Calculate md5 of expression. Returns a hex-string of the result. Currently requires to manually provide the encoding of the data. Will be taken automatically from the input encoding in the future.",
+                    md5,
+                    2),
+    UserFunctionDef(FunctionType.REGULAR,
+                    "sqrt","sqrt(<expr>) = <square-root>",
+                    "Calculate the square root of the expression",
+                    sqrt,
+                    1),
+    UserFunctionDef(FunctionType.REGULAR,
+                    "power","power(<expr1>,<expr2>) = <expr1-to-the-power-of-expr2>",
+                    "Raise expr1 to the power of expr2",
+                    power,
+                    2),
+    UserFunctionDef(FunctionType.AGG,
+                    "percentile","percentile(<expr>,<percentile-in-the-range-0-to-1>) = <percentile-value>",
+                    "Calculate the strict percentile of a set of a values.",
+                    StrictPercentile,
+                    2),
+    UserFunctionDef(FunctionType.AGG,
+                    "stddev_pop","stddev_pop(<expr>) = <stddev-value>",
+                    "Calculate the population standard deviation of a set of values",
+                    StdevPopulation,
+                    1),
+    UserFunctionDef(FunctionType.AGG,
+                    "stddev_sample","stddev_sample(<expr>) = <stddev-value>",
+                    "Calculate the sample standard deviation of a set of values",
+                    StdevSample,
+                    1)
+]
+
+def print_user_functions():
+    for udf in user_functions:
+        print("Function: %s" % udf.name)
+        print("     Usage: %s" % udf.usage)
+        print("     Description: %s" % udf.description)
+
+class Sqlite3DBResults(object):
+    def __init__(self,query_column_names,results):
+        self.query_column_names = query_column_names
+        self.results = results
+
 class Sqlite3DB(object):

    def __init__(self, show_sql=SHOW_SQL):
@ -169,11 +312,13 @@ class Sqlite3DB(object):
            raise ValueError('Unknown store-db-to-disk method %s' % method)

    def add_user_functions(self):
-        self.conn.create_function("regexp", 2, regexp)
-        self.conn.create_function("sha1", 1, sha1)
-        self.conn.create_function("md5", 2, md5)
-        self.conn.create_function("md5", 1, md5)
-        self.conn.create_aggregate("percentile",2,StrictPercentile)
+        for udf in user_functions:
+            if type(udf.func_or_obj) == type(object):
+                self.conn.create_aggregate(udf.name,udf.param_count,udf.func_or_obj)
+            elif type(udf.func_or_obj) == type(md5):
+                self.conn.create_function(udf.name,udf.param_count,udf.func_or_obj)
+            else:
+                raise Exception("Invalid user function definition %s" % str(udf))

    def is_numeric_type(self, column_type):
        return column_type in self.numeric_column_types
@ -1791,6 +1936,8 @@ def run_standalone():
                      help="Output encoding. Defaults to 'none', leading to selecting the system/terminal encoding")
    output_data_option_group.add_option("-W","--output-quoting-mode",dest="output_quoting_mode",default="minimal",
                      help="Output quoting mode. Possible values are all, minimal, nonnumeric and none. Note the slightly misleading parameter name, and see the matching -w parameter for input quoting.")
+    output_data_option_group.add_option("-L","--list-user-functions",dest="list_user_functions",default=False,action="store_true",
+                      help="List all user functions")
    parser.add_option_group(output_data_option_group)
    #-----------------------------------------------
    query_option_group = OptionGroup(parser,"Query Related Options")
@ -1808,6 +1955,11 @@ def run_standalone():
        sys.exit(0)

 ###
+
+    if options.list_user_functions:
+        print_user_functions()
+        sys.exit(0)
+
    if len(args) == 0 and options.query_filename is None:
        print_credentials()
        print("Must provide at least one query in the command line, or through a file with the -q parameter", file=sys.stderr)
--- a/mkdocs/docs/index.md
+++ b/mkdocs/docs/index.md
@ -89,7 +89,7 @@ Usage:

        Its purpose is to bring SQL expressive power to manipulating text data using the Linux command line.

-        Basic usage is q "<sql-like query>" where table names are just regular file names (Use - to read from standard input)
+        Basic usage is q "<sql like query>" where table names are just regular file names (Use - to read from standard input)
            When the input contains a header row, use -H, and column names will be set according to the header row content. If there isn't a header row, then columns will automatically be named c1..cN.

        Column types are detected automatically. Use -A in order to see the column name/type analysis.
@ -133,6 +133,8 @@ Options:
    -d DELIMITER, --delimiter=DELIMITER
                        Field delimiter. If none specified, then space is used
                        as the delimiter.
+    -p, --pipe-delimited
+                        Same as -d '|'. Added for convenience and readability
    -t, --tab-delimited
                        Same as -d <tab>. Just a shorthand for handling
                        standard tab delimited file You can use $'\t' if you
@ -186,6 +188,8 @@ Options:
                        Field delimiter for output. If none specified, then
                        the -d delimiter is used if present, or space if no
                        delimiter is specified
+    -P, --pipe-delimited-output
+                        Same as -D '|'. Added for convenience and readability.
    -T, --tab-delimited-output
                        Same as -D <tab>. Just a shorthand for outputting tab
                        delimited output. You can use -D $'\t' if you want.
@ -210,6 +214,8 @@ Options:
                        nonnumeric and none. Note the slightly misleading
                        parameter name, and see the matching -w parameter for
                        input quoting.
+    -L, --list-user-functions
+                        List all user functions

  Query Related Options:
    -q QUERY_FILENAME, --query-filename=QUERY_FILENAME
--- a/test/test-suite
+++ b/test/test-suite
@ -11,7 +11,6 @@
 #

 import unittest
-import pytest
 import random
 import json
 from json import JSONEncoder
@ -283,34 +282,6 @@ class BasicTests(AbstractQTestCase):

        self.cleanup(tmpfile)

-    def test_regexp_int_data_handling(self):
-        tmpfile = self.create_file_with_data(sample_data_no_header)
-
-        cmd = Q_EXECUTABLE + ' -d , "select c2 from %s where regexp(\'^1\',c2)"' % tmpfile.name
-        retcode, o, e = run_command(cmd)
-
-        self.assertEqual(retcode, 0)
-        self.assertEqual(len(o), 1)
-        self.assertEqual(len(e), 0)
-
-        self.assertEqual(o[0],six.b("1"))
-
-        self.cleanup(tmpfile)
-
-    def test_regexp_null_data_handling(self):
-        tmpfile = self.create_file_with_data(sample_data_no_header)
-
-        cmd = Q_EXECUTABLE + ' -d , "select count(*) from %s where regexp(\'^\',c2)"' % tmpfile.name
-        retcode, o, e = run_command(cmd)
-
-        self.assertEqual(retcode, 0)
-        self.assertEqual(len(o), 1)
-        self.assertEqual(len(e), 0)
-
-        self.assertEqual(o[0],six.b("2"))
-
-        self.cleanup(tmpfile)
-
    def test_select_one_column(self):
        tmpfile = self.create_file_with_data(sample_data_no_header)

@ -1525,6 +1496,55 @@ class BasicTests(AbstractQTestCase):


 class UserFunctionTests(AbstractQTestCase):
+    def test_regexp_int_data_handling(self):
+        tmpfile = self.create_file_with_data(sample_data_no_header)
+
+        cmd = Q_EXECUTABLE + ' -d , "select c2 from %s where regexp(\'^1\',c2)"' % tmpfile.name
+        retcode, o, e = run_command(cmd)
+
+        self.assertEqual(retcode, 0)
+        self.assertEqual(len(o), 1)
+        self.assertEqual(len(e), 0)
+
+        self.assertEqual(o[0],six.b("1"))
+
+        self.cleanup(tmpfile)
+
+    def test_percentile_func(self):
+        cmd = 'seq 1000 1999 | %s "select substr(c1,0,3),percentile(c1,0),percentile(c1,0.5),percentile(c1,1) from - group by substr(c1,0,3)" -c 1' % Q_EXECUTABLE
+        retcode, o, e = run_command(cmd)
+
+        self.assertEqual(retcode, 0)
+        self.assertEqual(len(o), 10)
+        self.assertEqual(len(e), 0)
+
+        output_table = [l.split(six.b(" ")) for l in o]
+        group_labels = [int(row[0]) for row in output_table]
+        minimum_values = [float(row[1]) for row in output_table]
+        median_values = [float(row[2]) for row in output_table]
+        max_values = [float(row[3]) for row in output_table]
+
+        base_values = list(range(1000,2000,100))
+
+        self.assertEqual(group_labels,list(range(10,20)))
+        self.assertEqual(minimum_values,base_values)
+        self.assertEqual(median_values,list(map(lambda x: x + 49.5,base_values)))
+        self.assertEqual(max_values,list(map(lambda x: x + 99,base_values)))
+
+    def test_regexp_null_data_handling(self):
+        tmpfile = self.create_file_with_data(sample_data_no_header)
+
+        cmd = Q_EXECUTABLE + ' -d , "select count(*) from %s where regexp(\'^\',c2)"' % tmpfile.name
+        retcode, o, e = run_command(cmd)
+
+        self.assertEqual(retcode, 0)
+        self.assertEqual(len(o), 1)
+        self.assertEqual(len(e), 0)
+
+        self.assertEqual(o[0],six.b("2"))
+
+        self.cleanup(tmpfile)
+
    def test_md5_function(self):
        cmd = 'seq 1 4 | %s -c 1 -d , "select c1,md5(c1,\'utf-8\') from -"' % Q_EXECUTABLE
        retcode, o, e = run_command(cmd)
@ -1538,6 +1558,74 @@ class UserFunctionTests(AbstractQTestCase):
        self.assertEqual(tuple(o[2].split(six.b(','),1)),(six.b('3'),six.b('eccbc87e4b5ce2fe28308fd9f2a7baf3')))
        self.assertEqual(tuple(o[3].split(six.b(','),1)),(six.b('4'),six.b('a87ff679a2f3e71d9181a67b7542122c')))

+    def test_stddev_functions(self):
+        tmpfile = self.create_file_with_data(six.b("\n".join(map(str,[234,354,3234,123,4234,234,634,56,65]))))
+
+        cmd = '%s -c 1 -d , "select round(stddev_pop(c1),10),round(stddev_sample(c1),10) from %s"' % (Q_EXECUTABLE,tmpfile.name)
+        retcode, o, e = run_command(cmd)
+
+        self.assertEqual(retcode,0)
+        self.assertEqual(len(o),1)
+        self.assertEqual(len(e),0)
+
+        self.assertEqual(o[0],'1479.7015464838,1569.4604964764')
+
+        self.cleanup(tmpfile)
+
+    def test_sqrt_function(self):
+        cmd = 'seq 1 5 | %s -c 1 -d , "select round(sqrt(c1),10) from -"' % Q_EXECUTABLE
+        retcode, o, e = run_command(cmd)
+
+        self.assertEqual(retcode,0)
+        self.assertEqual(len(o),5)
+        self.assertEqual(len(e),0)
+
+        self.assertEqual(o[0],six.b('1.0'))
+        self.assertEqual(o[1],six.b('1.4142135624'))
+        self.assertEqual(o[2],six.b('1.7320508076'))
+        self.assertEqual(o[3],six.b('2.0'))
+        self.assertEqual(o[4],six.b('2.2360679775'))
+
+    def test_power_function(self):
+        cmd = 'seq 1 5 | %s -c 1 -d , "select round(power(c1,2.5),10) from -"' % Q_EXECUTABLE
+        retcode, o, e = run_command(cmd)
+
+        self.assertEqual(retcode,0)
+        self.assertEqual(len(o),5)
+        self.assertEqual(len(e),0)
+
+        self.assertEqual(o[0],six.b('1.0'))
+        self.assertEqual(o[1],six.b('5.6568542495'))
+        self.assertEqual(o[2],six.b('15.5884572681'))
+        self.assertEqual(o[3],six.b('32.0'))
+        self.assertEqual(o[4],six.b('55.9016994375'))
+
+    def test_sha1_function(self):
+        cmd = 'seq 1 4 | %s -c 1 -d , "select c1,sha1(c1,\'utf-8\') from -"' % Q_EXECUTABLE
+        retcode, o, e = run_command(cmd)
+
+        self.assertEqual(retcode,0)
+        self.assertEqual(len(o),4)
+        self.assertEqual(len(e),0)
+
+        self.assertEqual(o[0],six.b('1,356a192b7913b04c54574d18c28d46e6395428ab'))
+        self.assertEqual(o[1],six.b('2,da4b9237bacccdf19c0760cab7aec4a8359010b0'))
+        self.assertEqual(o[2],six.b('3,77de68daecd823babbb58edb1c8e14d7106e83bb'))
+        self.assertEqual(o[3],six.b('4,1b6453892473a467d07372d45eb05abc2031647a'))
+
+    def test_sha_function(self):
+        cmd = 'seq 1 4 | %s -c 1 -d , "select c1,sha(c1,1,\'utf-8\') as sha1,sha(c1,224,\'utf-8\') as sha224,sha(c1,256,\'utf-8\') as sha256 from -"' % Q_EXECUTABLE
+        retcode, o, e = run_command(cmd)
+
+        self.assertEqual(retcode,0)
+        self.assertEqual(len(o),4)
+        self.assertEqual(len(e),0)
+
+        self.assertEqual(o[0],six.b('1,356a192b7913b04c54574d18c28d46e6395428ab,e25388fde8290dc286a6164fa2d97e551b53498dcbf7bc378eb1f178,6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d49c01e52ddb7875b4b'))
+        self.assertEqual(o[1],six.b('2,da4b9237bacccdf19c0760cab7aec4a8359010b0,58b2aaa0bfae7acc021b3260e941117b529b2e69de878fd7d45c61a9,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f90da3a666eec13ab35'))
+        self.assertEqual(o[2],six.b('3,77de68daecd823babbb58edb1c8e14d7106e83bb,4cfc3a1811fe40afa401b25ef7fa0379f1f7c1930a04f8755d678474,4e07408562bedb8b60ce05c1decfe3ad16b72230967de01f640b7e4729b49fce'))
+        self.assertEqual(o[3],six.b('4,1b6453892473a467d07372d45eb05abc2031647a,271f93f45e9b4067327ed5c8cd30a034730aaace4382803c3e1d6c2f,4b227777d4dd1fc61c6f884f48641d02b4d121d3fd328cb08b5531fcacdabf8a'))
+

 class MultiHeaderTests(AbstractQTestCase):
    def test_output_header_when_multiple_input_headers_exist(self):