Add python script to get cabal package licenses

PR-URL: https://github.com/hasura/graphql-engine-mono/pull/10928 GitOrigin-RevId: d26495c8c9975b9e4f98e322b6d5b2977e66c247
2025-01-05 14:27:59 +03:00 · 2024-07-09 15:37:33 -06:00 · 2024-07-09 15:37:33 -06:00 · b2e0843045
commit b2e0843045
parent a94bace075
2 changed files with 143 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -42,6 +42,8 @@ tags
 docs/_build/
 docs/_ext/

+# cabal-plan artifacts
+licenses/

 # Ignore benchmark report output
 server/benchmarks/benchmark_sets/*/report.json
--- a/scripts/get_server_licenses.py
+++ b/scripts/get_server_licenses.py
@ -0,0 +1,141 @@
+"""
+Script to generate a combined CSV report of package licenses using cabal-plan.
+
+This script takes any number of Haskell package names, runs the `cabal-plan license-report`
+command for each package, and processes the resulting markdown output to generate a combined
+CSV file listing package dependencies with their names, versions, licenses, and descriptions.
+
+Usage:
+    python script.py <package1> <package2> ... <output_file.csv>
+
+Example:
+    python script.py graphql-engine-pro some-other-package output.csv
+
+Parameters:
+    <package1>, <package2>, ... : Names of the packages to process. ex lib:graphql-engine, exe:graphql-engine
+    <output_file.csv>           : Name of the output CSV file where the combined dependencies will be saved.
+
+Description:
+    - The script removes links, backticks, bold and italic formatting, and any remaining Markdown syntax from the content.
+    - It processes the markdown content generated by the cabal-plan command for each package.
+    - The script excludes specified packages from the final report.
+    - It assigns specified SPDX License IDs to certain packages.
+    - The combined dependencies are written to the specified output CSV file.
+
+Dependencies:
+    - Python 3.x
+    - cabal-plan (ensure it's installed and available in your PATH)
+
+Notes:
+    - Ensure you have cabal-plan installed and the packages are available for the command to run successfully.
+    - The script expects the cabal-plan license-report command to output markdown tables with specific columns.
+"""
+
+import re
+import csv
+import subprocess
+
+# Packages to remove
+PACKAGES_TO_REMOVE = [
+    'arrows-extra',
+    'aeson-ordered',
+    'ci-info',
+    'dc-api',
+    'ekg-prometheus',
+    'graphql-engine',
+    'graphql-parser',
+    'hasura-base',
+    'hasura-error-message',
+    'hasura-extras',
+    'hasura-json-encoding',
+    'hasura-prelude',
+    'incremental',
+    'kriti-lang',
+    'libdeflate-hs',
+    'pg-client',
+    'schema-parsers'
+]
+
+# Packages with specified SPDX Licenses
+PACKAGE_LICENSES = {
+    'ekg-json': 'BSD-3-Clause',
+    'odbc': 'BSD-3-Clause'
+}
+
+def clean_text(text):
+    # Remove links
+    text = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', text)
+    # Remove backticks
+    text = text.replace('`', '')
+    # Remove bold and italic formatting
+    text = re.sub(r'\*+([^*]+)\*+', r'\1', text)
+    # Remove any remaining Markdown syntax
+    text = re.sub(r'[#_~]', '', text)
+    return text.strip()
+
+def parse_md_content(content):
+    # Find the table content
+    table_match = re.search(r'\| Name.*\n\|[-\s|]*\n((.|\n)*?)(\n\n|$)', content)
+    if not table_match:
+        return []
+
+    table_content = table_match.group(1)
+    
+    # Parse each row
+    rows = []
+    for line in table_content.split('\n'):
+        if line.strip():
+            cells = [clean_text(cell) for cell in line.split('|')[1:-1]]
+            if len(cells) >= 4:
+                rows.append(cells[:4])  # Only take the first 4 columns
+    
+    return rows
+
+def run_cabal_plan(package):
+    command = f'cabal-plan license-report --licensedir=licenses {package}'
+    result = subprocess.run(command, shell=True, capture_output=True, text=True)
+    if result.returncode != 0:
+        print(f"Error running command for package {package}: {result.stderr}")
+        return ""
+    return result.stdout
+
+def combine_dependencies(packages):
+    combined_deps = {}
+    
+    for package in packages:
+        md_content = run_cabal_plan(package)
+        if not md_content:
+            continue
+        
+        rows = parse_md_content(md_content)
+        for row in rows:
+            name, version, license, description = row
+            if name not in PACKAGES_TO_REMOVE:
+                if name in PACKAGE_LICENSES:
+                    license = PACKAGE_LICENSES[name]
+                if name not in combined_deps or version > combined_deps[name][1]:
+                    combined_deps[name] = (name, version, license, description)
+    
+    return list(combined_deps.values())
+
+def write_csv(dependencies, output_file):
+    with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
+        writer = csv.writer(csvfile, quoting=csv.QUOTE_MINIMAL)
+        writer.writerow(['Name', 'Version', 'SPDX License Id', 'Description'])
+        for dep in sorted(dependencies):
+            writer.writerow(dep)
+
+if __name__ == "__main__":
+    import sys
+
+    if len(sys.argv) < 3:
+        print("Usage: python script.py <package1> <package2> ... <output_file.csv>")
+        sys.exit(1)
+
+    packages = sys.argv[1:-1]
+    output_file = sys.argv[-1]
+
+    combined_deps = combine_dependencies(packages)
+    write_csv(combined_deps, output_file)
+
+    print(f"Combined dependencies written to {output_file}")