From 045e9d22aa324eb72cb2ceaa39859cabd9ebf5e9 Mon Sep 17 00:00:00 2001 From: scronge <144865853+scronge@users.noreply.github.com> Date: Sat, 9 Nov 2024 13:40:55 -0600 Subject: [PATCH] Create spark.html.markdown This commit introduces a new "Learn Spark in Y Minutes" guide in the format of LearnXinYMinutes documentation. The guide provides a concise overview of Apache Spark using PySpark, following repository style guidelines and including examples of basic Spark operations. Key sections cover: - Initializing a Spark session - Creating and manipulating DataFrames - Applying transformations and actions - Performing SQL queries - Reading from and writing to various file formats - Working with RDDs - Ending a Spark session --- spark.html.markdown | 63 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) create mode 100644 spark.html.markdown diff --git a/spark.html.markdown b/spark.html.markdown new file mode 100644 index 00000000..16acc229 --- /dev/null +++ b/spark.html.markdown @@ -0,0 +1,63 @@ +--- +language: Spark +category: tool +tool: Spark +filename: learnspark.spark +contributors: + - ["YourName", "https://github.com/Scronge"] +--- + +[Spark](https://spark.apache.org/) is an open-source distributed data processing framework that enables large-scale data processing across clusters. This guide covers the basics of **Apache Spark** using PySpark, the Python API. + +```python +# Setting Up Spark +from pyspark.sql import SparkSession + +spark = SparkSession.builder \ + .appName("ExampleApp") \ + .getOrCreate() + +# Working with DataFrames +data = [("Alice", 30), ("Bob", 40)] +columns = ["Name", "Age"] + +df = spark.createDataFrame(data, columns) +df.show() +# +-----+---+ +# | Name|Age| +# +-----+---+ +# |Alice| 30| +# | Bob| 40| +# +-----+---+ + +# Transformations and Actions + +df_filtered = df.filter(df.Age > 35) +df_filtered.show() +# +----+---+ +# |Name|Age| +# +----+---+ +# | Bob| 40| +# +----+---+ + +# SQL Queries + +df.createOrReplaceTempView("people") +spark.sql("SELECT * FROM people WHERE Age > 30").show() + +# Reading and Writing Files + +csv_df = spark.read.csv("path/to/file.csv", header=True, inferSchema=True) +df.write.parquet("output_path") + +# RDD Basics + +rdd = spark.sparkContext.parallelize([1, 2, 3, 4]) + +squared_rdd = rdd.map(lambda x: x ** 2) +print(squared_rdd.collect()) +# Output: [1, 4, 9, 16] + +# Ending the Spark Session + +spark.stop()