Newsletter zu Aktionen
Trage dich ein um keine Aktionen von uns zu verpassen.
Wir senden 1-2 E-Mails pro Quartal.
Click HERE to download A3 Poster
SparkSQL
SELECT * FROM table SELECT column1, column2 FROM table SELECT COUNT(*) FROM table SELECT DISTINCT column FROM table SELECT * FROM table WHERE column=value SELECT * FROM table WHERE column>value SELECT * FROM table ORDER BY column ASC SELECT * FROM table ORDER BY column DESC
PySpark
df.select("*") df.select("column1", "column2") df.count() df.select("column").distinct() df.filter(df.column == value) df.filter(df.column > value) df.orderBy("column", ascending=True) df.orderBy("column", ascending=False)
SparkSQL
SELECT COUNT(column) FROM table SELECT MAX(column) FROM table SELECT MIN(column) FROM table SELECT AVG(column) FROM table SELECT SUM(column) FROM table SELECT column, COUNT(*) FROM table GROUP BY column SELECT column, SUM(column2) FROM table GROUP BY column SELECT column, AVG(column2) FROM table GROUP BY column SELECT column, MIN(column2) FROM table GROUP BY column
PySpark
df.agg(F.count("column").alias("count")) df.agg(F.max("column").alias("max")) df.agg(F.min("column").alias("min")) df.agg(F.avg("column").alias("avg")) df.agg(F.sum("column").alias("sum")) df.groupBy("column").agg(F.count("*").alias("count")) df.groupBy("column").agg(F.sum("column2").alias("sum")) df.groupBy("column").agg(F.avg("column2").alias("avg")) df.groupBy("column").agg(F.min("column2").alias("min"))
SparkSQL
SELECT * FROM table1 JOIN table2 ON condition SELECT * FROM table1 LEFT JOIN table2 ON condition SELECT * FROM table1 RIGHT JOIN table2 ON condition SELECT * FROM table1 FULL JOIN table2 ON condition
PySpark
df1.join(df2, on="condition", how="inner") df1.join(df2, on="condition", how="left") df1.join(df2, on="condition", how="right") df1.join(df2, on="condition", how="outer")
SparkSQL
SELECT * FROM table WHERE column IS NULL SELECT * FROM table WHERE column IS NOT NULL SELECT column, COALESCE(column, 0) FROM table SELECT column, IFNULL(column, 0) FROM table
PySpark
df.filter(df.column.isNull()) df.filter(df.column.isNotNull()) df.withColumn("column", F.coalesce(df.column, F.lit(0))) "df.withColumn(""column"", F.when(df.column.isNull(), 0) .otherwise(df.column))"
SparkSQL
SELECT UPPER(column) FROM table SELECT LOWER(column) FROM table SELECT TRIM(column) FROM table SELECT SUBSTRING(column, 1, 3) FROM table SELECT LENGTH(column) FROM table
PySpark
df.withColumn("column", F.upper(df.column)) df.withColumn("column", F.lower(df.column)) df.withColumn("column", F.trim(df.column)) df.withColumn("column", F.substring(df.column, 1, 3)) df.withColumn("columnLength", F.length(df.column))
SparkSQL
SELECT CURRENT_DATE() SELECT CURRENT_TIMESTAMP() SELECT YEAR(column) FROM table SELECT MONTH(column) FROM table SELECT DAY(column) FROM table SELECT DATEDIFF(end_date, start_date) FROM table
PySpark
df.withColumn("current_date", F.current_date()) df.withColumn("current_timestamp", F.current_timestamp()) df.withColumn("year", F.year(df.column)) df.withColumn("month", F.month(df.column)) df.withColumn("day", F.dayofmonth(df.column)) df.withColumn("date_diff", F.datediff(df.end_date, df.start_date))
SparkSQL
LOAD DATA INPATH '/path' INTO TABLE table INSERT INTO table VALUES (1, 2) SELECT * FROM table SAVE AS TABLE table
PySpark
df = spark.read.csv("/path") df.write.insertInto("table", overwrite=True) df = spark.table("table") df.write.saveAsTable("table", mode="overwrite")
Trage dich ein um keine Aktionen von uns zu verpassen.
Wir senden 1-2 E-Mails pro Quartal.