from pyspark.sql import SparkSession
# Create a SparkSession
spark = SparkSession.builder \
.master('local[*]') \ # Location of cluster, use all cores of local computer
.appName("Load and Query CSV with SQL") \
.getOrCreate()
# Define schema
schema = StructType([
StructField("col1", StringType()),
StructField("col2", IntegerType()),
StructField("col3", DoubleType())
])
# Load the CSV file into a DataFrame
df = spark.read.csv("file.csv",sep=',', header=True, inferSchema=True, nullValue='NA') # schema= schema
# Check column types
df.printSchema()
df.dtypes
# Register the DataFrame as a temporary table or view
df.createOrReplaceTempView("my_table")
# Print the tables in the catalog
print(spark.catalog.listTables())
# Run SQL queries on the DataFrame
query_result = spark.sql("SELECT * FROM my_table WHERE column_name = 'value'")
query_result.show()
sc = spark.sparkContext # Access the SparkContext from SparkSession
spark = SparkSession(sc) # Create a SparkSession from SparkContext
spark.stop() # Stop SparkSession