ใช้ตัวอย่างไฟล์ json จาก Read data in JSON format
# Required for StructField, StringType, IntegerType, etc. from pyspark.sql.types import * jsonSchema = StructType([ StructField("id", LongType(), True), StructField("father", StringType(), True), StructField("mother", StringType(), True), StructField("children", ArrayType(StringType()), True) ])
jsonFile = "/mnt/training/sample.json" testDF = (spark.read # The DataFrameReader .schema(jsonSchema) # Use the specified schema .json(jsonFile) # Creates a DataFrame from JSON after reading in the file )
ดู Schema และ column type
testDF.printSchema() columns = testDF.dtypes print(columns)
root |-- id: long (nullable = true) |-- father: string (nullable = true) |-- mother: string (nullable = true) |-- children: array (nullable = true) | |-- element: string (containsNull = true) [('id', 'bigint'), ('father', 'string'), ('mother', 'string'), ('children', 'array<string>')]
ทดสอบด้วย assert
assert len(columns) == 4, "Expected 4 columns but found " + str(len(columns)) assert columns[0][0] == "id", "Expected column 0 to be \"id\" but found \"" + columns[0][0] + "\"." assert columns[0][1] == "bigint", "Expected column 0 to be of type \"bigint\" but found \"" + columns[0][1] + "\"." assert columns[1][0] == "father", "Expected column 0 to be \"father\" but found \"" + columns[0][0] + "\"." assert columns[1][1] == "string", "Expected column 0 to be of type \"string\" but found \"" + columns[0][1] + "\"." assert columns[2][0] == "mother", "Expected column 0 to be \"father\" but found \"" + columns[0][0] + "\"." assert columns[2][1] == "string", "Expected column 0 to be of type \"string\" but found \"" + columns[0][1] + "\"." assert columns[3][0] == "children", "Expected column 0 to be \"father\" but found \"" + columns[0][0] + "\"." assert columns[3][1] == "array<string>", "Expected column 0 to be of type \"array<string>\" but found \"" + columns[0][1] + "\"." print("Congratulations, all tests passed... that is if no jobs were triggered :-)\n")
Congratulations, all tests passed... that is if no jobs were triggered :-)