hadoop@hadoop:~/Downloads$ hdfs dfs -copyFromLocal orgs.json /SparkFiles/
hadoop@hadoop:~/Downloads$ hdfs dfs -cat hdfs://localhost:9000/SparkFiles/orgs.json
#we downloaded and copied the .json file
df_json = spark.read.format("json").load("hdfs://localhost:9000/SparkFiles/orgs.json")
root
|-- _corrupt_record: string (nullable = true)
#multiline = True
df_json = spark.read.format("json").option("multiline",True).load("hdfs://localhost:9000/SparkFiles/orgs.json")
df_json.printSchema()
root
|-- avatar_url: string (nullable = true)
|-- description: string (nullable = true)
|-- events_url: string (nullable = true)
|-- hooks_url: string (nullable = true)
|-- id: long (nullable = true)
|-- issues_url: string (nullable = true)
|-- login: string (nullable = true)
|-- members_url: string (nullable = true)
|-- node_id: string (nullable = true)
|-- public_members_url: string (nullable = true)
|-- repos_url: string (nullable = true)
|-- url: string (nullable = true)
df_json.select("avatar_url","id","login").show(5,False)
+----------------------------------------------------+-------+--------+
|avatar_url |id |login |
+----------------------------------------------------+-------+--------+
|https://avatars1.githubusercontent.com/u/423638?v=4 |423638 |ggobi |
|https://avatars1.githubusercontent.com/u/513560?v=4 |513560 |rstudio |
|https://avatars1.githubusercontent.com/u/722735?v=4 |722735 |rstats |
|https://avatars3.githubusercontent.com/u/1200269?v=4|1200269|ropensci|
|https://avatars2.githubusercontent.com/u/3330561?v=4|3330561|rjournal|
+----------------------------------------------------+-------+--------+
#write the program to read json directly from url
url for json : https://api.github.com/users/hadley/orgs
#reading from REST API(Representational State Transfer)
import requests
import json
#json is available in the below url
jsonapidata = requests.request("GET","https://api.github.com/users/hadley/orgs")
jsondata = jsonapidata.json()
print(type(len(jsonapidata.json())))
print(len(jsonapidata.json()))
#We are creating this file
file = open("/home/hadoop/Downloads/test.json",'a')
#writing data into the file
for record in jsondata:
file.write("%s\n" %record)
#reading the json file from local
df_json = spark.read.format("json").option("multiline",True).load("/home/hadoop/Downloads/test.json")
df_json.count()
df_json.show(5)
No comments:
Post a Comment