Friday, 22 May 2020

Read online json and create local file using Pyspark

hadoop@hadoop:~/Downloads$ hdfs dfs -copyFromLocal orgs.json /SparkFiles/

hadoop@hadoop:~/Downloads$ hdfs dfs -cat hdfs://localhost:9000/SparkFiles/orgs.json

#we downloaded and copied the .json file 
df_json = spark.read.format("json").load("hdfs://localhost:9000/SparkFiles/orgs.json")
root
 |-- _corrupt_record: string (nullable = true)

#multiline = True

df_json = spark.read.format("json").option("multiline",True).load("hdfs://localhost:9000/SparkFiles/orgs.json")
df_json.printSchema()

root
 |-- avatar_url: string (nullable = true)
 |-- description: string (nullable = true)
 |-- events_url: string (nullable = true)
 |-- hooks_url: string (nullable = true)
 |-- id: long (nullable = true)
 |-- issues_url: string (nullable = true)
 |-- login: string (nullable = true)
 |-- members_url: string (nullable = true)
 |-- node_id: string (nullable = true)
 |-- public_members_url: string (nullable = true)
 |-- repos_url: string (nullable = true)
 |-- url: string (nullable = true)

df_json.select("avatar_url","id","login").show(5,False)

+----------------------------------------------------+-------+--------+
|avatar_url                                          |id     |login   |
+----------------------------------------------------+-------+--------+
|https://avatars1.githubusercontent.com/u/423638?v=4 |423638 |ggobi   |
|https://avatars1.githubusercontent.com/u/513560?v=4 |513560 |rstudio |
|https://avatars1.githubusercontent.com/u/722735?v=4 |722735 |rstats  |
|https://avatars3.githubusercontent.com/u/1200269?v=4|1200269|ropensci|
|https://avatars2.githubusercontent.com/u/3330561?v=4|3330561|rjournal|
+----------------------------------------------------+-------+--------+

#write the program to read json directly from url

url for json  : https://api.github.com/users/hadley/orgs

#reading from REST API(Representational State Transfer)



import requests
import json

#json is available in the below url
jsonapidata = requests.request("GET","https://api.github.com/users/hadley/orgs")
jsondata = jsonapidata.json()



print(type(len(jsonapidata.json())))
print(len(jsonapidata.json()))

#We are creating this file 
file = open("/home/hadoop/Downloads/test.json",'a')

#writing data into the file
for record in jsondata:
file.write("%s\n" %record)
#reading the json file from local
df_json = spark.read.format("json").option("multiline",True).load("/home/hadoop/Downloads/test.json")
df_json.count()
df_json.show(5)

No comments:

Post a Comment

Flume - Simple Demo

// create a folder in hdfs : $ hdfs dfs -mkdir /user/flumeExa // Create a shell script which generates : Hadoop in real world <n>...