20170516: Python script to convert .csv to "triple store"
A python script to take a .csv file and convert it to a tab delimited file in "triple store" (id/name/value pairs).
Source File: https://data.nasa.gov/Space-Science/Meteorite-Landings/gh4g-9sfhtakes a file with 45,717 records (10 columns) and creates a file with over 500k records (3 columns).
I probably should add the following to remove any possible tabs in the values:
str1 = str1.replace('\t', '')
#createTripleStore.py
import pandas as pd
myFileName = "Meteorite_Landings.csv"
#if you do not know the id column, set it to N/A or ""
myIdColumn = "id"
#myIdColumn = ""
myOutFileName = myFileName + ".TabFileOut.txt"
df=pd.read_csv(myFileName, encoding='UTF-8')
df['myFileName'] = myFileName
rows_list=[]
i = 0
for index, row in df.iterrows():
i = i + 1
if myIdColumn is None or myIdColumn == "" or myIdColumn == "N/A":
myIdColumnValue = i
else:
myIdColumnValue = row[myIdColumn]
RecordtoAdd={} #initialise an empty dict
RecordtoAdd.update({'01-id' : myIdColumnValue})
RecordtoAdd.update({'02-columnName' : "myFileRecordId"})
RecordtoAdd.update({'03-columnValue' : i})
rows_list.append(RecordtoAdd)
for c in df:
myColumnName = c
if myColumnName == myIdColumn:
myDoNothing = "Y"
else:
RecordtoAdd={} #initialise an empty dict
RecordtoAdd.update({'01-id' : myIdColumnValue})
RecordtoAdd.update({'02-columnName' : myColumnName})
RecordtoAdd.update({'03-columnValue' : row[myColumnName]})
rows_list.append(RecordtoAdd)
#create a dataframe and write it out
df_out = pd.DataFrame(rows_list)
df_out.to_csv(myOutFileName, sep='\t', encoding='utf-8', index=False)
Comments