I am trying to read a CSV file located in an AWS S3 bucket into memory as a pandas dataframe using the following code:
import pandas as pd
import boto
data
You don't need pandas.. you can just use the default csv library of python
def read_file(bucket_name,region, remote_file_name, aws_access_key_id, aws_secret_access_key):
# reads a csv from AWS
# first you stablish connection with your passwords and region id
conn = boto.s3.connect_to_region(
region,
aws_access_key_id=aws_access_key_id,
aws_secret_access_key=aws_secret_access_key)
# next you obtain the key of the csv you want to read
# you will need the bucket name and the csv file name
bucket = conn.get_bucket(bucket_name, validate=False)
key = Key(bucket)
key.key = remote_file_name
data = key.get_contents_as_string()
key.close()
# you store it into a string, therefore you will need to split it
# usually the split characters are '\r\n' if not just read the file normally
# and find out what they are
reader = csv.reader(data.split('\r\n'))
data = []
header = next(reader)
for row in reader:
data.append(row)
return data
hope it solved your problem, good luck! :)