How to read a csv file from an s3 bucket using Pandas in Python

前端未结

关注

 5  2044

I am trying to read a CSV file located in an AWS S3 bucket into memory as a pandas dataframe using the following code:

import pandas as pd
import boto

data


                      
              相关标签:


      
      
        
          5条回答        

        
                         				            
            
           
            
                              
                
              
              
                
                  星月不相逢        
                
              
                            
                2020-12-08 17:49
              
            
            
                                                                       
You can also try to use pandas read_sql and pyathena:
from pyathena import connect
import pandas as pd

conn = connect(s3_staging_dir='s3://bucket/folder',region_name='region')
df = pd.read_sql('select * from database.table', conn) #don't change the "database.table"

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  甜味超标        
                
              
                            
                2020-12-08 17:55
              
            
            
                                                                       
You don't need pandas.. you can just use the default csv library of python

def read_file(bucket_name,region, remote_file_name, aws_access_key_id, aws_secret_access_key):
    # reads a csv from AWS

    # first you stablish connection with your passwords and region id

    conn = boto.s3.connect_to_region(
        region,
        aws_access_key_id=aws_access_key_id,
        aws_secret_access_key=aws_secret_access_key)

    # next you obtain the key of the csv you want to read
    # you will need the bucket name and the csv file name

    bucket = conn.get_bucket(bucket_name, validate=False)
    key = Key(bucket)
    key.key = remote_file_name
    data = key.get_contents_as_string()
    key.close()

    # you store it into a string, therefore you will need to split it
    # usually the split characters are '\r\n' if not just read the file normally 
    # and find out what they are 

    reader = csv.reader(data.split('\r\n'))
    data = []
    header = next(reader)
    for row in reader:
        data.append(row)

    return data


hope it solved your problem, 
good luck!
:)
                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  伪装坚强ぢ        
                
              
                            
                2020-12-08 17:57
              
            
            
                                                                       
Using pandas 0.20.3

import os
import boto3
import pandas as pd
import sys

if sys.version_info[0] < 3: 
    from StringIO import StringIO # Python 2.x
else:
    from io import StringIO # Python 3.x

# get your credentials from environment variables
aws_id = os.environ['AWS_ID']
aws_secret = os.environ['AWS_SECRET']

client = boto3.client('s3', aws_access_key_id=aws_id,
        aws_secret_access_key=aws_secret)

bucket_name = 'my_bucket'

object_key = 'my_file.csv'
csv_obj = client.get_object(Bucket=bucket_name, Key=object_key)
body = csv_obj['Body']
csv_string = body.read().decode('utf-8')

df = pd.read_csv(StringIO(csv_string))

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  暖寄归人        
                
              
                            
                2020-12-08 17:59
              
            
            
                                                                       
Based on this answer that suggested using smart_open for reading from S3, this is how I used it with Pandas:

import os
import pandas as pd
from smart_open import smart_open

aws_key = os.environ['AWS_ACCESS_KEY']
aws_secret = os.environ['AWS_SECRET_ACCESS_KEY']

bucket_name = 'my_bucket'
object_key = 'my_file.csv'

path = 's3://{}:{}@{}/{}'.format(aws_key, aws_secret, bucket_name, object_key)

df = pd.read_csv(smart_open(path))

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  小鲜肉        
                
              
                            
                2020-12-08 18:03
              
            
            
                                                                       
I eventually realised that you also need to set the permissions on each individual object within the bucket in order to extract it by using the following code:

from boto.s3.key import Key
k = Key(bucket)
k.key = 'data_1.csv'
k.set_canned_acl('public-read')


And I also had to modify the address of the bucket in the pd.read_csv command as follows:

data = pd.read_csv('https://s3-ap-southeast-2.amazonaws.com/example_bucket/data_1.csv')

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
                             
        
        
          
            
            
              
              
            
    


                                 
              
            
                          
    

        
         
                验证码
                
                  
                
                
                   看不清?
                
              
                                  
                    
   
                 
             
              提交回复