Paramiko Fails to download large files >1GB

def download():
if os.path.exists( dst_dir_path ) == False:
    logger.error( "Cannot access destination folder %s. Please check path and permissions. " % ( dst_dir_path ))
    return 1
elif os.path.isdir( dst_dir_path ) == False:
    logger.error( "%s is not a folder. Please check path. " % ( dst_dir_path ))
    return 1

file_list = None
#transport = paramiko.Transport(( hostname, port)) 
paramiko.util.log_to_file('paramiko.log')
ssh = paramiko.SSHClient() 
ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy()) 
#transport
try:
    ssh.connect( hostname, username=username, password=password, timeout=5.0) 
    #transport.connect(username=username, password=password ) 
except Exception, err:
    logger.error( "Failed to connect to the remote server. Reason: %s" % ( str(err) ) )
    return 1

try:
    #sftp = paramiko.SFTPClient.from_transport(transport)
    sftp = ssh.open_sftp() 
except Exception, err:
    logger.error( "Failed to start SFTP session from connection to %s. Check that SFTP service is running and available. Reason: %s" % ( hostname, str(err) ))
    return 1

try:    
    sftp.chdir(src_dir_path)
    #file_list = sftp.listdir(path="%s" % ( src_dir_path ) )
    file_list = sftp.listdir()

except Exception, err:
    logger.error( "Failed to list files in folder %s. Please check path and permissions. Reason: %s" % ( src_dir_path, str(err) ))
    return 1
match_text = re.compile( file_mask )
download_count = 0
for file in file_list:         
    # Here is an item name... but is it a file or directory?         
    #logger.info( "Downloading file %s." % ( file ) )
    if not re.match( file_mask, file ):
        continue
    else:
        logger.info( "File \"%s\" name matched file mask \"%s\". matches %s.Processing file..." % ( file, file_mask, (match_text.match( file_mask ) ) ) )
    src_file_path = "./%s" % ( file )
    dst_file_path = "/".join( [ dst_dir_path, file]   )
    retry_count = 0
    while True:
        try:
            logger.info( "Downloading file %s to %s."  % ( file, dst_file_path ) )
            #sftp.get( file, dst_file_path, callback=printTotals ) #sftp.get( remote file, local file )
            sftp.get( file, dst_file_path) #sftp.get( remote file, local file )
            logger.info( "Successfully downloaded file %s to %s."  % ( file, dst_file_path ) )
            download_count += 1
            break
        except Exception, err:
            if retry_count == retry_threshold:
                logger.error( "Failed to download %s to %s. Reason: %s." % ( file, dst_file_path, str(err) ) )
                sftp.close() 
                #transport.close()
                return 1
            else:
                logger.error( "Failed to download %s to %s. Reason: %s." % ( file, dst_file_path, str(err) ) )
                retry_count +=1

sftp.close() 
transport.close() 
logger.info( "%d files downloaded." % ( download_count ) )
return 0

When I run the below function, it downloads the source file for about 3 minutes and then closes the session, even though only 38-41MB(varies) of a 1-1.6GB file has downloaded.

From the Paramiko log file, it looks like the SSh connection stay open while the SFTP session closes:

DEB [20120913-10:05:00.894] thr=1 paramiko.transport: Switch to new keys ... DEB [20120913-10:05:06.953] thr=1 paramiko.transport: Rekeying (hit 401 packets, 1053444 bytes received) DEB [20120913-10:05:07.391] thr=1 paramiko.transport: kex algos:['diffie-hellman-group1-sha1', 'diffie-hellman-group-exchange-sha1'] server key:['ssh-dss'] client encrypt:['aes256-ctr', 'aes192-ctr', 'aes128-ctr', 'aes256-cbc', 'aes192-cbc', 'aes128-cbc', 'twofish-cbc', 'blowfish-cbc', '3des-cbc', 'arcfour'] server encrypt:['aes256-ctr', 'aes192-ctr', 'aes128-ctr', 'aes256-cbc', 'aes192-cbc', 'aes128-cbc', 'twofish-cbc', 'blowfish-cbc', '3des-cbc', 'arcfour'] client mac:['hmac-sha1', 'hmac-sha1-96', 'hmac-md5', 'hmac-md5-96', 'umac-64@openssh.com'] server mac:['hmac-sha1', 'hmac-sha1-96', 'hmac-md5', 'hmac-md5-96', 'umac-64@openssh.com'] client compress:['zlib@openssh.com', 'zlib', 'none'] server compress:['zlib@openssh.com', 'zlib', 'none'] client lang:[''] server lang:[''] kex follows?False DEB [20120913-10:05:07.421] thr=1 paramiko.transport: Ciphers agreed: local=aes128-ctr, remote=aes128-ctr DEB [20120913-10:05:07.421] thr=1 paramiko.transport: using kex diffie-hellman-group1-sha1; server key type ssh-dss; cipher: local aes128-ctr, remote aes128-ctr; mac: local hmac-sha1, remote hmac-sha1; compression: local none, remote none DEB [20120913-10:05:07.625] thr=1 paramiko.transport: Switch to new keys ... INF [20120913-10:05:10.374] thr=2 paramiko.transport.sftp: [chan 1] sftp session closed. DEB [20120913-10:05:10.388] thr=2 paramiko.transport: [chan 1] EOF sent (1)

After this point, the script quits with this exception ( from the sftp.get() try/except block )

There are insufficient resources to complete the request

The system itself has gigabytes of disk space free, so that isn't the problem.

The same transfer the parakmiko fails on works fine with FileZilla and with Java app that I wrote years ago to do SFTP transfers. So I think its a problem with paramiko.

This is running it on Windows XP and on Windows Server 2003.

I've tried patching Paramko 1.17 so that it refreshes keys more often, but the transfer still throws an exceptiom. Python 2.7.3 Paramiko 1.7 with patch Windows 2003 Sevfer

Ideas?

Additional Information: It fails on Windows XP SP3 and Windows 2003 server, exact same behavior and error messages. sys.version information Window XP Workstation: '2.7.3 (default, Apr 10 2012, 23:31:26) [MSC v.1500 32 bit (Intel)]' Windows 2003 Server: '2.7.3 (default, Apr 10 2012, 23:31:26) [MSC v.1500 32 bit (Intel)]' I patched the packet.py file to decrease time between key renewals. It had no effect on the behavior of sftp.get().

Screwtape

The SFTP protocol doesn't have a way to stream file data; instead what it has is a way to request a block of data from a particular offset in an open file. The naive method of downloading a file would be to request the first block, write it to disk, then request the second block, and so forth. This is reliable, but very slow.

Instead, Paramiko has a performance trick it uses: when you call .get() it immediately sends a request for every block in the file, and it remembers what offset they're supposed to be written to. Then as each response arrives, it makes sure it gets written to the correct offset on-disk. For more information, see the SFTPFile.prefetch() and SFTPFile.readv() methods in the Paramiko documentation. I suspect the book-keeping information it stores when downloading your 1GB file might be causing... something to run out of resources, generating your "insufficient resources" message.

Rather than using .get(), if you just call .open() to get an SFTPFile instance, then call .read() on that object, or just hand it to the Python standard library function shutil.copyfileobj() to download the contents. That should avoid the Paramiko prefetch cache, and allow you to download the file even if it's not quite as fast.

i.e:

 def lazy_loading_ftp_file(sftp_host_conn, filename):
    """
        Lazy loading ftp file when exception simple sftp.get call
        :param sftp_host_conn: sftp host
        :param filename: filename to be downloaded
        :return: None, file will be downloaded current directory
    """
    import shutil
    try:
        with sftp_host_conn() as host:
            sftp_file_instance = host.open(filename, 'r')
            with open(filename, 'wb') as out_file:
                shutil.copyfileobj(sftp_file_instance, out_file)
            return {"status": "sucess", "msg": "sucessfully downloaded file: {}".format(filename)}
    except Exception as ex:
        return {"status": "failed", "msg": "Exception in Lazy reading too: {}".format(ex)}

I had a very similar problem, in my case the file is only ~400 MB but it would consistently fail after downloading about 35 MB or so. It didn't always fail at the exact same number of bytes downloaded but somewhere around 35 - 40 MB the file would stop transferring and a minute or so later I would get the "There are insufficient resources to complete the request" error.

Downloading the file via WinSCP or PSFTP worked fine.

I tried Screwtape's method, and it did work but was painfully slow. My 400 MB file was on pace to take something like 4 hours to download, which was an unacceptable timeframe for this particular application.

Also, at one time, when we first set this up, everything worked fine. But the server administrator made some changes to the SFTP server and that's when things broke. I'm not sure what the changes were, but since it still worked OK using WinSCP/other SFTP methods I didn't think it was going to be fruitful to try attacking this from the server side.

I'm not going to pretend to understand why, but here's what ended up working for me:

I downloaded and installed the current version of Paramiko (1.11.1 at this time). Initially this didn't make any difference at all but I figured I'd mention it just in case it was part of the solution.

The stack trace for the exception was:

File "C:\Python26\lib\site-packages\paramiko\sftp_client.py", line 676, in get
    size = self.getfo(remotepath, fl, callback)
File "C:\Python26\lib\site-packages\paramiko\sftp_client.py", line 645, in getfo
    data = fr.read(32768)
File "C:\Python26\lib\site-packages\paramiko\file.py", line 153, in read
    new_data = self._read(read_size)
File "C:\Python26\lib\site-packages\paramiko\sftp_file.py", line 157, in _read
    data = self._read_prefetch(size)
File "C:\Python26\lib\site-packages\paramiko\sftp_file.py", line 138, in _read_prefetch
    self._check_exception()
File "C:\Python26\lib\site-packages\paramiko\sftp_file.py", line 483, in _check_exception
    raise x

Poking around a bit in sftp_file.py, I noticed this (lines 43-45 in the current version):

# Some sftp servers will choke if you send read/write requests larger than
# this size.
MAX_REQUEST_SIZE = 32768

On a whim, I tried changing MAX_REQUEST_SIZE to 1024 and, lo and behold, I was able to download the whole file!
After I got it to work by changing the MAX_REQUEST_SIZE to 1024, I tried a bunch of other values between 1024 and 32768 to see if it affected performance or anything. But I always got the error sooner or later when the value was significantly bigger then 1024 (1025 was OK, but 1048 eventually failed).

damau

In addition to Screwtape's answer it's also worth mentioning that you should probably limit the block size with .read([block size in bytes])

See lazy method for reading big file

I had real issues with just file.read() without block size size in 2.4 it is possible 2.7 determines the correct block size though.

I have try to trace the code into paramiko, now I'm sure it is the server problem.

1. What prefetch has done

In order to increase the download speed, paramiko try prefetch the file by fetch method.When SFTP_FILE.prefetch() method is called, a new thread is create and tons fetch request will send to server util the whole file is covered.
we can find this in file paramiko/sftp_file.py around line 464.

2. How to sure is the server problem

The request mention above is run in async mode. SFTP_FILE._async_response() is used to receive the response from the server async.And trace down the code, we can find this exception is created in method SFTP_FILE._async_response() which convert from the message sent from the server. Now, We can sure that is the exception from server.

3. How to solve the problem

Because I have no access to the server, so use sftp in command line is my best choice.But on the other hand, now we know that too many request makes the server crash, so we can make a sleep when sending the request to server.

I use this type of script with paramiko for larger files, you can play around with the window_size / packet size to see what works best for you, if you want it to be more performant you could run parallel processes to read different chunks of files in parallel using the second method (see http://docs.paramiko.org/en/latest/api/sftp.html#paramiko.sftp_file.SFTPFile.readv)

import time, paramiko

MAX_RETRIES = 10

ftp_server = "ftp.someserver.com"
port = 22
sftp_file = "/somefolder/somefile.txt"
local_file = "/somefolder/somewhere/here.txt"
ssh_conn = sftp_client = None
username = "username"
password = "password"

start_time = time.time()

for retry in range(MAX_RETRIES):
    try:
        ssh_conn = paramiko.Transport((ftp_server, port))
        ssh_conn.connect(username=username, password=password)
        #method 1 using sftpfile.get and settings window_size, max_packet_size
        window_size = pow(4, 12)#about ~16MB chunks
        max_packet_size = pow(4, 12)
        sftp_client = paramiko.SFTPClient.from_transport(ssh_conn, window_size=window_size, max_packet_size=max_packet_size)
        sftp_client.get(sftp_file, local_file)
        #method 2 breaking up file into chunks to read in parallel
        sftp_client = paramiko.SFTPClient.from_transport(ssh_conn)
        filesize = sftp_client.stat(sftp_file).st_size
        chunksize = pow(4, 12)#<-- adjust this and benchmark speed
        chunks = [(offset, chunksize) for offset in range(0, filesize, chunksize)]
        with sftp_client.open(sftp_file, "rb") as infile:
            with open(local_file, "wb") as outfile:
                for chunk in infile.readv(chunks):
                    outfile.write(chunk)
        break
    except (EOFError, paramiko.ssh_exception.SSHException) as x:
        retry += 1
        print("%s %s - > retrying %s..." % (type(x), x, retry))
        time.sleep(abs(retry - 1) * 10)
        #back off in steps of 10, 20.. seconds 
    finally:
        if hasattr(sftp_client, "close") and callable(sftp_client.close):
            sftp_client.close()
        if hasattr(ssh_conn, "close") and callable(ssh_conn.close):
            ssh_conn.close()


print("Loading File %s Took %d seconds " % (sftp_file, time.time() - start_time))

If you are really concerned about performance you could run the second method and break the file into multiple processes / threads, here's a code sample using multi-threading that writes multiple file parts then joins them into the one file

import threading, os, time, paramiko

#you could make the number of threads relative to file size
NUM_THREADS = 4
MAX_RETRIES = 10

def make_filepart_path(file_path, part_number):
    '''creates filepart path from filepath'''
    return "%s.filepart.%s" % (file_path, part_number+1)

def write_chunks(chunks, tnum, local_file_part, username, password, ftp_server, max_retries):
    ssh_conn = sftp_client = None
    for retry in range(max_retries):
        try:
            ssh_conn = paramiko.Transport((ftp_server, port))
            ssh_conn.connect(username=username, password=password)
            sftp_client = paramiko.SFTPClient.from_transport(ssh_conn)
            with sftp_client.open(sftp_file, "rb") as infile:
                with open(local_file_part, "wb") as outfile:
                    for chunk in infile.readv(chunks):
                        outfile.write(chunk)
            break
        except (EOFError, paramiko.ssh_exception.SSHException) as x:
            retry += 1
            print("%s %s Thread %s - > retrying %s..." % (type(x), x, tnum, retry))
            time.sleep(abs(retry - 1) * 10)
        finally:
            if hasattr(sftp_client, "close") and callable(sftp_client.close):
                sftp_client.close()
            if hasattr(ssh_conn, "close") and callable(ssh_conn.close):
                ssh_conn.close()



start_time = time.time()

for retry in range(MAX_RETRIES):
    try:
        ssh_conn = paramiko.Transport((ftp_server, port))
        ssh_conn.connect(username=username, password=password)
        sftp_client = paramiko.SFTPClient.from_transport(ssh_conn)
        #connect to get the file's size in order to calculate chunks
        filesize = sftp_client.stat(sftp_file).st_size
        sftp_client.close()
        ssh_conn.close()
        chunksize = pow(4, 12)
        chunks = [(offset, chunksize) for offset in range(0, filesize, chunksize)]
        thread_chunk_size = (len(chunks) // NUM_THREADS) + 1
        #break the chunks into sub lists to hand off to threads
        thread_chunks = [chunks[i:i+thread_chunk_size] for i in range(0, len(chunks) - 1, thread_chunk_size)]
        threads = []
        fileparts = []
        for thread_num in range(len(thread_chunks)):
            local_file_part = make_filepart_path(local_file, thread_num) 
            args = (thread_chunks[thread_num], thread_num, local_file_part, username, password, ftp_server, MAX_RETRIES)
            threads.append(threading.Thread(target=write_chunks, args=args))
            fileparts.append(local_file_part)
        for thread in threads:
            thread.start()
        for thread in threads:
            thread.join()
        #join file parts into one file, remove fileparts
        with open(local_file, "wb") as outfile:
            for filepart in fileparts:
                with open(filepart, "rb") as infile:
                    outfile.write(infile.read())
                os.remove(filepart)
        break
    except (EOFError, paramiko.ssh_exception.SSHException) as x:
        retry += 1
        print("%s %s - > retrying %s..." % (type(x), x, retry))
        time.sleep(abs(retry - 1) * 10)
    finally:
       if hasattr(sftp_client, "close") and callable(sftp_client.close):
           sftp_client.close()
       if hasattr(ssh_conn, "close") and callable(ssh_conn.close):
           ssh_conn.close()


print("Loading File %s Took %d seconds " % (sftp_file, time.time() - start_time))

来源：https://stackoverflow.com/questions/12486623/paramiko-fails-to-download-large-files-1gb

标签

python

paramiko