What to do when http header wrongly reports content-length

随声附和 提交于 2020-01-26 04:51:07

问题


I am trying to download web pages over https by first downloading the headers with a HEAD request, then parsing to obtain the Content-Length and then using the Content-Length plus some space for headers to allocate memory for a buffer to store results from a GET request. It seems that stackoverflow.com gives a Content-Length that is too small and thus my code segfaults.

I've tried looking through stack overflow past questions to see how to go about dynamically allocating memory to handle pages which misreport their Content-Length but haven't been able to find any suitable answers.

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <openssl/bio.h>
#include <openssl/ssl.h>
#include <openssl/err.h>

#define MAX_HEADER_SIZE 8192

/**
 * Main SSL demonstration code entry point
 */
int main() {
    char* host_and_port = "stackoverflow.com:443"; 
    char* head_request = "HEAD / HTTP/1.1\r\nHost: stackoverflow.com\r\n\r\n"; 
    char* get_request = "GET / HTTP/1.1\r\nHost: stackoverflow.com\r\n\r\n"; 
    char* store_path = "mycert.pem"; 
    char *header_token, *line_token, content_length_line[1024];
    char *cmp = "\r\n";
    char *html;
    char *get;
    int content_length;
    size_t i = 0;
    char buffer[MAX_HEADER_SIZE];
    buffer[0] = 0;

    BIO* bio;
    SSL_CTX* ctx = NULL;
    SSL* ssl = NULL;

    /* initilise the OpenSSL library */
    SSL_load_error_strings();
    SSL_library_init();
    ERR_load_BIO_strings();
    OpenSSL_add_all_algorithms();

    bio = NULL;
    int r = 0;

    /* Set up the SSL pointers */
    ctx = SSL_CTX_new(TLS_client_method());
    ssl = NULL;
    r = SSL_CTX_load_verify_locations(ctx, store_path, NULL);

    if (r == 0) {
        fprintf(stdout,"Unable to load the trust store from %s.\n", store_path);
        fprintf(stdout, "Error: %s\n", ERR_reason_error_string(ERR_get_error()));
        fprintf(stdout, "%s\n", ERR_error_string(ERR_get_error(), NULL));
        ERR_print_errors_fp(stdout);
    }

    /* Setting up the BIO SSL object */
    bio = BIO_new_ssl_connect(ctx);
    BIO_get_ssl(bio, &ssl);
    if (!(ssl)) {
        printf("Unable to allocate SSL pointer.\n");
        fprintf(stdout, "Error: %s\n", ERR_reason_error_string(ERR_get_error()));
        fprintf(stdout, "%s\n", ERR_error_string(ERR_get_error(), NULL));
        ERR_print_errors_fp(stdout); 
        bio = NULL;           
    }
    SSL_set_mode(ssl, SSL_MODE_AUTO_RETRY);

    /* Attempt to connect */
    BIO_set_conn_hostname(bio, host_and_port);

    /* Verify the connection opened and perform the handshake */
    if (BIO_do_connect(bio) < 1) {
        fprintf(stdout, "Unable to connect BIO.%s\n", host_and_port);
        fprintf(stdout, "Error: %s\n", ERR_reason_error_string(ERR_get_error()));
        fprintf(stdout, "%s\n", ERR_error_string(ERR_get_error(), NULL));
        ERR_print_errors_fp(stdout);
        bio = NULL;
    }

    if (SSL_get_verify_result(ssl) != X509_V_OK) {
        printf("Unable to verify connection result.\n");
        fprintf(stdout, "Error: %s\n", ERR_reason_error_string(ERR_get_error()));
        fprintf(stdout, "%s\n", ERR_error_string(ERR_get_error(), NULL));
        ERR_print_errors_fp(stdout);            
    }

    if (bio == NULL)
        return (EXIT_FAILURE);

    r = -1;

    while (r < 0) {

        r = BIO_write(bio, head_request, strlen(head_request));
        if (r <= 0) {
            if (!BIO_should_retry(bio)) {
                printf("BIO_read should retry test failed.\n");
                fprintf(stdout, "Error: %s\n", ERR_reason_error_string(ERR_get_error()));
                fprintf(stdout, "%s\n", ERR_error_string(ERR_get_error(), NULL));
                ERR_print_errors_fp(stdout);            
                continue;
            }
            /* It would be prudent to check the reason for the retry and handle
            * it appropriately here */
        }
    }

    r = -1;

    while (r < 0) {
        r = BIO_read(bio, buffer, MAX_HEADER_SIZE);
        if (r == 0) {
            printf("Reached the end of the data stream.\n");
            fprintf(stdout, "Error: %s\n", ERR_reason_error_string(ERR_get_error()));
            fprintf(stdout, "%s\n", ERR_error_string(ERR_get_error(), NULL));
            ERR_print_errors_fp(stdout);
            continue;
        } else if (r < 0) {
            if (!BIO_should_retry(bio)) {
                printf("BIO_read should retry test failed.\n");
                fprintf(stdout, "Error: %s\n", ERR_reason_error_string(ERR_get_error()));
                fprintf(stdout, "%s\n", ERR_error_string(ERR_get_error(), NULL));
                ERR_print_errors_fp(stdout);
                continue;
            }

            /* It would be prudent to check the reason for the retry and handle
            * it appropriately here */
        }
    };
    printf("%s\r\n", buffer);

    header_token = strtok(buffer, cmp);

    while (header_token != NULL)
    {
        //printf ("header_token: %s\n\n", header_token);
        if (strncmp(header_token, "Content-Length:", strlen("Content-Length:")) == 0 
        || strncmp(header_token, "content-length:", strlen("content-length:")) == 0)
        {
            //printf ("header_token %s is equal to Content-Length:\n", header_token);
            strcpy(content_length_line, header_token);
        }
        header_token = strtok(NULL, cmp);
    }

    if (strlen(content_length_line) > 0) 
    {
        line_token = strtok(content_length_line, " ");
        line_token = strtok(NULL, " ");
        content_length = atoi(line_token);
        printf ("Content-Length = %d\n", content_length);
    }

    //char get[content_length + MAX_HEADER_SIZE];
    get = malloc((content_length + MAX_HEADER_SIZE)*sizeof(char));
    if (get == NULL) {
        fprintf(stdout, "Out of memory\n");
        return (EXIT_FAILURE);
    }

    r = -1;

    while (r < 0) {

        r = BIO_write(bio, get_request, strlen(get_request));
        if (r <= 0) {
            if (!BIO_should_retry(bio)) {
                printf("BIO_read should retry test failed.\n");
                fprintf(stdout, "Error: %s\n", ERR_reason_error_string(ERR_get_error()));
                fprintf(stdout, "%s\n", ERR_error_string(ERR_get_error(), NULL));
                ERR_print_errors_fp(stdout);            
                continue;
            }
            /* It would be prudent to check the reason for the retry and handle
            * it appropriately here */
        }
    }

    r = -1;

    while (r) {
        while (r < 0) {
            r = BIO_read(bio, buffer, 4096);
            if (r == 0) {
                printf("Reached the end of the data stream.\n");
                fprintf(stdout, "Error: %s\n", ERR_reason_error_string(ERR_get_error()));
                fprintf(stdout, "%s\n", ERR_error_string(ERR_get_error(), NULL));
                ERR_print_errors_fp(stdout);
                continue;
            } else if (r < 0) {
                if (!BIO_should_retry(bio)) {
                    printf("BIO_read should retry test failed.\n");
                    fprintf(stdout, "Error: %s\n", ERR_reason_error_string(ERR_get_error()));
                    fprintf(stdout, "%s\n", ERR_error_string(ERR_get_error(), NULL));
                    ERR_print_errors_fp(stdout);
                    continue;
                }

                /* It would be prudent to check the reason for the retry and handle
                * it appropriately here */
            }
        };
        printf("Received %d bytes\n",r);
        printf("Received total of %ld bytes of %d\n", i+r, content_length);
        memcpy(get+i, buffer, r);
        i += r;
    }
    printf("%s\r\n", buffer);

    /* clean up the SSL context resources for the encrypted link */
    SSL_CTX_free(ctx);

    free(get);  

    return (EXIT_SUCCESS);
}

I would usually expect to be able to print out the full web page but because of the erroneous Content-Length I get the following output and segfault.

Received 1752 bytes
Received total of 248784 bytes of 105585

Program received signal SIGSEGV, Segmentation fault.
__memmove_sse2_unaligned_erms () at ../sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S:404
404     ../sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S: No such file or directory.

How should I handle pages that give incorrect Content-Length?


回答1:


The Content-length in the response to a HEAD request is of no relevance. Only the Content-length in the response containing the actual body is relevant (i.e. response to GET, POST...). And this Content-length should be used to read the HTTP body, i.e. first read the HTTP header, determine the length and then read the body as specified. Even if more data could be read they don't belong to the response body.

Apart from that you are doing a HTTP/1.1 request. This means that the server might use Transfer-Encoding: chunked in which case the value of Content-length is irrelevant too. Instead chunked encoding takes preference and you need to read all the chunks of the body based on the length of each given chunk.



来源:https://stackoverflow.com/questions/57258387/what-to-do-when-http-header-wrongly-reports-content-length

标签
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!