问题
I am trying to download web pages over https by first downloading the headers with a HEAD request, then parsing to obtain the Content-Length and then using the Content-Length plus some space for headers to allocate memory for a buffer to store results from a GET request. It seems that stackoverflow.com gives a Content-Length that is too small and thus my code segfaults.
I've tried looking through stack overflow past questions to see how to go about dynamically allocating memory to handle pages which misreport their Content-Length but haven't been able to find any suitable answers.
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <openssl/bio.h>
#include <openssl/ssl.h>
#include <openssl/err.h>
#define MAX_HEADER_SIZE 8192
/**
* Main SSL demonstration code entry point
*/
int main() {
char* host_and_port = "stackoverflow.com:443";
char* head_request = "HEAD / HTTP/1.1\r\nHost: stackoverflow.com\r\n\r\n";
char* get_request = "GET / HTTP/1.1\r\nHost: stackoverflow.com\r\n\r\n";
char* store_path = "mycert.pem";
char *header_token, *line_token, content_length_line[1024];
char *cmp = "\r\n";
char *html;
char *get;
int content_length;
size_t i = 0;
char buffer[MAX_HEADER_SIZE];
buffer[0] = 0;
BIO* bio;
SSL_CTX* ctx = NULL;
SSL* ssl = NULL;
/* initilise the OpenSSL library */
SSL_load_error_strings();
SSL_library_init();
ERR_load_BIO_strings();
OpenSSL_add_all_algorithms();
bio = NULL;
int r = 0;
/* Set up the SSL pointers */
ctx = SSL_CTX_new(TLS_client_method());
ssl = NULL;
r = SSL_CTX_load_verify_locations(ctx, store_path, NULL);
if (r == 0) {
fprintf(stdout,"Unable to load the trust store from %s.\n", store_path);
fprintf(stdout, "Error: %s\n", ERR_reason_error_string(ERR_get_error()));
fprintf(stdout, "%s\n", ERR_error_string(ERR_get_error(), NULL));
ERR_print_errors_fp(stdout);
}
/* Setting up the BIO SSL object */
bio = BIO_new_ssl_connect(ctx);
BIO_get_ssl(bio, &ssl);
if (!(ssl)) {
printf("Unable to allocate SSL pointer.\n");
fprintf(stdout, "Error: %s\n", ERR_reason_error_string(ERR_get_error()));
fprintf(stdout, "%s\n", ERR_error_string(ERR_get_error(), NULL));
ERR_print_errors_fp(stdout);
bio = NULL;
}
SSL_set_mode(ssl, SSL_MODE_AUTO_RETRY);
/* Attempt to connect */
BIO_set_conn_hostname(bio, host_and_port);
/* Verify the connection opened and perform the handshake */
if (BIO_do_connect(bio) < 1) {
fprintf(stdout, "Unable to connect BIO.%s\n", host_and_port);
fprintf(stdout, "Error: %s\n", ERR_reason_error_string(ERR_get_error()));
fprintf(stdout, "%s\n", ERR_error_string(ERR_get_error(), NULL));
ERR_print_errors_fp(stdout);
bio = NULL;
}
if (SSL_get_verify_result(ssl) != X509_V_OK) {
printf("Unable to verify connection result.\n");
fprintf(stdout, "Error: %s\n", ERR_reason_error_string(ERR_get_error()));
fprintf(stdout, "%s\n", ERR_error_string(ERR_get_error(), NULL));
ERR_print_errors_fp(stdout);
}
if (bio == NULL)
return (EXIT_FAILURE);
r = -1;
while (r < 0) {
r = BIO_write(bio, head_request, strlen(head_request));
if (r <= 0) {
if (!BIO_should_retry(bio)) {
printf("BIO_read should retry test failed.\n");
fprintf(stdout, "Error: %s\n", ERR_reason_error_string(ERR_get_error()));
fprintf(stdout, "%s\n", ERR_error_string(ERR_get_error(), NULL));
ERR_print_errors_fp(stdout);
continue;
}
/* It would be prudent to check the reason for the retry and handle
* it appropriately here */
}
}
r = -1;
while (r < 0) {
r = BIO_read(bio, buffer, MAX_HEADER_SIZE);
if (r == 0) {
printf("Reached the end of the data stream.\n");
fprintf(stdout, "Error: %s\n", ERR_reason_error_string(ERR_get_error()));
fprintf(stdout, "%s\n", ERR_error_string(ERR_get_error(), NULL));
ERR_print_errors_fp(stdout);
continue;
} else if (r < 0) {
if (!BIO_should_retry(bio)) {
printf("BIO_read should retry test failed.\n");
fprintf(stdout, "Error: %s\n", ERR_reason_error_string(ERR_get_error()));
fprintf(stdout, "%s\n", ERR_error_string(ERR_get_error(), NULL));
ERR_print_errors_fp(stdout);
continue;
}
/* It would be prudent to check the reason for the retry and handle
* it appropriately here */
}
};
printf("%s\r\n", buffer);
header_token = strtok(buffer, cmp);
while (header_token != NULL)
{
//printf ("header_token: %s\n\n", header_token);
if (strncmp(header_token, "Content-Length:", strlen("Content-Length:")) == 0
|| strncmp(header_token, "content-length:", strlen("content-length:")) == 0)
{
//printf ("header_token %s is equal to Content-Length:\n", header_token);
strcpy(content_length_line, header_token);
}
header_token = strtok(NULL, cmp);
}
if (strlen(content_length_line) > 0)
{
line_token = strtok(content_length_line, " ");
line_token = strtok(NULL, " ");
content_length = atoi(line_token);
printf ("Content-Length = %d\n", content_length);
}
//char get[content_length + MAX_HEADER_SIZE];
get = malloc((content_length + MAX_HEADER_SIZE)*sizeof(char));
if (get == NULL) {
fprintf(stdout, "Out of memory\n");
return (EXIT_FAILURE);
}
r = -1;
while (r < 0) {
r = BIO_write(bio, get_request, strlen(get_request));
if (r <= 0) {
if (!BIO_should_retry(bio)) {
printf("BIO_read should retry test failed.\n");
fprintf(stdout, "Error: %s\n", ERR_reason_error_string(ERR_get_error()));
fprintf(stdout, "%s\n", ERR_error_string(ERR_get_error(), NULL));
ERR_print_errors_fp(stdout);
continue;
}
/* It would be prudent to check the reason for the retry and handle
* it appropriately here */
}
}
r = -1;
while (r) {
while (r < 0) {
r = BIO_read(bio, buffer, 4096);
if (r == 0) {
printf("Reached the end of the data stream.\n");
fprintf(stdout, "Error: %s\n", ERR_reason_error_string(ERR_get_error()));
fprintf(stdout, "%s\n", ERR_error_string(ERR_get_error(), NULL));
ERR_print_errors_fp(stdout);
continue;
} else if (r < 0) {
if (!BIO_should_retry(bio)) {
printf("BIO_read should retry test failed.\n");
fprintf(stdout, "Error: %s\n", ERR_reason_error_string(ERR_get_error()));
fprintf(stdout, "%s\n", ERR_error_string(ERR_get_error(), NULL));
ERR_print_errors_fp(stdout);
continue;
}
/* It would be prudent to check the reason for the retry and handle
* it appropriately here */
}
};
printf("Received %d bytes\n",r);
printf("Received total of %ld bytes of %d\n", i+r, content_length);
memcpy(get+i, buffer, r);
i += r;
}
printf("%s\r\n", buffer);
/* clean up the SSL context resources for the encrypted link */
SSL_CTX_free(ctx);
free(get);
return (EXIT_SUCCESS);
}
I would usually expect to be able to print out the full web page but because of the erroneous Content-Length I get the following output and segfault.
Received 1752 bytes
Received total of 248784 bytes of 105585
Program received signal SIGSEGV, Segmentation fault.
__memmove_sse2_unaligned_erms () at ../sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S:404
404 ../sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S: No such file or directory.
How should I handle pages that give incorrect Content-Length?
回答1:
The Content-length in the response to a HEAD request is of no relevance. Only the Content-length in the response containing the actual body is relevant (i.e. response to GET, POST...). And this Content-length should be used to read the HTTP body, i.e. first read the HTTP header, determine the length and then read the body as specified. Even if more data could be read they don't belong to the response body.
Apart from that you are doing a HTTP/1.1 request. This means that the server might use Transfer-Encoding: chunked in which case the value of Content-length is irrelevant too. Instead chunked encoding takes preference and you need to read all the chunks of the body based on the length of each given chunk.
来源:https://stackoverflow.com/questions/57258387/what-to-do-when-http-header-wrongly-reports-content-length