C strip html between <…>

久未见 提交于 2021-02-20 05:16:22

问题


How can i strip the HTML from document between and including the <...> tags in a HTML document using C? My current program uses curl to get the contents of the webpage and puts it into a text file, it then reads from the text file and removes the <>, but i am unsure of how to remove everything between those tags.

#include <curl/curl.h>
#include <stdio.h>
#include <stdlib.h>
#include <fcntl.h>
#include <unistd.h>

#define WEBPAGE_URL "http://homepages.paradise.net.nz/adrianfu/index.html"
#define DESTINATION_FILE "/home/user/data.txt"

size_t write_data( void *ptr, size_t size, size_t nmeb, void *stream)
{
 return fwrite(ptr,size,nmeb,stream);
}

int main()
{
 int in_tag = 0;
 char * buffer;
 char c;
 long lSize;
 size_t result;

 FILE * file = fopen(DESTINATION_FILE,"w+");
 if (file==NULL) {
    fputs ("File error",stderr); 
    exit (1);
    }

 CURL *handle = curl_easy_init();
 curl_easy_setopt(handle,CURLOPT_URL,WEBPAGE_URL); /*Using the http protocol*/
 curl_easy_setopt(handle,CURLOPT_WRITEFUNCTION, write_data);
 curl_easy_setopt(handle,CURLOPT_WRITEDATA, file);
 curl_easy_perform(handle);
 curl_easy_cleanup(handle);

  int i, nRead, fd;
    int source;
    char buf[1024];


    if((fd = open("data.txt", O_RDONLY)) == -1)
    {
        printf("Cannot open the file");
    }
    else
    {
        nRead = read(fd, buf, 1024);
        printf("Original String ");
        for(i=0; i<nRead; i++)
        {
                printf("%c", buf[i]);
        }

        printf("\nReplaced String ");

        for(i=0; i<nRead; i++)
        {
            if(buf[i]=='<' || buf[i]=='>'){
            buf[i]=' ';

            }
            printf("%c", buf[i]);
        }
    }
    close(source);

 return 0;
 }

回答1:


Placing just the code that removes the contents between the '<' and '>' tags (assuming that you deal with proper html, meaning that you don't have one tag nested in the declaration of the other like <html < body> >). I am just changing a small portion of your code. I will also remove the tags from the buf variable, instead of replacing the undesired characters with intervals, because I think this will be more useful to you (correct me if I am wrong).

int idx = 0;
int opened = 0; // false
for(i=0; i<nRead; i++)
{
    if(buf[i]=='<') {
        opened = 1; // true
    } else if (buf[i] == '>') {
        opened = 0; // false
    } else if (!opened) {
        buf[idx++] = buf[i];
    }
}
buf[idx] = '\0';
printf("%s\n", buf);



回答2:


This would also handle scripts and style tags

int stripHTMLTags(char *sToClean,size_t size)
    {
        int i=0,j=0,k=0;
        int flag = 0; // 0: searching for < or & (& as in &bspn; etc), 1: searching for >, 2: searching for ; after &, 3: searching for </script>,</style>, -->
        char tempbuf[1024*1024] = "";
        char searchbuf[1024] =  "";

        while(i<size)
        {
            if(flag == 0)
            {
                if(sToClean[i] == '<')
                {
                    flag = 1;

                    tempbuf[0] = '\0';
                    k=0; // track for <script>,<style>, <!-- --> etc
                }
                else if(sToClean[i] == '&')
                {
                    flag = 2;
                }
                else
                {
                    sToClean[j] = sToClean[i];
                    j++;
                }
            }
            else if(flag == 1)
            {
                tempbuf[k] = sToClean[i];
                k++;
                tempbuf[k] = '\0';

                //printf("DEBUG: %s\n",tempbuf);

                if((0 == strcmp(tempbuf,"script")))
                {
                    flag = 3;

                    strcpy(searchbuf,"</script>");
                    //printf("DEBUG: Detected %s\n",tempbuf);

                    tempbuf[0] = '\0';
                    k = 0;
                }
                else if((0 == strcmp(tempbuf,"style")))
                {
                    flag = 3;

                    strcpy(searchbuf,"</style>");
                    //printf("DEBUG: Detected %s\n",tempbuf);

                    tempbuf[0] = '\0';
                    k = 0;
                }
                else if((0 == strcmp(tempbuf,"!--")))
                {
                    flag = 3;

                    strcpy(searchbuf,"-->");
                    //printf("DEBUG: Detected %s\n",tempbuf);

                    tempbuf[0] = '\0';
                    k = 0;
                }

                if(sToClean[i] == '>')
                {
                    sToClean[j] = ' ';
                    j++;
                    flag = 0;
                }

            }
            else if(flag == 2)
            {
                if(sToClean[i] == ';')
                {
                    sToClean[j] = ' ';
                    j++;
                    flag = 0;
                }
            }
            else if(flag == 3)
            {
                tempbuf[k] = sToClean[i];
                k++;
                tempbuf[k] = '\0';

                //printf("DEBUG: %s\n",tempbuf);
                //printf("DEBUG: Searching for %s\n",searchbuf);

                if(0 == strcmp(&tempbuf[0] + k - strlen(searchbuf),searchbuf))
                {
                    flag = 0;
                    //printf("DEBUG: Detected END OF %s\n",searchbuf);

                    searchbuf[0] = '\0';
                    tempbuf[0] = '\0';
                    k = 0;
                }
            }

            i++;
        }

        sToClean[j] = '\0';

        return j;
    }


来源:https://stackoverflow.com/questions/9444200/c-strip-html-between

标签
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!