How to correctly extract text from a pdf using pdf.js

后端 未结 4 1705
生来不讨喜
生来不讨喜 2020-12-15 10:30

I\'m new to ES6 and Promise. I\'m trying pdf.js to extract texts from all pages of a pdf file into a string array. And when extraction is done, I want to parse the array som

4条回答
  •  渐次进展
    2020-12-15 10:55

    Similar to https://stackoverflow.com/a/40494019/1765767 -- collect page promises using Promise.all and don't forget to chain then's:

    function gettext(pdfUrl){
      var pdf = PDFJS.getDocument(pdfUrl);
      return pdf.then(function(pdf) { // get all pages text
        var maxPages = pdf.pdfInfo.numPages;
        var countPromises = []; // collecting all page promises
        for (var j = 1; j <= maxPages; j++) {
          var page = pdf.getPage(j);
    
          var txt = "";
          countPromises.push(page.then(function(page) { // add page promise
            var textContent = page.getTextContent();
            return textContent.then(function(text){ // return content promise
              return text.items.map(function (s) { return s.str; }).join(''); // value page text 
            });
          }));
        }
        // Wait for all pages and join text
        return Promise.all(countPromises).then(function (texts) {
          return texts.join('');
        });
      });
    }
    
    // waiting on gettext to finish completion, or error
    gettext("https://cdn.mozilla.net/pdfjs/tracemonkey.pdf").then(function (text) {
      alert('parse ' + text);
    }, 
    function (reason) {
      console.error(reason);
    });

提交回复
热议问题