Realtime scrap a chat using Nodejs

狂风中的少年 提交于 2019-11-29 11:45:19

If possible you should use the API, the chat is using. Try to open the network tab inside the Chrome developer tools and try to figure out which network requests are happening.


If that is not possible, you can use a MutationObserver to monitor DOM changes. Expose a function via page.exposeFunction and then listen to relevant changes. You can then insert the obtained data into a database.

Here is some example code to get you started:

const puppeteer = require('puppeteer');
const { Client } = require('pg');

(async () => {
    const client = new Client(/* ... */);
    await client.connect(); // connect to database

    const browser = await puppeteer.launch({ headless: false });
    const [page] = await browser.pages();

    // call a handler when a mutation happens
    async function mutationListener(addedText) {
        console.log(`Added text: ${addedText}`);

        // insert data into database
        await client.query('INSERT INTO users(text) VALUES($1)', [addedText]);
    }
    page.exposeFunction('mutationListener', mutationListener);

    await page.goto('http://...');
    await page.waitForSelector('.msg-nickname');

    await page.evaluate(() => {
        // wait for any mutations inside a specific element (e.g. the chatbox)
        const observerTarget = document.querySelector('ELEMENT-TO-MONITOR');
        const mutationObserver = new MutationObserver((mutationsList) => {
            // handle change by checking which elements were added and which were deleted
            for (const mutation of mutationsList) {
                const { removedNodes, addedNodes } = mutation;
                // example: pass innerText of first added element to our mutationListener
                mutationListener(addedNodes[0].innerText);
            }
        });
        mutationObserver.observe( // start observer
            observerTarget,
            { childList: true }, // wait for new child nodes to be added/removed
        );
    });
})();
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!