Ad

Scrape And Store Shopify Ecommerce Websites Using Node.js

- 1 answer

I wrote a code to scrape an array of Shopify ecommerce websites using website-scraper npm module in node.js but it is showing 403 error but the same code is working for other websites.

How can we get around this problem?

My scraperTest.js file is :

var scrape = require('website-scraper');
let test = require('./test')
let urls = [];
urlList = ['1500.academy'];
urlList.forEach(url =>{
    test.checkRedirect(url)
    .then(domain =>{
        urls.push('https://' + domain);
        console.log(urls);
        var options = {
            urls: urls,
            directory: './autochat/',
            'User-Agent': 'request',
        };

        // with promise
        scrape(options).then((result) => {
            /* some code here */
        }).catch((err) => {
            /* some code here */
        });

        // or with callback
        scrape(options, (error, result) => {
            /* some code here */
        });
    })
})

and test.js file is

const request = require('request');
const extractDomain = require('extract-domain');

//var link = 'oneplustwocase.com';

function checkRedirect(link) {
    return new Promise((resolve, reject) => {

        var url = "http://" + link;
        var options = {
            url: url,
            headers: {
                'User-Agent': 'request'
            }
        };
        request(options, function (error, response, body) {
            let redirectedDomain = extractDomain(response.request.uri.href);
            if(response !== undefined){
                extractDomain(response.request.uri.href);
                if (response.statusCode === 200 && link !== redirectedDomain) {
                   resolve(redirectedDomain);
                } else {
                    resolve(link);
                }
            } else {
                resolve(link);
            }
        });
    });
}

module.exports.checkRedirect = checkRedirect;
Ad

Answer

I got the solution. We are able to fetch the html data of the domain using request(); The response.body contains the html data

the solution I got by using the following code :

const request = require('request');
const extractDomain = require('extract-domain');
let fs = require('fs');

function checkRedirect(link) {
        var url = "http://" + link;
        var options = {
            url: url,
            headers: {
                'User-Agent': 'request'
            }
        };
        request(options, function (error, response, body) {

            if(response !== undefined){
                let redirectedDomain = extractDomain(response.request.uri.href);
                let writeStream = fs.createWriteStream(redirectedDomain + '.html');
                writeStream.write(response.body)
                writeStream.end();
        });
}

module.exports.checkRedirect = checkRedirect;

//checkRedirect('oneplustwocase.com')

/*
var r = request(url, function (e, resp) {
    r.uri
    resp.request.uri
  })*/
Ad
source: stackoverflow.com
Ad