Ad

How To Get Promises From Nested Arrays?

Can somebody help me with this?

I am trying to scrape a website and store the collected data in a Json file. I'm using cheerios and request-promise.

The Json structure goes like that: companys > packages > cities

      "companies": [
    {
      "id": 0,
      "name": "companyName",
      "url": "https://www.url-company.net/",
      "packages": [
        {
          "id": 0,
          "name": "general",
          "url": "https://www.url-company.net/package",
          "cities": [
            {
              "id": 0,
              "name": "cityName",
              "url": "https://www.url-company.net/package/city",
            },
            ...]
        }
      ...]
    }
  ..]

I have extracted the array of companies from this site.

  • Each COMPANY has a specific url --> from every url I scraped the packages for each company.
  • Each PACKAGE has a specific url --> from every url I want to scrape the cities for each package but I am NOT able to do it.

I am only able to populate companies and packagesByCompany, but I'm lost when trying to populate citiesByPackage:

const rp = require('request-promise');
const cheerio = require('cheerio');
const jsonfile = require('jsonfile');
const baseUrl = 'https://www.base-url-example.net';

scrapeAll();


function scrapeAll() {
    return scrapeCompanies().then(function (dataCompanys) {
        //Map every endpoint so we can make a request with each URL
        var promises = dataCompanys.map(function (company) {
            return scrapePackagesByCompany(company)  // Populate each company with all the array of packages from this company
        });
        return Promise.all(promises);
    })
    .then(function(promiseArray) { // Need help here!!!!

        var promise4all = Promise.all(
            promiseArray.map(function(company) {

                 return Promise.all(   // This is NOT working, I do not know how to get promises from nested arrays
                    company.packages.map(function(package) {

                         return Promise.all(
                            scrapeCitiesByPackage(package) // Try to populate each package with all the array of cities from this package
                             );
                    })
                     );
            })
         );

        return promise4all;
    })
    .then(function (data) {
        saveScrapedDateIntoJsonFile(data);

        return data;
    })
    .catch(function (err) {
        return Promise.reject(err);
    });
}

function scrapeCompanies() {
    return rp(baseUrl)
      .then(function(html){
        const data = []; 
        let companysImg = '#content section .elementor-container > .elementor-row > .elementor-element.elementor-top-column .elementor-widget-wrap .elementor-widget-image >.elementor-widget-container > .elementor-image';
        let $ = cheerio.load(html); 
        
        $(companysImg).each(function(index, element){
            
            const urlCompany = $(element).find('a').attr('href');
            const imgCompany = $(element).find('img').data('lazy-src');
            
            if (urlCompany && imgCompany) {
                
                const nameCompany = urlCompany;
    
                const company = {
                    id : index,
                    name: nameCompany,
                    url : baseUrl + urlCompany,
                    img: imgCompany,
                };
    
                data.push(company);
            }       
        });     
        
        return data;
      })
      .catch(function(err){
        //handle error
        console.error('errorrr2', err);
      });
}


  function scrapePackagesByCompany(company) {
    return rp(company.url)
        .then(function(html){
            company.packages = []; 
            let packagesImg = '#content section .elementor-container > .elementor-row > .elementor-element.elementor-top-column .elementor-widget-wrap .elementor-widget-image >.elementor-widget-container > .elementor-image';
            let $ = cheerio.load(html); 
            
            $(packagesImg).each(function(index, element){
                
                const urlPackage = $(element).find('a').attr('href');
                const imgPackage = $(element).find('img').data('lazy-src');

                if (urlPackage && imgPackage) {
                    
                    const namePackage = urlPackage.text();

                    const package = {
                        id : index,
                        name: namePackage,
                        url : urlPackage,
                        img: imgPackage,
                    };

                    company.packages.push(package);
                }       
            });

            return company;
        })      
        .catch(function(err){
            //handle error
            console.error('errorrr2', err);
        });
  }


  function scrapeCitiesByPackage(insurancePackage) {
    return rp(insurancePackage.url)
        .then(function(html){
            insurancePackage.cities = []; 
            let citiesLinks = '#content section .elementor-container > .elementor-row > .elementor-element .elementor-widget.elementor-widget-posts .elementor-posts-container article';
            let $ = cheerio.load(html); 
            
            $(citiesLinks).each(function(index, element) {
                
                const $linkCity = $(element).find('a');
                const urlCity = $linkCity.attr('href');
                const nameCity = $linkCity.text();
                
                if (urlCity && nameCity) {
                    
                    const city = {
                        id : index,
                        name: nameCity,
                        url : urlCity,
                    };
                    insurancePackage.cities.push(city);
                }       
            });
            return insurancePackage;

        })
        .catch(function(err){
            //handle error
            console.error('errorrr2', err);
        });
  }


  function saveScrapedDateIntoJsonFile(data) {
    jsonfile.writeFile(
        './data/company.json',
        {companies : data },
        //data,
        {spaces: 2},
        function(err) {
            console.error('errorrr', err);
        });
  }

Thanks in advance :)

Ad

Answer

What you are trying could be made to work but it's arguably better for scrapePackagesByCompany() and scrapeCitiesByPackage() simply to deliver data, and to perform all the "assembly" work (ie bundling the delivered arrays into higher level objects) in scrapeAll().

You can write something like this:

scrapeAll()
.catch(function(err) {
    console.log(err);
});

function scrapeAll() {
    return scrapeCompanies()
    .then(function(companies) {
        return Promise.all(companies.map(function(company) {
            return scrapePackagesByCompany(company)
            .then(function(packages) {
                company.packages = packages; // assembly
                return Promise.all(packages.map(function(package) {
                    return scrapeCitiesByPackage(package)
                    .then(function(cities) {
                        package.cities = cities; // assembly
                    });
                }));
            });
        }))
        .then(function() {
            return saveScrapedDateIntoJsonFile(companies);
        });
    });
}

Then it's fairly trivial to simplify scrapePackagesByCompany() and scrapeCitiesByPackage(package) such that they deliver packages array and cities array respectively.

Ad
source: stackoverflow.com
Ad