Quantcast
Channel: Hot Weekly Questions - Web Applications Stack Exchange
Viewing all articles
Browse latest Browse all 9712

Using UrlFetchApp.fetch(url) with regex to extract website data

$
0
0

I'm trying to extract data from a list of >1000 URLs using a script that uses UrlFetchApp.fetch(url) and regex based on this article.

This is the code I'm using.

function importRegex(url, regex_string) {  var html, content = '';  var response = UrlFetchApp.fetch(url);  if (response) {    html = response.getContentText();    if (html.length && regex_string.length) {      var regex = new RegExp( regex_string, "i" );      content = html.match(regex)[1];    }  }  content = unescapeHTML(content);  Utilities.sleep(1000); // avoid call limit by adding a delay  return content;  }var htmlEntities = {  cent:  '¢',  pound: '£',  yen:   '¥',  euro:  '€',  copy:  '©',  reg:   '®',  lt:    '<',  gt:    '>',  mdash: '–',  quot:  '"',  amp:   '&',  apos:  '\''};function unescapeHTML(str) {    return str.replace(/\&([^;]+);/g, function (entity, entityCode) {        var match;        if (entityCode in htmlEntities) {            return htmlEntities[entityCode];        } else if (match = entityCode.match(/^#x([\da-fA-F]+)$/)) {            return String.fromCharCode(parseInt(match[1], 16));        } else if (match = entityCode.match(/^#(\d+)$/)) {            return String.fromCharCode(~~match[1]);        } else {            return entity;        }    });};

and the importregex function formula I'm using is

=importRegex(A4, "<h1 class=""ch-title"".*?>(.*)<\/h1>")

It gives the following error

TypeError: Cannot read property '1' of null (line 9).

enter image description here

I'm not sure how to fix it.


Viewing all articles
Browse latest Browse all 9712

Latest Images

Trending Articles



Latest Images

<script src="https://jsc.adskeeper.com/r/s/rssing.com.1596347.js" async> </script>