Error first argument to readability constructor should be a document object

article parser version { "article-parser": "^7.2.4" } snippet i used import { extract } from 'article-parser'; import axios from 'axios'; axios .get('https:/...

article parser version

{
  "article-parser": "^7.2.4"
}

snippet i used

import { extract } from 'article-parser';
import axios from 'axios';

axios
  .get('https://www.webmanajemen.com/chimeraland/sitemap.txt')
  .then((response) => {
    const list = response.data.split(/r?n/gm) as string[];
    list.map((url) => {
      extract(url).then((data) => {
        console.log(data);
      });
    });
  });

error logs

dimas@DESKTOP-9JFNTEA /cygdrive/d/Repositories/hexo-backend
$ ts-node "d:Repositorieshexo-backendsrcstandaloneget-links.ts"
d:Repositorieshexo-backendnode_modulesarticle-parserdistcjsarticle-parser.js:54
`)});let n=e.nonTextTags||["script","style","textarea","option"],p,u;e.allowedAttributes&&(p={},u={},rs(e.allowedAttributes,function(v,y){p[y]=[];let T=[];v.forEach(function(P){typeof P=="string"&&P.indexOf("*")>=0?T.push(lm(P).replace(/\*/g,".*")):p[y].push(P)}),T.length&&(u[y]=new RegExp("^("+T.join("|")+")$"))}));let o={},d={},c={};rs(e.allowedClasses,function(v,y){p&&(qr(p,y)||(p[y]=[]),p[y].push("class")),o[y]=[],c[y]=[];let T=[];v.forEach(function(P){typeof P=="string"&&P.indexOf("*")>=0?T.push(lm(P).replace(/\*/g,".*")):P instanceof RegExp?c[y].push(P):o[y].push(P)}),T.length&&(d[y]=new RegExp("^("+T.join("|")+")$"))});let l={},h;rs(e.transformTags,function(v,y){let T;typeof v=="function"?T=v:typeof v=="string"&&(T=as.simpleTransform(v)),y==="*"?h=T:l[y]=T});let g,_,S,R,O,M,E=!1;A();let V=new Lb.Parser({onopentag:function(v,y){if(e.enforceHtmlBoundary&&v==="html"&&A(),O){M++;return}let T=new s(v,y);_.push(T);let P=!1,De=!!T.text,Le;if(qr(l,v)&&(Le=l[v](v,y),T.attribs=y=Le.attribs,Le.text!==void 0&&(T.innerText=Le.text),v!==Le.tagName&&(T.name=v=Le.tagName,R[g]=Le.tagName)),h&&(Le=h(v,y),T.attribs=y=Le.attribs,v!==Le.tagName&&(T.name=v=Le.tagName,R[g]=Le.tagName)),(e.allowedTags&&e.allowedTags.indexOf(v)===-1||e.disallowedTagsMode==="recursiveEscape"&&!Pb(S)||e.nestingLimit!=null&&g>=e.nestingLimit)&&(P=!0,S[g]=!0,e.disallowedTagsMode==="discard"&&n.indexOf(v)!==-1&&(O=!0,M=1),S[g]=!0),g++,P){if(e.disallowedTagsMode==="discard")return;i=a,a=""}a+="<"+v,v==="script"&&(e.allowedScriptHostnames||e.allowedScriptDomains)&&(T.innerText=""),(!p||qr(p,v)||p["*"])&&rs(y,function(k,Q){if(!kb.test(Q)){delete T.attribs[Q];return}let P1=!1;if(!p||qr(p,v)&&p[v].indexOf(Q)!==-1||p["*"]&&p["*"].indexOf(Q)!==-1||qr(u,v)&&u[v].test(Q)||u["*"]&&u["*"].test(Q))P1=!0;else if(p&&p[v]){for(let $ of p[v])if(Ob($)&&$.name&&$.name===Q){P1=!0;let W="";if($.multiple===!0){let Vt=k.split(" ");for(let Je of Vt)$.values.indexOf(Je)!==-1&&(W===""?W=Je:W+=" "+Je)}else $.values.indexOf(k)>=0&&(W=k);k=W}}if(P1){if(e.allowedSchemesAppliedToAttributes.indexOf(Q)!==-1&&X(v,k)){delete T.attribs[Q];return}if(v==="script"&&Q==="src"){let $=!0;try{let W=I(k);if(e.allowedScriptHostnames||e.allowedScriptDomains){let Vt=(e.allowedScriptHostnames||[]).find(function(Ge){return Ge===W.url.hostname}),Je=(e.allowedScriptDomains||[]).find(function(Ge){return W.url.hostname===Ge||W.url.hostname.endsWith(`.${Ge}`)});$=Vt||Je}}catch{$=!1}if(!$){delete T.attribs[Q];return}}if(v==="iframe"&&Q==="src"){let $=!0;try{let W=I(k);if(W.isRelativeUrl)$=qr(e,"allowIframeRelativeUrls")?e.allowIframeRelativeUrls:!e.allowedIframeHostnames&&!e.allowedIframeDomains;else if(e.allowedIframeHostnames||e.allowedIframeDomains){let Vt=(e.allowedIframeHostnames||[]).find(function(Ge){return Ge===W.url.hostname}),Je=(e.allowedIframeDomains||[]).find(function(Ge){return W.url.hostname===Ge||W.url.hostname.endsWith(`.${Ge}`)});$=Vt||Je}}catch{$=!1}if(!$){delete T.attribs[Q];return}}if(Q==="srcset")try{let $=Rb(k);if($.forEach(function(W){X("srcset",W.url)&&(W.evil=!0)}),$=mm($,function(W){return!W.evil}),$.length)k=qb(mm($,function(W){return!W.evil})),T.attribs[Q]=k;else{delete T.attribs[Q];return}}catch{delete T.attribs[Q];return}if(Q==="class"){let $=o[v],W=o["*"],Vt=d[v],Je=c[v],Ge=d["*"],kr=[Vt,Ge].concat(Je).filter(function(l0){return l0});if($&&W?k=de(k,cm($,W),kr):k=de(k,$||W,kr),!k.length){delete 
T.attribs[Q];return}}if(Q==="style")try{let $=Mb(v+" {"+k+"}"),W=J($,e.allowedStyles);if(k=xe(W),k.length===0){delete T.attribs[Q];return}}catch{delete T.attribs[Q];return}a+=" "+Q,k&&k.length&&(a+='="'+K(k,!0)+'"')}else delete T.attribs[Q]}),e.selfClosing.indexOf(v)!==-1?a+=" />":(a+=">",T.innerText&&!De&&!e.textFilter&&(a+=K(T.innerText),E=!0)),P&&(a=i+K(a),i="")},ontext:function(v){if(O)return;let y=_[_.length-1],T;if(y&&(T=y.tag,v=y.innerText!==void 0?y.innerText:v),e.disallowedTagsMode==="discard"&&(T==="script"||T==="style"))a+=v;else{let P=K(v,!1);e.textFilter&&!E?a+=e.textFilter(P,T):E||(a+=P)}if(_.length){let P=_[_.length-1];P.text+=v}},onclosetag:function(v){if(O)if(M--,!M)O=!1;else return;let y=_.pop();if(!y)return;if(y.tag!==v){_.push(y);return}O=e.enforceHtmlBoundary?v==="html":!1,g--;let T=S[g];if(T){if(delete S[g],e.disallowedTagsMode==="discard"){y.updateParentNodeText();return}i=a,a=""}if(R[g]&&(v=R[g],delete R[g]),e.exclusiveFilter&&e.exclusiveFilter(y)){a=a.substr(0,y.tagPosition);return}if(y.updateParentNodeMediaChildren(),y.updateParentNodeText(),e.selfClosing.indexOf(v)!==-1){T&&(a=i,i="");return}a+="</"+v+">",T&&(a=i+K(a),i=""),E=!1}},e.parser);return V.write(t),V.end(),a;function A(){a="",g=0,_=[],S={},R={},O=!1,M=0}function K(v,y){return typeof v!="string"&&(v=v+""),e.parser.decodeEntities&&(v=v.replace(/&/g,"&amp;").replace(/</g,"&lt;").replace(/>/g,"&gt;"),y&&(v=v.replace(/"/g,"&quot;"))),v=v.replace(/&(?![a-zA-Z0-9#]{1,20};)/g,"&amp;").replace(/</g,"&lt;").replace(/>/g,"&gt;"),y&&(v=v.replace(/"/g,"&quot;")),v}function X(v,y){for(y=y.replace(/[x00-x20]+/g,"");;){let De=y.indexOf("<!--");if(De===-1)break;let Le=y.indexOf("-->",De+4);if(Le===-1)break;y=y.substring(0,De)+y.substring(Le+3)}let T=y.match(/^([a-zA-Z][a-zA-Z0-9.-+]*):/);if(!T)return y.match(/^[/\]{2}/)?!e.allowProtocolRelative:!1;let P=T[1].toLowerCase();return qr(e.allowedSchemesByTag,v)?e.allowedSchemesByTag[v].indexOf(P)===-1:!e.allowedSchemes||e.allowedSchemes.indexOf(P)===-1}function I(v){if(v=v.replace(/^(w+:)?s*[\/]s*[\/]/,"$1//"),v.startsWith("relative:"))throw new Error("relative: exploit attempt");let y="relative://relative-site";for(let De=0;De<100;De++)y+=`/${De}`;let T=new URL(v,y);return{isRelativeUrl:T&&T.hostname==="relative-site"&&T.protocol==="relative:",url:T}}function J(v,y){if(!y)return v;let T=v.nodes[0],P;return y[T.selector]&&y["*"]?P=cm(y[T.selector],y["*"]):P=y[T.selector]||y["*"],P&&(v.nodes[0].nodes=T.nodes.reduce(ne(P),[])),v}function xe(v){return v.nodes[0].nodes.reduce(function(y,T){return y.push(`${T.prop}:${T.value}${T.important?" !important":""}`),y},[]).join(";")}function ne(v){return function(y,T){return qr(v,T.prop)&&v[T.prop].some(function(De){return De.test(T.value)})&&y.push(T),y}}function de(v,y,T){return y?(v=v.split(/s+/),v.filter(function(P){return y.indexOf(P)!==-1||T.some(function(De){return De.test(P)})}).join(" ")):v}}var Bb={decodeEntities:!0};as.defaults={allowedTags:["address","article","aside","footer","header","h1","h2","h3","h4","h5","h6","hgroup","main","nav","section","blockquote","dd","div","dl","dt","figcaption","figure","hr","li","main","ol","p","pre","ul","a","abbr","b","bdi","bdo","br","cite","code","data","dfn","em","i","kbd","mark","q","rb","rp","rt","rtc","ruby","s","samp","small","span","strong","sub","sup","time","u","var","wbr","caption","col","colgroup","table","tbody","td","tfoot","th","thead","tr"],disallowedTagsMode:"discard",allowedAttributes:{a:["href","name","target"],img:["src","srcset","alt","title","width","height","loading"]},selfClosing:["img","br","hr","area","base","basefont","input","link","meta"],allowedSchemes:["http","https","ftp","mailto","tel"],allowedSchemesByTag:{},allowedSchemesAppliedToAttributes:["href","src","cite"],allowProtocolRelative:!0,enforceHtmlBoundary:!1};as.simpleTransform=function(t,e,r){return r=r===void 0?!0:r,e=e||{},function(a,i){let s;if(r)for(s in e)i[s]=e[s];else i=e;return{tagName:t,attribs:i}}}});var _m=b((PO,xm)=>{xm.exports={compareTwoStrings:ym,findBestMatch:Hb};function ym(t,e){if(t=t.replace(/s+/g,""),e=e.replace(/s+/g,""),t===e)return 1;if(t.length<2||e.length<2)return 0;let r=new Map;for(let i=0;i<t.length-1;i++){let s=t.substring(i,i+2),n=r.has(s)?r.get(s)+1:1;r.set(s,n)}let a=0;for(let i=0;i<e.length-1;i++){let s=e.substring(i,i+2),n=r.has(s)?r.get(s):0;n>0&&(r.set(s,n-1),a++)}return 2*a/(t.length+e.length-2)}function Hb(t,e){if(!jb(t,e))throw new Error("Bad arguments: First argument should be a string, second should be an array 
of strings");let r=[],a=0;for(let s=0;s<e.length;s++){let n=e[s],p=ym(t,n);r.push({target:n,rating:p}),p>r[a].rating&&(a=s)}let i=r[a];return{ratings:r,bestMatch:i,bestMatchIndex:a}}function jb(t,e){return!(typeof t!="string"||!Array.isArray(e)||!e.length||e.find(function(r){return typeof r!="string"}))}});var Om=b((FO,fd)=>{function Lm(t,e){if(e&&e.documentElement)t=e,e=arguments[2];else if(!t||!t.documentElement)throw new Error("First argument to Readability constructor should be a document object.");if(e=e||{},this._doc=t,this._docJSDOMParser=this._doc.firstChild.__JSDOMParser__,this._articleTitle=null,this._articleByline=null,this._articleDir=null,this._articleSiteName=null,this._attempts=[],this._debug=!!e.debug,this._maxElemsToParse=e.maxElemsToParse||this.DEFAULT_MAX_ELEMS_TO_PARSE,this._nbTopCandidates=e.nbTopCandidates||this.DEFAULT_N_TOP_CANDIDATES,this._charThreshold=e.charThreshold||this.DEFAULT_CHAR_THRESHOLD,this._classesToPreserve=this.CLASSES_TO_PRESERVE.concat(e.classesToPreserve||[]),this._keepClasses=!!e.keepClasses,this._serializer=e.serializer||function(r){return r.innerHTML},this._disableJSONLD=!!e.disableJSONLD,this._flags=this.FLAG_STRIP_UNLIKELYS|this.FLAG_WEIGHT_CLASSES|this.FLAG_CLEAN_CONDITIONALLY,this._debug){let r=function(a){if(a.nodeType==a.TEXT_NODE)return`${a.nodeName} ("${a.textContent}")`;let i=Array.from(a.attributes||[],function(s){return`${s.name}="${s.value}"`}).join(" ");return`<${a.localName} ${i}>`};this.log=function(){if(typeof dump<"u"){var a=Array.prototype.map.call(arguments,function(i){return i&&i.nodeName?r(i):i}).join(" ");dump("Reader: (Readability) "+a+`
----LONG WHITESPACE HERE, I DELETED IT----
            ^
Error: First argument to Readability constructor should be a document object.
    at new Lm (d:Repositorieshexo-backendnode_modulesarticle-parserdistcjsarticle-parser.js:54:8356)
    at qm (d:Repositorieshexo-backendnode_modulesarticle-parserdistcjsarticle-parser.js:64:3456)    at bd (d:Repositorieshexo-backendnode_modulesarticle-parserdistcjsarticle-parser.js:64:4693)    at Yb (d:Repositorieshexo-backendnode_modulesarticle-parserdistcjsarticle-parser.js:64:5168)    at d:Repositorieshexo-backendsrcstandaloneget-links.ts:9:14
    at Array.map (<anonymous>)
    at d:Repositorieshexo-backendsrcstandaloneget-links.ts:8:10
    at processTicksAndRejections (node:internal/process/task_queues:96:5)

#node.js #cheerio #jsdom

#node.js #cheerio #jsdom

Вопрос:

Я хотел бы использовать удобочитаемость для анализа содержимого «статьи» на веб-страницах. Readability делает хорошую работу, но это зависит от JSDOM, который кажется очень медленным и выдает ошибки, если синтаксический анализ содержимого CSS завершается неудачно (что мне совсем не нужно), но игнорировать CSS в невозможно JSDOM , как я понимаю из проблем проекта Gibhub.

Я пытался заменить JSDOM на cheerio, но я не смог выяснить, какая часть его API совместима с выводом JSDOM .

В JSDOM заявлении

 var doc = new JSDOM(html);
 

создает объект DOM, который может быть передан в

 let reader = new Readability(doc.window.document);
 

cheerio Однако, похоже, нет ничего, что создает объект DOM.
Я пробовал

 var $ = cheerio.load(html);
var object_something = $('html');
 

Он выдает ошибку при вызове new Readability(object_something)

 Error: First argument to Readability constructor should be a document object.
 

что явно означает то, что он говорит. object_something это объект, но не уверен, что это на самом деле.

Возможно ли вообще создать объект DOM с cheerio помощью? У меня более 10 миллионов локальных HTML-документов, поэтому любое улучшение производительности сэкономит мне много времени.

Recommend Projects

  • React photo

    React

    A declarative, efficient, and flexible JavaScript library for building user interfaces.

  • Vue.js photo

    Vue.js

    🖖 Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.

  • Typescript photo

    Typescript

    TypeScript is a superset of JavaScript that compiles to clean JavaScript output.

  • TensorFlow photo

    TensorFlow

    An Open Source Machine Learning Framework for Everyone

  • Django photo

    Django

    The Web framework for perfectionists with deadlines.

  • Laravel photo

    Laravel

    A PHP framework for web artisans

  • D3 photo

    D3

    Bring data to life with SVG, Canvas and HTML. 📊📈🎉

Recommend Topics

  • javascript

    JavaScript (JS) is a lightweight interpreted programming language with first-class functions.

  • web

    Some thing interesting about web. New door for the world.

  • server

    A server is a program made to process requests and deliver data to clients.

  • Machine learning

    Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.

  • Visualization

    Some thing interesting about visualization, use data art

  • Game

    Some thing interesting about game, make everyone happy.

Recommend Org

  • Facebook photo

    Facebook

    We are working to build community through open source technology. NB: members must have two-factor auth.

  • Microsoft photo

    Microsoft

    Open source projects and samples from Microsoft.

  • Google photo

    Google

    Google ❤️ Open Source for everyone.

  • Alibaba photo

    Alibaba

    Alibaba Open Source for everyone

  • D3 photo

    D3

    Data-Driven Documents codes.

  • Tencent photo

    Tencent

    China tencent open source team.

I’d like to use Readability to parse out the post uvdos cheerio «article» content within web pages. post uvdos cheerio Readability does a good job, but it depends post uvdos cheerio on JSDOM which seems to be very slow and post uvdos cheerio throws errors if parsing CSS content fails post uvdos cheerio (which I do not need at all), but it’s not post uvdos cheerio possible to ignore CSS in JSDOM, as I post uvdos cheerio understand from the Gibhub issues of the post uvdos cheerio project.

I’ve been trying to replace JSDOM with post uvdos cheerio cheerio, but I haven’t been able to figure post uvdos cheerio out what part of its API is compatible with post uvdos cheerio the output of JSDOM.

In JSDOM, the statement

var doc = new JSDOM(html);

produces a DOM object which can be passed post uvdos cheerio into

let reader = new Readability(doc.window.document);

In cheerio however, there doesn’t seem to post uvdos cheerio anything that produces a DOM object.
I’ve post uvdos cheerio tried

var $ = cheerio.load(html);
var object_something = $('html');

It throws an error when I call new post uvdos cheerio Readability(object_something)

Error: First argument to Readability constructor should be a document object.

which clearly means that what it says. post uvdos cheerio object_something is an object, but not sure post uvdos cheerio what it actually is.

Is it even possible to produce a DOM object post uvdos cheerio with cheerio? I have over 10 million local post uvdos cheerio HTML documents, so any performance post uvdos cheerio improvement would save me a lot of time.

I’d like to use Readability to parse out the «article» content within web pages. Readability does a good job, but it depends on JSDOM which seems to be very slow and throws errors if parsing CSS content fails (which I do not need at all), but it’s not possible to ignore CSS in JSDOM, as I understand from the Gibhub issues of the project.

I’ve been trying to replace JSDOM with cheerio, but I haven’t been able to figure out what part of its API is compatible with the output of JSDOM.

In JSDOM, the statement

var doc = new JSDOM(html);

produces a DOM object which can be passed into

let reader = new Readability(doc.window.document);

In cheerio however, there doesn’t seem to anything that produces a DOM object.
I’ve tried

var $ = cheerio.load(html);
var object_something = $('html');

It throws an error when I call new Readability(object_something)

Error: First argument to Readability constructor should be a document object.

which clearly means that what it says. object_something is an object, but not sure what it actually is.

Is it even possible to produce a DOM object with cheerio? I have over 10 million local HTML documents, so any performance improvement would save me a lot of time.

Recommend Projects

  • React photo
    React

    A declarative, efficient, and flexible JavaScript library for building user interfaces.

  • Vue.js photo
    Vue.js

    🖖 Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.

  • Typescript photo
    Typescript

    TypeScript is a superset of JavaScript that compiles to clean JavaScript output.

  • TensorFlow photo
    TensorFlow

    An Open Source Machine Learning Framework for Everyone

  • Django photo
    Django

    The Web framework for perfectionists with deadlines.

  • Laravel photo
    Laravel

    A PHP framework for web artisans

  • D3 photo
    D3

    Bring data to life with SVG, Canvas and HTML. 📊📈🎉

Recommend Topics

  • javascript

    JavaScript (JS) is a lightweight interpreted programming language with first-class functions.

  • web

    Some thing interesting about web. New door for the world.

  • server

    A server is a program made to process requests and deliver data to clients.

  • Machine learning

    Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.

  • Visualization

    Some thing interesting about visualization, use data art

  • Game

    Some thing interesting about game, make everyone happy.

Recommend Org

  • Facebook photo
    Facebook

    We are working to build community through open source technology. NB: members must have two-factor auth.

  • Microsoft photo
    Microsoft

    Open source projects and samples from Microsoft.

  • Google photo
    Google

    Google ❤️ Open Source for everyone.

  • Alibaba photo
    Alibaba

    Alibaba Open Source for everyone

  • D3 photo
    D3

    Data-Driven Documents codes.

  • Tencent photo
    Tencent

    China tencent open source team.

Понравилась статья? Поделить с друзьями:
  • Error firefox not found
  • Error firebaseerror missing or insufficient permissions
  • Error finding your systems active partition windows loader
  • Error finding upload folder site storage location site directory maybe it does not exist
  • Error finding machine could not find a registered machine named