article parser version
{ "article-parser": "^7.2.4" }
snippet i used
import { extract } from 'article-parser'; import axios from 'axios'; axios .get('https://www.webmanajemen.com/chimeraland/sitemap.txt') .then((response) => { const list = response.data.split(/r?n/gm) as string[]; list.map((url) => { extract(url).then((data) => { console.log(data); }); }); });
error logs
dimas@DESKTOP-9JFNTEA /cygdrive/d/Repositories/hexo-backend
$ ts-node "d:Repositorieshexo-backendsrcstandaloneget-links.ts"
d:Repositorieshexo-backendnode_modulesarticle-parserdistcjsarticle-parser.js:54
`)});let n=e.nonTextTags||["script","style","textarea","option"],p,u;e.allowedAttributes&&(p={},u={},rs(e.allowedAttributes,function(v,y){p[y]=[];let T=[];v.forEach(function(P){typeof P=="string"&&P.indexOf("*")>=0?T.push(lm(P).replace(/\*/g,".*")):p[y].push(P)}),T.length&&(u[y]=new RegExp("^("+T.join("|")+")$"))}));let o={},d={},c={};rs(e.allowedClasses,function(v,y){p&&(qr(p,y)||(p[y]=[]),p[y].push("class")),o[y]=[],c[y]=[];let T=[];v.forEach(function(P){typeof P=="string"&&P.indexOf("*")>=0?T.push(lm(P).replace(/\*/g,".*")):P instanceof RegExp?c[y].push(P):o[y].push(P)}),T.length&&(d[y]=new RegExp("^("+T.join("|")+")$"))});let l={},h;rs(e.transformTags,function(v,y){let T;typeof v=="function"?T=v:typeof v=="string"&&(T=as.simpleTransform(v)),y==="*"?h=T:l[y]=T});let g,_,S,R,O,M,E=!1;A();let V=new Lb.Parser({onopentag:function(v,y){if(e.enforceHtmlBoundary&&v==="html"&&A(),O){M++;return}let T=new s(v,y);_.push(T);let P=!1,De=!!T.text,Le;if(qr(l,v)&&(Le=l[v](v,y),T.attribs=y=Le.attribs,Le.text!==void 0&&(T.innerText=Le.text),v!==Le.tagName&&(T.name=v=Le.tagName,R[g]=Le.tagName)),h&&(Le=h(v,y),T.attribs=y=Le.attribs,v!==Le.tagName&&(T.name=v=Le.tagName,R[g]=Le.tagName)),(e.allowedTags&&e.allowedTags.indexOf(v)===-1||e.disallowedTagsMode==="recursiveEscape"&&!Pb(S)||e.nestingLimit!=null&&g>=e.nestingLimit)&&(P=!0,S[g]=!0,e.disallowedTagsMode==="discard"&&n.indexOf(v)!==-1&&(O=!0,M=1),S[g]=!0),g++,P){if(e.disallowedTagsMode==="discard")return;i=a,a=""}a+="<"+v,v==="script"&&(e.allowedScriptHostnames||e.allowedScriptDomains)&&(T.innerText=""),(!p||qr(p,v)||p["*"])&&rs(y,function(k,Q){if(!kb.test(Q)){delete T.attribs[Q];return}let P1=!1;if(!p||qr(p,v)&&p[v].indexOf(Q)!==-1||p["*"]&&p["*"].indexOf(Q)!==-1||qr(u,v)&&u[v].test(Q)||u["*"]&&u["*"].test(Q))P1=!0;else if(p&&p[v]){for(let $ of p[v])if(Ob($)&&$.name&&$.name===Q){P1=!0;let W="";if($.multiple===!0){let Vt=k.split(" ");for(let Je of Vt)$.values.indexOf(Je)!==-1&&(W===""?W=Je:W+=" "+Je)}else $.values.indexOf(k)>=0&&(W=k);k=W}}if(P1){if(e.allowedSchemesAppliedToAttributes.indexOf(Q)!==-1&&X(v,k)){delete T.attribs[Q];return}if(v==="script"&&Q==="src"){let $=!0;try{let W=I(k);if(e.allowedScriptHostnames||e.allowedScriptDomains){let Vt=(e.allowedScriptHostnames||[]).find(function(Ge){return Ge===W.url.hostname}),Je=(e.allowedScriptDomains||[]).find(function(Ge){return W.url.hostname===Ge||W.url.hostname.endsWith(`.${Ge}`)});$=Vt||Je}}catch{$=!1}if(!$){delete T.attribs[Q];return}}if(v==="iframe"&&Q==="src"){let $=!0;try{let W=I(k);if(W.isRelativeUrl)$=qr(e,"allowIframeRelativeUrls")?e.allowIframeRelativeUrls:!e.allowedIframeHostnames&&!e.allowedIframeDomains;else if(e.allowedIframeHostnames||e.allowedIframeDomains){let Vt=(e.allowedIframeHostnames||[]).find(function(Ge){return Ge===W.url.hostname}),Je=(e.allowedIframeDomains||[]).find(function(Ge){return W.url.hostname===Ge||W.url.hostname.endsWith(`.${Ge}`)});$=Vt||Je}}catch{$=!1}if(!$){delete T.attribs[Q];return}}if(Q==="srcset")try{let $=Rb(k);if($.forEach(function(W){X("srcset",W.url)&&(W.evil=!0)}),$=mm($,function(W){return!W.evil}),$.length)k=qb(mm($,function(W){return!W.evil})),T.attribs[Q]=k;else{delete T.attribs[Q];return}}catch{delete T.attribs[Q];return}if(Q==="class"){let $=o[v],W=o["*"],Vt=d[v],Je=c[v],Ge=d["*"],kr=[Vt,Ge].concat(Je).filter(function(l0){return l0});if($&&W?k=de(k,cm($,W),kr):k=de(k,$||W,kr),!k.length){delete
T.attribs[Q];return}}if(Q==="style")try{let $=Mb(v+" {"+k+"}"),W=J($,e.allowedStyles);if(k=xe(W),k.length===0){delete T.attribs[Q];return}}catch{delete T.attribs[Q];return}a+=" "+Q,k&&k.length&&(a+='="'+K(k,!0)+'"')}else delete T.attribs[Q]}),e.selfClosing.indexOf(v)!==-1?a+=" />":(a+=">",T.innerText&&!De&&!e.textFilter&&(a+=K(T.innerText),E=!0)),P&&(a=i+K(a),i="")},ontext:function(v){if(O)return;let y=_[_.length-1],T;if(y&&(T=y.tag,v=y.innerText!==void 0?y.innerText:v),e.disallowedTagsMode==="discard"&&(T==="script"||T==="style"))a+=v;else{let P=K(v,!1);e.textFilter&&!E?a+=e.textFilter(P,T):E||(a+=P)}if(_.length){let P=_[_.length-1];P.text+=v}},onclosetag:function(v){if(O)if(M--,!M)O=!1;else return;let y=_.pop();if(!y)return;if(y.tag!==v){_.push(y);return}O=e.enforceHtmlBoundary?v==="html":!1,g--;let T=S[g];if(T){if(delete S[g],e.disallowedTagsMode==="discard"){y.updateParentNodeText();return}i=a,a=""}if(R[g]&&(v=R[g],delete R[g]),e.exclusiveFilter&&e.exclusiveFilter(y)){a=a.substr(0,y.tagPosition);return}if(y.updateParentNodeMediaChildren(),y.updateParentNodeText(),e.selfClosing.indexOf(v)!==-1){T&&(a=i,i="");return}a+="</"+v+">",T&&(a=i+K(a),i=""),E=!1}},e.parser);return V.write(t),V.end(),a;function A(){a="",g=0,_=[],S={},R={},O=!1,M=0}function K(v,y){return typeof v!="string"&&(v=v+""),e.parser.decodeEntities&&(v=v.replace(/&/g,"&").replace(/</g,"<").replace(/>/g,">"),y&&(v=v.replace(/"/g,"""))),v=v.replace(/&(?![a-zA-Z0-9#]{1,20};)/g,"&").replace(/</g,"<").replace(/>/g,">"),y&&(v=v.replace(/"/g,""")),v}function X(v,y){for(y=y.replace(/[x00-x20]+/g,"");;){let De=y.indexOf("<!--");if(De===-1)break;let Le=y.indexOf("-->",De+4);if(Le===-1)break;y=y.substring(0,De)+y.substring(Le+3)}let T=y.match(/^([a-zA-Z][a-zA-Z0-9.-+]*):/);if(!T)return y.match(/^[/\]{2}/)?!e.allowProtocolRelative:!1;let P=T[1].toLowerCase();return qr(e.allowedSchemesByTag,v)?e.allowedSchemesByTag[v].indexOf(P)===-1:!e.allowedSchemes||e.allowedSchemes.indexOf(P)===-1}function I(v){if(v=v.replace(/^(w+:)?s*[\/]s*[\/]/,"$1//"),v.startsWith("relative:"))throw new Error("relative: exploit attempt");let y="relative://relative-site";for(let De=0;De<100;De++)y+=`/${De}`;let T=new URL(v,y);return{isRelativeUrl:T&&T.hostname==="relative-site"&&T.protocol==="relative:",url:T}}function J(v,y){if(!y)return v;let T=v.nodes[0],P;return y[T.selector]&&y["*"]?P=cm(y[T.selector],y["*"]):P=y[T.selector]||y["*"],P&&(v.nodes[0].nodes=T.nodes.reduce(ne(P),[])),v}function xe(v){return v.nodes[0].nodes.reduce(function(y,T){return y.push(`${T.prop}:${T.value}${T.important?" !important":""}`),y},[]).join(";")}function ne(v){return function(y,T){return qr(v,T.prop)&&v[T.prop].some(function(De){return De.test(T.value)})&&y.push(T),y}}function de(v,y,T){return y?(v=v.split(/s+/),v.filter(function(P){return y.indexOf(P)!==-1||T.some(function(De){return De.test(P)})}).join(" ")):v}}var Bb={decodeEntities:!0};as.defaults={allowedTags:["address","article","aside","footer","header","h1","h2","h3","h4","h5","h6","hgroup","main","nav","section","blockquote","dd","div","dl","dt","figcaption","figure","hr","li","main","ol","p","pre","ul","a","abbr","b","bdi","bdo","br","cite","code","data","dfn","em","i","kbd","mark","q","rb","rp","rt","rtc","ruby","s","samp","small","span","strong","sub","sup","time","u","var","wbr","caption","col","colgroup","table","tbody","td","tfoot","th","thead","tr"],disallowedTagsMode:"discard",allowedAttributes:{a:["href","name","target"],img:["src","srcset","alt","title","width","height","loading"]},selfClosing:["img","br","hr","area","base","basefont","input","link","meta"],allowedSchemes:["http","https","ftp","mailto","tel"],allowedSchemesByTag:{},allowedSchemesAppliedToAttributes:["href","src","cite"],allowProtocolRelative:!0,enforceHtmlBoundary:!1};as.simpleTransform=function(t,e,r){return r=r===void 0?!0:r,e=e||{},function(a,i){let s;if(r)for(s in e)i[s]=e[s];else i=e;return{tagName:t,attribs:i}}}});var _m=b((PO,xm)=>{xm.exports={compareTwoStrings:ym,findBestMatch:Hb};function ym(t,e){if(t=t.replace(/s+/g,""),e=e.replace(/s+/g,""),t===e)return 1;if(t.length<2||e.length<2)return 0;let r=new Map;for(let i=0;i<t.length-1;i++){let s=t.substring(i,i+2),n=r.has(s)?r.get(s)+1:1;r.set(s,n)}let a=0;for(let i=0;i<e.length-1;i++){let s=e.substring(i,i+2),n=r.has(s)?r.get(s):0;n>0&&(r.set(s,n-1),a++)}return 2*a/(t.length+e.length-2)}function Hb(t,e){if(!jb(t,e))throw new Error("Bad arguments: First argument should be a string, second should be an array
of strings");let r=[],a=0;for(let s=0;s<e.length;s++){let n=e[s],p=ym(t,n);r.push({target:n,rating:p}),p>r[a].rating&&(a=s)}let i=r[a];return{ratings:r,bestMatch:i,bestMatchIndex:a}}function jb(t,e){return!(typeof t!="string"||!Array.isArray(e)||!e.length||e.find(function(r){return typeof r!="string"}))}});var Om=b((FO,fd)=>{function Lm(t,e){if(e&&e.documentElement)t=e,e=arguments[2];else if(!t||!t.documentElement)throw new Error("First argument to Readability constructor should be a document object.");if(e=e||{},this._doc=t,this._docJSDOMParser=this._doc.firstChild.__JSDOMParser__,this._articleTitle=null,this._articleByline=null,this._articleDir=null,this._articleSiteName=null,this._attempts=[],this._debug=!!e.debug,this._maxElemsToParse=e.maxElemsToParse||this.DEFAULT_MAX_ELEMS_TO_PARSE,this._nbTopCandidates=e.nbTopCandidates||this.DEFAULT_N_TOP_CANDIDATES,this._charThreshold=e.charThreshold||this.DEFAULT_CHAR_THRESHOLD,this._classesToPreserve=this.CLASSES_TO_PRESERVE.concat(e.classesToPreserve||[]),this._keepClasses=!!e.keepClasses,this._serializer=e.serializer||function(r){return r.innerHTML},this._disableJSONLD=!!e.disableJSONLD,this._flags=this.FLAG_STRIP_UNLIKELYS|this.FLAG_WEIGHT_CLASSES|this.FLAG_CLEAN_CONDITIONALLY,this._debug){let r=function(a){if(a.nodeType==a.TEXT_NODE)return`${a.nodeName} ("${a.textContent}")`;let i=Array.from(a.attributes||[],function(s){return`${s.name}="${s.value}"`}).join(" ");return`<${a.localName} ${i}>`};this.log=function(){if(typeof dump<"u"){var a=Array.prototype.map.call(arguments,function(i){return i&&i.nodeName?r(i):i}).join(" ");dump("Reader: (Readability) "+a+`
----LONG WHITESPACE HERE, I DELETED IT----
^
Error: First argument to Readability constructor should be a document object.
at new Lm (d:Repositorieshexo-backendnode_modulesarticle-parserdistcjsarticle-parser.js:54:8356)
at qm (d:Repositorieshexo-backendnode_modulesarticle-parserdistcjsarticle-parser.js:64:3456) at bd (d:Repositorieshexo-backendnode_modulesarticle-parserdistcjsarticle-parser.js:64:4693) at Yb (d:Repositorieshexo-backendnode_modulesarticle-parserdistcjsarticle-parser.js:64:5168) at d:Repositorieshexo-backendsrcstandaloneget-links.ts:9:14
at Array.map (<anonymous>)
at d:Repositorieshexo-backendsrcstandaloneget-links.ts:8:10
at processTicksAndRejections (node:internal/process/task_queues:96:5)
#node.js #cheerio #jsdom
#node.js #cheerio #jsdom
Вопрос:
Я хотел бы использовать удобочитаемость для анализа содержимого «статьи» на веб-страницах. Readability
делает хорошую работу, но это зависит от JSDOM, который кажется очень медленным и выдает ошибки, если синтаксический анализ содержимого CSS завершается неудачно (что мне совсем не нужно), но игнорировать CSS в невозможно JSDOM
, как я понимаю из проблем проекта Gibhub.
Я пытался заменить JSDOM на cheerio, но я не смог выяснить, какая часть его API совместима с выводом JSDOM
.
В JSDOM
заявлении
var doc = new JSDOM(html);
создает объект DOM, который может быть передан в
let reader = new Readability(doc.window.document);
cheerio
Однако, похоже, нет ничего, что создает объект DOM.
Я пробовал
var $ = cheerio.load(html);
var object_something = $('html');
Он выдает ошибку при вызове new Readability(object_something)
Error: First argument to Readability constructor should be a document object.
что явно означает то, что он говорит. object_something
это объект, но не уверен, что это на самом деле.
Возможно ли вообще создать объект DOM с cheerio
помощью? У меня более 10 миллионов локальных HTML-документов, поэтому любое улучшение производительности сэкономит мне много времени.
Recommend Projects
-
React
A declarative, efficient, and flexible JavaScript library for building user interfaces.
-
Vue.js
🖖 Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.
-
Typescript
TypeScript is a superset of JavaScript that compiles to clean JavaScript output.
-
TensorFlow
An Open Source Machine Learning Framework for Everyone
-
Django
The Web framework for perfectionists with deadlines.
-
Laravel
A PHP framework for web artisans
-
D3
Bring data to life with SVG, Canvas and HTML. 📊📈🎉
Recommend Topics
-
javascript
JavaScript (JS) is a lightweight interpreted programming language with first-class functions.
-
web
Some thing interesting about web. New door for the world.
-
server
A server is a program made to process requests and deliver data to clients.
-
Machine learning
Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.
-
Visualization
Some thing interesting about visualization, use data art
-
Game
Some thing interesting about game, make everyone happy.
Recommend Org
-
Facebook
We are working to build community through open source technology. NB: members must have two-factor auth.
-
Microsoft
Open source projects and samples from Microsoft.
-
Google
Google ❤️ Open Source for everyone.
-
Alibaba
Alibaba Open Source for everyone
-
D3
Data-Driven Documents codes.
-
Tencent
China tencent open source team.
I’d like to use Readability to parse out the post uvdos cheerio «article» content within web pages. post uvdos cheerio Readability does a good job, but it depends post uvdos cheerio on JSDOM which seems to be very slow and post uvdos cheerio throws errors if parsing CSS content fails post uvdos cheerio (which I do not need at all), but it’s not post uvdos cheerio possible to ignore CSS in JSDOM, as I post uvdos cheerio understand from the Gibhub issues of the post uvdos cheerio project.
I’ve been trying to replace JSDOM with post uvdos cheerio cheerio, but I haven’t been able to figure post uvdos cheerio out what part of its API is compatible with post uvdos cheerio the output of JSDOM.
In JSDOM, the statement
var doc = new JSDOM(html);
produces a DOM object which can be passed post uvdos cheerio into
let reader = new Readability(doc.window.document);
In cheerio however, there doesn’t seem to post uvdos cheerio anything that produces a DOM object.
I’ve post uvdos cheerio tried
var $ = cheerio.load(html);
var object_something = $('html');
It throws an error when I call new post uvdos cheerio Readability(object_something)
Error: First argument to Readability constructor should be a document object.
which clearly means that what it says. post uvdos cheerio object_something is an object, but not sure post uvdos cheerio what it actually is.
Is it even possible to produce a DOM object post uvdos cheerio with cheerio? I have over 10 million local post uvdos cheerio HTML documents, so any performance post uvdos cheerio improvement would save me a lot of time.
I’d like to use Readability to parse out the «article» content within web pages. Readability
does a good job, but it depends on JSDOM which seems to be very slow and throws errors if parsing CSS content fails (which I do not need at all), but it’s not possible to ignore CSS in JSDOM
, as I understand from the Gibhub issues of the project.
I’ve been trying to replace JSDOM with cheerio, but I haven’t been able to figure out what part of its API is compatible with the output of JSDOM
.
In JSDOM
, the statement
var doc = new JSDOM(html);
produces a DOM object which can be passed into
let reader = new Readability(doc.window.document);
In cheerio
however, there doesn’t seem to anything that produces a DOM object.
I’ve tried
var $ = cheerio.load(html);
var object_something = $('html');
It throws an error when I call new Readability(object_something)
Error: First argument to Readability constructor should be a document object.
which clearly means that what it says. object_something
is an object, but not sure what it actually is.
Is it even possible to produce a DOM object with cheerio
? I have over 10 million local HTML documents, so any performance improvement would save me a lot of time.
Recommend Projects
-
ReactA declarative, efficient, and flexible JavaScript library for building user interfaces.
-
Vue.js🖖 Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.
-
TypescriptTypeScript is a superset of JavaScript that compiles to clean JavaScript output.
-
TensorFlowAn Open Source Machine Learning Framework for Everyone
-
DjangoThe Web framework for perfectionists with deadlines.
-
LaravelA PHP framework for web artisans
-
D3Bring data to life with SVG, Canvas and HTML. 📊📈🎉
Recommend Topics
-
javascript
JavaScript (JS) is a lightweight interpreted programming language with first-class functions.
-
web
Some thing interesting about web. New door for the world.
-
server
A server is a program made to process requests and deliver data to clients.
-
Machine learning
Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.
-
Visualization
Some thing interesting about visualization, use data art
-
Game
Some thing interesting about game, make everyone happy.
Recommend Org
-
FacebookWe are working to build community through open source technology. NB: members must have two-factor auth.
-
MicrosoftOpen source projects and samples from Microsoft.
-
GoogleGoogle ❤️ Open Source for everyone.
-
AlibabaAlibaba Open Source for everyone
-
D3Data-Driven Documents codes.
-
TencentChina tencent open source team.