Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion dist/cli.cjs
Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,5 @@
`)}return e.type="warcinfo",n.create(e,s())}getResponseInfo(){let e=this.httpHeaders;return e?{headers:e.headers,status:e.statusCode,statusText:e.statusText}:null}fixUp(){let e=this.warcHeaders.headers.get("WARC-Target-URI");e&&e.startsWith("<")&&e.endsWith(">")&&this.warcHeaders.headers.set("WARC-Target-URI",e.slice(1,-1))}async readFully(e=!1){if(this.httpHeaders){if(this.payload&&!this.payload.length)return this.payload;if(this._contentReader&&!e)throw new TypeError("WARC Record decoding already started, but requesting raw payload");if(e&&this.consumed==="raw"&&this.payload)return await this._createDecodingReader([this.payload]).readFully()}return this.payload?this.payload:(e?(this.payload=await super.readFully(),this.consumed="content"):(this.payload=await y.readFully(this._reader),this.consumed="raw"),this.payload)}get reader(){if(this.payload&&!this.payload.length)return G();if(this._contentReader)throw new TypeError("WARC Record decoding already started, but requesting raw payload");return this._reader}get contentReader(){return this.httpHeaders?(this._contentReader||(this._contentReader=this._createDecodingReader(this._reader)),this._contentReader):this._reader}_createDecodingReader(e){if(!this.httpHeaders)throw new Error("WARCRecord cannot call _createDecodingReader when this.httpHeaders === null");let t=this.httpHeaders.headers.get("Content-Encoding"),s=this.httpHeaders.headers.get("Transfer-Encoding"),i=s==="chunked";return!t&&!i&&(t=s),new R(e,t,i)}async readlineRaw(e){if(this.consumed)throw new Error("Record already consumed.. Perhaps a promise was not awaited?");if(this.contentReader instanceof y)return this.contentReader.readlineRaw(e);throw new Error("WARCRecord cannot call readlineRaw on this.contentReader if it does not extend BaseAsyncIterReader")}async contentText(){let e=await this.readFully(!0);return _e.decode(e)}async*[Symbol.asyncIterator](){for await(let e of this.contentReader)if(yield e,this.consumed)throw new Error("Record already consumed.. Perhaps a promise was not awaited?");this.consumed="content"}async skipFully(){if(!this.consumed){if(this._reader instanceof w){let e=await this._reader.skipFully();return this.consumed="skipped",e}throw new Error("WARCRecord cannot call skipFully on this._reader if it is not a LimitReader")}}warcHeader(e){return this.warcHeaders.headers.get(e)}get warcType(){return this.warcHeaders.headers.get("WARC-Type")}get warcTargetURI(){return this.warcHeaders.headers.get("WARC-Target-URI")}get warcDate(){return this.warcHeaders.headers.get("WARC-Date")}get warcRefersToTargetURI(){return this.warcHeaders.headers.get("WARC-Refers-To-Target-URI")}get warcRefersToDate(){return this.warcHeaders.headers.get("WARC-Refers-To-Date")}get warcPayloadDigest(){return this.warcHeaders.headers.get("WARC-Payload-Digest")}get warcBlockDigest(){return this.warcHeaders.headers.get("WARC-Block-Digest")}get warcContentType(){return this.warcHeaders.headers.get("Content-Type")}get warcContentLength(){return Number(this.warcHeaders.headers.get("Content-Length"))}};async function*G(){}var K=new TextDecoder,O=new Uint8Array([]),W=class n{static parse(r,e){return new n(r,e).parse()}static iterRecords(r,e){return new n(r,e)[Symbol.asyncIterator]()}constructor(r,{keepHeadersCase:e=!1,parseHttp:t=!0}={}){this._offset=0,this._warcHeadersLength=0,this._headersClass=e?Map:Headers,this._parseHttp=t,r instanceof R?this._reader=r:this._reader=new R(r),this._record=null}async readToNextRecord(){if(!this._reader||!this._record)return O;await this._record.skipFully(),this._reader.compressed&&(this._offset=this._reader.getRawOffset());let r=await this._reader.readlineRaw(),e=0;if(!r)r=O;else{if(e=r.byteLength-1,e===9&&K.decode(r).startsWith("WARC/"))return r;for(;e>0;){let t=r[e-1];if(t!==10&&t!==13)break;e--}e&&console.warn(`Content-Length Too Small: Record not followed by newline, Remainder Length: ${e}, Offset: ${this._reader.getRawOffset()-r.byteLength}`)}if(this._reader.compressed)await this._reader.skipSize(2),r=O;else{for(r=await this._reader.readlineRaw();r&&r.byteLength===2;)r=await this._reader.readlineRaw();this._offset=this._reader.getRawOffset(),r&&(this._offset-=r.length)}return r}_initRecordReader(r){return new w(this._reader,Number(r.headers.get("Content-Length")||0))}async parse(){let r=await this.readToNextRecord(),e=r?K.decode(r):"",t=new k,s=await t.parse(this._reader,{firstLine:e,headersClass:this._headersClass});if(!s)return null;this._warcHeadersLength=this._reader.getReadOffset();let i=new T({warcHeaders:s,reader:this._initRecordReader(s)});if(this._record=i,this._parseHttp)switch(i.warcType){case"response":case"request":await this._addHttpHeaders(i,t);break;case"revisit":i.warcContentLength>0&&await this._addHttpHeaders(i,t);break}return i}get offset(){return this._offset}get recordLength(){return this._reader.getRawLength(this._offset)}async*[Symbol.asyncIterator](){let r=null;for(;(r=await this.parse())!==null;)yield r;this._record=null}async _addHttpHeaders(r,e){let t=await e.parse(this._reader,{headersClass:this._headersClass});r.httpHeaders=t;let s=this._reader.getReadOffset()-this._warcHeadersLength;r.reader instanceof w&&r.reader.setLimitSkip(r.warcContentLength-s)}};var Te=["offset","warc-type","warc-target-uri"],P=class{constructor(r={}){this.opts=r,this.fields=r&&r.fields?r.fields.split(","):Te,this.parseHttp=!1}serialize(r){return JSON.stringify(r)+`
`}write(r,e){e.write(this.serialize(r))}async writeAll(r,e){for await(let t of this.iterIndex(r))this.write(t,e)}async*iterIndex(r){let e={strictHeaders:!0,parseHttp:this.parseHttp};for(let{filename:t,reader:s}of r){let i=new W(s,e);yield*this.iterRecords(i,t)}}async*iterRecords(r,e){for await(let t of r){await t.skipFully();let s=this.indexRecord(t,r,e);s&&(yield s)}}indexRecord(r,e,t){if(this.filterRecord&&!this.filterRecord(r))return null;let s={},{offset:i,recordLength:a}=e,o={offset:i,length:a,filename:t};for(let d of this.fields)d in o?s[d]=o[d]:this.setField(d,r,s);return s}setField(r,e,t){let s=this.getField(r,e);s!==null&&(t[r]=s)}getField(r,e){if(r==="http:status")return e.httpHeaders&&(e.warcType==="response"||e.warcType==="revisit")?e.httpHeaders.statusCode:null;if(r.startsWith("http:")){if(e.httpHeaders){let t=e.httpHeaders.headers;return t instanceof Map&&(t=new Headers(Object.fromEntries(t))),t.get(r.slice(5))}return null}return e.warcHeaders.headers.get(r)||null}},C=class extends P{constructor(r){super(r);for(let e of this.fields)if(e.startsWith("http:")){this.parseHttp=!0;break}}},We="urlkey,timestamp,url,mime,status,digest,length,offset,filename".split(","),Ue="urlkey,timestamp,url,mime,status,digest,redirect,meta,length,offset,filename".split(","),I=class extends C{constructor(e){super(e);switch(this.includeAll=!!e?.all,this.overrideIndexForAll=!!e?.all,this.fields=We,this.parseHttp=!0,this.noSurt=!!e?.noSurt,this._lastRecord=null,e?.format){case"cdxj":this.serialize=this.serializeCDXJ;break;case"cdx":this.serialize=this.serializeCDX11;break;case"json":default:break}}async*iterRecords(e,t){this._lastRecord=null;for await(let i of e){await i.readFully();let a=this.indexRecord(i,e,t);a&&(yield a)}let s=this.indexRecord(null,e,t);s&&(yield s)}filterRecord(e){if(this.includeAll)return!0;let t=e.warcType;return!(t==="request"||t==="warcinfo"||(t==="metadata"||t==="resource")&&e.warcContentType==="application/warc-fields")}indexRecord(e,t,s){if(this.overrideIndexForAll)return e?super.indexRecord(e,t,s):null;let i=this._lastRecord;if(this._lastRecord=e,e&&(e._offset=t.offset,e._length=t.recordLength),!i)return null;if(!e||i.warcTargetURI!=e.warcTargetURI)return this.indexRecordPair(i,null,t,s);let a=e.warcType,o=i.warcType;return a==="request"&&(o==="response"||o==="revisit")?(this._lastRecord=null,this.indexRecordPair(i,e,t,s)):(a==="response"||a==="revisit")&&o==="request"?(this._lastRecord=null,this.indexRecordPair(e,i,t,s)):this.indexRecordPair(i,null,t,s)}indexRecordPair(e,t,s,i){let a,o,d=e.warcTargetURI||"";if(t&&t.httpHeaders&&t.httpHeaders.method!=="GET"){let u={url:d,method:t.httpHeaders.method,headers:t.httpHeaders.headers,postData:t.payload};a=u.method,M(u)&&(o=u.requestBody,e.method=a,e.requestBody=o,d=u.url)}e._urlkey=d;let l=super.indexRecord(e,s,i);return l&&(e&&e._offset!==void 0&&(l.offset=e._offset,l.length=e._length),a&&(l.method=a),o&&(l.requestBody=o)),l}serializeCDXJ(e){let{urlkey:t,timestamp:s}=e;return delete e.urlkey,delete e.timestamp,`${t} ${s} ${JSON.stringify(e)}
`}serializeCDX11(e){let t=[];for(let s of Ue)t.push(e[s]!=null?e[s]:"-");return t.join(" ")+`
`}getField(e,t){let s=null;switch(e){case"urlkey":return s=t._urlkey||t.warcTargetURI||null,this.noSurt||s===null?s:z(s);case"timestamp":return s=t.warcDate??"",s.replace(/[-:T]/g,"").slice(0,14);case"url":return t.warcTargetURI;case"mime":switch(t.warcType){case"revisit":return"warc/revisit";case"response":case"request":e="http:content-type";break;default:e="content-type"}return s=super.getField(e,t),s?s.toString().split(";",1)[0]?.trim():null;case"status":return super.getField("http:status",t);case"digest":return s=t.warcPayloadDigest,s?s.split(":",2)[1]:null;default:return null}}};var Z="2.2.2";var ve=1024*128;function D(n=H.stdout,r){let e=Promise.resolve();return r=r||(0,re.hideBin)(process.argv),(0,te.default)().version(Z).usage("$0 [command]").command({command:"index <filenames..>",describe:"Index WARC(s)",builder:q,handler:async t=>{e=new C(t).writeAll(Y(t.filenames),n)}}).command({command:"cdx-index <filenames..>",describe:"CDX(J) Index of WARC(s)",builder:B,handler:async t=>{e=new I(t).writeAll(Y(t.filenames),n)}}).demandCommand(1,"Please specify a command").strictCommands().help().parseAsync(r),e}function Y(n){return n.reduce((r,e)=>{if(!(0,U.lstatSync)(e).isFile())return H.stderr.write(`Skipping ${e}, not a file
`}getField(e,t){let s=null;switch(e){case"urlkey":return s=t._urlkey||t.warcTargetURI||null,this.noSurt||s===null?s:z(s);case"timestamp":return s=t.warcDate??"",s.replace(/[-:T]/g,"").slice(0,14);case"url":return t.warcTargetURI;case"mime":switch(t.warcType){case"revisit":return"warc/revisit";case"response":case"request":e="http:content-type";break;default:e="content-type"}return s=super.getField(e,t),s?s.toString().split(";",1)[0]?.trim():null;case"status":return super.getField("http:status",t);case"digest":return s=t.warcPayloadDigest,s?s.split(":",2)[1]:null;default:return null}}};var Z="2.3.0";var ve=1024*128;function D(n=H.stdout,r){let e=Promise.resolve();return r=r||(0,re.hideBin)(process.argv),(0,te.default)().version(Z).usage("$0 [command]").command({command:"index <filenames..>",describe:"Index WARC(s)",builder:q,handler:async t=>{e=new C(t).writeAll(Y(t.filenames),n)}}).command({command:"cdx-index <filenames..>",describe:"CDX(J) Index of WARC(s)",builder:B,handler:async t=>{e=new I(t).writeAll(Y(t.filenames),n)}}).demandCommand(1,"Please specify a command").strictCommands().help().parseAsync(r),e}function Y(n){return n.reduce((r,e)=>{if(!(0,U.lstatSync)(e).isFile())return H.stderr.write(`Skipping ${e}, not a file
`),r;let t=(0,U.createReadStream)(e,{highWaterMark:ve});return e=(0,ee.basename)(e),r.push({filename:e,reader:t}),r},[])}D();
Loading