|
| 1 | +import pako from 'pako'; |
| 2 | +import { IHasher } from 'hash-wasm/dist/lib/WASMInterface'; |
| 3 | +import { WritableStreamBuffer } from 'stream-buffers'; |
| 4 | + |
| 5 | +type SourceReader = { |
| 6 | + read: Function; |
| 7 | +}; |
| 8 | +type SourceReadable = { |
| 9 | + getReader: (...args: any) => { |
| 10 | + read: Function; |
| 11 | + }; |
| 12 | +}; |
| 13 | +type Source = SourceReader | SourceReadable | AsyncIterable<Uint8Array> | Iterable<Uint8Array>; |
| 14 | +type StreamResult = { |
| 15 | + filename: string; |
| 16 | + reader: AsyncIterable<Uint8Array>; |
| 17 | +}; |
| 18 | +type StreamResults = StreamResult[]; |
| 19 | +type IndexerOffsetLength = { |
| 20 | + offset: number; |
| 21 | + recordLength: number; |
| 22 | +}; |
| 23 | +type Request = { |
| 24 | + method: string; |
| 25 | + url: string; |
| 26 | + headers: Map<string, string> | Headers; |
| 27 | + postData?: any; |
| 28 | + requestBody?: any; |
| 29 | +}; |
| 30 | + |
| 31 | +declare class NoConcatInflator<T extends BaseAsyncIterReader> extends pako.Inflate { |
| 32 | + reader: T; |
| 33 | + ended: boolean; |
| 34 | + chunks: Uint8Array[]; |
| 35 | + constructor(options: pako.InflateOptions, reader: T); |
| 36 | + onEnd(status: pako.ReturnCodes): void; |
| 37 | +} |
| 38 | +declare abstract class BaseAsyncIterReader { |
| 39 | + static readFully(iter: AsyncIterable<Uint8Array> | Iterable<Uint8Array>): Promise<Uint8Array>; |
| 40 | + abstract [Symbol.asyncIterator](): AsyncIterator<Uint8Array>; |
| 41 | + getReadableStream(): ReadableStream<any>; |
| 42 | + readFully(): Promise<Uint8Array>; |
| 43 | + abstract readlineRaw(maxLength?: number): Promise<Uint8Array | null>; |
| 44 | + readline(maxLength?: number): Promise<string>; |
| 45 | + iterLines(maxLength?: number): AsyncGenerator<string, void, unknown>; |
| 46 | +} |
| 47 | +type AsyncIterReaderOpts = { |
| 48 | + raw: boolean; |
| 49 | +}; |
| 50 | +declare class AsyncIterReader extends BaseAsyncIterReader { |
| 51 | + compressed: string | null; |
| 52 | + opts: AsyncIterReaderOpts; |
| 53 | + inflator: NoConcatInflator<this> | null; |
| 54 | + _sourceIter: AsyncIterator<Uint8Array | null>; |
| 55 | + lastValue: Uint8Array | null; |
| 56 | + errored: boolean; |
| 57 | + _savedChunk: Uint8Array | null; |
| 58 | + _rawOffset: number; |
| 59 | + _readOffset: number; |
| 60 | + numChunks: number; |
| 61 | + constructor(streamOrIter: Source, compressed?: string | null, dechunk?: boolean); |
| 62 | + _loadNext(): Promise<Uint8Array | null>; |
| 63 | + dechunk(source: AsyncIterable<Uint8Array>): AsyncIterator<Uint8Array | null>; |
| 64 | + unread(chunk: Uint8Array): void; |
| 65 | + _next(): Promise<Uint8Array | null>; |
| 66 | + _push(value: Uint8Array): void; |
| 67 | + _getNextChunk(original?: Uint8Array): Uint8Array | null | undefined; |
| 68 | + [Symbol.asyncIterator](): AsyncGenerator<Uint8Array, void, unknown>; |
| 69 | + readlineRaw(maxLength?: number): Promise<Uint8Array | null>; |
| 70 | + readFully(): Promise<Uint8Array>; |
| 71 | + readSize(sizeLimit: number): Promise<Uint8Array>; |
| 72 | + skipSize(sizeLimit: number): Promise<number>; |
| 73 | + _readOrSkip(sizeLimit?: number, skip?: boolean): Promise<readonly [number, Uint8Array]>; |
| 74 | + getReadOffset(): number; |
| 75 | + getRawOffset(): number; |
| 76 | + getRawLength(prevOffset: number): number; |
| 77 | + static fromReadable<Readable extends SourceReader>(source: Readable): { |
| 78 | + [Symbol.asyncIterator](): AsyncGenerator<Uint8Array, void, unknown>; |
| 79 | + }; |
| 80 | + static fromIter(source: Iterable<Uint8Array>): { |
| 81 | + [Symbol.asyncIterator](): AsyncGenerator<Uint8Array, void, unknown>; |
| 82 | + }; |
| 83 | +} |
| 84 | +declare class LimitReader extends BaseAsyncIterReader { |
| 85 | + sourceIter: AsyncIterReader; |
| 86 | + length: number; |
| 87 | + limit: number; |
| 88 | + skip: number; |
| 89 | + constructor(streamIter: AsyncIterReader, limit: number, skip?: number); |
| 90 | + setLimitSkip(limit: number, skip?: number): void; |
| 91 | + [Symbol.asyncIterator](): AsyncGenerator<Uint8Array, void, unknown>; |
| 92 | + readlineRaw(maxLength?: number): Promise<Uint8Array | null>; |
| 93 | + skipFully(): Promise<number>; |
| 94 | +} |
| 95 | + |
| 96 | +declare class StatusAndHeaders { |
| 97 | + statusline: string; |
| 98 | + headers: Map<string, string> | Headers; |
| 99 | + constructor({ statusline, headers, }: { |
| 100 | + statusline: string; |
| 101 | + headers: Map<string, string> | Headers; |
| 102 | + }); |
| 103 | + toString(): string; |
| 104 | + iterSerialize(encoder: TextEncoder): AsyncGenerator<Uint8Array, void, unknown>; |
| 105 | + _protocol: string; |
| 106 | + _statusCode: number | string; |
| 107 | + _statusText: string; |
| 108 | + _parseResponseStatusLine(): void; |
| 109 | + get statusCode(): string | number; |
| 110 | + get protocol(): string; |
| 111 | + get statusText(): string; |
| 112 | + _method: string; |
| 113 | + _requestPath: string; |
| 114 | + _parseRequestStatusLine(): void; |
| 115 | + get method(): string; |
| 116 | + get requestPath(): string; |
| 117 | +} |
| 118 | +declare class StatusAndHeadersParser { |
| 119 | + parse(reader: AsyncIterReader, { headersClass, firstLine, }?: { |
| 120 | + firstLine?: string; |
| 121 | + headersClass: typeof Map | typeof Headers; |
| 122 | + }): Promise<StatusAndHeaders | null>; |
| 123 | +} |
| 124 | + |
| 125 | +declare const WARC_1_1 = "WARC/1.1"; |
| 126 | +declare const WARC_1_0 = "WARC/1.0"; |
| 127 | +type WARCType = "warcinfo" | "response" | "resource" | "request" | "metadata" | "revisit" | "conversion" | "continuation"; |
| 128 | +type WARCRecordOpts = { |
| 129 | + url?: string; |
| 130 | + date?: string; |
| 131 | + type?: WARCType; |
| 132 | + warcHeaders?: Record<string, string>; |
| 133 | + filename?: string; |
| 134 | + httpHeaders?: HeadersInit; |
| 135 | + statusline?: string; |
| 136 | + warcVersion?: typeof WARC_1_0 | typeof WARC_1_1; |
| 137 | + keepHeadersCase?: boolean; |
| 138 | + refersToUrl?: string; |
| 139 | + refersToDate?: string; |
| 140 | +}; |
| 141 | +declare class WARCRecord extends BaseAsyncIterReader { |
| 142 | + static create({ url, date, type, warcHeaders, filename, httpHeaders, statusline, warcVersion, keepHeadersCase, refersToUrl, refersToDate, }?: WARCRecordOpts, reader?: AsyncIterable<Uint8Array> | Iterable<Uint8Array>): WARCRecord; |
| 143 | + static createWARCInfo(opts: WARCRecordOpts | undefined, info: Record<string, string>): WARCRecord; |
| 144 | + warcHeaders: StatusAndHeaders; |
| 145 | + _reader: AsyncIterable<Uint8Array> | Iterable<Uint8Array>; |
| 146 | + _contentReader: BaseAsyncIterReader | null; |
| 147 | + payload: Uint8Array | null; |
| 148 | + httpHeaders: StatusAndHeaders | null; |
| 149 | + consumed: "content" | "raw" | "skipped" | ""; |
| 150 | + _offset: number; |
| 151 | + _length: number; |
| 152 | + method: string; |
| 153 | + requestBody: string; |
| 154 | + _urlkey: string; |
| 155 | + constructor({ warcHeaders, reader, }: { |
| 156 | + warcHeaders: StatusAndHeaders; |
| 157 | + reader: AsyncIterable<Uint8Array> | Iterable<Uint8Array>; |
| 158 | + }); |
| 159 | + getResponseInfo(): { |
| 160 | + headers: Map<string, string> | Headers; |
| 161 | + status: string | number; |
| 162 | + statusText: string; |
| 163 | + } | null; |
| 164 | + fixUp(): void; |
| 165 | + readFully(isContent?: boolean): Promise<Uint8Array>; |
| 166 | + get reader(): AsyncIterable<Uint8Array> | Iterable<Uint8Array>; |
| 167 | + get contentReader(): AsyncIterable<Uint8Array> | Iterable<Uint8Array>; |
| 168 | + _createDecodingReader(source: Source): AsyncIterReader; |
| 169 | + readlineRaw(maxLength?: number): Promise<Uint8Array | null>; |
| 170 | + contentText(): Promise<string>; |
| 171 | + [Symbol.asyncIterator](): AsyncGenerator<Uint8Array, void, unknown>; |
| 172 | + skipFully(): Promise<number | undefined>; |
| 173 | + warcHeader(name: string): string | null | undefined; |
| 174 | + get warcType(): string | null | undefined; |
| 175 | + get warcTargetURI(): string | null | undefined; |
| 176 | + get warcDate(): string | null | undefined; |
| 177 | + get warcRefersToTargetURI(): string | null | undefined; |
| 178 | + get warcRefersToDate(): string | null | undefined; |
| 179 | + get warcPayloadDigest(): string | null | undefined; |
| 180 | + get warcBlockDigest(): string | null | undefined; |
| 181 | + get warcContentType(): string | null | undefined; |
| 182 | + get warcContentLength(): number; |
| 183 | +} |
| 184 | + |
| 185 | +type WARCParserOpts = { |
| 186 | + keepHeadersCase?: boolean; |
| 187 | + parseHttp?: boolean; |
| 188 | +}; |
| 189 | +declare class WARCParser implements IndexerOffsetLength { |
| 190 | + static parse(source: Source, options?: WARCParserOpts): Promise<WARCRecord | null>; |
| 191 | + static iterRecords(source: Source, options?: WARCParserOpts): AsyncGenerator<WARCRecord, void, unknown>; |
| 192 | + _offset: number; |
| 193 | + _warcHeadersLength: number; |
| 194 | + _headersClass: typeof Map | typeof Headers; |
| 195 | + _parseHttp: boolean; |
| 196 | + _reader: AsyncIterReader; |
| 197 | + _record: WARCRecord | null; |
| 198 | + constructor(source: Source, { keepHeadersCase, parseHttp }?: WARCParserOpts); |
| 199 | + readToNextRecord(): Promise<Uint8Array | null>; |
| 200 | + _initRecordReader(warcHeaders: StatusAndHeaders): LimitReader; |
| 201 | + parse(): Promise<WARCRecord | null>; |
| 202 | + get offset(): number; |
| 203 | + get recordLength(): number; |
| 204 | + [Symbol.asyncIterator](): AsyncGenerator<WARCRecord, void, unknown>; |
| 205 | + _addHttpHeaders(record: WARCRecord, headersParser: StatusAndHeadersParser): Promise<void>; |
| 206 | +} |
| 207 | + |
| 208 | +type WARCSerializerOpts = { |
| 209 | + gzip?: boolean; |
| 210 | + digest?: { |
| 211 | + algo?: AlgorithmIdentifier; |
| 212 | + prefix?: string; |
| 213 | + base32?: boolean; |
| 214 | + }; |
| 215 | + preferPako?: boolean; |
| 216 | +}; |
| 217 | +declare abstract class BaseSerializerBuffer { |
| 218 | + abstract write(chunk: Uint8Array): void; |
| 219 | + abstract readAll(): AsyncIterable<Uint8Array>; |
| 220 | +} |
| 221 | +declare class WARCSerializer extends BaseAsyncIterReader { |
| 222 | + gzip: boolean; |
| 223 | + digestAlgo: AlgorithmIdentifier; |
| 224 | + digestAlgoPrefix: string; |
| 225 | + digestBase32: boolean; |
| 226 | + preferPako: boolean; |
| 227 | + record: WARCRecord; |
| 228 | + externalBuffer: BaseSerializerBuffer; |
| 229 | + _alreadyDigested: boolean; |
| 230 | + blockHasher: IHasher | null; |
| 231 | + payloadHasher: IHasher | null; |
| 232 | + httpHeadersBuff: Uint8Array | null; |
| 233 | + warcHeadersBuff: Uint8Array | null; |
| 234 | + static serialize(record: WARCRecord, opts?: WARCSerializerOpts, externalBuffer?: BaseSerializerBuffer): Promise<Uint8Array>; |
| 235 | + constructor(record: WARCRecord, opts?: WARCSerializerOpts, externalBuffer?: BaseSerializerBuffer); |
| 236 | + static noComputeDigest(record: WARCRecord): string | true | null | undefined; |
| 237 | + [Symbol.asyncIterator](): AsyncGenerator<any, void, unknown>; |
| 238 | + readlineRaw(maxLength?: number): Promise<Uint8Array | null>; |
| 239 | + pakoCompress(): AsyncGenerator<any, void, unknown>; |
| 240 | + streamCompress(cs: CompressionStream): AsyncGenerator<Uint8Array, void, unknown>; |
| 241 | + newHasher(): Promise<IHasher> | null; |
| 242 | + getDigest(hasher: IHasher): string; |
| 243 | + digestRecord(): Promise<number>; |
| 244 | + generateRecord(): AsyncGenerator<Uint8Array, void, unknown>; |
| 245 | +} |
| 246 | + |
| 247 | +type IndexCommandArgs = any; |
| 248 | +type CdxIndexCommandArgs = any; |
| 249 | + |
| 250 | +declare abstract class BaseIndexer { |
| 251 | + opts: Partial<IndexCommandArgs>; |
| 252 | + fields: string[]; |
| 253 | + parseHttp: boolean; |
| 254 | + constructor(opts?: Partial<IndexCommandArgs>); |
| 255 | + serialize(result: Record<string, any>): string; |
| 256 | + write(result: Record<string, any>, out: WritableStreamBuffer | NodeJS.WriteStream): void; |
| 257 | + writeAll(files: StreamResults, out: WritableStreamBuffer | NodeJS.WriteStream): Promise<void>; |
| 258 | + iterIndex(files: StreamResults): AsyncGenerator<Record<string, any>, void, unknown>; |
| 259 | + iterRecords(parser: WARCParser, filename: string): AsyncGenerator<Record<string, any>, void, unknown>; |
| 260 | + filterRecord?(record: WARCRecord): boolean; |
| 261 | + indexRecord(record: WARCRecord, indexerOffset: IndexerOffsetLength, filename: string): Record<string, any> | null; |
| 262 | + setField(field: string, record: WARCRecord, result: Record<string, any>): void; |
| 263 | + getField(field: string, record: WARCRecord): string | number | null | undefined; |
| 264 | +} |
| 265 | +declare class Indexer extends BaseIndexer { |
| 266 | + constructor(opts?: Partial<IndexCommandArgs>); |
| 267 | +} |
| 268 | +interface CDXAndRecord { |
| 269 | + cdx: Record<string, any>; |
| 270 | + record: WARCRecord; |
| 271 | + reqRecord: WARCRecord | null; |
| 272 | +} |
| 273 | +declare class CDXIndexer extends Indexer { |
| 274 | + includeAll: boolean; |
| 275 | + overrideIndexForAll: boolean; |
| 276 | + noSurt: boolean; |
| 277 | + _lastRecord: WARCRecord | null; |
| 278 | + constructor(opts?: Partial<CdxIndexCommandArgs>); |
| 279 | + iterRecords(parser: WARCParser, filename: string): AsyncGenerator<Record<string, any>, void, unknown>; |
| 280 | + filterRecord(record: WARCRecord): boolean; |
| 281 | + indexRecord(record: WARCRecord | null, indexOffset: IndexerOffsetLength, filename: string): Record<string, any> | null; |
| 282 | + indexRecordPair(record: WARCRecord, reqRecord: WARCRecord | null, indexOffset: IndexerOffsetLength, filename: string): Record<string, any> | null; |
| 283 | + serializeCDXJ(result: Record<string, any>): string; |
| 284 | + serializeCDX11(result: Record<string, any>): string; |
| 285 | + getField(field: string, record: WARCRecord): string | number | null | undefined; |
| 286 | +} |
| 287 | +declare class CDXAndRecordIndexer extends CDXIndexer { |
| 288 | + constructor(opts?: Partial<CdxIndexCommandArgs>); |
| 289 | + indexRecordPair(record: WARCRecord, reqRecord: WARCRecord | null, indexOffset: IndexerOffsetLength, filename: string): CDXAndRecord | null; |
| 290 | +} |
| 291 | + |
| 292 | +declare function getSurt(url: string): string; |
| 293 | +declare function postToGetUrl(request: Request): boolean; |
| 294 | +declare function appendRequestQuery(url: string, query: string, method: string): string; |
| 295 | +declare function jsonToQueryParams(json: string | any, ignoreInvalid?: boolean): URLSearchParams; |
| 296 | +declare function mfdToQueryParams(mfd: string | Uint8Array, contentType: string): URLSearchParams; |
| 297 | +declare function jsonToQueryString(json: any, ignoreInvalid?: boolean): string; |
| 298 | +declare function mfdToQueryString(mfd: string | Uint8Array, contentType: string): string; |
| 299 | +declare function concatChunks(chunks: Uint8Array[], size: number): Uint8Array; |
| 300 | +declare function splitChunk(chunk: Uint8Array, inx: number): [Uint8Array, Uint8Array]; |
| 301 | + |
| 302 | +export { AsyncIterReader, AsyncIterReaderOpts, BaseAsyncIterReader, BaseSerializerBuffer, CDXAndRecordIndexer, CDXIndexer, Indexer, IndexerOffsetLength, LimitReader, NoConcatInflator, Request, Source, SourceReadable, SourceReader, StatusAndHeaders, StatusAndHeadersParser, StreamResult, StreamResults, WARCParser, WARCParserOpts, WARCRecord, WARCRecordOpts, WARCSerializer, WARCSerializerOpts, WARCType, WARC_1_0, WARC_1_1, appendRequestQuery, concatChunks, getSurt, jsonToQueryParams, jsonToQueryString, mfdToQueryParams, mfdToQueryString, postToGetUrl, splitChunk }; |
0 commit comments