1
1
import { BaseExtractor , ExtractorResult } from './_base' ;
2
2
3
3
export class TwitterExtractor extends BaseExtractor {
4
- private mainTweet : Element | null ;
5
- private threadTweets : Element [ ] ;
4
+ private mainTweet : Element | null = null ;
5
+ private threadTweets : Element [ ] = [ ] ;
6
6
7
7
constructor ( document : Document , url : string ) {
8
8
super ( document , url ) ;
9
- // Get the main tweet and any thread tweets
10
- this . mainTweet = document . querySelector ( 'article[data-testid="tweet"]' ) ;
11
- this . threadTweets = Array . from ( document . querySelectorAll ( 'article[data-testid="tweet"]' ) ) . slice ( 1 ) ;
9
+
10
+ // Get all tweets from the timeline
11
+ const timeline = document . querySelector ( '[aria-label="Timeline: Conversation"]' ) ;
12
+ if ( ! timeline ) {
13
+ // Try to find a single tweet if not in timeline view
14
+ const singleTweet = document . querySelector ( 'article[data-testid="tweet"]' ) ;
15
+ if ( singleTweet ) {
16
+ this . mainTweet = singleTweet ;
17
+ }
18
+ return ;
19
+ }
20
+
21
+ // Get all tweets before any section with "Discover more" or similar headings
22
+ const allTweets = Array . from ( timeline . querySelectorAll ( 'article[data-testid="tweet"]' ) ) ;
23
+ const firstSection = timeline . querySelector ( 'section, h2' ) ?. parentElement ;
24
+
25
+ if ( firstSection ) {
26
+ // Filter out tweets that appear after the first section
27
+ allTweets . forEach ( ( tweet , index ) => {
28
+ if ( firstSection . compareDocumentPosition ( tweet ) & Node . DOCUMENT_POSITION_FOLLOWING ) {
29
+ allTweets . splice ( index ) ;
30
+ return false ;
31
+ }
32
+ } ) ;
33
+ }
34
+
35
+ // Set main tweet and thread tweets
36
+ this . mainTweet = allTweets [ 0 ] || null ;
37
+ this . threadTweets = allTweets . slice ( 1 ) ;
12
38
}
13
39
14
40
canExtract ( ) : boolean {
@@ -17,14 +43,15 @@ export class TwitterExtractor extends BaseExtractor {
17
43
18
44
extract ( ) : ExtractorResult {
19
45
const mainContent = this . extractTweet ( this . mainTweet ) ;
20
- const threadContent = this . threadTweets . map ( tweet => this . extractTweet ( tweet ) ) . join ( '\n\n' ) ;
46
+ const threadContent = this . threadTweets . map ( tweet => this . extractTweet ( tweet ) ) . join ( '\n<hr> \n' ) ;
21
47
22
48
const contentHtml = `
23
49
<div class="tweet-thread">
24
50
<div class="main-tweet">
25
51
${ mainContent }
26
52
</div>
27
53
${ threadContent ? `
54
+ <hr>
28
55
<div class="thread-tweets">
29
56
${ threadContent }
30
57
</div>
0 commit comments