pubmed articles return a truncated abstract #413

brunoamaral · 2024-06-23T19:52:23Z

rss feed where the problem was found: https://pubmed.ncbi.nlm.nih.gov/rss/search/10guX6I3SqrbUeeLKSTD6FCRM44ewnrN2MKKTQLLPMHB4xNsZU/?limit=15&utm_campaign=pubmed-2&fc=20210216052009

this is in the feedreader.py file that processes the data

		def update_articles_from_feeds(self):
			sources = Sources.objects.filter(method='rss', source_for='science paper')
			for source in sources:
					feed = self.fetch_feed(source.link, source.ignore_ssl)
					for entry in feed['entries']:
							title = entry['title']
							self.stdout.write(f"Processing {title}")
							summary = entry.get('summary', '')
							if hasattr(entry, 'summary_detail'):
									summary = entry['summary_detail']['value']
							published = entry.get('published')
							if 'pubmed' in source.link and hasattr(entry, 'content'):
									summary = entry['content'][0]['value']
							published_date = parse(entry.get('published') or entry.get('prism_coverdate'), tzinfos=self.tzinfos).astimezone(pytz.utc)
							link = greg.remove_utm(entry['link'])
							doi = None
							if 'pubmed' in source.link and entry.get('dc_identifier', '').startswith('doi:'):
									doi = entry['dc_identifier'].replace('doi:', '')
							elif 'faseb' in source.link:
									doi = entry.get('prism_doi', '')

							if doi:
									crossref_paper = SciencePaper(doi=doi)
									crossref_paper.refresh()
									title = crossref_paper.title if crossref_paper.title else entry['title']
									summary = crossref_paper.abstract if crossref_paper.abstract else entry.get('summary')

									# Check if an article with the same DOI or title exists
									existing_article = Articles.objects.filter(Q(doi=doi) | Q(title=title)).first()
									if existing_article:
										science_paper = existing_article
										created = False
									else:
										science_paper = Articles.objects.create(
											doi=doi,
											title=title,
											summary=summary,
											link=link,
											published_date=published_date,
											container_title=crossref_paper.journal,
											publisher=crossref_paper.publisher,
											access=crossref_paper.access,
											crossref_check=timezone.now()
										)
										created = True

									if created:
										science_paper.teams.add(source.team)
										science_paper.subjects.add(source.subject)
										science_paper.sources.add(source)
										science_paper.save()
									else:
											if any([science_paper.title != title, science_paper.summary != SciencePaper.clean_abstract(abstract=summary),
													science_paper.link != link, science_paper.published_date != published_date]):
													science_paper.title = title
													science_paper.summary = SciencePaper.clean_abstract(abstract=summary)
													science_paper.link = link
													science_paper.published_date = published_date
													science_paper.sources.add(source)
													science_paper.teams.add(source.team)
													science_paper.subjects.add(source.subject)
													science_paper.save()

									# Process author information
									if crossref_paper is not None:  # Assuming `paper` contains the article's metadata including author information
										if crossref_paper.authors is not None:
											for author_info in crossref_paper.authors:
												given_name = author_info.get('given')
												family_name = author_info.get('family')
												orcid = author_info.get('ORCID', None)
												try:
													if orcid:  # If ORCID is present, use it as the primary key for author lookup/creation
														author_obj, author_created = Authors.objects.get_or_create(
																ORCID=orcid,
																defaults={
																		'given_name': given_name,
																		'family_name': family_name
																		}
																)
													else:  # If no ORCID is provided, fallback to using given_name and family_name for lookup/creation
														if not given_name or not family_name:
															self.stdout.write(f"Missing given name or family name, skipping this author. {crossref_paper.doi}")
															continue
														else:
															author_obj, author_created = Authors.objects.get_or_create(
																given_name=given_name,
																family_name=family_name,
																defaults={'ORCID': orcid}  # orcid will be an empty string if not provided, which is fine
															)
												except MultipleObjectsReturned:
													# Handle the case where multiple authors are returned
													authors = Authors.objects.filter(given_name=given_name, family_name=family_name)
													print(f"Multiple authors found for {given_name} {family_name}:")
													for author in authors:
															print(f"Author ID: {author.author_id}, ORCID: {author.ORCID}")
													# Use the first author with an ORCID, if available
													author_obj = next((author for author in authors if author.ORCID), authors.first())

													# Link the author to the article if not already linked
												if not science_paper.authors.filter(pk=author_obj.pk).exists():
													science_paper.authors.add(author_obj)
							else:
								print('no DOI, trying to create article')
								existing_article = Articles.objects.filter(title=title).first()
								if existing_article:
											science_paper = existing_article
											created = False
								else:
											science_paper = Articles.objects.create(
												title=title,
												summary=summary,
												link=link,
												published_date=published_date,
												source=source,
												crossref_check=None
											)
											created = True

								if not created:
									if any([science_paper.title != title, science_paper.summary != SciencePaper.clean_abstract(abstract=summary),
												science_paper.link != link, science_paper.published_date != published_date]):
										science_paper.title = title
										science_paper.summary = SciencePaper.clean_abstract(abstract=summary)
										science_paper.link = link
										science_paper.published_date = published_date
										science_paper.teams.add(source.team)
										science_paper.subjects.add(source.subject)
										science_paper.sources.add(source)
										science_paper.save()

brunoamaral added bug Something isn't working help wanted Extra attention is needed labels Jun 23, 2024

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

pubmed articles return a truncated abstract #413

pubmed articles return a truncated abstract #413

brunoamaral commented Jun 23, 2024

pubmed articles return a truncated abstract #413

pubmed articles return a truncated abstract #413

Comments

brunoamaral commented Jun 23, 2024