Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

pubmed articles return a truncated abstract #413

Open
brunoamaral opened this issue Jun 23, 2024 · 0 comments
Open

pubmed articles return a truncated abstract #413

brunoamaral opened this issue Jun 23, 2024 · 0 comments
Labels
bug Something isn't working help wanted Extra attention is needed

Comments

@brunoamaral
Copy link
Owner

rss feed where the problem was found: https://pubmed.ncbi.nlm.nih.gov/rss/search/10guX6I3SqrbUeeLKSTD6FCRM44ewnrN2MKKTQLLPMHB4xNsZU/?limit=15&utm_campaign=pubmed-2&fc=20210216052009

this is in the feedreader.py file that processes the data

		def update_articles_from_feeds(self):
			sources = Sources.objects.filter(method='rss', source_for='science paper')
			for source in sources:
					feed = self.fetch_feed(source.link, source.ignore_ssl)
					for entry in feed['entries']:
							title = entry['title']
							self.stdout.write(f"Processing {title}")
							summary = entry.get('summary', '')
							if hasattr(entry, 'summary_detail'):
									summary = entry['summary_detail']['value']
							published = entry.get('published')
							if 'pubmed' in source.link and hasattr(entry, 'content'):
									summary = entry['content'][0]['value']
							published_date = parse(entry.get('published') or entry.get('prism_coverdate'), tzinfos=self.tzinfos).astimezone(pytz.utc)
							link = greg.remove_utm(entry['link'])
							doi = None
							if 'pubmed' in source.link and entry.get('dc_identifier', '').startswith('doi:'):
									doi = entry['dc_identifier'].replace('doi:', '')
							elif 'faseb' in source.link:
									doi = entry.get('prism_doi', '')

							if doi:
									crossref_paper = SciencePaper(doi=doi)
									crossref_paper.refresh()
									title = crossref_paper.title if crossref_paper.title else entry['title']
									summary = crossref_paper.abstract if crossref_paper.abstract else entry.get('summary')

									# Check if an article with the same DOI or title exists
									existing_article = Articles.objects.filter(Q(doi=doi) | Q(title=title)).first()
									if existing_article:
										science_paper = existing_article
										created = False
									else:
										science_paper = Articles.objects.create(
											doi=doi,
											title=title,
											summary=summary,
											link=link,
											published_date=published_date,
											container_title=crossref_paper.journal,
											publisher=crossref_paper.publisher,
											access=crossref_paper.access,
											crossref_check=timezone.now()
										)
										created = True

									if created:
										science_paper.teams.add(source.team)
										science_paper.subjects.add(source.subject)
										science_paper.sources.add(source)
										science_paper.save()
									else:
											if any([science_paper.title != title, science_paper.summary != SciencePaper.clean_abstract(abstract=summary),
													science_paper.link != link, science_paper.published_date != published_date]):
													science_paper.title = title
													science_paper.summary = SciencePaper.clean_abstract(abstract=summary)
													science_paper.link = link
													science_paper.published_date = published_date
													science_paper.sources.add(source)
													science_paper.teams.add(source.team)
													science_paper.subjects.add(source.subject)
													science_paper.save()

									# Process author information
									if crossref_paper is not None:  # Assuming `paper` contains the article's metadata including author information
										if crossref_paper.authors is not None:
											for author_info in crossref_paper.authors:
												given_name = author_info.get('given')
												family_name = author_info.get('family')
												orcid = author_info.get('ORCID', None)
												try:
													if orcid:  # If ORCID is present, use it as the primary key for author lookup/creation
														author_obj, author_created = Authors.objects.get_or_create(
																ORCID=orcid,
																defaults={
																		'given_name': given_name,
																		'family_name': family_name
																		}
																)
													else:  # If no ORCID is provided, fallback to using given_name and family_name for lookup/creation
														if not given_name or not family_name:
															self.stdout.write(f"Missing given name or family name, skipping this author. {crossref_paper.doi}")
															continue
														else:
															author_obj, author_created = Authors.objects.get_or_create(
																given_name=given_name,
																family_name=family_name,
																defaults={'ORCID': orcid}  # orcid will be an empty string if not provided, which is fine
															)
												except MultipleObjectsReturned:
													# Handle the case where multiple authors are returned
													authors = Authors.objects.filter(given_name=given_name, family_name=family_name)
													print(f"Multiple authors found for {given_name} {family_name}:")
													for author in authors:
															print(f"Author ID: {author.author_id}, ORCID: {author.ORCID}")
													# Use the first author with an ORCID, if available
													author_obj = next((author for author in authors if author.ORCID), authors.first())

													# Link the author to the article if not already linked
												if not science_paper.authors.filter(pk=author_obj.pk).exists():
													science_paper.authors.add(author_obj)
							else:
								print('no DOI, trying to create article')
								existing_article = Articles.objects.filter(title=title).first()
								if existing_article:
											science_paper = existing_article
											created = False
								else:
											science_paper = Articles.objects.create(
												title=title,
												summary=summary,
												link=link,
												published_date=published_date,
												source=source,
												crossref_check=None
											)
											created = True

								if not created:
									if any([science_paper.title != title, science_paper.summary != SciencePaper.clean_abstract(abstract=summary),
												science_paper.link != link, science_paper.published_date != published_date]):
										science_paper.title = title
										science_paper.summary = SciencePaper.clean_abstract(abstract=summary)
										science_paper.link = link
										science_paper.published_date = published_date
										science_paper.teams.add(source.team)
										science_paper.subjects.add(source.subject)
										science_paper.sources.add(source)
										science_paper.save()
@brunoamaral brunoamaral added bug Something isn't working help wanted Extra attention is needed labels Jun 23, 2024
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
bug Something isn't working help wanted Extra attention is needed
Projects
None yet
Development

No branches or pull requests

1 participant