from metapub import PubMedArticle, PubMedFetcher, PubMedAuthor, pubmedcentral
from . import entry, file
from typing import List
import requests
from xml.etree import ElementTree
import time
from selenium import webdriver, common
import atexit
import os.path as op
from functools import lru_cache
[docs]@lru_cache()
def get_fetcher():
return PubMedFetcher()
[docs]@lru_cache()
def get_firefox():
fo = webdriver.FirefoxOptions()
fo.headless = True
firefox = webdriver.Firefox(options=fo, log_path=op.expanduser('~/.log/geckodriver.log'), timeout=5)
atexit.register(lambda: firefox.quit())
return firefox
[docs]def query_mult(pmids) -> List[PubMedArticle]:
"""
Creates a list of pubmed articles from a single query
Args:
pmids: sequence of pubmed ids
Returns:
pubmed article of each pubmed id
"""
if isinstance(pmids, str):
pmids = [pmids]
if len(pmids) > 250:
return query_mult(pmids[:250]) + query_mult(pmids[250:])
res = get_fetcher().qs.efetch({'db': 'pubmed', 'id': ','.join(pmids)})
lines = res.split(b'\n')
open = (b'<PubmedArticle>', b'<PubmedBookArticle>')
close = (b'</PubmedArticle>', b'</PubmedBookArticle>')
start = [idx for idx, line in enumerate(lines) if any(m in line for m in open)]
end = [idx + 1 for idx, line in enumerate(lines) if any(m in line for m in close)]
articles = [None] * len(pmids)
for idx1, idx2, pmid in zip(start, end, pmids):
single_lines = lines[:start[0]] + lines[idx1:idx2] + lines[end[-1]:]
article = PubMedArticle(b'\n'.join(single_lines))
articles[pmids.index(article.pmid)] = article
return articles
[docs]def to_bibtex(article: PubMedArticle) -> entry.BibTexEntry:
"""
Converts a PubMedArticle to a bibtex entry
Args:
article: article metadata from pubmed
Returns:
bibtex entry
"""
convert = {
'author': 'author_list',
'number': 'issue',
'editor': 'book_editors',
'publisher': 'book_publisher',
}
tags = {}
for tag_name in (
'pmid', 'doi', 'abstract', 'number', 'pages', 'editor',
'author', 'volume', 'publisher', 'year', 'journal', 'title',
):
value = getattr(article, convert.get(tag_name, tag_name))
if value is not None:
if tag_name == 'author':
value = ' and '.join([format_author(val) for val in value])
elif isinstance(value, list):
value = ' '.join(value)
tags[tag_name] = value.replace('%', r'\%')
for tag_name in entry.required_tags[article.pubmed_type]:
if isinstance(tag_name, str) and tag_name not in tags:
tags[tag_name] = '?'
name = article.author_list[0].last_name if len(article.author_list) > 0 else 'UNK'
key = f'{name}:{article.year}'
return entry.BibTexEntry(article.pubmed_type, key, **tags)
[docs]def doi_to_bibtex(doi: str) -> entry.BibTexEntry:
"""
Converts a DOI into a bibtex entry using doi.org
:param doi: doi
:return: corresponding bibtex entry
"""
as_text = requests.get(f'https://doi.org/{doi}', headers={'Accept': 'application/x-bibtex'}).text
return file.BibTexSet.from_string(as_text)
[docs]def pmid_from_doi(identifiers):
"""
Converts a sequence of identifiers into pmids
:param identifiers: list of dois or pcmids
:return: list of pmids
"""
if isinstance(identifiers, str):
identifiers = [identifiers]
if len(identifiers) > 25:
return pmid_from_doi(identifiers[:25]) + pmid_from_doi(identifiers[25:])
time.sleep(0.33)
xml = requests.get(pubmedcentral.PMC_ID_CONVERSION_URI % ' '.join(identifiers)).content
records = ElementTree.fromstring(xml).findall('record')
while len(records) < len(identifiers):
print('bad ID:', identifiers[len(records)])
records.append(None)
time.sleep(0.33)
xml = requests.get(pubmedcentral.PMC_ID_CONVERSION_URI % ' '.join(identifiers[len(records):])).content
records.extend(ElementTree.fromstring(xml).findall('record'))
res = [None] * len(identifiers)
for record in records:
if (
record is not None and
record.get('doi') is not None and
record.get('doi') in identifiers
):
res[identifiers.index(record.get('doi'))] = record.get('pmid')
return res
[docs]def biorxiv_to_doi(bioarxiv_url: str) -> str:
"""
Returns doi associated with article
If available will return published url,
otherwise will return None
:param bioarxiv_url: Url of the website (e.g., https://www.biorxiv.org/content/early/2018/06/14/266627)
:return: DOI
"""
firefox = get_firefox()
try:
firefox.get(bioarxiv_url)
except common.exceptions.TimeoutException:
print(f'timeout for {bioarxiv_url}')
return None
text = firefox.find_element_by_class_name('pub_jnl').text
if 'doi:' in text:
return text.split()[text.split().index('doi:') + 1]
for line in firefox.find_element_by_class_name('highwire-cite-metadata').text.splitlines():
if 'doi:' in line:
return line.split()[line.split().index('doi:') + 1]
return None