Source code for mcot.bibtex.pubmed

from metapub import PubMedArticle, PubMedFetcher, PubMedAuthor, pubmedcentral
from . import entry, file
from typing import List
import requests
from xml.etree import ElementTree
import time
from selenium import webdriver, common
import atexit
import os.path as op
from functools import lru_cache


[docs]@lru_cache()
def get_fetcher():
    return PubMedFetcher()


[docs]@lru_cache()
def get_firefox():
    fo = webdriver.FirefoxOptions()
    fo.headless = True
    firefox = webdriver.Firefox(options=fo, log_path=op.expanduser('~/.log/geckodriver.log'), timeout=5)
    atexit.register(lambda: firefox.quit())
    return firefox


[docs]def query_mult(pmids) -> List[PubMedArticle]:
    """
    Creates a list of pubmed articles from a single query

    Args:
        pmids: sequence of pubmed ids

    Returns:
        pubmed article of each pubmed id
    """
    if isinstance(pmids, str):
        pmids = [pmids]
    if len(pmids) > 250:
        return query_mult(pmids[:250]) + query_mult(pmids[250:])
    res = get_fetcher().qs.efetch({'db': 'pubmed', 'id': ','.join(pmids)})
    lines = res.split(b'\n')
    open = (b'<PubmedArticle>', b'<PubmedBookArticle>')
    close = (b'</PubmedArticle>', b'</PubmedBookArticle>')
    start = [idx for idx, line in enumerate(lines) if any(m in line for m in open)]
    end = [idx + 1 for idx, line in enumerate(lines) if any(m in line for m in close)]
    articles = [None] * len(pmids)
    for idx1, idx2, pmid in zip(start, end, pmids):
        single_lines = lines[:start[0]] + lines[idx1:idx2] + lines[end[-1]:]
        article = PubMedArticle(b'\n'.join(single_lines))
        articles[pmids.index(article.pmid)] = article
    return articles


[docs]def to_bibtex(article: PubMedArticle) -> entry.BibTexEntry:
    """
    Converts a PubMedArticle to a bibtex entry

    Args:
        article: article metadata from pubmed

    Returns:
        bibtex entry
    """
    convert = {
        'author': 'author_list',
        'number': 'issue',
        'editor': 'book_editors',
        'publisher': 'book_publisher',
    }
    tags = {}
    for tag_name in (
        'pmid', 'doi', 'abstract', 'number', 'pages', 'editor',
        'author', 'volume', 'publisher', 'year', 'journal', 'title',
    ):
        value = getattr(article, convert.get(tag_name, tag_name))
        if value is not None:
            if tag_name == 'author':
                value = ' and '.join([format_author(val) for val in value])
            elif isinstance(value, list):
                value = ' '.join(value)
            tags[tag_name] = value.replace('%', r'\%')
    for tag_name in entry.required_tags[article.pubmed_type]:
        if isinstance(tag_name, str) and tag_name not in tags:
            tags[tag_name] = '?'
    name = article.author_list[0].last_name if len(article.author_list) > 0 else 'UNK'
    key = f'{name}:{article.year}'
    return entry.BibTexEntry(article.pubmed_type, key, **tags)


[docs]def doi_to_bibtex(doi: str) -> entry.BibTexEntry:
    """
    Converts a DOI into a bibtex entry using doi.org

    :param doi: doi
    :return: corresponding bibtex entry
    """
    as_text = requests.get(f'https://doi.org/{doi}', headers={'Accept': 'application/x-bibtex'}).text
    return file.BibTexSet.from_string(as_text)


[docs]def format_author(author: PubMedAuthor):
    """
    Provides formatting for the author

    :param author: individual author
    :return: text for in bibtex
    """
    if not author.last_name and not author.collective_name:
        raise ValueError("Author name is ill-defined")
    if author.collective_name:
        return author.collective_name
    if author.fore_name:
        return f'{author.last_name}, {author.fore_name}'
    if author.initials:
        return f'{author.last_name}, {author.initials}'
    return author.last_name


[docs]def pmid_from_doi(identifiers):
    """
    Converts a sequence of identifiers into pmids

    :param identifiers: list of dois or pcmids
    :return: list of pmids
    """
    if isinstance(identifiers, str):
        identifiers = [identifiers]
    if len(identifiers) > 25:
        return pmid_from_doi(identifiers[:25]) + pmid_from_doi(identifiers[25:])
    time.sleep(0.33)
    xml = requests.get(pubmedcentral.PMC_ID_CONVERSION_URI % ' '.join(identifiers)).content
    records = ElementTree.fromstring(xml).findall('record')
    while len(records) < len(identifiers):
        print('bad ID:', identifiers[len(records)])
        records.append(None)
        time.sleep(0.33)
        xml = requests.get(pubmedcentral.PMC_ID_CONVERSION_URI % ' '.join(identifiers[len(records):])).content
        records.extend(ElementTree.fromstring(xml).findall('record'))

    res = [None] * len(identifiers)
    for record in records:
        if (
            record is not None and
            record.get('doi') is not None and
            record.get('doi') in identifiers
        ):
            res[identifiers.index(record.get('doi'))] = record.get('pmid')

    return res


[docs]def biorxiv_to_doi(bioarxiv_url: str) -> str:
    """
    Returns doi associated with article

    If available will return published url,
    otherwise will return None

    :param bioarxiv_url: Url of the website (e.g., https://www.biorxiv.org/content/early/2018/06/14/266627)
    :return: DOI
    """
    firefox = get_firefox()
    try:
        firefox.get(bioarxiv_url)
    except common.exceptions.TimeoutException:
        print(f'timeout for {bioarxiv_url}')
        return None
    text = firefox.find_element_by_class_name('pub_jnl').text
    if 'doi:' in text:
        return text.split()[text.split().index('doi:') + 1]
    for line in firefox.find_element_by_class_name('highwire-cite-metadata').text.splitlines():
        if 'doi:' in line:
            return line.split()[line.split().index('doi:') + 1]
    return None