Logo Search packages:      
Sourcecode: python-biopython version File versions

HistoryClient.py

00001 """Search and retreive information using the EUtils history.

EUtils has two major modes.  One uses history while the other uses
database identifiers.  This is a high-level interface for working with
the history.  You should use this module if you expect to work with
large or an unknown number of identifiers.

See DBIdsClient if you want to get information about a set of known
database identifiers.

>>> from EUtils import HistoryClient
>>> client = HistoryClient.HistoryClient()
>>> cancer = client.search("cancer")
>>> print len(cancer)
1458353
>>> 

That's quite a few hits.  Most people would like to see the first few
records then try to refine the search.

>>> print cancer[:5].efetch(retmode = "text", rettype = "docsum").read()

1:  Seow-Choen F.
Author's reply: Adjuvant therapy for rectal cancer cannot be based on the
results of other surgeons (Br J Surg 2002; 89: 946-947).
Br J Surg. 2003 Jan;90(1):121-122.
PMID: 12520589 [PubMed - as supplied by publisher]

2:  Mortensen N, Lindsey I.
Adjuvant therapy for rectal cancer cannot be based on the results of other
surgeons (Br J Surg 2002; 89: 946-947).
Br J Surg. 2003 Jan;90(1):121.
PMID: 12520588 [PubMed - as supplied by publisher]

3:  Osugi H, Takemura M, Higashino M, Takada N, Lee S, Kinoshita H.
A comparison of video-assisted thoracoscopic oesophagectomy and radical lymph
node dissection for squamous cell cancer of the oesophagus with open operation.
Br J Surg. 2003 Jan;90(1):108-13.
PMID: 12520585 [PubMed - in process]

4:  Tanaka M, Kitajima Y, Sato S, Miyazaki K.
Combined evaluation of mucin antigen and E-cadherin expression may help select
patients with gastric cancer suitable for minimally invasive therapy.
Br J Surg. 2003 Jan;90(1):95-101.
PMID: 12520583 [PubMed - in process]

5:  Diaz De Liano A, Oteiza Martinez F, Ciga MA, Aizcorbe M, Cobo F, Trujillo R.
Impact of surgical procedure for gastric cancer on quality of life.
Br J Surg. 2003 Jan;90(1):91-4.
PMID: 12520582 [PubMed - in process]

>>>

Now refine the query to publications in the last day

>>> recent_cancer = client.search("#%s" % (cancer.query_key,),
...                               daterange = EUtils.WithinNDays(1))
>>> len(recent_cancer)
106
>>>

Still quite a few.  What's the last one about?
>>> for k, v in recent_cancer[-1].summary().dataitems.allitems():
...     print k, "=", v
...

PubDate = 2002/12/01
Source = Nippon Shokakibyo Gakkai Zasshi
Authors = Kuroki T
Title = [Strategy against cancer in 21 century, with emphasis of cancer prevention and refractory cancer]
Volume = 99
Pages = 1423-7
EntrezDate = 2003/01/10
PubMedId = 12518389
MedlineId = 22406828
Lang = Japanese
PubType =
RecordStatus = PubMed - in process
Issue = 12
SO = 2002 Dec;99(12):1423-7
DOI =
JTA = KJY
ISSN = 0446-6586
PubId =
PubStatus = 4
Status = 6
HasAbstract = 0
ArticleIds = {'MedlineUID': u'22406828', 'PubMedId': u'12518389'}
>>> 

Here's an interesting one.  Which articles are related to this one but
are not about cancer?  First, get the related articles.


>>> neighbors = recent_cancer[-1].neighbor_links()
>>> dbids = neighbors.linksetdbs["pubmed_pubmed"].dbids
>>> len(dbids)
10296
>>> 

Upload that back to the server

>>> related_result = client.post(dbids)
>>>
>>> non_cancer = client.search("#%s NOT #%s" % (related_result.query_key,
...                                             cancer.query_key))
>>> len(non_cancer)
4000
>>>

The HistoryClient instance has an attribute named 'query_history'
which stores the searches done so far, keyed by the query_key value
assigned by the server.  The history on the server can expire.  If
that is detected during a search then previous results are invalidated
and removed from the query_history.  Future requests from invalidated
results will raise an error.

If a request is made from a search which has not been invalidated but
whose history has expired then queries like 'summary' will raise an
error.  Some other request (like 'dbids') may return success but
contain undefined information.

""" #"
    
import types
import ThinClient, parse, Datatypes, Mixins, Config

00128 class HistoryCookie:
    """Data needed to get back to the history"""
    def __init__(self, db, webenv_ref, query_key):
        self.db = db
        self.webenv_ref = webenv_ref
        self.query_key = query_key


00136 class HistoryLookup(object):
    """Look up information about a search in history

    To get the list of dbids by fetching the server's "uilist",
    use the "dbids" attribute.
    """
    def __init__(self, eutils, cookie, retstart, retmax):
        self.eutils = eutils
        self.cookie = cookie
        self.retstart = retstart
        self.retmax = retmax
        self.db = cookie.db
        self.query_key = cookie.query_key

    def _check_invalid(self):
        # Check if we can get data from this history
        if self.cookie.query_key is None:
            raise NotImplemented("empty data set")
        if self.query_key is None:
            raise Datatypes.EUtilsError(
                "query history no longer available on server")

00158     def esummary(self, retmode = 'xml', rettype = None):
        """Request the eSummary for this history; returns the socket handle"""
        self._check_invalid()
        infile = self.eutils.esummary_using_history(
            webenv = self.cookie.webenv_ref[0],
            db = self.cookie.db,
            query_key = self.cookie.query_key,
            retstart = self.retstart,
            retmax = self.retmax)
        return infile

00169     def summary(self):
        """the Datatypes.Summary for this history"""
        return parse.parse_summary_xml(self.esummary("xml"))

00173     def elink(self,
              db = "pubmed",
              cmd = "neighbor",
              term = None,
              field = None,
              daterange = None):
        """Request an eLink for this history; returns the socket handle"""
        self._check_invalid()
        return self.eutils.elink_using_history(
            webenv = self.cookie.webenv_ref[0],
            query_key = self.cookie.query_key,
            db = db,
            dbfrom = self.cookie.db,
            cmd = cmd,
            retstart = self.retstart,
            retmax = self.retmax,
            daterange = daterange,
            term = term,
            field = field,
            )

    def _get_dbids(self):
        infile = self.efetch(retmode = "text", rettype = "uilist")
        ids = parse.parse_fetch_identifiers(infile)
        return Datatypes.DBIds(self.cookie.db, ids)
    dbids = property(_get_dbids, None, None,
        "The DBIds for this results set, fetched from the server's 'uilist'")
    

00202 class HistoryRecord(HistoryLookup):
    """Get information about a single record in a history"""
    def __init__(self, eutils, cookie, offset):
        HistoryLookup.__init__(self, eutils, cookie, offset, 1)
00206     def summary(self):
        """the Datatypes.Summary for this history record"""
        return HistoryLookup.summary(self)[0]
    


class SequenceHistoryFetchMixin:
    def efetch(self, retmode = 'xml', rettype = None,
               seq_start = None, seq_stop = None, strand = None,
               complexity = None):
        self._check_invalid()
        if strand not in (None, 1, 2):
            raise TypeError("Strand can only be 1 (plus, default) or 2 (minus)")
        return self.eutils.efetch_using_history(
            webenv = self.cookie.webenv_ref[0],
            db = self.cookie.db,
            query_key = self.cookie.query_key,
            retstart = self.retstart,
            retmax = self.retmax,
            retmode = retmode,
            rettype = rettype,
            seq_start = seq_start,
            seq_stop = seq_stop,
            strand = strand,
            complexity = complexity)

class SequenceHistoryRecord(Mixins.SequenceFetchMixin,
                            SequenceHistoryFetchMixin,
                            HistoryRecord):
    pass

class PublicationHistoryFetchMixin:
    def efetch(self, retmode = "xml", rettype = None):
        self._check_invalid()
        return self.eutils.efetch_using_history(
            webenv = self.cookie.webenv_ref[0],
            db = self.cookie.db,
            query_key = self.cookie.query_key,
            retstart = self.retstart,
            retmax = self.retmax,
            retmode = retmode,
            rettype = rettype)

class PublicationHistoryRecord(Mixins.PublicationFetchMixin,
                               PublicationHistoryFetchMixin,
                               HistoryRecord):
    pass

class BaseHistoryRecordSet(HistoryLookup):
    def __init__(self, eutils, cookie, retstart, retmax, metadata = None):
        HistoryLookup.__init__(self, eutils, cookie, retstart, retmax)
        self.metadata = metadata

    def __len__(self):
        return self.retmax
    
    def __getitem__(self, i):
        if isinstance(i, types.SliceType):
            if i.step is not None:
                raise TypeError("cannot set step size in slice")
            # Don't pass metadata downwards
            start = i.start
            if start is None: start = 0
            stop = i.stop
            if stop is None: stop = self.retmax
            # Potentially expensive, but this is the easy way
            # to get the semantics correct.
            x = range(self.retstart, self.retstart + self.retmax)[start:stop]
            if x:
                retstart = x[0]
                retmax = x[-1] - x[0] + 1
            else:
                retstart = 0
                retmax = 0
            return self.__class__(self.eutils, self.cookie, retstart,
                                  retmax)
        if 0 <= i < self.retmax:
            pos = self.retstart + i
        elif 1 <= -i <= self.retmax:
            pos = self.retstart + i + self.retmax
        else:
            raise IndexError(i)
        return self._record_class(self.eutils, self.cookie, pos)

class SequenceHistoryRecordSet(Mixins.SequenceFetchMixin,
                               SequenceHistoryFetchMixin,
                               BaseHistoryRecordSet):
    _record_class = SequenceHistoryRecord

class PublicationHistoryRecordSet(Mixins.PublicationFetchMixin,
                                  PublicationHistoryFetchMixin,
                                  BaseHistoryRecordSet):
    _record_class = PublicationHistoryRecord

def _get_recordset_constructor(db, dbtype):
    dbtype = Config.databases.gettype(db, dbtype)
    if dbtype == Config.SEQUENCE_TYPE:
        return SequenceHistoryRecordSet
    elif dbtype == Config.PUBLICATION_TYPE:
        return PublicationHistoryRecordSet
    else:
        raise TypeError("unknown database type: %r" % (dbtype,))
    return Hrec_set, Hrec

class HistoryClient:
    def __init__(self, eutils = None):
        if eutils is None:
            eutils = ThinClient.ThinClient()
        self.eutils = eutils
        self.webenv_ref = [None]
        self.query_history = {}

    def _check_for_cache_reset(self, query_key):
        # If this is a repeat then it's because the history
        # has expired.  Set the existing results to have a
        # query_key of None and reset the cache.
        if query_key not in self.query_history:
            # New value, so there was no reset
            return

        for v in self.query_history.values():
            v.query_key = None
        
        self.query_history.clear()

    def search(self,
               term,
               db = "pubmed",
               field = None,
               daterange = None,
               dbtype = None
               ):
        
        set_klass = _get_recordset_constructor(db, dbtype)
        
        infile = self.eutils.esearch(
            term = term,
            db = db,
            field = field,

            retstart = 0,
            retmax = 0,
            
            daterange = daterange,
            
            usehistory = 1,
            webenv = self.webenv_ref[0],
            )
        searchinfo = parse.parse_search(infile, self.webenv_ref)

        if searchinfo.query_key is not None:
            cookie = HistoryCookie(db, self.webenv_ref, searchinfo.query_key)
        else:
            assert searchinfo.count == 0
            cookie = HistoryCookie(db, None, None)

        recordset = set_klass(self.eutils, cookie, 0, searchinfo.count,
                              searchinfo)
        # won't have a query_key if the search turned up empty
        if searchinfo.query_key is not None:
            self._check_for_cache_reset(searchinfo.query_key)
            self.query_history[searchinfo.query_key] = recordset

        return recordset

                                 

    def post(self, dbids, dbtype = None):
        set_klass = _get_recordset_constructor(dbids.db, dbtype)

        infile = self.eutils.epost(dbids,
                                   webenv = self.webenv_ref[0])
        # Extract the webenv_ref since it may change.
        postinfo = parse.parse_post(infile, self.webenv_ref)

        # Were there any invalid identifiers?
        n = len(dbids) - len(postinfo.invalid_ids)

        cookie = HistoryCookie(dbids.db, self.webenv_ref,
                               postinfo.query_key)
        recordset = set_klass(self.eutils, cookie, 0, n, postinfo)
        self._check_for_cache_reset(postinfo.query_key)
        self.query_history[postinfo.query_key] = recordset
        return recordset

    from_dbids = post  # alias for similarity to DBIdsClient

Generated by  Doxygen 1.6.0   Back to index