diff options
author | Mykyta Holubakha <hilobakho@gmail.com> | 2017-08-18 08:58:30 +0300 |
---|---|---|
committer | Mykyta Holubakha <hilobakho@gmail.com> | 2017-08-18 08:58:30 +0300 |
commit | 2b385f3b47656913d98b0df0abc3099554d4ba3a (patch) | |
tree | a6e71cbeceb3fba0ee92fd3ade9d8d55c9d6d553 | |
parent | Fixed iquery (diff) | |
download | pomu-2b385f3b47656913d98b0df0abc3099554d4ba3a.tar.gz pomu-2b385f3b47656913d98b0df0abc3099554d4ba3a.tar.bz2 pomu-2b385f3b47656913d98b0df0abc3099554d4ba3a.zip |
Created DataSource class
Implemented a Zugaina fetcher for fetching search results from zugaina
-rw-r--r-- | pomu/data/datasource.py | 19 | ||||
-rw-r--r-- | pomu/data/zugaina.py | 60 |
2 files changed, 79 insertions, 0 deletions
diff --git a/pomu/data/datasource.py b/pomu/data/datasource.py new file mode 100644 index 0000000..53fcc62 --- /dev/null +++ b/pomu/data/datasource.py @@ -0,0 +1,19 @@ +""" +Base DataSource class +""" + +class DataSource(): + def __init__(self, query): + pass + + def page_count(self): + pass + + def get_page(self, page): + pass + + def list_items(self, ident): + pass + + def get_item(self, ident): + pass diff --git a/pomu/data/zugaina.py b/pomu/data/zugaina.py new file mode 100644 index 0000000..25437fc --- /dev/null +++ b/pomu/data/zugaina.py @@ -0,0 +1,60 @@ +""" +gpo.zugaina.org searcher and fetcher +""" +import lxml.html +import requests + +from pomu.data.datasource import DataSource + +BASE_URL = 'https://gpo.zugaina.org/' +SBASE_URL = BASE_URL + 'Search?search={}&page={}' + +class ZugainaDataSource(DataSource): + + def __init__(self, query): + self.query = query + self.pagecache = {} + self.itemcache = {} + self.pagecount = -1 + + def page_count(self): + if self.pagecount > 0: + return self.pagecount + text = self.fetch_page(1) + doc = lxml.html.document_fromstring(text) + field = doc.xpath('//div[@class="pager"]/span')[0].text + self.pagecount = (field.split(' ')[-1] + 49) // 50 + return self.pagecount + + def get_page(self, page): + text = self.fetch_page(page) + doc = lxml.html.document_fromstring(text) + return [(strip(x.text), x.getchildren()[0].text) + for x in doc.xpath('//div[@id="search_results"]/a/div')] + + def list_items(self, ident): + text = self.fetch_item(ident) + doc = lxml.html.document_fromstring(text) + res = [] + for div in doc.xpath('//div[@id="ebuild_list"]/ul/div'): + id_ = div.xpath('li/a')[0].get('href').split('/')[3] + pv = div.xpath('li/div/b').text + overlay = div.xpath('@id') + res.append(id_, pv, overlay) + + def get_item(self, ident): + return results.get(BASE_URL + 'AJAX/Ebuild/' + ident).text + + def fetch_item(self, ident): + if ident in self.itemcache: + return self.itemcache[ident] + res = requests.get(BASE_URL + ident).text + return res + + + def fetch_page(self, page): + if page in self.pagecache: + return self.pagecache[page] + res = requests.get(SBASE_URL.format(self.query, page)).text + self.pagecache[page] = res + return res |