fix twitter engine and add comments

* add language-support
* add comments
* little refactoring
This commit is contained in:
Thomas Pointhuber 2014-09-02 20:14:52 +02:00
parent 678a80f043
commit 9460750fea

View file

@ -1,30 +1,63 @@
## Twitter (Social media)
#
# @website https://www.bing.com/news
# @provide-api yes (https://dev.twitter.com/docs/using-search)
#
# @using-api no
# @results HTML (using search portal)
# @stable no (HTML can change)
# @parse url, title, content
#
# @todo publishedDate
from urlparse import urljoin from urlparse import urljoin
from urllib import urlencode from urllib import urlencode
from lxml import html from lxml import html
from cgi import escape from cgi import escape
# engine dependent config
categories = ['social media'] categories = ['social media']
language_support = True
# search-url
base_url = 'https://twitter.com/' base_url = 'https://twitter.com/'
search_url = base_url+'search?' search_url = base_url+'search?'
# specific xpath variables
results_xpath = '//li[@data-item-type="tweet"]'
link_xpath = './/small[@class="time"]//a'
title_xpath = './/span[@class="username js-action-profile-name"]//text()' title_xpath = './/span[@class="username js-action-profile-name"]//text()'
content_xpath = './/p[@class="js-tweet-text tweet-text"]//text()' content_xpath = './/p[@class="js-tweet-text tweet-text"]//text()'
# do search-request
def request(query, params): def request(query, params):
params['url'] = search_url + urlencode({'q': query}) params['url'] = search_url + urlencode({'q': query})
# set language if specified
if params['language'] != 'all':
params['cookies']['lang'] = params['language'].split('_')[0]
return params return params
# get response from search-request
def response(resp): def response(resp):
results = [] results = []
dom = html.fromstring(resp.text) dom = html.fromstring(resp.text)
for tweet in dom.xpath('//li[@data-item-type="tweet"]'):
link = tweet.xpath('.//small[@class="time"]//a')[0] # parse results
for tweet in dom.xpath(results_xpath):
link = tweet.xpath(link_xpath)[0]
url = urljoin(base_url, link.attrib.get('href')) url = urljoin(base_url, link.attrib.get('href'))
title = ''.join(tweet.xpath(title_xpath)) title = ''.join(tweet.xpath(title_xpath))
content = escape(''.join(tweet.xpath(content_xpath))) content = escape(''.join(tweet.xpath(content_xpath)))
# append result
results.append({'url': url, results.append({'url': url,
'title': title, 'title': title,
'content': content}) 'content': content})
# return results
return results return results