bazarr/libs/ftfy/streamtester/twitter_tester.py

89 lines
3.1 KiB
Python
Raw Normal View History

2018-11-01 00:08:29 +08:00
"""
Implements a StreamTester that runs over Twitter data. See the class
docstring.
This module is written for Python 3 only. The __future__ imports you see here
are just to let Python 2 scan the file without crashing with a SyntaxError.
"""
from __future__ import print_function, unicode_literals
import os
from collections import defaultdict
from ftfy.streamtester import StreamTester
class TwitterTester(StreamTester):
"""
This class uses the StreamTester code (defined in `__init__.py`) to
evaluate ftfy's real-world performance, by feeding it live data from
Twitter.
This is a semi-manual evaluation. It requires a human to look at the
results and determine if they are good. The three possible cases we
can see here are:
- Success: the process takes in mojibake and outputs correct text.
- False positive: the process takes in correct text, and outputs
mojibake. Every false positive should be considered a bug, and
reported on GitHub if it isn't already.
- Confusion: the process takes in mojibake and outputs different
mojibake. Not a great outcome, but not as dire as a false
positive.
This tester cannot reveal false negatives. So far, that can only be
done by the unit tests.
"""
OUTPUT_DIR = './twitterlogs'
def __init__(self):
self.lines_by_lang = defaultdict(list)
super().__init__()
def save_files(self):
"""
When processing data from live Twitter, save it to log files so that
it can be replayed later.
"""
if not os.path.exists(self.OUTPUT_DIR):
os.makedirs(self.OUTPUT_DIR)
for lang, lines in self.lines_by_lang.items():
filename = 'tweets.{}.txt'.format(lang)
fullname = os.path.join(self.OUTPUT_DIR, filename)
langfile = open(fullname, 'a', encoding='utf-8')
for line in lines:
print(line.replace('\n', ' '), file=langfile)
langfile.close()
self.lines_by_lang = defaultdict(list)
def run_sample(self):
"""
Listen to live data from Twitter, and pass on the fully-formed tweets
to `check_ftfy`. This requires the `twitter` Python package as a
dependency.
"""
from twitter import TwitterStream
from ftfy.streamtester.oauth import get_auth
twitter_stream = TwitterStream(auth=get_auth())
iterator = twitter_stream.statuses.sample()
for tweet in iterator:
if 'text' in tweet:
self.check_ftfy(tweet['text'])
if 'user' in tweet:
lang = tweet['user'].get('lang', 'NONE')
self.lines_by_lang[lang].append(tweet['text'])
if self.count % 10000 == 100:
self.save_files()
def main():
"""
When run from the command line, this script connects to the Twitter stream
and runs the TwitterTester on it forever. Or at least until the stream
drops.
"""
tester = TwitterTester()
tester.run_sample()
if __name__ == '__main__':
main()