From e8a8cc1f62e2c572db66e66b87b29c0e27cda1b1 Mon Sep 17 00:00:00 2001 From: Michel Vedrine Date: Mon, 28 Jan 2019 13:56:55 +0100 Subject: [PATCH] fulltext search now tries to guess the file encoding --- catfish/CatfishSearchEngine.py | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/catfish/CatfishSearchEngine.py b/catfish/CatfishSearchEngine.py index a36032ef..ca7d0a9c 100644 --- a/catfish/CatfishSearchEngine.py +++ b/catfish/CatfishSearchEngine.py @@ -30,6 +30,8 @@ from mimetypes import guess_type from sys import version_info +from chardet.universaldetector import UniversalDetector + try: from zeitgeist.client import ZeitgeistDBusInterface from zeitgeist.datamodel import Event, TimeRange @@ -87,6 +89,18 @@ def string_regex(keywords, path): return regex +def detect_encoding(filepath): + """Tries to guess the encoding of a text file incrementally.""" + detector = UniversalDetector() + with open(filepath, 'rb') as f: + for line in f: + detector.feed(line) + if detector.done: + break + detector.close() + return detector.result['encoding'] + + class CatfishSearchEngine: """CatfishSearchEngine is the collection of search backends that are used @@ -375,8 +389,12 @@ class CatfishSearchMethod_Fulltext(CatfishSearchMethod): mime = guess_type(filename)[0] if not mime or 'text' in mime: try: - opened = open(os.path.join(root, filename), 'r') - + filepath = os.path.join(root, filename) + encoding = detect_encoding(filepath) + if python3: + opened = open(filepath, 'r', encoding=encoding) + else: + opened = io.open(filepath, 'r', encoding=encoding) find_keywords = find_keywords_backup # Check each line. If a keyword is found, yield. -- 2.17.1