cclauss
commited on
Commit
·
2c047e3
1
Parent(s):
e7d4bd4
Fix flake8 issues
Browse files- scripts/calculate_coverages.py +5 -0
- scripts/convert_all_datasets.py +5 -0
- torchmoji/filter_utils.py +7 -4
- torchmoji/finetuning.py +8 -4
- torchmoji/word_generator.py +5 -4
scripts/calculate_coverages.py
CHANGED
|
@@ -11,6 +11,11 @@ sys.path.insert(0, dirname(dirname(abspath(__file__))))
|
|
| 11 |
|
| 12 |
from torchmoji.sentence_tokenizer import SentenceTokenizer, coverage
|
| 13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
IS_PYTHON2 = int(sys.version[0]) == 2
|
| 15 |
|
| 16 |
OUTPUT_PATH = 'coverage.csv'
|
|
|
|
| 11 |
|
| 12 |
from torchmoji.sentence_tokenizer import SentenceTokenizer, coverage
|
| 13 |
|
| 14 |
+
try:
|
| 15 |
+
unicode # Python 2
|
| 16 |
+
except NameError:
|
| 17 |
+
unicode = str # Python 3
|
| 18 |
+
|
| 19 |
IS_PYTHON2 = int(sys.version[0]) == 2
|
| 20 |
|
| 21 |
OUTPUT_PATH = 'coverage.csv'
|
scripts/convert_all_datasets.py
CHANGED
|
@@ -14,6 +14,11 @@ from torchmoji.create_vocab import VocabBuilder
|
|
| 14 |
from torchmoji.sentence_tokenizer import SentenceTokenizer, extend_vocab, coverage
|
| 15 |
from torchmoji.tokenizer import tokenize
|
| 16 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
IS_PYTHON2 = int(sys.version[0]) == 2
|
| 18 |
|
| 19 |
DATASETS = [
|
|
|
|
| 14 |
from torchmoji.sentence_tokenizer import SentenceTokenizer, extend_vocab, coverage
|
| 15 |
from torchmoji.tokenizer import tokenize
|
| 16 |
|
| 17 |
+
try:
|
| 18 |
+
unicode # Python 2
|
| 19 |
+
except NameError:
|
| 20 |
+
unicode = str # Python 3
|
| 21 |
+
|
| 22 |
IS_PYTHON2 = int(sys.version[0]) == 2
|
| 23 |
|
| 24 |
DATASETS = [
|
torchmoji/filter_utils.py
CHANGED
|
@@ -11,8 +11,11 @@ import numpy as np
|
|
| 11 |
from torchmoji.tokenizer import RE_MENTION, RE_URL
|
| 12 |
from torchmoji.global_variables import SPECIAL_TOKENS
|
| 13 |
|
| 14 |
-
|
| 15 |
-
|
|
|
|
|
|
|
|
|
|
| 16 |
|
| 17 |
AtMentionRegex = re.compile(RE_MENTION)
|
| 18 |
urlRegex = re.compile(RE_URL)
|
|
@@ -36,8 +39,8 @@ VARIATION_SELECTORS = [ '\ufe00',
|
|
| 36 |
'\ufe0f']
|
| 37 |
|
| 38 |
# from https://stackoverflow.com/questions/92438/stripping-non-printable-characters-from-a-string-in-python
|
| 39 |
-
ALL_CHARS = (
|
| 40 |
-
CONTROL_CHARS = ''.join(map(
|
| 41 |
CONTROL_CHAR_REGEX = re.compile('[%s]' % re.escape(CONTROL_CHARS))
|
| 42 |
|
| 43 |
def is_special_token(word):
|
|
|
|
| 11 |
from torchmoji.tokenizer import RE_MENTION, RE_URL
|
| 12 |
from torchmoji.global_variables import SPECIAL_TOKENS
|
| 13 |
|
| 14 |
+
try:
|
| 15 |
+
unichr # Python 2
|
| 16 |
+
except NameError:
|
| 17 |
+
unichr = chr # Python 3
|
| 18 |
+
|
| 19 |
|
| 20 |
AtMentionRegex = re.compile(RE_MENTION)
|
| 21 |
urlRegex = re.compile(RE_URL)
|
|
|
|
| 39 |
'\ufe0f']
|
| 40 |
|
| 41 |
# from https://stackoverflow.com/questions/92438/stripping-non-printable-characters-from-a-string-in-python
|
| 42 |
+
ALL_CHARS = (unichr(i) for i in range(sys.maxunicode))
|
| 43 |
+
CONTROL_CHARS = ''.join(map(unichr, list(range(0,32)) + list(range(127,160))))
|
| 44 |
CONTROL_CHAR_REGEX = re.compile('[%s]' % re.escape(CONTROL_CHARS))
|
| 45 |
|
| 46 |
def is_special_token(word):
|
torchmoji/finetuning.py
CHANGED
|
@@ -3,7 +3,6 @@
|
|
| 3 |
"""
|
| 4 |
from __future__ import print_function
|
| 5 |
|
| 6 |
-
import sys
|
| 7 |
import uuid
|
| 8 |
from time import sleep
|
| 9 |
from io import open
|
|
@@ -28,8 +27,13 @@ from torchmoji.global_variables import (FINETUNING_METHODS,
|
|
| 28 |
from torchmoji.tokenizer import tokenize
|
| 29 |
from torchmoji.sentence_tokenizer import SentenceTokenizer
|
| 30 |
|
| 31 |
-
|
| 32 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
|
| 34 |
def load_benchmark(path, vocab, extend_with=0):
|
| 35 |
""" Loads the given benchmark dataset.
|
|
@@ -66,7 +70,7 @@ def load_benchmark(path, vocab, extend_with=0):
|
|
| 66 |
|
| 67 |
# Decode data
|
| 68 |
try:
|
| 69 |
-
texts = [
|
| 70 |
except UnicodeDecodeError:
|
| 71 |
texts = [x.decode('utf-8') for x in data['texts']]
|
| 72 |
|
|
|
|
| 3 |
"""
|
| 4 |
from __future__ import print_function
|
| 5 |
|
|
|
|
| 6 |
import uuid
|
| 7 |
from time import sleep
|
| 8 |
from io import open
|
|
|
|
| 27 |
from torchmoji.tokenizer import tokenize
|
| 28 |
from torchmoji.sentence_tokenizer import SentenceTokenizer
|
| 29 |
|
| 30 |
+
try:
|
| 31 |
+
unicode
|
| 32 |
+
IS_PYTHON2 = True
|
| 33 |
+
except NameError:
|
| 34 |
+
unicode = str
|
| 35 |
+
IS_PYTHON2 = False
|
| 36 |
+
|
| 37 |
|
| 38 |
def load_benchmark(path, vocab, extend_with=0):
|
| 39 |
""" Loads the given benchmark dataset.
|
|
|
|
| 70 |
|
| 71 |
# Decode data
|
| 72 |
try:
|
| 73 |
+
texts = [unicode(x) for x in data['texts']]
|
| 74 |
except UnicodeDecodeError:
|
| 75 |
texts = [x.decode('utf-8') for x in data['texts']]
|
| 76 |
|
torchmoji/word_generator.py
CHANGED
|
@@ -7,7 +7,6 @@
|
|
| 7 |
|
| 8 |
from __future__ import division, print_function, unicode_literals
|
| 9 |
|
| 10 |
-
import sys
|
| 11 |
import re
|
| 12 |
import unicodedata
|
| 13 |
import numpy as np
|
|
@@ -26,8 +25,10 @@ from torchmoji.filter_utils import (convert_linebreaks,
|
|
| 26 |
remove_variation_selectors,
|
| 27 |
separate_emojis_and_text)
|
| 28 |
|
| 29 |
-
|
| 30 |
-
|
|
|
|
|
|
|
| 31 |
|
| 32 |
# Only catch retweets in the beginning of the tweet as those are the
|
| 33 |
# automatically added ones.
|
|
@@ -68,7 +69,7 @@ class WordGenerator():
|
|
| 68 |
that is not allowed.
|
| 69 |
"""
|
| 70 |
|
| 71 |
-
if not isinstance(sentence,
|
| 72 |
raise ValueError("All sentences should be Unicode-encoded!")
|
| 73 |
sentence = sentence.strip().lower()
|
| 74 |
|
|
|
|
| 7 |
|
| 8 |
from __future__ import division, print_function, unicode_literals
|
| 9 |
|
|
|
|
| 10 |
import re
|
| 11 |
import unicodedata
|
| 12 |
import numpy as np
|
|
|
|
| 25 |
remove_variation_selectors,
|
| 26 |
separate_emojis_and_text)
|
| 27 |
|
| 28 |
+
try:
|
| 29 |
+
unicode # Python 2
|
| 30 |
+
except NameError:
|
| 31 |
+
unicode = str # Python 3
|
| 32 |
|
| 33 |
# Only catch retweets in the beginning of the tweet as those are the
|
| 34 |
# automatically added ones.
|
|
|
|
| 69 |
that is not allowed.
|
| 70 |
"""
|
| 71 |
|
| 72 |
+
if not isinstance(sentence, unicode):
|
| 73 |
raise ValueError("All sentences should be Unicode-encoded!")
|
| 74 |
sentence = sentence.strip().lower()
|
| 75 |
|