Commit
·
943f44c
1
Parent(s):
2aa9556
Update tokenization_xgen.py (#16)
Browse files- Update tokenization_xgen.py (f79aeb3407323d2df16be52abe61dbed426d58b9)
- tokenization_xgen.py +3 -3
tokenization_xgen.py
CHANGED
|
@@ -134,15 +134,15 @@ class XgenTokenizer(PreTrainedTokenizer):
|
|
| 134 |
):
|
| 135 |
pad_token_added = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
|
| 136 |
eos_token_added = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
|
|
|
|
|
|
|
| 137 |
super().__init__(
|
| 138 |
pad_token=pad_token_added,
|
| 139 |
eos_token=eos_token_added,
|
| 140 |
add_eos_token=add_eos_token,
|
| 141 |
add_special_tokens=add_special_tokens,
|
| 142 |
**kwargs,
|
| 143 |
-
)
|
| 144 |
-
self.add_eos_token = add_eos_token
|
| 145 |
-
self.encoder = tiktoken_tokenizer(base="gpt2", pad_token=pad_token, add_special=add_special_tokens)
|
| 146 |
|
| 147 |
@property
|
| 148 |
def vocab_size(self):
|
|
|
|
| 134 |
):
|
| 135 |
pad_token_added = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
|
| 136 |
eos_token_added = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
|
| 137 |
+
self.add_eos_token = add_eos_token
|
| 138 |
+
self.encoder = tiktoken_tokenizer(base="gpt2", pad_token=pad_token, add_special=add_special_tokens)
|
| 139 |
super().__init__(
|
| 140 |
pad_token=pad_token_added,
|
| 141 |
eos_token=eos_token_added,
|
| 142 |
add_eos_token=add_eos_token,
|
| 143 |
add_special_tokens=add_special_tokens,
|
| 144 |
**kwargs,
|
| 145 |
+
)
|
|
|
|
|
|
|
| 146 |
|
| 147 |
@property
|
| 148 |
def vocab_size(self):
|