Upload processor

Browse files

Files changed (8) hide show

added_tokens.json +4 -0
preprocessor_config.json +8 -0
processing_mists.py +82 -0
processor_config.json +6 -0
special_tokens_map.json +30 -0
tokenizer.json +0 -0
tokenizer.model +3 -0
tokenizer_config.json +0 -0

added_tokens.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "<pad>": 32769,
+  "<time_series>": 32768
+}

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "auto_map": {
+    "AutoFeatureExtractor": "HachiML/MOMENT-1-large-embedding-v0.1--feature_extraction_moment.MomentFeatureExtractor",
+    "AutoProcessor": "processing_mists.MistsProcessor"
+  },
+  "feature_extractor_type": "MomentFeatureExtractor",
+  "processor_class": "MistsProcessor"
+}

processing_mists.py ADDED Viewed

	@@ -0,0 +1,82 @@

+# Processerでの実施事項
+# - TokenizerでTokenize
+# - 時系列データをdataframe, numpy array, torch tensorの状態からtorch tensor化
+# input_ids: , attention_mask: , time_series_values: の形式で返す。
+from typing import List, Optional, Union
+from pandas import DataFrame
+import numpy as np
+import torch
+import tensorflow as tf
+import jax.numpy as jnp
+from transformers import ProcessorMixin
+from transformers import TensorType
+from transformers import BatchFeature
+from transformers.tokenization_utils_base import PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
+class MistsProcessor(ProcessorMixin):
+    # 本来はMoment側のTokenizerもts_tokenizerとして入れたかったが、モデルに組み込まれてしまっている。
+    # refers: https://github.com/moment-timeseries-foundation-model/moment/blob/088b253a1138ac7e48a7efc9bf902336c9eec8d9/momentfm/models/moment.py#L105
+    # この2パーツが本来はts_tokenizerの領分になる気がする。
+    # (normalizer): RevIN()
+    # (tokenizer): Patching()
+    attributes = ["feature_extractor", "tokenizer"]
+    feature_extractor_class = "AutoFeatureExtractor"
+    tokenizer_class = "AutoTokenizer"
+    def __init__(self, feature_extractor=None, tokenizer=None):
+        super().__init__(feature_extractor, tokenizer)
+    def __call__(
+        self,
+        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
+        time_series: Union[DataFrame, np.ndarray, torch.Tensor, List[DataFrame], List[np.ndarray], List[torch.Tensor]] = None,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str, TruncationStrategy] = None,
+        max_length: Union[int, None] = None,
+        return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
+        torch_dtype: Optional[Union[str, torch.dtype]] = torch.float,
+        time_series_padding: Union[bool, str] = False,
+        time_series_max_length: Union[int, None] = None,
+    ) -> BatchFeature:
+        if time_series is not None:
+            time_series_values = self.feature_extractor(
+                time_series,
+                return_tensors=return_tensors,
+                torch_dtype=torch_dtype,
+                padding=time_series_padding,
+                time_series_max_length=time_series_max_length
+            )
+        else:
+            time_series_values = None
+        text_inputs = self.tokenizer(
+            text, return_tensors=return_tensors, padding=padding, truncation=truncation, max_length=max_length
+        )
+        return BatchFeature(data={**text_inputs, **time_series_values})
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to Tokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to Tokenizer's [`~PreTrainedTokenizer.decode`]. Please refer to
+        the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+    @property
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        feature_extractor_input_names = self.feature_extractor.model_input_names
+        return list(dict.fromkeys(tokenizer_input_names + feature_extractor_input_names))

processor_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "auto_map": {
+    "AutoProcessor": "processing_mists.MistsProcessor"
+  },
+  "processor_class": "MistsProcessor"
+}

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:37f00374dea48658ee8f5d0f21895b9bc55cb0103939607c8185bfd1c6ca1f89
+size 587404

tokenizer_config.json ADDED Viewed

The diff for this file is too large to render. See raw diff