π λͺ©μ
νμ νν μ μ½λ μμ± μ κ°λ°μμ λ€λ₯Έ μ¬μ©μμκ² ν¨μμ μ λ ₯ λ° μΆλ ₯ νμ μ λν μ 보λ₯Ό μ 곡νμ¬ μ½λλ₯Ό λ λͺ ννκ³ μ μ§λ³΄μνκ² μ½κ² λ§λ λ€.
π μμ
1. λ°νκ° X
def print_world() -> None:
print("world")
-> : ν¨μκ° μ΄λ€ νμ μ κ°μ λ°ννλμ§ λͺ μνλ λ°ν νμ ννΈ(return type hint)λ₯Ό μ§μ νλ λ¬Έλ²μ΄λ€.
-> None : ν¨μκ° κ°μ λ°ννμ§ μμμ μλ―Ένλ€. μ¦ returnλ¬Έ μμ΄ μ€νλ§ νκ³ λλλ ν¨μμ΄λ€.
result = print_world()
resultλ Noneμ΄λ€.
μ°Έκ³ : λ°ν νμ ννΈκ° μλ κ²½μ°
def print_world():
print("world")
2. λ°νκ° int
def add(x: int, y: int) -> int:
return x + y
result = add(3, 5) # λ°νκ°μ int νμ
3. λ°νκ° κ°μ²΄
def __call__(
self,
text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
text_pair: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None,
text_target: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
text_pair_target: Optional[
Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]
] = None,
add_special_tokens: bool = True,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str, TruncationStrategy] = None,
max_length: Optional[int] = None,
stride: int = 0,
is_split_into_words: bool = False,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
**kwargs,
) -> BatchEncoding:
# To avoid duplicating
all_kwargs = {
"add_special_tokens": add_special_tokens,
"padding": padding,
"truncation": truncation,
"max_length": max_length,
"stride": stride,
"is_split_into_words": is_split_into_words,
"pad_to_multiple_of": pad_to_multiple_of,
"padding_side": padding_side,
"return_tensors": return_tensors,
"return_token_type_ids": return_token_type_ids,
"return_attention_mask": return_attention_mask,
"return_overflowing_tokens": return_overflowing_tokens,
"return_special_tokens_mask": return_special_tokens_mask,
"return_offsets_mapping": return_offsets_mapping,
"return_length": return_length,
"split_special_tokens": kwargs.pop("split_special_tokens", self.split_special_tokens),
"verbose": verbose,
}
all_kwargs.update(kwargs)
if text is None and text_target is None:
raise ValueError("You need to specify either `text` or `text_target`.")
if text is not None:
# The context manager will send the inputs as normal texts and not text_target, but we shouldn't change the
# input mode in this case.
if not self._in_target_context_manager:
self._switch_to_input_mode()
encodings = self._call_one(text=text, text_pair=text_pair, **all_kwargs)
if text_target is not None:
self._switch_to_target_mode()
target_encodings = self._call_one(text=text_target, text_pair=text_pair_target, **all_kwargs)
# Leave back tokenizer in input mode
self._switch_to_input_mode()
if text_target is None:
return encodings
elif text is None:
return target_encodings
else:
encodings["labels"] = target_encodings["input_ids"]
return encodings
__call__ λ©μ€λμ λ°ν νμ μ Hugging Faceμ tranformers λΌμ΄λΈλ¬λ¦¬μμ μ 곡νλ BatchEncoding κ°μ²΄μ΄λ€.
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
# μ
λ ₯ ν
μ€νΈ
text = "I love AI."
text_target = "AI is amazing."
# ν ν¬λμ΄μ νΈμΆ
output = tokenizer(text=text, text_target=text_target, padding="max_length", max_length=10, return_tensors="pt")
print(output)
tokenizer λ΄λΆ ν¨μ __call__ μ€νλ¨
μΆλ ₯ μμ
BatchEncoding(data={
"input_ids": tensor([[ 101, 1045, 2293, 9931, 1012, 102, 0, 0, 0, 0]]),
"attention_mask": tensor([[1, 1, 1, 1, 1, 1, 0, 0, 0, 0]]),
"labels": tensor([[ 101, 9931, 2003, 6429, 1012, 102, 0, 0, 0, 0]])
})
π μΆμ² μ½λ
transformers/src/transformers/tokenization_utils_base.py at main · huggingface/transformers
π€ Transformers: State-of-the-art Machine Learning for Pytorch, TensorFlow, and JAX. - huggingface/transformers
github.com
'Python' μΉ΄ν κ³ λ¦¬μ λ€λ₯Έ κΈ
μΆμ λ©μλ (Abstract Method) (1) | 2024.12.10 |
---|