Upload json_tokenizer/hf_compat.py with huggingface_hub

f3a6aa4 verified 19 days ago

13.3 kB

	"""HuggingFace Transformers-compatible wrapper for JSONTokenizer.

	Provides JSONPreTrainedTokenizer, a PreTrainedTokenizer subclass that
	wraps JSONTokenizer for use with the HuggingFace ecosystem:
	- save_pretrained / from_pretrained
	- AutoTokenizer.from_pretrained (with trust_remote_code=True)
	- tokenizer(json_string) -> BatchEncoding
	- Padding, truncation, batch processing, return_tensors

	Requires: pip install json-tokenizer[huggingface]
	"""

	from __future__ import annotations

	import json
	import os
	from typing import Any, Dict, List, Optional, Tuple, Union

	try:
	from transformers import PreTrainedTokenizer
	except ImportError:
	raise ImportError(
	"The HuggingFace transformers library is required for this module. "
	"Install it with: pip install json-tokenizer[huggingface]"
	)

	from json_tokenizer.tokenizer import JSONTokenizer, StructuralTokens
	from json_tokenizer.bpe import BPETrainer


	VOCAB_FILES_NAMES = {"vocab_file": "json_tokenizer_vocab.json"}

	# Structural token ID -> HF-compatible string name.
	# Uses <name> format which cannot collide with BPE tokens because
	# the BPE pre-tokenizer splits <, >, : into separate tokens.
	_STRUCTURAL_TOKEN_NAMES = {
	StructuralTokens.PAD: "<pad>",
	StructuralTokens.START: "<s>",
	StructuralTokens.END: "</s>",
	StructuralTokens.OBJ_START: "<obj_start>",
	StructuralTokens.OBJ_END: "<obj_end>",
	StructuralTokens.ARR_START: "<arr_start>",
	StructuralTokens.ARR_END: "<arr_end>",
	StructuralTokens.COLON: "<colon>",
	StructuralTokens.COMMA: "<comma>",
	StructuralTokens.NULL: "<null>",
	StructuralTokens.TRUE: "<true>",
	StructuralTokens.FALSE: "<false>",
	StructuralTokens.STR_DELIM: "<str_delim>",
	StructuralTokens.NUM_PREFIX: "<num_prefix>",
	StructuralTokens.KEY_PREFIX: "<key_prefix>",
	StructuralTokens.UNK: "<unk>",
	}

	_STRUCTURAL_NAME_TO_ID = {v: k for k, v in _STRUCTURAL_TOKEN_NAMES.items()}


	class JSONPreTrainedTokenizer(PreTrainedTokenizer):
	"""HuggingFace-compatible wrapper around JSONTokenizer.

	Usage:
	# From a trained JSONTokenizer:
	tok = JSONTokenizer(bpe_vocab_size=4096)
	tok.train(data)
	hf_tok = JSONPreTrainedTokenizer.from_json_tokenizer(tok)

	# Encode/decode via HF API:
	output = hf_tok('{"name": "Alice", "age": 30}')
	print(output["input_ids"])
	print(hf_tok.decode(output["input_ids"]))

	# Save and reload:
	hf_tok.save_pretrained("./my_tokenizer")
	loaded = JSONPreTrainedTokenizer.from_pretrained("./my_tokenizer")
	"""

	vocab_files_names = VOCAB_FILES_NAMES
	model_input_names = ["input_ids", "attention_mask"]

	def __init__(
	self,
	vocab_file: Optional[str] = None,
	unk_token: str = "<unk>",
	bos_token: str = "<s>",
	eos_token: str = "</s>",
	pad_token: str = "<pad>",
	**kwargs,
	):
	# Internal state — populated from vocab_file or from_json_tokenizer
	if not hasattr(self, "_json_tokenizer"):
	self._json_tokenizer: Optional[JSONTokenizer] = None
	if not hasattr(self, "_hf_vocab"):
	self._hf_vocab: Dict[str, int] = {}
	if not hasattr(self, "_hf_id_to_token"):
	self._hf_id_to_token: Dict[int, str] = {}

	if vocab_file is not None and os.path.isfile(vocab_file):
	self._load_vocab_file(vocab_file)

	super().__init__(
	unk_token=unk_token,
	bos_token=bos_token,
	eos_token=eos_token,
	pad_token=pad_token,
	**kwargs,
	)

	# ── Factory ────────────────────────────────────────────────────────

	@classmethod
	def from_json_tokenizer(
	cls, tokenizer: JSONTokenizer, **kwargs
	) -> "JSONPreTrainedTokenizer":
	"""Create from a trained JSONTokenizer instance.

	Args:
	tokenizer: A trained JSONTokenizer.
	**kwargs: Additional arguments passed to __init__.

	Returns:
	A new JSONPreTrainedTokenizer wrapping the provided tokenizer.
	"""
	if not tokenizer._trained:
	raise ValueError("JSONTokenizer must be trained before wrapping.")

	instance = cls.__new__(cls)
	instance._json_tokenizer = tokenizer
	instance._hf_vocab = {}
	instance._hf_id_to_token = {}
	instance._build_hf_vocab()
	instance.__init__(vocab_file=None, **kwargs)
	return instance

	# ── Vocab building ─────────────────────────────────────────────────

	def _load_vocab_file(self, vocab_file: str) -> None:
	"""Reconstruct a JSONTokenizer from our saved vocab file."""
	with open(vocab_file, "r", encoding="utf-8") as f:
	data = json.load(f)

	config = data["config"]
	tok = JSONTokenizer(
	bpe_vocab_size=config["bpe_vocab_size"],
	max_key_vocab=config["max_key_vocab"],
	min_key_freq=config["min_key_freq"],
	bpe_min_freq=config["bpe_min_freq"],
	)
	tok._key_to_id = {k: int(v) for k, v in data["key_vocab"].items()}
	tok._id_to_key = {int(v): k for k, v in data["key_vocab"].items()}
	tok._key_offset = config["key_offset"]
	tok._bpe_offset = config["bpe_offset"]

	bpe_data = data["bpe_model"]
	bpe = BPETrainer(
	vocab_size=bpe_data["vocab_size"],
	min_frequency=bpe_data["min_frequency"],
	)
	bpe.merges = [tuple(m) for m in bpe_data["merges"]]
	bpe.vocab = bpe_data["vocab"]
	bpe._id_to_tok = None
	tok._bpe = bpe

	tok._build_vocab_lookup()
	tok._trained = True

	self._json_tokenizer = tok
	self._build_hf_vocab()

	def _build_hf_vocab(self) -> None:
	"""Build the unified {token_string: id} mapping across all tiers."""
	tok = self._json_tokenizer
	self._hf_vocab = {}
	self._hf_id_to_token = {}

	# Structural tokens (0-15)
	for tid, name in _STRUCTURAL_TOKEN_NAMES.items():
	self._hf_vocab[name] = tid
	self._hf_id_to_token[tid] = name

	# Reserved tokens (16-31)
	for tid in range(16, StructuralTokens.RESERVED_END):
	name = f"<reserved_{tid}>"
	self._hf_vocab[name] = tid
	self._hf_id_to_token[tid] = name

	# Key vocabulary tokens
	for key_str, tid in tok._key_to_id.items():
	name = f"<key:{key_str}>"
	self._hf_vocab[name] = tid
	self._hf_id_to_token[tid] = name

	# BPE tokens
	for bpe_token, bpe_local_id in tok._bpe.vocab.items():
	full_id = tok._bpe_offset + bpe_local_id
	# Collision guard (only <UNK> from BPE could theoretically collide)
	if bpe_token in self._hf_vocab:
	bpe_token_name = f"bpe:{bpe_token}"
	else:
	bpe_token_name = bpe_token
	self._hf_vocab[bpe_token_name] = full_id
	self._hf_id_to_token[full_id] = bpe_token_name

	# ── Required PreTrainedTokenizer overrides ─────────────────────────

	@property
	def vocab_size(self) -> int:
	if self._json_tokenizer is None:
	return len(_STRUCTURAL_TOKEN_NAMES)
	return self._json_tokenizer.vocab_size

	def get_vocab(self) -> Dict[str, int]:
	vocab = dict(self._hf_vocab)
	vocab.update(self.added_tokens_encoder)
	return vocab

	def _tokenize(self, text: str, **kwargs) -> List[str]:
	"""Tokenize a JSON string into HF token strings.

	The HF pipeline calls: tokenize(text) -> _tokenize -> list[str]
	then convert_tokens_to_ids maps those to IDs.

	We parse the JSON, encode via JSONTokenizer (skipping START/END
	since HF adds special tokens via build_inputs_with_special_tokens),
	then convert IDs to our HF token string names.
	"""
	if self._json_tokenizer is None:
	return [self.unk_token]

	try:
	ids = self._json_tokenizer.encode(text)
	except (ValueError, json.JSONDecodeError):
	# Not valid JSON — encode as raw string via BPE
	ids = [StructuralTokens.START]
	self._json_tokenizer._encode_string(text, ids)
	ids.append(StructuralTokens.END)

	# Strip START/END — HF adds them via build_inputs_with_special_tokens
	if ids and ids[0] == StructuralTokens.START:
	ids = ids[1:]
	if ids and ids[-1] == StructuralTokens.END:
	ids = ids[:-1]

	return [self._hf_id_to_token.get(tid, self.unk_token) for tid in ids]

	def _convert_token_to_id(self, token: str) -> int:
	return self._hf_vocab.get(
	token, self._hf_vocab.get(self.unk_token, StructuralTokens.UNK)
	)

	def _convert_id_to_token(self, index: int) -> str:
	return self._hf_id_to_token.get(index, self.unk_token)

	def convert_tokens_to_string(self, tokens: List[str]) -> str:
	"""Reconstruct a JSON string from token strings.

	Converts token strings -> IDs, wraps with START/END,
	and delegates to JSONTokenizer.decode().
	"""
	if self._json_tokenizer is None:
	return ""

	ids = [StructuralTokens.START]
	for token in tokens:
	tid = self._convert_token_to_id(token)
	ids.append(tid)
	ids.append(StructuralTokens.END)

	try:
	return self._json_tokenizer.decode(ids)
	except Exception:
	return " ".join(tokens)

	# ── Special tokens ─────────────────────────────────────────────────

	def build_inputs_with_special_tokens(
	self,
	token_ids_0: List[int],
	token_ids_1: Optional[List[int]] = None,
	) -> List[int]:
	"""Wrap with START (bos) and END (eos) tokens."""
	bos = [self.bos_token_id]
	eos = [self.eos_token_id]
	if token_ids_1 is None:
	return bos + token_ids_0 + eos
	return bos + token_ids_0 + eos + bos + token_ids_1 + eos

	def get_special_tokens_mask(
	self,
	token_ids_0: List[int],
	token_ids_1: Optional[List[int]] = None,
	already_has_special_tokens: bool = False,
	) -> List[int]:
	"""1 for special tokens (START/END), 0 for content tokens."""
	if already_has_special_tokens:
	return super().get_special_tokens_mask(
	token_ids_0=token_ids_0,
	token_ids_1=token_ids_1,
	already_has_special_tokens=True,
	)
	if token_ids_1 is None:
	return [1] + [0] * len(token_ids_0) + [1]
	return (
	[1] + [0] * len(token_ids_0) + [1]
	+ [1] + [0] * len(token_ids_1) + [1]
	)

	def create_token_type_ids_from_sequences(
	self,
	token_ids_0: List[int],
	token_ids_1: Optional[List[int]] = None,
	) -> List[int]:
	"""Segment IDs: 0 for first sequence, 1 for second."""
	bos_eos = 2 # one bos + one eos
	if token_ids_1 is None:
	return [0] * (len(token_ids_0) + bos_eos)
	return [0] * (len(token_ids_0) + bos_eos) + [1] * (len(token_ids_1) + bos_eos)

	# ── Persistence ────────────────────────────────────────────────────

	def save_vocabulary(
	self,
	save_directory: str,
	filename_prefix: Optional[str] = None,
	) -> Tuple[str]:
	"""Save the vocabulary to a single JSON file.

	This file contains everything needed to reconstruct the
	JSONTokenizer: config, key vocab, and BPE model.
	"""
	if not os.path.isdir(save_directory):
	raise ValueError(f"Not a directory: {save_directory}")

	vocab_file = os.path.join(
	save_directory,
	(filename_prefix + "-" if filename_prefix else "")
	+ VOCAB_FILES_NAMES["vocab_file"],
	)

	tok = self._json_tokenizer
	data = {
	"version": "json-tokenizer-hf-v1",
	"config": {
	"bpe_vocab_size": tok.bpe_vocab_size,
	"max_key_vocab": tok.max_key_vocab,
	"min_key_freq": tok.min_key_freq,
	"bpe_min_freq": tok.bpe_min_freq,
	"key_offset": tok._key_offset,
	"bpe_offset": tok._bpe_offset,
	},
	"key_vocab": tok._key_to_id,
	"bpe_model": {
	"vocab_size": tok._bpe.vocab_size,
	"min_frequency": tok._bpe.min_frequency,
	"merges": [list(m) for m in tok._bpe.merges],
	"vocab": tok._bpe.vocab,
	},
	}

	with open(vocab_file, "w", encoding="utf-8") as f:
	json.dump(data, f, indent=2, ensure_ascii=False)

	return (vocab_file,)