1
import json
from statistics import mean
from tqdm import tqdm
from transformers import AutoTokenizer
def load_json(filename):
with open(filename, 'r') as f:
return json.load(f)
def num_tokens_from_string(string: str, tokenizer) -> int:
return len(tokenizer.encode(string))
def analyze_token_sizes(data, tokenizer):
combined_tokens = []
for item in tqdm(data):
instruction_tokens = num_tokens_from_string(item['instruction'], tokenizer)
input_tokens = num_tokens_from_string(item['input'], tokenizer)
output_tokens = num_tokens_from_string(item['output'], tokenizer)
combined_data = item['instruction'] + item['input'] + item['output']
combined_tokens.append(num_tokens_from_string(combined_data, tokenizer))
print("Combined Data Statistics:")
print(f" Minimum tokens: {min(combined_tokens)}")
print(f" Maximum tokens: {max(combined_tokens)}")
print(f" Average tokens: {mean(combined_tokens):.2f}")
def main():
# Load a tokenizer similar to LLaMA
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct",
token = "")
data = load_json('data.json')
analyze_token_sizes(data, tokenizer)
if __name__ == "__main__":
main()
For immediate assistance, please email our customer support: [email protected]