Compare commits
1 Commits
multipack-
...
completion
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
da154e6d56 |
@@ -2,15 +2,14 @@
|
||||
auto-gptq==0.5.1
|
||||
packaging
|
||||
peft==0.6.0
|
||||
transformers==4.35.2
|
||||
tokenizers==0.15.0
|
||||
transformers==4.35.1
|
||||
bitsandbytes>=0.41.1
|
||||
accelerate==0.24.1
|
||||
deepspeed
|
||||
addict
|
||||
fire
|
||||
PyYAML>=6.0
|
||||
datasets>=2.15.0
|
||||
datasets>=2.14.0
|
||||
flash-attn==2.3.3
|
||||
sentencepiece
|
||||
wandb
|
||||
@@ -30,7 +29,7 @@ scikit-learn==1.2.2
|
||||
pynvml
|
||||
art
|
||||
fschat==0.2.29
|
||||
gradio==3.50.2
|
||||
gradio
|
||||
tensorboard
|
||||
|
||||
# remote filesystems
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
"""
|
||||
Basic completion text
|
||||
"""
|
||||
import json
|
||||
from collections import defaultdict
|
||||
from typing import Any, Dict, Generator, Optional, Tuple
|
||||
|
||||
@@ -64,6 +65,19 @@ class CompletionPromptTokenizingStrategy(InstructionPromptTokenizingStrategy):
|
||||
return next(iter(self.prompter.build_prompt(instruction, input, response)))
|
||||
|
||||
|
||||
class CompletionJSONPromptTokenizationStrategy(CompletionPromptTokenizingStrategy):
|
||||
"""
|
||||
Strategy to return the stringified JSON of the entire row as the training data
|
||||
"""
|
||||
|
||||
def parse_instruction_fields(self, prompt) -> Tuple[str, str, str]:
|
||||
return (
|
||||
json.dumps(prompt),
|
||||
"",
|
||||
"",
|
||||
)
|
||||
|
||||
|
||||
class CompletionPrompter:
|
||||
"""
|
||||
Prompter for completion
|
||||
@@ -82,7 +96,7 @@ def load(tokenizer, cfg, ds_cfg: Optional[Dict[str, Any]] = None):
|
||||
strat = CompletionPromptTokenizingStrategy(
|
||||
CompletionPrompter(),
|
||||
tokenizer,
|
||||
cfg.train_on_inputs,
|
||||
True,
|
||||
cfg.sequence_len,
|
||||
max_length=cfg.sequence_len * 64,
|
||||
)
|
||||
@@ -90,3 +104,15 @@ def load(tokenizer, cfg, ds_cfg: Optional[Dict[str, Any]] = None):
|
||||
strat.field = ds_cfg["field"]
|
||||
|
||||
return strat
|
||||
|
||||
|
||||
def load_json(tokenizer, cfg):
|
||||
strat = CompletionJSONPromptTokenizationStrategy(
|
||||
CompletionPrompter(),
|
||||
tokenizer,
|
||||
True,
|
||||
cfg.sequence_len,
|
||||
max_length=cfg.sequence_len * 64,
|
||||
)
|
||||
|
||||
return strat
|
||||
|
||||
@@ -698,24 +698,6 @@ def get_dataset_wrapper(
|
||||
return dataset_wrapper, dataset_prompter
|
||||
|
||||
|
||||
def encode_packed_pretraining(
|
||||
tokenizer: PreTrainedTokenizerBase, max_tokens: int, examples: List[str]
|
||||
):
|
||||
# tokenize all the examples
|
||||
# rows get split with stride (overlap)
|
||||
res = tokenizer(
|
||||
examples,
|
||||
truncation=True,
|
||||
max_length=max_tokens,
|
||||
add_special_tokens=True,
|
||||
return_overflowing_tokens=True,
|
||||
stride=256,
|
||||
)
|
||||
# convert to a dataset.from_list
|
||||
# use a dataloader and multipack batch sampler to pack the data
|
||||
pass
|
||||
|
||||
|
||||
def encode_pretraining(
|
||||
tokenizer: PreTrainedTokenizerBase, max_tokens: int, examples: List[str]
|
||||
) -> Dict[str, List]:
|
||||
@@ -831,7 +813,6 @@ def load_pretraining_dataset(path, tokenizer, max_tokens=2048, seed=42):
|
||||
dataset = dataset.map(
|
||||
encode,
|
||||
batched=True,
|
||||
batch_size=10_000,
|
||||
input_columns="text",
|
||||
# remove all the existing columns after mapping since they end up having
|
||||
# a different length than the encoded/tokenized column
|
||||
|
||||
Reference in New Issue
Block a user