Fix: remove the numerous sequential log (#2461)

* fix: remove sequential logs

* feat(doc): add for sample pack sequentially and curriculum sampling
This commit is contained in:
NanoCode012
2025-04-01 20:20:00 +07:00
committed by GitHub
parent 9b95e06cbb
commit f4ae8816bb
2 changed files with 7 additions and 3 deletions

View File

@@ -320,9 +320,13 @@ total_num_tokens:
sample_packing_group_size: 100000
# The number of samples which can be packed into one sequence. Increase if using a large sequence_len with many short samples.
sample_packing_bin_size: 200
sample_pack_sequentially: # Optional[bool]. Whether to pack samples sequentially.
# whether to concatenate samples during pretraining
pretraining_sample_concatenation:
curriculum_sampling: # Optional[bool]. Whether to use sequential sampling for curriculum learning
# Use batch flattening for speedups when not using sample_packing
batch_flattening:

View File

@@ -12,7 +12,9 @@ from torch.utils.data import BatchSampler, Sampler, SequentialSampler
from axolotl.utils.distributed import reduce_and_broadcast
LOG = logging.getLogger("axolotl.utils.samplers.multipack")
LOG = logging.getLogger(__name__)
LOG.setLevel(logging.INFO)
@numba.njit
@@ -202,7 +204,6 @@ class MultipackBatchSampler(BatchSampler):
lengths_cumsum = np.cumsum(lengths)
if self.sequential:
LOG.debug("using sequential sample packing algorithm")
batches, total_used, total_slots = allocate_sequentially(
lengths=lengths,
rank=0,
@@ -210,7 +211,6 @@ class MultipackBatchSampler(BatchSampler):
n=1,
)
else:
LOG.debug("using non-sequential sample packing algorithm")
batches, total_used, total_slots = allocate(
lengths=lengths,
lengths_cumsum=lengths_cumsum,