* docs: fix codestyle placeholders in CONTRIBUTING.md
Replace unresolved {codestyle} and {URLofCodestyle} template
variables with Ruff, the project's actual linter/formatter
as configured in .pre-commit-config.yaml.
* fix: replace bare except clauses with specific exception types
- quantization.py: use except ImportError for optional torchao imports
(consistent with line 48 which already uses ImportError correctly)
- cli/config.py: use except (RuntimeError, AssertionError) for CUDA
device property query
Prevents masking unrelated errors like KeyboardInterrupt or SystemExit.
* test: add unit tests for convert.py JSON/JSONL utilities
Cover FileReader, FileWriter, StdoutWriter, JsonParser,
JsonlSerializer, and JsonToJsonlConverter with 8 test cases
including roundtrip and edge case (empty list) scenarios.
Previously this module had zero test coverage.
* fix: address CodeRabbit review feedback
- quantization.py: catch (ImportError, RuntimeError) for optional
torchao imports; CUDA wheel/GPU mismatches raise RuntimeError,
not ImportError
- convert.py: remove unused output_file_path parameter from
JsonToJsonlConverter.convert() — FileWriter already holds the
output path from construction
- tests/test_convert.py: update call site to match new signature
76 lines
1.8 KiB
Python
76 lines
1.8 KiB
Python
"""Module containing File Reader, File Writer, Json Parser, and Jsonl Serializer classes"""
|
|
|
|
import json
|
|
import sys
|
|
|
|
|
|
class FileReader:
|
|
"""
|
|
Reads a file and returns its contents as a string
|
|
"""
|
|
|
|
def read(self, file_path):
|
|
with open(file_path, encoding="utf-8") as file:
|
|
return file.read()
|
|
|
|
|
|
class FileWriter:
|
|
"""
|
|
Writes a string to a file
|
|
"""
|
|
|
|
def __init__(self, file_path):
|
|
self.file_path = file_path
|
|
|
|
def write(self, content):
|
|
with open(self.file_path, "w", encoding="utf-8") as file:
|
|
file.write(content)
|
|
|
|
|
|
class StdoutWriter:
|
|
"""
|
|
Writes a string to stdout
|
|
"""
|
|
|
|
def write(self, content):
|
|
sys.stdout.write(content)
|
|
sys.stdout.write("\n")
|
|
|
|
|
|
class JsonParser:
|
|
"""
|
|
Parses a string as JSON and returns the result
|
|
"""
|
|
|
|
def parse(self, content):
|
|
return json.loads(content)
|
|
|
|
|
|
class JsonlSerializer:
|
|
"""
|
|
Serializes a list of JSON objects into a JSONL string
|
|
"""
|
|
|
|
def serialize(self, data):
|
|
lines = [json.dumps(item) for item in data]
|
|
return "\n".join(lines)
|
|
|
|
|
|
class JsonToJsonlConverter:
|
|
"""
|
|
Converts a JSON file to JSONL
|
|
"""
|
|
|
|
def __init__(self, file_reader, file_writer, json_parser, jsonl_serializer):
|
|
self.file_reader = file_reader
|
|
self.file_writer = file_writer
|
|
self.json_parser = json_parser
|
|
self.jsonl_serializer = jsonl_serializer
|
|
|
|
def convert(self, input_file_path):
|
|
content = self.file_reader.read(input_file_path)
|
|
data = self.json_parser.parse(content)
|
|
# data = [r for r in data if r["conversations"]] # vicuna cleaned has rows with empty conversations
|
|
jsonl_content = self.jsonl_serializer.serialize(data)
|
|
self.file_writer.write(jsonl_content)
|