diff --git a/src/axolotl/convert.py b/src/axolotl/convert.py index a953252e9..357e0ec50 100644 --- a/src/axolotl/convert.py +++ b/src/axolotl/convert.py @@ -1,47 +1,76 @@ +"""Module containing File Reader, File Writer, Json Parser, and Jsonl Serializer classes""" + + import json import sys class FileReader: + """ + Reads a file and returns its contents as a string + """ + def read(self, file_path): - with open(file_path, "r") as file: + with open(file_path, encoding="utf-8") as file: return file.read() class FileWriter: + """ + Writes a string to a file + """ + def __init__(self, file_path): self.file_path = file_path def write(self, content): - with open(self.file_path, "w") as file: + with open(self.file_path, "w", encoding="utf-8") as file: file.write(content) class StdoutWriter: + """ + Writes a string to stdout + """ + def write(self, content): sys.stdout.write(content) sys.stdout.write("\n") class JsonParser: + """ + Parses a string as JSON and returns the result + """ + def parse(self, content): return json.loads(content) class JsonlSerializer: + """ + Serializes a list of JSON objects into a JSONL string + """ + def serialize(self, data): lines = [json.dumps(item) for item in data] return "\n".join(lines) class JsonToJsonlConverter: + """ + Converts a JSON file to JSONL + """ + def __init__(self, file_reader, file_writer, json_parser, jsonl_serializer): self.file_reader = file_reader self.file_writer = file_writer self.json_parser = json_parser self.jsonl_serializer = jsonl_serializer - def convert(self, input_file_path, output_file_path): + def convert( + self, input_file_path, output_file_path + ): # pylint: disable=unused-argument content = self.file_reader.read(input_file_path) data = self.json_parser.parse(content) # data = [r for r in data if r["conversations"]] # vicuna cleaned has rows with empty conversations