From 52efd41d21e1dfd5056abfa73401673a09a77dbc Mon Sep 17 00:00:00 2001 From: Jaikinator Date: Fri, 16 Jun 2023 15:00:22 +0200 Subject: [PATCH] added Transcriptor class which handles Transcription output --- autotranscript/transcript_exporter.py | 181 +++++++++++++++++++++++++- 1 file changed, 175 insertions(+), 6 deletions(-) diff --git a/autotranscript/transcript_exporter.py b/autotranscript/transcript_exporter.py index 956b398..ae6f1b6 100644 --- a/autotranscript/transcript_exporter.py +++ b/autotranscript/transcript_exporter.py @@ -1,23 +1,192 @@ +import json + +ALPHABET = [*"abcdefghijklmnopqrstuvwxyz"] + class Transcript: """ Class for storing transcript data and exporting it to files in different formats """ - def __init__(self, transcript: str) -> None: + def __init__(self, transcript: dict) -> None: """ :param transcript: formated transcript string """ self.transcript = transcript + self.speakers = self._extract_speakers() + self.segments = self._extract_segments() + self.annotation = {} - def to_latex(self, path: str) -> None: + def annotate(self, *args, **kwargs) -> dict: + """ + Annote transcript to define speaker names + + :param args: list of speaker names will maped sequentially to the speakers + :param kwargs: dict with speaker names as keys and list of segments as values + + :return: dict with speaker names as keys and list of segments as values + :rtype: dict + """ + + annotatios = {} + + if len(args) != len(self.speakers): + raise ValueError("Number of speaker names does not match number of speakers") + + if args: + for arg,ospeaker in zip(args,self.speakers): + annotatios[ospeaker] = arg + + if kwargs: + for key in kwargs: + if key not in self.speakers: + raise ValueError(f"{key} is not a speaker") + annotatios[key] = kwargs[key] + + self.annotation = annotatios + return annotatios + + def _extract_speakers(self) -> list: + """ + Extract speaker names from transcript + :return: list of speaker names + :rtype: list + """ + return list(set([self.transcript[id]["speaker"] for id in self.transcript])) + + def _extract_segments(self) -> list: + """ + Extract segments from transcript + + :return: list of segments + :rtype: list + """ + return [self.transcript[id]["segment"] for id in self.transcript] + + def __str__(self) -> str: + """ + Get transcript as string + + :return: transcript as string + :rtype: str + """ + fstring = "" + + for id in self.transcript: + seq = self.transcript[id] + + if self.annotation: + speaker = self.annotation[seq["speaker"]] + else: + speaker = seq["speaker"] + + fstring += f"{speaker}: {seq['text']}\n" + + return fstring + + def __repr__(self) -> str: + return f"Transcript(speakers = {self.speakers},"\ + f"segments = {self.segments}, annotation = {self.annotation})" + + def get_dict(self) -> dict: + """ + Get transcript as dict + + :return: transcript as dict + :rtype: dict + """ + + return self.transcript + + def get_json(self, *args, **kwargs) -> str: + """ + Get transcript as json string + :return: transcript as json string + :rtype: str + """ + if "indent" not in kwargs: + kwargs["indent"] = 4 + return json.dumps(self.transcript, *args, **kwargs) + + def get_html(self) -> str: + """ + Get transcript as html string + + :return: transcript as html string + :rtype: str + """ + html = "

" + self.__str__().replace("\n", "
") + "

" + html = "" + html + "" + html = html.replace("\t", "    ") + + return html + + + def get_md(self) -> str: + return self.get_html() + + def get_tex(self) -> str: + + if not self.annotation: + + self.annotate(*ALPHABET[:len(self.speakers)]) + + fstring ="\\begin{drama}" + + for speaker in self.speakers: + + fstring += "\n\t\\Character{"+ str(self.annotation[speaker]) + "}" \ + "{"+ str(self.annotation[speaker]) + "}" + + for id in self.transcript: + seq = self.transcript[id] + speaker = self.annotation[seq["speaker"]] + fstring += f"\n\\{speaker}speaks:\n{seq['text']}" + + fstring += "\n\\end{drama}" + + return fstring + + + def to_json(self,path, *args, **kwargs) -> None: + """ + Save transcript as json file + :param path: path to save file + :type path: str + """ + with open(path, "w") as f: + json.dump(self.transcript, f, *args, **kwargs) + + def to_txt(self, path: str) -> None: + + with open(path, "w") as f: + f.write(self.__str__, f) + + def to_md(self, path: str) -> None: + return self.to_html(path) + + def to_html(self, path: str) -> None: + """ + Save transcript as html file + + :param path: path to save file + :type path: str + """ + + with open(path, "w") as file: + file.write(self.get_html()) + + def to_tex(self, path: str) -> None: pass def to_pdf(self, path: str) -> None: pass - def to_txt(self, path: str) -> None: - pass +if __name__ == "__main__": + test = Transcript(json.load(open("tests/test.json", "r"))) + print(repr(test)) + print(test) - def to_json(self, path: str) -> None: - pass \ No newline at end of file + + + \ No newline at end of file