diff --git a/autotranscript/transcript_exporter.py b/autotranscript/transcript_exporter.py index 956b398..ae6f1b6 100644 --- a/autotranscript/transcript_exporter.py +++ b/autotranscript/transcript_exporter.py @@ -1,23 +1,192 @@ +import json + +ALPHABET = [*"abcdefghijklmnopqrstuvwxyz"] + class Transcript: """ Class for storing transcript data and exporting it to files in different formats """ - def __init__(self, transcript: str) -> None: + def __init__(self, transcript: dict) -> None: """ :param transcript: formated transcript string """ self.transcript = transcript + self.speakers = self._extract_speakers() + self.segments = self._extract_segments() + self.annotation = {} - def to_latex(self, path: str) -> None: + def annotate(self, *args, **kwargs) -> dict: + """ + Annote transcript to define speaker names + + :param args: list of speaker names will maped sequentially to the speakers + :param kwargs: dict with speaker names as keys and list of segments as values + + :return: dict with speaker names as keys and list of segments as values + :rtype: dict + """ + + annotatios = {} + + if len(args) != len(self.speakers): + raise ValueError("Number of speaker names does not match number of speakers") + + if args: + for arg,ospeaker in zip(args,self.speakers): + annotatios[ospeaker] = arg + + if kwargs: + for key in kwargs: + if key not in self.speakers: + raise ValueError(f"{key} is not a speaker") + annotatios[key] = kwargs[key] + + self.annotation = annotatios + return annotatios + + def _extract_speakers(self) -> list: + """ + Extract speaker names from transcript + :return: list of speaker names + :rtype: list + """ + return list(set([self.transcript[id]["speaker"] for id in self.transcript])) + + def _extract_segments(self) -> list: + """ + Extract segments from transcript + + :return: list of segments + :rtype: list + """ + return [self.transcript[id]["segment"] for id in self.transcript] + + def __str__(self) -> str: + """ + Get transcript as string + + :return: transcript as string + :rtype: str + """ + fstring = "" + + for id in self.transcript: + seq = self.transcript[id] + + if self.annotation: + speaker = self.annotation[seq["speaker"]] + else: + speaker = seq["speaker"] + + fstring += f"{speaker}: {seq['text']}\n" + + return fstring + + def __repr__(self) -> str: + return f"Transcript(speakers = {self.speakers},"\ + f"segments = {self.segments}, annotation = {self.annotation})" + + def get_dict(self) -> dict: + """ + Get transcript as dict + + :return: transcript as dict + :rtype: dict + """ + + return self.transcript + + def get_json(self, *args, **kwargs) -> str: + """ + Get transcript as json string + :return: transcript as json string + :rtype: str + """ + if "indent" not in kwargs: + kwargs["indent"] = 4 + return json.dumps(self.transcript, *args, **kwargs) + + def get_html(self) -> str: + """ + Get transcript as html string + + :return: transcript as html string + :rtype: str + """ + html = "
" + self.__str__().replace("\n", "
") + "