From 52efd41d21e1dfd5056abfa73401673a09a77dbc Mon Sep 17 00:00:00 2001
From: Jaikinator <schmieder.jacob@web.de>
Date: Fri, 16 Jun 2023 15:00:22 +0200
Subject: [PATCH] added Transcriptor class which handles Transcription output

---
 autotranscript/transcript_exporter.py | 181 +++++++++++++++++++++++++-
 1 file changed, 175 insertions(+), 6 deletions(-)

diff --git a/autotranscript/transcript_exporter.py b/autotranscript/transcript_exporter.py
index 956b398..ae6f1b6 100644
--- a/autotranscript/transcript_exporter.py
+++ b/autotranscript/transcript_exporter.py
@@ -1,23 +1,192 @@
+import json
+
+ALPHABET = [*"abcdefghijklmnopqrstuvwxyz"]
+
 
 class Transcript:
     """
     Class for storing transcript data
     and exporting it to files in different formats
     """
-    def __init__(self, transcript: str) -> None:
+    def __init__(self, transcript: dict) -> None:
         """
         :param transcript: formated transcript string
         """
         self.transcript = transcript
+        self.speakers = self._extract_speakers()
+        self.segments = self._extract_segments()
+        self.annotation = {}
     
-    def to_latex(self, path: str) -> None:
+    def annotate(self, *args, **kwargs) -> dict:
+        """
+        Annote transcript to define speaker names
+        
+        :param args: list of speaker names will maped sequentially to the speakers
+        :param kwargs: dict with speaker names as keys and list of segments as values
+        
+        :return: dict with speaker names as keys and list of segments as values
+        :rtype: dict
+        """
+        
+        annotatios = {}
+
+        if len(args) != len(self.speakers):
+            raise ValueError("Number of speaker names does not match number of speakers")
+        
+        if args:
+            for arg,ospeaker in zip(args,self.speakers):
+                annotatios[ospeaker] = arg
+        
+        if kwargs:
+            for key in kwargs:
+                if key not in self.speakers:
+                    raise ValueError(f"{key} is not a speaker")
+                annotatios[key] = kwargs[key]
+
+        self.annotation = annotatios
+        return annotatios
+    
+    def _extract_speakers(self) -> list:
+        """
+        Extract speaker names from transcript
+        :return: list of speaker names
+        :rtype: list
+        """
+        return list(set([self.transcript[id]["speaker"] for id in self.transcript]))
+    
+    def _extract_segments(self) -> list:
+        """
+        Extract segments from transcript
+
+        :return: list of segments
+        :rtype: list
+        """
+        return [self.transcript[id]["segment"] for id in self.transcript]
+
+    def __str__(self) -> str:
+        """
+        Get transcript as string
+
+        :return: transcript as string
+        :rtype: str
+        """
+        fstring = ""
+        
+        for id in self.transcript:
+            seq = self.transcript[id]
+            
+            if self.annotation:
+                speaker = self.annotation[seq["speaker"]]
+            else:
+                speaker = seq["speaker"]
+                
+            fstring += f"{speaker}: {seq['text']}\n"
+
+        return fstring
+    
+    def __repr__(self) -> str:
+        return f"Transcript(speakers = {self.speakers},"\
+                f"segments = {self.segments}, annotation = {self.annotation})"
+    
+    def get_dict(self) -> dict:
+        """
+        Get transcript as dict
+
+        :return: transcript as dict
+        :rtype: dict
+        """
+        
+        return self.transcript
+    
+    def get_json(self, *args, **kwargs) -> str:
+        """
+        Get transcript as json string
+        :return: transcript as json string
+        :rtype: str
+        """
+        if "indent" not in kwargs:
+            kwargs["indent"] = 4
+        return json.dumps(self.transcript, *args, **kwargs)
+    
+    def get_html(self) -> str:
+        """
+        Get transcript as html string
+
+        :return: transcript as html string
+        :rtype: str
+        """
+        html = "<p>" + self.__str__().replace("\n", "<br>") + "</p>"
+        html = "<html><body>" + html + "</body></html>"
+        html = html.replace("\t", "&nbsp;&nbsp;&nbsp;&nbsp;")
+        
+        return html
+        
+    
+    def get_md(self) -> str:
+        return self.get_html()
+    
+    def get_tex(self) -> str:
+        
+        if not self.annotation:
+
+            self.annotate(*ALPHABET[:len(self.speakers)])
+        
+        fstring ="\\begin{drama}"
+        
+        for speaker in self.speakers:
+            
+            fstring += "\n\t\\Character{"+ str(self.annotation[speaker]) + "}" \
+                "{"+ str(self.annotation[speaker]) + "}"
+        
+        for id in self.transcript:
+            seq = self.transcript[id]
+            speaker = self.annotation[seq["speaker"]]
+            fstring += f"\n\\{speaker}speaks:\n{seq['text']}"
+        
+        fstring += "\n\\end{drama}"
+        
+        return fstring
+        
+            
+    def to_json(self,path, *args, **kwargs) -> None:
+        """
+        Save transcript as json file
+        :param path: path to save file
+        :type path: str
+        """
+        with open(path, "w") as f:
+            json.dump(self.transcript, f, *args, **kwargs)
+    
+    def to_txt(self, path: str) -> None:
+        
+       with open(path, "w") as f:
+            f.write(self.__str__, f)
+    
+    def to_md(self, path: str) -> None:
+        return self.to_html(path)
+    
+    def to_html(self, path: str) -> None:
+        """
+        Save transcript as html file
+
+        :param path: path to save file
+        :type path: str
+        """
+        
+        with open(path, "w") as file:
+            file.write(self.get_html())
+    
+    def to_tex(self, path: str) -> None:
         pass
     
     def to_pdf(self, path: str) -> None:
         pass
     
-    def to_txt(self, path: str) -> None:
-        pass
+if __name__ == "__main__":
+    test = Transcript(json.load(open("tests/test.json", "r")))
+    print(repr(test))
+    print(test)
     
-    def to_json(self, path: str) -> None:
-        pass
\ No newline at end of file
+    
+    
+    
\ No newline at end of file