Auto fixes from PEP8, fixes from flake8.

2024-05-15 15:18:17 +02:00
parent 9f526a8f3b
commit 4bcd28d0ea
15 changed files with 391 additions and 417 deletions
@@ -1,5 +1,6 @@
 import json
 import time
+from json.decoder import JSONDecodeError

 from typing import Union

@@ -8,13 +9,12 @@ from .hallucinations import KNOWN_HALLUCINATIONS
 ALPHABET = [*"abcdefghijklmnopqrstuvwxyz"]


-
 class Transcript:
    """
    Class for storing transcript data, including speaker information and text segments, 
    and exporting it to various file formats such as JSON, HTML, and LaTeX.
    """
-    
+
    def __init__(self, transcript: dict) -> None:
        """
        Initializes the Transcript object with the given transcript data.
@@ -30,7 +30,7 @@ class Transcript:
        self.speakers = self._extract_speakers()
        self.segments = self._extract_segments()
        self.annotation = {}
-    
+
    def annotate(self, *args, **kwargs) -> dict:
        """
        Annotates the transcript to associate specific names with speakers.
@@ -46,36 +46,41 @@ class Transcript:
            ValueError: If the number of speaker names does not match the number 
                        of speakers, or if an unknown speaker is found.
        """
-        
+
        annotations = {}
        if args and len(args) != len(self.speakers):
-            raise ValueError("Number of speaker names does not match number of speakers")
-        
+            raise ValueError(
+                "Number of speaker names does not match number of speakers")
+
        if args:
            for arg, speaker in zip(args, sorted(self.speakers)):
-                
+
                annotations[speaker] = arg
-        
+
        invalid_speakers = set(kwargs.keys()) - set(self.speakers)
        if invalid_speakers:
-            raise ValueError(f"These keys are not speakers: {', '.join(invalid_speakers)}")
+            raise ValueError(
+                f"These keys are not speakers: {', '.join(invalid_speakers)}")

-        annotations.update({key: kwargs[key] for key in self.speakers if key in kwargs})
+        annotations.update({key: kwargs[key]
+                           for key in self.speakers if key in kwargs})

        self.annotation = annotations
-        
+
        return self
-    
+
    def _remove_hallucinations(self) -> None:
        """
        Removes all occurances of known hallucinations from all segments of the transcript.
        Segments that are identical to empty strings afterwards are removed from the transcript.
        """
-        segments_to_drop=[]
+        segments_to_drop = []
        for id in self.transcript:
            for snippet in KNOWN_HALLUCINATIONS:
-                self.transcript[id]['text']=self.transcript[id]['text'].replace(snippet,'')
-            if self.transcript[id]['text'] == '': segments_to_drop.append(id)
+                self.transcript[id]['text'] = self.transcript[id]['text'].replace(
+                    snippet, '')
+            if self.transcript[id]['text'] == '':
+                segments_to_drop.append(id)

        for id in segments_to_drop:
            del self.transcript[id]
@@ -87,9 +92,9 @@ class Transcript:
        Returns:
            list: List of unique speaker names in the transcript.
        """
-        
+
        return list(set([self.transcript[id]["speakers"] for id in self.transcript]))
-    
+
    def _extract_segments(self) -> list:
        """
        Extracts all the text segments from the transcript.
@@ -109,23 +114,23 @@ class Transcript:
                time stamps for each segment.
        """
        fstring = ""
-        
+
        for _id in self.transcript:
            seq = self.transcript[_id]
-            
+
            if self.annotation:
                speaker = self.annotation[seq["speakers"]]
            else:
                speaker = seq["speakers"]
-            
+
            segm = seq["segments"]
-            sseg = time.strftime("%H:%M:%S",time.gmtime(segm[0]))
-            eseg = time.strftime("%H:%M:%S",time.gmtime(segm[1]))
-            
+            sseg = time.strftime("%H:%M:%S", time.gmtime(segm[0]))
+            eseg = time.strftime("%H:%M:%S", time.gmtime(segm[1]))
+
            fstring += f"{speaker} ({sseg} ; {eseg}):\t{seq['text']}\n"
-        
+
        return fstring
-    
+
    def __repr__(self) -> str:
        """Return a string representation of the Transcript object.

@@ -133,8 +138,8 @@ class Transcript:
            str: A string that provides an informative description of the object.
        """
        return f"Transcript(speakers = {self.speakers},"\
-                f"segments = {self.segments}, annotation = {self.annotation})"
-    
+            f"segments = {self.segments}, annotation = {self.annotation})"
+
    def get_dict(self) -> dict:
        """
        Get transcript as dict
@@ -142,10 +147,10 @@ class Transcript:
        :return: transcript as dict
        :rtype: dict
        """
-        
+
        return self.transcript
-    
-    def get_json(self, *args, use_annotation : bool = True, **kwargs) -> str:
+
+    def get_json(self, *args, use_annotation: bool = True, **kwargs) -> str:
        """
        Get transcript as json string
        :return: transcript as json string
@@ -153,14 +158,14 @@ class Transcript:
        """
        if "indent" not in kwargs:
            kwargs["indent"] = 3
-        
+
        if use_annotation and self.annotation:
            for _id in self.transcript:
                seq = self.transcript[_id]
                seq["speakers"] = self.annotation[seq["speakers"]]
-            
+
        return json.dumps(self.transcript, *args, **kwargs)
-    
+
    def get_html(self) -> str:
        """
        Get transcript as html string
@@ -171,9 +176,9 @@ class Transcript:
        html = "<p>" + self.__str__().replace("\n", "<br>") + "</p>"
        html = "<html><body>" + html + "</body></html>"
        html = html.replace("\t", "&nbsp;&nbsp;&nbsp;&nbsp;")
-       
-        return html   
-    
+
+        return html
+
    def get_md(self) -> str:
        """Get transcript as Markdown string, using HTML formatting.

@@ -181,7 +186,7 @@ class Transcript:
            str: Transcript as a Markdown string.
        """
        return self.get_html()
-    
+
    def get_tex(self) -> str:
        """Get transcript as LaTeX string. If no annotations are present, the speakers will
        be annotated with the first letters of the alphabet.
@@ -192,43 +197,42 @@ class Transcript:
        if not self.annotation:

            self.annotate(*ALPHABET[:len(self.speakers)])
-        
-        fstring ="\\begin{drama}"
-        
+
+        fstring = "\\begin{drama}"
+
        for speaker in self.speakers:
-            
-            fstring += "\n\t\\Character{"+ str(self.annotation[speaker]) + "}" \
-                "{"+ str(self.annotation[speaker]) + "}"
-        
+
+            fstring += "\n\t\\Character{" + str(self.annotation[speaker]) + "}" \
+                "{" + str(self.annotation[speaker]) + "}"
+
        for id in self.transcript:
            seq = self.transcript[id]
            speaker = self.annotation[seq["speakers"]]
            fstring += f"\n\\{speaker}speaks:\n{seq['text']}"
-        
+
        fstring += "\n\\end{drama}"
-        
+
        return fstring
-        
-            
-    def to_json(self,path, *args, **kwargs) -> None:
+
+    def to_json(self, path, *args, **kwargs) -> None:
        """Save transcript as json file
-        
+
        Args:
            path (str): path to save file
        """
        with open(path, "w") as f:
            json.dump(self.transcript, f, *args, **kwargs)
-    
+
    def to_txt(self, path: str) -> None:
        """Save transcript as a LaTeX file (placeholder function, implementation needed).

        Args:
            path (str): Path to save the LaTeX file.
        """
-        
+
        with open(path, "w") as f:
            f.write(self.__str__())
-    
+
    def to_md(self, path: str) -> None:
        """Get transcript as Markdown string, using HTML formatting.

@@ -236,7 +240,7 @@ class Transcript:
            str: Transcript as a Markdown string.
        """
        return self.to_html(path)
-    
+
    def to_html(self, path: str) -> None:
        """
        Save transcript as html file
@@ -244,10 +248,10 @@ class Transcript:
        :param path: path to save file
        :type path: str
        """
-        
+
        with open(path, "w") as file:
            file.write(self.get_html())
-    
+
    def to_tex(self, path: str) -> None:
        """Save transcript as a LaTeX file (placeholder function, implementation needed).

@@ -255,7 +259,7 @@ class Transcript:
            path (str): Path to save the LaTeX file.
        """
        pass
-    
+
    def to_pdf(self, path: str) -> None:
        """Save transcript as a PDF file (placeholder function, implementation needed).

@@ -263,7 +267,7 @@ class Transcript:
            path (str): Path to save the PDF file.
        """
        pass
-    
+
    def save(self, path: str, *args, **kwargs) -> None:
        """Save transcript to file with the given path and file format.

@@ -279,7 +283,7 @@ class Transcript:
        Raises:
            ValueError: If the file format specified in the path is unknown.
        """
-        
+
        if path.endswith(".json"):
            self.to_json(path, *args, **kwargs)
        elif path.endswith(".txt"):
@@ -294,7 +298,7 @@ class Transcript:
            self.to_pdf(path, *args, **kwargs)
        else:
            raise ValueError("Unknown file format")
-        
+
    @classmethod
    def from_json(cls, json: Union[dict, str]) -> "Transcript":
        """Load transcript from json file
@@ -310,10 +314,8 @@ class Transcript:
        else:
            try:
                transcript = json.loads(json)
-            except:
+            except (TypeError, JSONDecodeError):
                with open(json, "r") as f:
                    transcript = json.load(f)
-            
-            return cls(transcript)

-    
+            return cls(transcript)