Auto fixes from PEP8, fixes from flake8.
This commit is contained in:
@@ -1,5 +1,6 @@
|
||||
import json
|
||||
import time
|
||||
from json.decoder import JSONDecodeError
|
||||
|
||||
from typing import Union
|
||||
|
||||
@@ -8,13 +9,12 @@ from .hallucinations import KNOWN_HALLUCINATIONS
|
||||
ALPHABET = [*"abcdefghijklmnopqrstuvwxyz"]
|
||||
|
||||
|
||||
|
||||
class Transcript:
|
||||
"""
|
||||
Class for storing transcript data, including speaker information and text segments,
|
||||
and exporting it to various file formats such as JSON, HTML, and LaTeX.
|
||||
"""
|
||||
|
||||
|
||||
def __init__(self, transcript: dict) -> None:
|
||||
"""
|
||||
Initializes the Transcript object with the given transcript data.
|
||||
@@ -30,7 +30,7 @@ class Transcript:
|
||||
self.speakers = self._extract_speakers()
|
||||
self.segments = self._extract_segments()
|
||||
self.annotation = {}
|
||||
|
||||
|
||||
def annotate(self, *args, **kwargs) -> dict:
|
||||
"""
|
||||
Annotates the transcript to associate specific names with speakers.
|
||||
@@ -46,36 +46,41 @@ class Transcript:
|
||||
ValueError: If the number of speaker names does not match the number
|
||||
of speakers, or if an unknown speaker is found.
|
||||
"""
|
||||
|
||||
|
||||
annotations = {}
|
||||
if args and len(args) != len(self.speakers):
|
||||
raise ValueError("Number of speaker names does not match number of speakers")
|
||||
|
||||
raise ValueError(
|
||||
"Number of speaker names does not match number of speakers")
|
||||
|
||||
if args:
|
||||
for arg, speaker in zip(args, sorted(self.speakers)):
|
||||
|
||||
|
||||
annotations[speaker] = arg
|
||||
|
||||
|
||||
invalid_speakers = set(kwargs.keys()) - set(self.speakers)
|
||||
if invalid_speakers:
|
||||
raise ValueError(f"These keys are not speakers: {', '.join(invalid_speakers)}")
|
||||
raise ValueError(
|
||||
f"These keys are not speakers: {', '.join(invalid_speakers)}")
|
||||
|
||||
annotations.update({key: kwargs[key] for key in self.speakers if key in kwargs})
|
||||
annotations.update({key: kwargs[key]
|
||||
for key in self.speakers if key in kwargs})
|
||||
|
||||
self.annotation = annotations
|
||||
|
||||
|
||||
return self
|
||||
|
||||
|
||||
def _remove_hallucinations(self) -> None:
|
||||
"""
|
||||
Removes all occurances of known hallucinations from all segments of the transcript.
|
||||
Segments that are identical to empty strings afterwards are removed from the transcript.
|
||||
"""
|
||||
segments_to_drop=[]
|
||||
segments_to_drop = []
|
||||
for id in self.transcript:
|
||||
for snippet in KNOWN_HALLUCINATIONS:
|
||||
self.transcript[id]['text']=self.transcript[id]['text'].replace(snippet,'')
|
||||
if self.transcript[id]['text'] == '': segments_to_drop.append(id)
|
||||
self.transcript[id]['text'] = self.transcript[id]['text'].replace(
|
||||
snippet, '')
|
||||
if self.transcript[id]['text'] == '':
|
||||
segments_to_drop.append(id)
|
||||
|
||||
for id in segments_to_drop:
|
||||
del self.transcript[id]
|
||||
@@ -87,9 +92,9 @@ class Transcript:
|
||||
Returns:
|
||||
list: List of unique speaker names in the transcript.
|
||||
"""
|
||||
|
||||
|
||||
return list(set([self.transcript[id]["speakers"] for id in self.transcript]))
|
||||
|
||||
|
||||
def _extract_segments(self) -> list:
|
||||
"""
|
||||
Extracts all the text segments from the transcript.
|
||||
@@ -109,23 +114,23 @@ class Transcript:
|
||||
time stamps for each segment.
|
||||
"""
|
||||
fstring = ""
|
||||
|
||||
|
||||
for _id in self.transcript:
|
||||
seq = self.transcript[_id]
|
||||
|
||||
|
||||
if self.annotation:
|
||||
speaker = self.annotation[seq["speakers"]]
|
||||
else:
|
||||
speaker = seq["speakers"]
|
||||
|
||||
|
||||
segm = seq["segments"]
|
||||
sseg = time.strftime("%H:%M:%S",time.gmtime(segm[0]))
|
||||
eseg = time.strftime("%H:%M:%S",time.gmtime(segm[1]))
|
||||
|
||||
sseg = time.strftime("%H:%M:%S", time.gmtime(segm[0]))
|
||||
eseg = time.strftime("%H:%M:%S", time.gmtime(segm[1]))
|
||||
|
||||
fstring += f"{speaker} ({sseg} ; {eseg}):\t{seq['text']}\n"
|
||||
|
||||
|
||||
return fstring
|
||||
|
||||
|
||||
def __repr__(self) -> str:
|
||||
"""Return a string representation of the Transcript object.
|
||||
|
||||
@@ -133,8 +138,8 @@ class Transcript:
|
||||
str: A string that provides an informative description of the object.
|
||||
"""
|
||||
return f"Transcript(speakers = {self.speakers},"\
|
||||
f"segments = {self.segments}, annotation = {self.annotation})"
|
||||
|
||||
f"segments = {self.segments}, annotation = {self.annotation})"
|
||||
|
||||
def get_dict(self) -> dict:
|
||||
"""
|
||||
Get transcript as dict
|
||||
@@ -142,10 +147,10 @@ class Transcript:
|
||||
:return: transcript as dict
|
||||
:rtype: dict
|
||||
"""
|
||||
|
||||
|
||||
return self.transcript
|
||||
|
||||
def get_json(self, *args, use_annotation : bool = True, **kwargs) -> str:
|
||||
|
||||
def get_json(self, *args, use_annotation: bool = True, **kwargs) -> str:
|
||||
"""
|
||||
Get transcript as json string
|
||||
:return: transcript as json string
|
||||
@@ -153,14 +158,14 @@ class Transcript:
|
||||
"""
|
||||
if "indent" not in kwargs:
|
||||
kwargs["indent"] = 3
|
||||
|
||||
|
||||
if use_annotation and self.annotation:
|
||||
for _id in self.transcript:
|
||||
seq = self.transcript[_id]
|
||||
seq["speakers"] = self.annotation[seq["speakers"]]
|
||||
|
||||
|
||||
return json.dumps(self.transcript, *args, **kwargs)
|
||||
|
||||
|
||||
def get_html(self) -> str:
|
||||
"""
|
||||
Get transcript as html string
|
||||
@@ -171,9 +176,9 @@ class Transcript:
|
||||
html = "<p>" + self.__str__().replace("\n", "<br>") + "</p>"
|
||||
html = "<html><body>" + html + "</body></html>"
|
||||
html = html.replace("\t", " ")
|
||||
|
||||
return html
|
||||
|
||||
|
||||
return html
|
||||
|
||||
def get_md(self) -> str:
|
||||
"""Get transcript as Markdown string, using HTML formatting.
|
||||
|
||||
@@ -181,7 +186,7 @@ class Transcript:
|
||||
str: Transcript as a Markdown string.
|
||||
"""
|
||||
return self.get_html()
|
||||
|
||||
|
||||
def get_tex(self) -> str:
|
||||
"""Get transcript as LaTeX string. If no annotations are present, the speakers will
|
||||
be annotated with the first letters of the alphabet.
|
||||
@@ -192,43 +197,42 @@ class Transcript:
|
||||
if not self.annotation:
|
||||
|
||||
self.annotate(*ALPHABET[:len(self.speakers)])
|
||||
|
||||
fstring ="\\begin{drama}"
|
||||
|
||||
|
||||
fstring = "\\begin{drama}"
|
||||
|
||||
for speaker in self.speakers:
|
||||
|
||||
fstring += "\n\t\\Character{"+ str(self.annotation[speaker]) + "}" \
|
||||
"{"+ str(self.annotation[speaker]) + "}"
|
||||
|
||||
|
||||
fstring += "\n\t\\Character{" + str(self.annotation[speaker]) + "}" \
|
||||
"{" + str(self.annotation[speaker]) + "}"
|
||||
|
||||
for id in self.transcript:
|
||||
seq = self.transcript[id]
|
||||
speaker = self.annotation[seq["speakers"]]
|
||||
fstring += f"\n\\{speaker}speaks:\n{seq['text']}"
|
||||
|
||||
|
||||
fstring += "\n\\end{drama}"
|
||||
|
||||
|
||||
return fstring
|
||||
|
||||
|
||||
def to_json(self,path, *args, **kwargs) -> None:
|
||||
|
||||
def to_json(self, path, *args, **kwargs) -> None:
|
||||
"""Save transcript as json file
|
||||
|
||||
|
||||
Args:
|
||||
path (str): path to save file
|
||||
"""
|
||||
with open(path, "w") as f:
|
||||
json.dump(self.transcript, f, *args, **kwargs)
|
||||
|
||||
|
||||
def to_txt(self, path: str) -> None:
|
||||
"""Save transcript as a LaTeX file (placeholder function, implementation needed).
|
||||
|
||||
Args:
|
||||
path (str): Path to save the LaTeX file.
|
||||
"""
|
||||
|
||||
|
||||
with open(path, "w") as f:
|
||||
f.write(self.__str__())
|
||||
|
||||
|
||||
def to_md(self, path: str) -> None:
|
||||
"""Get transcript as Markdown string, using HTML formatting.
|
||||
|
||||
@@ -236,7 +240,7 @@ class Transcript:
|
||||
str: Transcript as a Markdown string.
|
||||
"""
|
||||
return self.to_html(path)
|
||||
|
||||
|
||||
def to_html(self, path: str) -> None:
|
||||
"""
|
||||
Save transcript as html file
|
||||
@@ -244,10 +248,10 @@ class Transcript:
|
||||
:param path: path to save file
|
||||
:type path: str
|
||||
"""
|
||||
|
||||
|
||||
with open(path, "w") as file:
|
||||
file.write(self.get_html())
|
||||
|
||||
|
||||
def to_tex(self, path: str) -> None:
|
||||
"""Save transcript as a LaTeX file (placeholder function, implementation needed).
|
||||
|
||||
@@ -255,7 +259,7 @@ class Transcript:
|
||||
path (str): Path to save the LaTeX file.
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
def to_pdf(self, path: str) -> None:
|
||||
"""Save transcript as a PDF file (placeholder function, implementation needed).
|
||||
|
||||
@@ -263,7 +267,7 @@ class Transcript:
|
||||
path (str): Path to save the PDF file.
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
def save(self, path: str, *args, **kwargs) -> None:
|
||||
"""Save transcript to file with the given path and file format.
|
||||
|
||||
@@ -279,7 +283,7 @@ class Transcript:
|
||||
Raises:
|
||||
ValueError: If the file format specified in the path is unknown.
|
||||
"""
|
||||
|
||||
|
||||
if path.endswith(".json"):
|
||||
self.to_json(path, *args, **kwargs)
|
||||
elif path.endswith(".txt"):
|
||||
@@ -294,7 +298,7 @@ class Transcript:
|
||||
self.to_pdf(path, *args, **kwargs)
|
||||
else:
|
||||
raise ValueError("Unknown file format")
|
||||
|
||||
|
||||
@classmethod
|
||||
def from_json(cls, json: Union[dict, str]) -> "Transcript":
|
||||
"""Load transcript from json file
|
||||
@@ -310,10 +314,8 @@ class Transcript:
|
||||
else:
|
||||
try:
|
||||
transcript = json.loads(json)
|
||||
except:
|
||||
except (TypeError, JSONDecodeError):
|
||||
with open(json, "r") as f:
|
||||
transcript = json.load(f)
|
||||
|
||||
return cls(transcript)
|
||||
|
||||
|
||||
return cls(transcript)
|
||||
|
||||
Reference in New Issue
Block a user