Skip to content

pySemanticSlides API Documentation

doi

Created on 2023-02-12

@author: wf

DOI dataclass

get DOI data

Source code in slides/doi.py
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
@dataclass
class DOI:
    """
    get DOI data
    """
    doi:str
    debug:bool=False

    def debug_dump(self,d:dict):
        """
        dump the given dict if debug mode is on

        Args:
            d(dict): the dictionary to dump
        """
        if self.debug:
            print(json.dumps(d,indent=2))

    def fetchMeta(self,headers:dict)->dict:
        """
        get the metadata for my doi

        Args:
            headers(dict): the headers to use

        Returns:
            dict: the metadata according to the given headers
        """
        url=f"https://doi.org/{self.doi}"
        req=urllib.request.Request(url,headers=headers)
        response=urllib.request.urlopen(req)
        encoding = response.headers.get_content_charset('utf-8')
        content = response.read()
        text = content.decode(encoding)
        return text

    def fetchBibtexMeta(self)->dict:
        """
        get the meta data for my  doi by getting the bibtext JSON 
        result for the doi

        Returns:
            dict: metadata

        """
        headers= {
            'Accept': 'application/x-bibtex; charset=utf-8'
        }
        text=self.fetchMeta(headers)
        if self.debug:
            print(text)
        return text

    def fetchCiteprocMeta(self)->dict:
        """
        get the meta data for my  doi by getting the Citeproc JSON 
        result for the doi

        see https://citeproc-js.readthedocs.io/en/latest/csl-json/markup.html

        Returns:
            dict: metadata
        """
        headers= {
            'Accept': 'application/vnd.citationstyles.csl+json; charset=utf-8'
        }
        text=self.fetchMeta(headers)
        json_data=json.loads(text)
        self.debug_dump(json_data)
        return json_data

    def fetchBibTexDict(self)->dict:
        """
        get a latex BibTexDict for my doi

        Returns:
            dict: a dict with bibliographic metadata in bibtex latex format
        """
        meta_bibtex=self.fetchBibtexMeta()
        bd=bibtexparser.loads(meta_bibtex)
        btex=None
        if len(bd.entries)>0:
            btex=bd.entries[0]
            self.debug_dump(btex)
        return btex

    def fetchPlainTextBibTexDict(self)->dict:
        """
        get a plain text BibTexDict for my doi

        Returns:
            dict: a dict with bibliographic metadata in bibtex utf-8 (no latex) format
        """
        btex=self.fetchBibTexDict()
        if btex:
            ln2t=LatexNodes2Text()
            for key in btex:
                latex=btex[key]
                no_latex=ln2t.latex_to_text(latex)
                btex[key]=no_latex
            self.debug_dump(btex)
        return btex

debug_dump(d)

dump the given dict if debug mode is on

Parameters:

Name Type Description Default
d(dict)

the dictionary to dump

required
Source code in slides/doi.py
20
21
22
23
24
25
26
27
28
def debug_dump(self,d:dict):
    """
    dump the given dict if debug mode is on

    Args:
        d(dict): the dictionary to dump
    """
    if self.debug:
        print(json.dumps(d,indent=2))

fetchBibTexDict()

get a latex BibTexDict for my doi

Returns:

Name Type Description
dict dict

a dict with bibliographic metadata in bibtex latex format

Source code in slides/doi.py
83
84
85
86
87
88
89
90
91
92
93
94
95
96
def fetchBibTexDict(self)->dict:
    """
    get a latex BibTexDict for my doi

    Returns:
        dict: a dict with bibliographic metadata in bibtex latex format
    """
    meta_bibtex=self.fetchBibtexMeta()
    bd=bibtexparser.loads(meta_bibtex)
    btex=None
    if len(bd.entries)>0:
        btex=bd.entries[0]
        self.debug_dump(btex)
    return btex

fetchBibtexMeta()

get the meta data for my doi by getting the bibtext JSON result for the doi

Returns:

Name Type Description
dict dict

metadata

Source code in slides/doi.py
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
def fetchBibtexMeta(self)->dict:
    """
    get the meta data for my  doi by getting the bibtext JSON 
    result for the doi

    Returns:
        dict: metadata

    """
    headers= {
        'Accept': 'application/x-bibtex; charset=utf-8'
    }
    text=self.fetchMeta(headers)
    if self.debug:
        print(text)
    return text

fetchCiteprocMeta()

get the meta data for my doi by getting the Citeproc JSON result for the doi

see https://citeproc-js.readthedocs.io/en/latest/csl-json/markup.html

Returns:

Name Type Description
dict dict

metadata

Source code in slides/doi.py
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
def fetchCiteprocMeta(self)->dict:
    """
    get the meta data for my  doi by getting the Citeproc JSON 
    result for the doi

    see https://citeproc-js.readthedocs.io/en/latest/csl-json/markup.html

    Returns:
        dict: metadata
    """
    headers= {
        'Accept': 'application/vnd.citationstyles.csl+json; charset=utf-8'
    }
    text=self.fetchMeta(headers)
    json_data=json.loads(text)
    self.debug_dump(json_data)
    return json_data

fetchMeta(headers)

get the metadata for my doi

Parameters:

Name Type Description Default
headers(dict)

the headers to use

required

Returns:

Name Type Description
dict dict

the metadata according to the given headers

Source code in slides/doi.py
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
def fetchMeta(self,headers:dict)->dict:
    """
    get the metadata for my doi

    Args:
        headers(dict): the headers to use

    Returns:
        dict: the metadata according to the given headers
    """
    url=f"https://doi.org/{self.doi}"
    req=urllib.request.Request(url,headers=headers)
    response=urllib.request.urlopen(req)
    encoding = response.headers.get_content_charset('utf-8')
    content = response.read()
    text = content.decode(encoding)
    return text

fetchPlainTextBibTexDict()

get a plain text BibTexDict for my doi

Returns:

Name Type Description
dict dict

a dict with bibliographic metadata in bibtex utf-8 (no latex) format

Source code in slides/doi.py
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
def fetchPlainTextBibTexDict(self)->dict:
    """
    get a plain text BibTexDict for my doi

    Returns:
        dict: a dict with bibliographic metadata in bibtex utf-8 (no latex) format
    """
    btex=self.fetchBibTexDict()
    if btex:
        ln2t=LatexNodes2Text()
        for key in btex:
            latex=btex[key]
            no_latex=ln2t.latex_to_text(latex)
            btex[key]=no_latex
        self.debug_dump(btex)
    return btex

keyvalue_parser

Created on 2023-02-14

@author: wf

BaseKeyValueParser

general KeyValue Parser

Source code in slides/keyvalue_parser.py
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
class BaseKeyValueParser():
    """
    general KeyValue Parser
    """

    def __init__(self,config:KeyValueParserConfig):
        """
        constructor

        Args:
            config(KeyValueParserConfig): the configuration to use
        """
        self.config=config
        self.errors=[]
        self.keydefs_by_keyword={}

    def setKeydefs(self,keydefs:typing.List[Keydef]):
        """
        set my key definitions

        Args:
             keydefs(List[Keydef]): a list of keyword definitions
        """
        self.keydefs_by_keyword=Keydef.as_dict(keydefs)

    def add_error(self,error_msg:str):
        """
        add the given error to my list of errors

        Args:
            error_msg(str): the error to add
        """
        if self.config.debug:
            print(error_msg)
        self.errors.append(error_msg)

    def handleErrors(self,text:str):
        """
        handle my error with respect to the given text to pars
        """
        if not self.config.ignore_errors:
            error_str="\n".join(self.errors)
            raise Exception(f"key/value parsing of {text} failed with {len(self.errors)} errors:\n{error_str}")

    def getStrippedValues(self,value_list)->list:
        """
        strip all values in the given value list
        """
        if not self.config.strip:
            return value_list
        else:
            stripped_values=[]
            for value in value_list:
                stripped_values.append(value.strip())
            return stripped_values

__init__(config)

constructor

Parameters:

Name Type Description Default
config(KeyValueParserConfig)

the configuration to use

required
Source code in slides/keyvalue_parser.py
102
103
104
105
106
107
108
109
110
111
def __init__(self,config:KeyValueParserConfig):
    """
    constructor

    Args:
        config(KeyValueParserConfig): the configuration to use
    """
    self.config=config
    self.errors=[]
    self.keydefs_by_keyword={}

add_error(error_msg)

add the given error to my list of errors

Parameters:

Name Type Description Default
error_msg(str)

the error to add

required
Source code in slides/keyvalue_parser.py
122
123
124
125
126
127
128
129
130
131
def add_error(self,error_msg:str):
    """
    add the given error to my list of errors

    Args:
        error_msg(str): the error to add
    """
    if self.config.debug:
        print(error_msg)
    self.errors.append(error_msg)

getStrippedValues(value_list)

strip all values in the given value list

Source code in slides/keyvalue_parser.py
141
142
143
144
145
146
147
148
149
150
151
def getStrippedValues(self,value_list)->list:
    """
    strip all values in the given value list
    """
    if not self.config.strip:
        return value_list
    else:
        stripped_values=[]
        for value in value_list:
            stripped_values.append(value.strip())
        return stripped_values

handleErrors(text)

handle my error with respect to the given text to pars

Source code in slides/keyvalue_parser.py
133
134
135
136
137
138
139
def handleErrors(self,text:str):
    """
    handle my error with respect to the given text to pars
    """
    if not self.config.ignore_errors:
        error_str="\n".join(self.errors)
        raise Exception(f"key/value parsing of {text} failed with {len(self.errors)} errors:\n{error_str}")

setKeydefs(keydefs)

set my key definitions

Parameters:

Name Type Description Default
keydefs(List[Keydef])

a list of keyword definitions

required
Source code in slides/keyvalue_parser.py
113
114
115
116
117
118
119
120
def setKeydefs(self,keydefs:typing.List[Keydef]):
    """
    set my key definitions

    Args:
         keydefs(List[Keydef]): a list of keyword definitions
    """
    self.keydefs_by_keyword=Keydef.as_dict(keydefs)

KeyValueParser

Bases: BaseKeyValueParser

Key Value Parser (which won't handle all details properly) see https://stackoverflow.com/a/75270267/1497139

Source code in slides/keyvalue_parser.py
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
class KeyValueParser(BaseKeyValueParser):
    """
    Key Value Parser (which won't handle all details properly)
    see https://stackoverflow.com/a/75270267/1497139
    """

    def __init__(self,config:KeyValueParserConfig):
        """
        constructor

        Args:
            config(KeyValueParserConfig): the configuration to use
        """
        BaseKeyValueParser.__init__(self, config)  
        if config.record_delim=="\n":
            pp.ParserElement.setDefaultWhitespaceChars("\t")
        else:
            pp.ParserElement.setDefaultWhitespaceChars("\n")
        pass


    def setKeydefs(self,keydefs:typing.List[Keydef]):
        """
        overwrite how to set my key definitions

        Args:
             keydefs(List[Keydef]): a list of keyword definitions
        """
        BaseKeyValueParser.setKeydefs(self,keydefs)
        # set local variable from config
        record_delim=self.config.record_delim
        key_value_delim=self.config.key_value_delim
        value_delim=self.config.value_delim
        quote=self.config.quote
        #
        # initialize grammar
        # 
        # valid keys are alphas
        g_key = pp.Word(pp.alphas)
        # items may not have record or value delimiters or must be quoted
        g_item = pp.OneOrMore(pp.Word(pp.printables+" "+self.config.unicode_chars, excludeChars=record_delim+value_delim+quote) | pp.QuotedString(quote_char=quote))
        # a value is a value_delim delimited list of items
        g_value = pp.delimited_list(g_item, delim=value_delim)
        l_key_value_sep = pp.Suppress(pp.Literal(key_value_delim))
        g_key_value = g_key + l_key_value_sep + g_value
        self.g_grammar = pp.delimited_list(g_key_value, delim=record_delim)

        g_key.add_parse_action(lambda x: 
            self.keydefs_by_keyword[x[0]].key if x[0] in self.keydefs_by_keyword else x
        )
        g_value.add_parse_action(lambda x: 
            [x] if len(x) > 1 else x
        )
        g_key_value.add_parse_action(lambda x: 
            (x[0], x[1].as_list()) if isinstance(x[1],pp.ParseResults) else (x[0], x[1])
        )
        pass

    def getKeyValues(self,text:str)->dict:
        """
        get key/value pairs from the given text using the configured keys definition

        Args:
            text(str): the text to parser

        Returns:
            dict: the resulting key-value pairs
        """
        self.errors=[]
        key_values = dict()
        if text:         
            try:
                for k,v in self.g_grammar.parse_string(text, parse_all=True):
                    if self.config.strip:
                        if isinstance(v,list):
                            v=self.getStrippedValues(v)
                        else:
                            v=v.strip()
                    key_values[k] = v
            except Exception as ex:
                tb = traceback.format_exc()
                error_msg=f"parsing {text} failed: \n{str(ex)}\n{tb}"
                self.add_error(error_msg)
            self.handleErrors(text)
        return key_values

__init__(config)

constructor

Parameters:

Name Type Description Default
config(KeyValueParserConfig)

the configuration to use

required
Source code in slides/keyvalue_parser.py
219
220
221
222
223
224
225
226
227
228
229
230
231
def __init__(self,config:KeyValueParserConfig):
    """
    constructor

    Args:
        config(KeyValueParserConfig): the configuration to use
    """
    BaseKeyValueParser.__init__(self, config)  
    if config.record_delim=="\n":
        pp.ParserElement.setDefaultWhitespaceChars("\t")
    else:
        pp.ParserElement.setDefaultWhitespaceChars("\n")
    pass

getKeyValues(text)

get key/value pairs from the given text using the configured keys definition

Parameters:

Name Type Description Default
text(str)

the text to parser

required

Returns:

Name Type Description
dict dict

the resulting key-value pairs

Source code in slides/keyvalue_parser.py
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
def getKeyValues(self,text:str)->dict:
    """
    get key/value pairs from the given text using the configured keys definition

    Args:
        text(str): the text to parser

    Returns:
        dict: the resulting key-value pairs
    """
    self.errors=[]
    key_values = dict()
    if text:         
        try:
            for k,v in self.g_grammar.parse_string(text, parse_all=True):
                if self.config.strip:
                    if isinstance(v,list):
                        v=self.getStrippedValues(v)
                    else:
                        v=v.strip()
                key_values[k] = v
        except Exception as ex:
            tb = traceback.format_exc()
            error_msg=f"parsing {text} failed: \n{str(ex)}\n{tb}"
            self.add_error(error_msg)
        self.handleErrors(text)
    return key_values

setKeydefs(keydefs)

overwrite how to set my key definitions

Parameters:

Name Type Description Default
keydefs(List[Keydef])

a list of keyword definitions

required
Source code in slides/keyvalue_parser.py
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
def setKeydefs(self,keydefs:typing.List[Keydef]):
    """
    overwrite how to set my key definitions

    Args:
         keydefs(List[Keydef]): a list of keyword definitions
    """
    BaseKeyValueParser.setKeydefs(self,keydefs)
    # set local variable from config
    record_delim=self.config.record_delim
    key_value_delim=self.config.key_value_delim
    value_delim=self.config.value_delim
    quote=self.config.quote
    #
    # initialize grammar
    # 
    # valid keys are alphas
    g_key = pp.Word(pp.alphas)
    # items may not have record or value delimiters or must be quoted
    g_item = pp.OneOrMore(pp.Word(pp.printables+" "+self.config.unicode_chars, excludeChars=record_delim+value_delim+quote) | pp.QuotedString(quote_char=quote))
    # a value is a value_delim delimited list of items
    g_value = pp.delimited_list(g_item, delim=value_delim)
    l_key_value_sep = pp.Suppress(pp.Literal(key_value_delim))
    g_key_value = g_key + l_key_value_sep + g_value
    self.g_grammar = pp.delimited_list(g_key_value, delim=record_delim)

    g_key.add_parse_action(lambda x: 
        self.keydefs_by_keyword[x[0]].key if x[0] in self.keydefs_by_keyword else x
    )
    g_value.add_parse_action(lambda x: 
        [x] if len(x) > 1 else x
    )
    g_key_value.add_parse_action(lambda x: 
        (x[0], x[1].as_list()) if isinstance(x[1],pp.ParseResults) else (x[0], x[1])
    )
    pass

KeyValueParserConfig dataclass

a configuration for a key/value Parser

Source code in slides/keyvalue_parser.py
11
12
13
14
15
16
17
18
19
20
21
22
23
24
@dataclass
class KeyValueParserConfig():
    """
    a configuration for a key/value Parser
    """
    key_value_delim:str=":"
    record_delim:str="•"
    value_delim:str=","
    quote:str="\'"
    unicode_chars:str="•→–"
    strip:bool=True
    ignore_errors:bool=True
    defined_keys_only:bool=False
    debug:bool=False

KeyValueSplitParser

Bases: BaseKeyValueParser

Key / Value Parser

see https://stackoverflow.com/questions/75266188/pyparsing-syntax-tree-from-named-value-list/75270267#75270267

Source code in slides/keyvalue_parser.py
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
class KeyValueSplitParser(BaseKeyValueParser):
    """
    Key / Value Parser

    see https://stackoverflow.com/questions/75266188/pyparsing-syntax-tree-from-named-value-list/75270267#75270267
    """

    def getKeyValues(self,text:str)->dict:
        """
        get key/value pairs from the given text using the configured keys definition

        Args:
            text(str): the text to parser

        Returns:
            dict: the resulting key-value pairs
        """
        self.errors=[]   
        result = dict()
        if text:
            try: 
                rsplit=Split(delim=self.config.record_delim,unicode_chars=self.config.unicode_chars)
                records=rsplit.split(text)
            except Exception as rsplit_ex:
                self.add_error(f"record split failed {rsplit_ex}")
                records=[]
            for record in records:
                key_value_split=Split(delim=self.config.key_value_delim,unicode_chars=self.config.unicode_chars)
                key_values=key_value_split.split(record)
                if len(key_values)!=2:
                    self.add_error(f"{key_values} has {len(key_values)}) elements but should have two")
                    continue
                else:
                    key_str=key_values[0]
                    keyword=key_str.strip()
                    values_str=key_values[1]
                    # is the keyword defined
                    if not keyword in self.keydefs_by_keyword:
                        if self.config.defined_keys_only:
                            self.add_error(f"undefined keyword {keyword}")
                        key=keyword
                        value=values_str
                    else:
                        keydef=self.keydefs_by_keyword[keyword]
                        # map keyword to key
                        key=keydef.key
                        values_split=Split(delim=self.config.value_delim,unicode_chars=self.config.unicode_chars,keep_quotes=False)
                        if keydef.has_list:
                            value_list=values_split.split(values_str)
                            value_list=self.getStrippedValues(value_list)
                            # value is a list
                            value=value_list
                        else:
                            value=values_str
                    if self.config.strip and isinstance(value,str):
                        value=value.strip()
                result[key]=value
            self.handleErrors(text)    
        return result

getKeyValues(text)

get key/value pairs from the given text using the configured keys definition

Parameters:

Name Type Description Default
text(str)

the text to parser

required

Returns:

Name Type Description
dict dict

the resulting key-value pairs

Source code in slides/keyvalue_parser.py
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
def getKeyValues(self,text:str)->dict:
    """
    get key/value pairs from the given text using the configured keys definition

    Args:
        text(str): the text to parser

    Returns:
        dict: the resulting key-value pairs
    """
    self.errors=[]   
    result = dict()
    if text:
        try: 
            rsplit=Split(delim=self.config.record_delim,unicode_chars=self.config.unicode_chars)
            records=rsplit.split(text)
        except Exception as rsplit_ex:
            self.add_error(f"record split failed {rsplit_ex}")
            records=[]
        for record in records:
            key_value_split=Split(delim=self.config.key_value_delim,unicode_chars=self.config.unicode_chars)
            key_values=key_value_split.split(record)
            if len(key_values)!=2:
                self.add_error(f"{key_values} has {len(key_values)}) elements but should have two")
                continue
            else:
                key_str=key_values[0]
                keyword=key_str.strip()
                values_str=key_values[1]
                # is the keyword defined
                if not keyword in self.keydefs_by_keyword:
                    if self.config.defined_keys_only:
                        self.add_error(f"undefined keyword {keyword}")
                    key=keyword
                    value=values_str
                else:
                    keydef=self.keydefs_by_keyword[keyword]
                    # map keyword to key
                    key=keydef.key
                    values_split=Split(delim=self.config.value_delim,unicode_chars=self.config.unicode_chars,keep_quotes=False)
                    if keydef.has_list:
                        value_list=values_split.split(values_str)
                        value_list=self.getStrippedValues(value_list)
                        # value is a list
                        value=value_list
                    else:
                        value=values_str
                if self.config.strip and isinstance(value,str):
                    value=value.strip()
            result[key]=value
        self.handleErrors(text)    
    return result

Keydef dataclass

a key definition

Source code in slides/keyvalue_parser.py
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
@dataclass
class Keydef():
    """
    a key definition
    """
    keyword: str
    key: str
    has_list: bool=False

    @classmethod
    def as_dict(cls,keydefs:typing.List['Keydef'])->typing.Dict[str,'Keydef']:
        """
        convert the given list of keydefs to a dict by keyword

        Args:
            keydefs(list): the list of key defs

        Returns:
            dict: a dict keyword -> Keydef

        """
        keydefs_by_keyword={}
        for keydef in keydefs:
            keydefs_by_keyword[keydef.keyword]=keydef
        return keydefs_by_keyword

as_dict(keydefs) classmethod

convert the given list of keydefs to a dict by keyword

Parameters:

Name Type Description Default
keydefs(list)

the list of key defs

required

Returns:

Name Type Description
dict Dict[str, Keydef]

a dict keyword -> Keydef

Source code in slides/keyvalue_parser.py
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
@classmethod
def as_dict(cls,keydefs:typing.List['Keydef'])->typing.Dict[str,'Keydef']:
    """
    convert the given list of keydefs to a dict by keyword

    Args:
        keydefs(list): the list of key defs

    Returns:
        dict: a dict keyword -> Keydef

    """
    keydefs_by_keyword={}
    for keydef in keydefs:
        keydefs_by_keyword[keydef.keyword]=keydef
    return keydefs_by_keyword

SimpleKeyValueParser

Bases: BaseKeyValueParser

a simple key value parser (which won't handle quote properly)

Source code in slides/keyvalue_parser.py
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
class SimpleKeyValueParser(BaseKeyValueParser):
    """
    a simple key value parser (which won't handle quote properly)
    """

    def getKeyValues(self,text:str)->dict:
        """
        get key/value pairs from the given text using the configured keys definition

        Args:
            text(str): the text to parser

        Returns:
            dict: the resulting key-value pairs
        """ 
        result={}
        self.errors=[]
        if text:
            key_values=text.split(self.config.record_delim)
            for key_value in key_values:
                if not self.config.key_value_delim in key_value:
                    error_msg=f"missing key_value delimiter '{self.config.key_value_delim} in {key_value}"
                    self.add_error(error_msg)
                    if self.config.ignore_errors:
                        continue
                parts=key_value.split(self.config.key_value_delim)
                if len(parts)>2:
                    error_msg=(f"notes syntax error: {key_value} has {len(parts)}) elements but should have two")
                    self.add_error(error_msg)
                    break
                # parsed key and value
                pkey,value=parts[0],parts[1]
                pkey=pkey.strip()
                if self.config.strip:
                    value=value.strip()
                if pkey in self.keydefs_by_keyword:
                    keydef=self.keydefs_by_keyword[pkey]
                    key=keydef.key
                    if keydef.has_list:
                        value_list=value.split(self.config.value_delim)
                        value_list=self.getStrippedValues(value_list)
                        # value is a list
                        value=value_list
                else:
                    if self.config.defined_keys_only:
                        error_msg=f"undefined key {pkey}"
                        self.add_error(error_msg)
                    else:
                        key=pkey
                result[key]=value 
                self.handleErrors(text)
        return result

getKeyValues(text)

get key/value pairs from the given text using the configured keys definition

Parameters:

Name Type Description Default
text(str)

the text to parser

required

Returns:

Name Type Description
dict dict

the resulting key-value pairs

Source code in slides/keyvalue_parser.py
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
def getKeyValues(self,text:str)->dict:
    """
    get key/value pairs from the given text using the configured keys definition

    Args:
        text(str): the text to parser

    Returns:
        dict: the resulting key-value pairs
    """ 
    result={}
    self.errors=[]
    if text:
        key_values=text.split(self.config.record_delim)
        for key_value in key_values:
            if not self.config.key_value_delim in key_value:
                error_msg=f"missing key_value delimiter '{self.config.key_value_delim} in {key_value}"
                self.add_error(error_msg)
                if self.config.ignore_errors:
                    continue
            parts=key_value.split(self.config.key_value_delim)
            if len(parts)>2:
                error_msg=(f"notes syntax error: {key_value} has {len(parts)}) elements but should have two")
                self.add_error(error_msg)
                break
            # parsed key and value
            pkey,value=parts[0],parts[1]
            pkey=pkey.strip()
            if self.config.strip:
                value=value.strip()
            if pkey in self.keydefs_by_keyword:
                keydef=self.keydefs_by_keyword[pkey]
                key=keydef.key
                if keydef.has_list:
                    value_list=value.split(self.config.value_delim)
                    value_list=self.getStrippedValues(value_list)
                    # value is a list
                    value=value_list
            else:
                if self.config.defined_keys_only:
                    error_msg=f"undefined key {pkey}"
                    self.add_error(error_msg)
                else:
                    key=pkey
            result[key]=value 
            self.handleErrors(text)
    return result

Split

quoted string splitter

Source code in slides/keyvalue_parser.py
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
class Split():
    """
    quoted string splitter
    """

    def __init__(self,delim:str=',',quote:str="'",unicode_chars:str="•→–",keep_quotes:bool=True):
        """
        constructor

        Args:
            delim(str): the delimiter char, default comma
            quote(str): the quote char, default single quote
            unicode_chars(str): unicode characters to allow
            keep_quotes(str): if True keep the quoted strings if False remove quotes

        """
        self.delim=delim
        self.quote=quote
        self.keep_quotes=keep_quotes
        pp.ParserElement.setDefaultWhitespaceChars("")
        self.g_quoted=pp.QuotedString(quote_char=quote)
        self.g_value = pp.OneOrMore(pp.Word(pp.printables+unicode_chars+" ", excludeChars=delim+quote) | self.g_quoted)
        self.g_quoted.add_parse_action(lambda x:
            f"{quote}{x[0]}{quote}" if self.keep_quotes else f"{x[0]}"
        )
        self.g_value.add_parse_action(lambda x: 
            "".join(x) if len(x) > 1 else x
        )  
        self.g_split = pp.delimited_list(self.g_value, delim=delim)
        pass

    def split(self,text:str,)->list:
        """
        split the given text with my delim acknowleding my quote char for quoted strings

        Args:
            text(str): the text to split

        Returns:
            list: a list of strings
        """
        parse_result=self.g_split.parse_string(text, parse_all=True)
        result_list=parse_result.asList()
        return result_list

__init__(delim=',', quote="'", unicode_chars='•→–', keep_quotes=True)

constructor

Parameters:

Name Type Description Default
delim(str)

the delimiter char, default comma

required
quote(str)

the quote char, default single quote

required
unicode_chars(str)

unicode characters to allow

required
keep_quotes(str)

if True keep the quoted strings if False remove quotes

required
Source code in slides/keyvalue_parser.py
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
def __init__(self,delim:str=',',quote:str="'",unicode_chars:str="•→–",keep_quotes:bool=True):
    """
    constructor

    Args:
        delim(str): the delimiter char, default comma
        quote(str): the quote char, default single quote
        unicode_chars(str): unicode characters to allow
        keep_quotes(str): if True keep the quoted strings if False remove quotes

    """
    self.delim=delim
    self.quote=quote
    self.keep_quotes=keep_quotes
    pp.ParserElement.setDefaultWhitespaceChars("")
    self.g_quoted=pp.QuotedString(quote_char=quote)
    self.g_value = pp.OneOrMore(pp.Word(pp.printables+unicode_chars+" ", excludeChars=delim+quote) | self.g_quoted)
    self.g_quoted.add_parse_action(lambda x:
        f"{quote}{x[0]}{quote}" if self.keep_quotes else f"{x[0]}"
    )
    self.g_value.add_parse_action(lambda x: 
        "".join(x) if len(x) > 1 else x
    )  
    self.g_split = pp.delimited_list(self.g_value, delim=delim)
    pass

split(text)

split the given text with my delim acknowleding my quote char for quoted strings

Parameters:

Name Type Description Default
text(str)

the text to split

required

Returns:

Name Type Description
list list

a list of strings

Source code in slides/keyvalue_parser.py
83
84
85
86
87
88
89
90
91
92
93
94
95
def split(self,text:str,)->list:
    """
    split the given text with my delim acknowleding my quote char for quoted strings

    Args:
        text(str): the text to split

    Returns:
        list: a list of strings
    """
    parse_result=self.g_split.parse_string(text, parse_all=True)
    result_list=parse_result.asList()
    return result_list

semslides

Created on 2023-02-23

@author: wf

SemSlides

a semantic mediawiki for slides

Source code in slides/semslides.py
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
class SemSlides():
    """
    a semantic mediawiki for slides
    """

    def __init__(self,args):
        """
        constructor

        Args:
            args(Args): my command line arguments
        """
        self.args=args

    @classmethod
    def getArgParser(cls,version_msg)->ArgumentParser:
        """
        Setup command line argument parser

        Args:
            description(str): the description
            version_msg(str): the version message

        Returns:
            ArgumentParser: the argument parser
        """
        parser = ArgumentParser(description=Version.description, formatter_class=RawDescriptionHelpFormatter)
        parser.add_argument("-a","--about",help="show about info [default: %(default)s]",action="store_true")
        parser.add_argument('--context', default="MetaModel",help='context to generate from [default: %(default)s]')
        parser.add_argument("-d", "--debug", dest="debug", action="store_true", help="show debug info")
        parser.add_argument('-V', '--version', action='version', version=version_msg)
        parser.add_argument('--wikiId', default="wiki",help='id of the wiki to generate for [default: %(default)s]')
        return parser

__init__(args)

constructor

Parameters:

Name Type Description Default
args(Args)

my command line arguments

required
Source code in slides/semslides.py
19
20
21
22
23
24
25
26
def __init__(self,args):
    """
    constructor

    Args:
        args(Args): my command line arguments
    """
    self.args=args

getArgParser(version_msg) classmethod

Setup command line argument parser

Parameters:

Name Type Description Default
description(str)

the description

required
version_msg(str)

the version message

required

Returns:

Name Type Description
ArgumentParser ArgumentParser

the argument parser

Source code in slides/semslides.py
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
@classmethod
def getArgParser(cls,version_msg)->ArgumentParser:
    """
    Setup command line argument parser

    Args:
        description(str): the description
        version_msg(str): the version message

    Returns:
        ArgumentParser: the argument parser
    """
    parser = ArgumentParser(description=Version.description, formatter_class=RawDescriptionHelpFormatter)
    parser.add_argument("-a","--about",help="show about info [default: %(default)s]",action="store_true")
    parser.add_argument('--context', default="MetaModel",help='context to generate from [default: %(default)s]')
    parser.add_argument("-d", "--debug", dest="debug", action="store_true", help="show debug info")
    parser.add_argument('-V', '--version', action='version', version=version_msg)
    parser.add_argument('--wikiId', default="wiki",help='id of the wiki to generate for [default: %(default)s]')
    return parser

main(argv=None)

main routine

Source code in slides/semslides.py
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
def main(argv=None):
    '''
    main routine
    '''
    if argv is None:
        argv = sys.argv   
    program_name = os.path.basename(sys.argv[0])#


    debug=True
    try:
        program_version_message = f'{program_name} (v{Version.version},{Version.updated})'
        parser=SemSlides.getArgParser(program_version_message)
        args = parser.parse_args(argv[1:])
        semSlides=SemSlides(args)
        if args.about:
            print(program_version_message)
            print(f"see {Version.doc_url}")
            webbrowser.open(Version.doc_url)
    except KeyboardInterrupt:
        ### handle keyboard interrupt ###
        return 1
    except Exception as e:
        indent = len(program_name) * " "
        sys.stderr.write(program_name + ": " + repr(e) + "\n")
        sys.stderr.write(indent + "  for help use --help")
        if debug:
            print(traceback.format_exc())
        return 2     

slidewalker

Created on 2022-04-07

@author: wf

PPT

Bases: object

PowerPoint Presentation with lecture

Source code in slides/slidewalker.py
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
class PPT(object):
    '''
    PowerPoint Presentation with lecture
    '''

    def __init__(self,filepath,pageHeight=297):
        '''
        Constructor
        '''
        self.filepath=filepath
        self.basename=os.path.basename(filepath)
        self.pageHeight=pageHeight
        if not os.path.isfile(filepath):
            raise Exception("%s does not exist" % filepath)
        self.prs=None
        self.error=None
        self.slides=[]

    def summary(self)->str:
        '''
        show a summary of the given lecture
        '''
        if self.error:
            summary=f"error: {self.error} at {self.filepath}"
        else:
            if hasattr(self, "lecture"):
                summary=f"{self.title}({len(self.lecture)} lecture)/{self.author}/{self.created}  {self.basename}"
            else:
                summary=f"{self.title}/{self.author}/{self.created}  {self.basename}"
        return summary

    def asDict(self)->dict:
        """
        convert me to a dict

        Returns:
            dict: summary
        """
        if self.error:
            summary={"error":str(self.error),"path":self.filepath}
        else:
            summary={"title":self.title,"author":self.author,"created":self.created,"path":self.filepath}
        return summary

    def open(self):
        '''
        open my presentation
        '''
        try:
            self.prs = Presentation(self.filepath)
            self.author=self.prs.core_properties.author
            self.created=self.prs.core_properties.created
            self.title=self.prs.core_properties.title
        except Exception as ex:
            self.error=ex

    def getSlides(self, excludeHiddenSlides:bool=False,runDelim:str=None):
        '''
        get my slides

        Args:
            excludeHiddenSlides(bool): if True exclude hidden Slides
        '''
        if runDelim is None:
            runDelim=Slide.defaultRunDelim
        if self.prs is None:
            self.open()
        if not self.error:
            page=0
            pdf_page=0
            for slide in self.prs.slides:
                page+=1
                if excludeHiddenSlides:
                    if slide._element.get('show') == '0':
                        # slide is hidden → go to next slide
                        continue
                pdf_page += 1
                pptSlide=Slide(self,slide,page=page,pdf_page=pdf_page,runDelim=runDelim)
                self.slides.append(pptSlide)
        return self.slides

__init__(filepath, pageHeight=297)

Constructor

Source code in slides/slidewalker.py
163
164
165
166
167
168
169
170
171
172
173
174
def __init__(self,filepath,pageHeight=297):
    '''
    Constructor
    '''
    self.filepath=filepath
    self.basename=os.path.basename(filepath)
    self.pageHeight=pageHeight
    if not os.path.isfile(filepath):
        raise Exception("%s does not exist" % filepath)
    self.prs=None
    self.error=None
    self.slides=[]

asDict()

convert me to a dict

Returns:

Name Type Description
dict dict

summary

Source code in slides/slidewalker.py
189
190
191
192
193
194
195
196
197
198
199
200
def asDict(self)->dict:
    """
    convert me to a dict

    Returns:
        dict: summary
    """
    if self.error:
        summary={"error":str(self.error),"path":self.filepath}
    else:
        summary={"title":self.title,"author":self.author,"created":self.created,"path":self.filepath}
    return summary

getSlides(excludeHiddenSlides=False, runDelim=None)

get my slides

Parameters:

Name Type Description Default
excludeHiddenSlides(bool)

if True exclude hidden Slides

required
Source code in slides/slidewalker.py
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
def getSlides(self, excludeHiddenSlides:bool=False,runDelim:str=None):
    '''
    get my slides

    Args:
        excludeHiddenSlides(bool): if True exclude hidden Slides
    '''
    if runDelim is None:
        runDelim=Slide.defaultRunDelim
    if self.prs is None:
        self.open()
    if not self.error:
        page=0
        pdf_page=0
        for slide in self.prs.slides:
            page+=1
            if excludeHiddenSlides:
                if slide._element.get('show') == '0':
                    # slide is hidden → go to next slide
                    continue
            pdf_page += 1
            pptSlide=Slide(self,slide,page=page,pdf_page=pdf_page,runDelim=runDelim)
            self.slides.append(pptSlide)
    return self.slides

open()

open my presentation

Source code in slides/slidewalker.py
202
203
204
205
206
207
208
209
210
211
212
def open(self):
    '''
    open my presentation
    '''
    try:
        self.prs = Presentation(self.filepath)
        self.author=self.prs.core_properties.author
        self.created=self.prs.core_properties.created
        self.title=self.prs.core_properties.title
    except Exception as ex:
        self.error=ex

summary()

show a summary of the given lecture

Source code in slides/slidewalker.py
176
177
178
179
180
181
182
183
184
185
186
187
def summary(self)->str:
    '''
    show a summary of the given lecture
    '''
    if self.error:
        summary=f"error: {self.error} at {self.filepath}"
    else:
        if hasattr(self, "lecture"):
            summary=f"{self.title}({len(self.lecture)} lecture)/{self.author}/{self.created}  {self.basename}"
        else:
            summary=f"{self.title}/{self.author}/{self.created}  {self.basename}"
    return summary

Slide

Bases: object

a single slide

Source code in slides/slidewalker.py
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
class Slide(object):
    '''
    a single slide
    '''
    defaultRunDelim=""

    def __init__(self,ppt,slide,page,pdf_page,runDelim:str=None):
        """
        constructor
        """
        self.ppt=ppt
        self.slide=slide
        self.page=page
        self.pdf_page=pdf_page
        self.name=slide.name
        self.title=None
        if runDelim is None:
            runDelim=Slide.defaultRunDelim
        self.runDelim=runDelim
        # https://stackoverflow.com/a/40821359/1497139
        if slide.shapes.title:
            self.title=slide.shapes.title.text
        if self.title is None:
            self.title=self.name
        pass

    def asDict(self):
        summary={
            "page": self.page,
            "pdf_page": self.pdf_page,
            "title": self.title,
            "name": self.name,
            "text": self.getText(),
            "notes": self.getNotes()
        }
        return summary

    def summary(self):
        text=(f"{self.page:3d}({self.name}):{self.title}")
        return text

    def getMM(self,emu):
        # https://startbigthinksmall.wordpress.com/2010/01/04/points-inches-and-emus-measuring-units-in-office-open-xml/
        if emu is None:
            return 0
        else:
            return emu.mm

    def getText4Shapes(self,shapes,yRange,runDelim:str=None):
        """
        get the text for the given shapes in the given yRange using the given
        run delimiter

        Args:
            shapes:
            yRange:
            runDelim(str): the delimiter for text runs
        """
        # lines will be populated with a list of strings,
        # one for each "line"  in presentation
        lines = []
        line=""
        delim=""
        if runDelim is None:
            runDelim=self.runDelim
        y=None
        for shape in shapes:
            if not shape.has_text_frame:
                continue

            for paragraph in shape.text_frame.paragraphs:
                for run in paragraph.runs:
                    line+=f"{delim}{run.text}"
                    delim=runDelim

            y=self.getMM(shape.top)
            if y and YRange.isIn(yRange,y):
                lines.append(line)

            delim=""
            line=""

        if y and YRange.isIn(yRange,y):
            lines.append(line)
            return lines

    def getText(self,yRange=None):
        '''
        get the text in the given yRange

        Args:
            yRange:

        Return:
            str: the notes for this slide
        '''
        text=self.getText4Shapes(self.slide.shapes,yRange,runDelim=self.runDelim)
        return text

    def getNotes(self,yRange=None,useShapes:bool=False)->str:
        """
        get the notes

        Return:
            str: the notes for this slide
        """
        text=""
        if self.slide.has_notes_slide:
            notes_slide=self.slide.notes_slide
            if useShapes:
                text=self.getText4Shapes(notes_slide.shapes,yRange,runDelim=self.runDelim)
            elif notes_slide.notes_text_frame:
                text=notes_slide.notes_text_frame.text
        return text

    def getLayoutName(self)->str:
        '''
        get the layoutName of this slide
        '''
        layoutName=self.slide.slide_layout.name
        return layoutName

__init__(ppt, slide, page, pdf_page, runDelim=None)

constructor

Source code in slides/slidewalker.py
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
def __init__(self,ppt,slide,page,pdf_page,runDelim:str=None):
    """
    constructor
    """
    self.ppt=ppt
    self.slide=slide
    self.page=page
    self.pdf_page=pdf_page
    self.name=slide.name
    self.title=None
    if runDelim is None:
        runDelim=Slide.defaultRunDelim
    self.runDelim=runDelim
    # https://stackoverflow.com/a/40821359/1497139
    if slide.shapes.title:
        self.title=slide.shapes.title.text
    if self.title is None:
        self.title=self.name
    pass

getLayoutName()

get the layoutName of this slide

Source code in slides/slidewalker.py
150
151
152
153
154
155
def getLayoutName(self)->str:
    '''
    get the layoutName of this slide
    '''
    layoutName=self.slide.slide_layout.name
    return layoutName

getNotes(yRange=None, useShapes=False)

get the notes

Return

str: the notes for this slide

Source code in slides/slidewalker.py
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
def getNotes(self,yRange=None,useShapes:bool=False)->str:
    """
    get the notes

    Return:
        str: the notes for this slide
    """
    text=""
    if self.slide.has_notes_slide:
        notes_slide=self.slide.notes_slide
        if useShapes:
            text=self.getText4Shapes(notes_slide.shapes,yRange,runDelim=self.runDelim)
        elif notes_slide.notes_text_frame:
            text=notes_slide.notes_text_frame.text
    return text

getText(yRange=None)

get the text in the given yRange

Parameters:

Name Type Description Default
yRange
None
Return

str: the notes for this slide

Source code in slides/slidewalker.py
121
122
123
124
125
126
127
128
129
130
131
132
def getText(self,yRange=None):
    '''
    get the text in the given yRange

    Args:
        yRange:

    Return:
        str: the notes for this slide
    '''
    text=self.getText4Shapes(self.slide.shapes,yRange,runDelim=self.runDelim)
    return text

getText4Shapes(shapes, yRange, runDelim=None)

get the text for the given shapes in the given yRange using the given run delimiter

Parameters:

Name Type Description Default
shapes
required
yRange
required
runDelim(str)

the delimiter for text runs

required
Source code in slides/slidewalker.py
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
def getText4Shapes(self,shapes,yRange,runDelim:str=None):
    """
    get the text for the given shapes in the given yRange using the given
    run delimiter

    Args:
        shapes:
        yRange:
        runDelim(str): the delimiter for text runs
    """
    # lines will be populated with a list of strings,
    # one for each "line"  in presentation
    lines = []
    line=""
    delim=""
    if runDelim is None:
        runDelim=self.runDelim
    y=None
    for shape in shapes:
        if not shape.has_text_frame:
            continue

        for paragraph in shape.text_frame.paragraphs:
            for run in paragraph.runs:
                line+=f"{delim}{run.text}"
                delim=runDelim

        y=self.getMM(shape.top)
        if y and YRange.isIn(yRange,y):
            lines.append(line)

        delim=""
        line=""

    if y and YRange.isIn(yRange,y):
        lines.append(line)
        return lines

SlideWalker

Bases: object

get meta information for all powerpoint presentations in a certain folder

Source code in slides/slidewalker.py
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
class SlideWalker(object):
    '''
    get meta information for all powerpoint presentations in a certain folder
    '''

    def __init__(self, rootFolder:str, debug:bool=False):
        '''
        Constructor

        Args:
            rootFolder(str): the path to the root folder of the analysis
            debug(bool): if True switch on debugging
        '''
        self.rootFolder=rootFolder
        self.debug=debug

    def asCsv(self,listOfDicts:list,fieldNames:list=None)->str:
        ''' convert the given list of dicts to CSV
        see https://stackoverflow.com/a/9157370/1497139

        Args:
            listOfDicts(list): the table to convert

        Returns:
            str: the CSV formated result
        '''
        output=io.StringIO()
        if fieldNames is None:
            fieldNameSet=set()
            for record in listOfDicts:
                for key in record.keys():
                    fieldNameSet.add(key)
            fieldNames=list(fieldNameSet)
        writer=csv.DictWriter(output,fieldnames=fieldNames,quoting=csv.QUOTE_NONNUMERIC)
        writer.writeheader()
        for record in listOfDicts:
            writer.writerow(record)
        return output.getvalue()

    def yieldPowerPointFiles(self,verbose:bool=False):
        """
        generate  my power point files

        Args:
            verbose(bool): if True show information about the processing
        """
        pptxFiles=self.findFiles(self.rootFolder, ".pptx")
        if verbose:
            print(f"found {len(pptxFiles)} powerpoint files")
        for pptxFile in pptxFiles:
            if verbose:
                print(f"Extracting data from {pptxFile}")
            ppt=PPT(pptxFile)
            ppt.open()
            if not ppt.error:
                yield ppt

    def yieldSlides(self,ppt,verbose:bool, excludeHiddenSlides:bool=False, runDelim:str=None,slideDetails:bool=False):
        """
        yield all slides

        Args:
            verbose(bool): if True print details on stdout
            excludeHiddenSlides(bool): If True hidden lecture will be excluded and also ignored in the page counting
            runDelim(str): the delimiter to use for powerpoint slide text
        """
        ppt.getSlides(excludeHiddenSlides=excludeHiddenSlides,runDelim=runDelim)
        for slide in ppt.slides:
            if verbose and slideDetails:
                print(slide.summary())
            yield slide

    def dumpInfo(self,outputFormat:str, excludeHiddenSlides:bool=False, runDelim:str=None, slideDetails:bool=False):
        '''
        dump information about the lecture in the given format

        Args:
            outputFormat(str): csv, json or txt
            excludeHiddenSlides(bool): If True hidden lecture will be excluded and also ignored in the page counting
            runDelim(str): the delimiter to use for powerpoint slide text
        '''
        info={}
        csvRecords=[]
        verbose=self.debug or outputFormat=="txt"
        for ppt in self.yieldPowerPointFiles(verbose):
            pptSummary=ppt.asDict()
            if verbose:
                print (f"{ppt.summary()}")
            slideSummary=[]
            for slide in self.yieldSlides(ppt,verbose, excludeHiddenSlides, runDelim,slideDetails=slideDetails):
                slideRecord=slide.asDict()
                csvRecord = OrderedDict()
                csvRecord["basename"]=ppt.basename
                csvRecord["page"]=slideRecord["page"]
                csvRecord["name"]=slideRecord["name"]
                title=''.join(slideRecord["title"].split())
                csvRecord["title"]=title
                csvRecords.append(csvRecord)
                slideSummary.append(slideRecord)
            pptSummary["slides"]=slideSummary
            info[ppt.basename]=pptSummary
        if outputFormat=="json":
            #
            # avoid the windows horror story
            # https://stackoverflow.com/questions/9233027/unicodedecodeerror-charmap-codec-cant-decode-byte-x-in-position-y-character
            # https://stackoverflow.com/a/18337754/1497139
            jsonStr=json.dumps(info,indent=2,default=str,ensure_ascii=False).encode('utf8')
            print(jsonStr.decode("utf-8"))
        elif outputFormat=="csv":
            sortedCsvRecords=sorted(csvRecords, key = lambda row: (row["basename"], int(row["page"])))
            csvText=self.asCsv(sortedCsvRecords,["basename","page","name","title"])
            print(csvText)
        elif outputFormat=="lod":
            return info

    def dumpInfoToString(self,outputFormat:str, excludeHiddenSlides:bool=True):
        """
        dump information about the presentations in the given format

        Args:
            outputFormat(str): csv, json or txt
            excludeHiddenSlides(bool): If True hidden lecture will be excluded and also ignored in the page counting
        """
        f = StringIO()
        with redirect_stdout(f):
            self.dumpInfo(outputFormat, excludeHiddenSlides=excludeHiddenSlides)
        stdout = f.getvalue()
        return stdout

    def findFiles(self,path:str,ext:str)->list:
        '''
        find Files with the given extension in the given path

        Args:
            path(str): the path to start with
            ext(str): the extension to search for

        Returns:
            list: a list of files found
        '''
        foundFiles=[]
        for root, _dirs, files in os.walk(path, topdown=False):
            for name in files:
                if name.endswith(ext) and not name.startswith("~$"):
                    filepath=os.path.join(root, name)
                    foundFiles.append(filepath)
        return foundFiles

__init__(rootFolder, debug=False)

Constructor

Parameters:

Name Type Description Default
rootFolder(str)

the path to the root folder of the analysis

required
debug(bool)

if True switch on debugging

required
Source code in slides/slidewalker.py
244
245
246
247
248
249
250
251
252
253
def __init__(self, rootFolder:str, debug:bool=False):
    '''
    Constructor

    Args:
        rootFolder(str): the path to the root folder of the analysis
        debug(bool): if True switch on debugging
    '''
    self.rootFolder=rootFolder
    self.debug=debug

asCsv(listOfDicts, fieldNames=None)

convert the given list of dicts to CSV see https://stackoverflow.com/a/9157370/1497139

Parameters:

Name Type Description Default
listOfDicts(list)

the table to convert

required

Returns:

Name Type Description
str str

the CSV formated result

Source code in slides/slidewalker.py
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
def asCsv(self,listOfDicts:list,fieldNames:list=None)->str:
    ''' convert the given list of dicts to CSV
    see https://stackoverflow.com/a/9157370/1497139

    Args:
        listOfDicts(list): the table to convert

    Returns:
        str: the CSV formated result
    '''
    output=io.StringIO()
    if fieldNames is None:
        fieldNameSet=set()
        for record in listOfDicts:
            for key in record.keys():
                fieldNameSet.add(key)
        fieldNames=list(fieldNameSet)
    writer=csv.DictWriter(output,fieldnames=fieldNames,quoting=csv.QUOTE_NONNUMERIC)
    writer.writeheader()
    for record in listOfDicts:
        writer.writerow(record)
    return output.getvalue()

dumpInfo(outputFormat, excludeHiddenSlides=False, runDelim=None, slideDetails=False)

dump information about the lecture in the given format

Parameters:

Name Type Description Default
outputFormat(str)

csv, json or txt

required
excludeHiddenSlides(bool)

If True hidden lecture will be excluded and also ignored in the page counting

required
runDelim(str)

the delimiter to use for powerpoint slide text

required
Source code in slides/slidewalker.py
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
def dumpInfo(self,outputFormat:str, excludeHiddenSlides:bool=False, runDelim:str=None, slideDetails:bool=False):
    '''
    dump information about the lecture in the given format

    Args:
        outputFormat(str): csv, json or txt
        excludeHiddenSlides(bool): If True hidden lecture will be excluded and also ignored in the page counting
        runDelim(str): the delimiter to use for powerpoint slide text
    '''
    info={}
    csvRecords=[]
    verbose=self.debug or outputFormat=="txt"
    for ppt in self.yieldPowerPointFiles(verbose):
        pptSummary=ppt.asDict()
        if verbose:
            print (f"{ppt.summary()}")
        slideSummary=[]
        for slide in self.yieldSlides(ppt,verbose, excludeHiddenSlides, runDelim,slideDetails=slideDetails):
            slideRecord=slide.asDict()
            csvRecord = OrderedDict()
            csvRecord["basename"]=ppt.basename
            csvRecord["page"]=slideRecord["page"]
            csvRecord["name"]=slideRecord["name"]
            title=''.join(slideRecord["title"].split())
            csvRecord["title"]=title
            csvRecords.append(csvRecord)
            slideSummary.append(slideRecord)
        pptSummary["slides"]=slideSummary
        info[ppt.basename]=pptSummary
    if outputFormat=="json":
        #
        # avoid the windows horror story
        # https://stackoverflow.com/questions/9233027/unicodedecodeerror-charmap-codec-cant-decode-byte-x-in-position-y-character
        # https://stackoverflow.com/a/18337754/1497139
        jsonStr=json.dumps(info,indent=2,default=str,ensure_ascii=False).encode('utf8')
        print(jsonStr.decode("utf-8"))
    elif outputFormat=="csv":
        sortedCsvRecords=sorted(csvRecords, key = lambda row: (row["basename"], int(row["page"])))
        csvText=self.asCsv(sortedCsvRecords,["basename","page","name","title"])
        print(csvText)
    elif outputFormat=="lod":
        return info

dumpInfoToString(outputFormat, excludeHiddenSlides=True)

dump information about the presentations in the given format

Parameters:

Name Type Description Default
outputFormat(str)

csv, json or txt

required
excludeHiddenSlides(bool)

If True hidden lecture will be excluded and also ignored in the page counting

required
Source code in slides/slidewalker.py
354
355
356
357
358
359
360
361
362
363
364
365
366
def dumpInfoToString(self,outputFormat:str, excludeHiddenSlides:bool=True):
    """
    dump information about the presentations in the given format

    Args:
        outputFormat(str): csv, json or txt
        excludeHiddenSlides(bool): If True hidden lecture will be excluded and also ignored in the page counting
    """
    f = StringIO()
    with redirect_stdout(f):
        self.dumpInfo(outputFormat, excludeHiddenSlides=excludeHiddenSlides)
    stdout = f.getvalue()
    return stdout

findFiles(path, ext)

find Files with the given extension in the given path

Parameters:

Name Type Description Default
path(str)

the path to start with

required
ext(str)

the extension to search for

required

Returns:

Name Type Description
list list

a list of files found

Source code in slides/slidewalker.py
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
def findFiles(self,path:str,ext:str)->list:
    '''
    find Files with the given extension in the given path

    Args:
        path(str): the path to start with
        ext(str): the extension to search for

    Returns:
        list: a list of files found
    '''
    foundFiles=[]
    for root, _dirs, files in os.walk(path, topdown=False):
        for name in files:
            if name.endswith(ext) and not name.startswith("~$"):
                filepath=os.path.join(root, name)
                foundFiles.append(filepath)
    return foundFiles

yieldPowerPointFiles(verbose=False)

generate my power point files

Parameters:

Name Type Description Default
verbose(bool)

if True show information about the processing

required
Source code in slides/slidewalker.py
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
def yieldPowerPointFiles(self,verbose:bool=False):
    """
    generate  my power point files

    Args:
        verbose(bool): if True show information about the processing
    """
    pptxFiles=self.findFiles(self.rootFolder, ".pptx")
    if verbose:
        print(f"found {len(pptxFiles)} powerpoint files")
    for pptxFile in pptxFiles:
        if verbose:
            print(f"Extracting data from {pptxFile}")
        ppt=PPT(pptxFile)
        ppt.open()
        if not ppt.error:
            yield ppt

yieldSlides(ppt, verbose, excludeHiddenSlides=False, runDelim=None, slideDetails=False)

yield all slides

Parameters:

Name Type Description Default
verbose(bool)

if True print details on stdout

required
excludeHiddenSlides(bool)

If True hidden lecture will be excluded and also ignored in the page counting

required
runDelim(str)

the delimiter to use for powerpoint slide text

required
Source code in slides/slidewalker.py
296
297
298
299
300
301
302
303
304
305
306
307
308
309
def yieldSlides(self,ppt,verbose:bool, excludeHiddenSlides:bool=False, runDelim:str=None,slideDetails:bool=False):
    """
    yield all slides

    Args:
        verbose(bool): if True print details on stdout
        excludeHiddenSlides(bool): If True hidden lecture will be excluded and also ignored in the page counting
        runDelim(str): the delimiter to use for powerpoint slide text
    """
    ppt.getSlides(excludeHiddenSlides=excludeHiddenSlides,runDelim=runDelim)
    for slide in ppt.slides:
        if verbose and slideDetails:
            print(slide.summary())
        yield slide

YRange

an Y Range

Source code in slides/slidewalker.py
22
23
24
25
26
27
28
29
30
31
32
33
class YRange():
    '''
    an Y Range
    '''
    def __init__(self,minY=0,maxY=300):
        self.minY=minY
        self.maxY=maxY

    @staticmethod
    def isIn(yRange,y):
        result=y==0 or yRange is None or (y>=yRange.minY and y<=yRange.maxY)
        return result

main(argv=None)

main routine

Source code in slides/slidewalker.py
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
def main(argv=None):
    '''
    main routine
    '''
    if argv is None:
        argv = sys.argv
    program_name = os.path.basename(sys.argv[0])
    program_version_message = f'{program_name} (v{Version.version},{Version.updated})'
    try:
        parser = argparse.ArgumentParser(description='SlideWalker - get meta information for all powerpoint presentations in a certain folder')
        parser.add_argument("-a","--about",help="show about info [default: %(default)s]",action="store_true")
        parser.add_argument("-d", "--debug", dest="debug", action="store_true", help="show debug info")
        parser.add_argument("-f", "--format", default="json", help="output format to create: csv,json or txt (default: %(default)s)")
        parser.add_argument("--includeHidden",action="store_true",help="exclude hidden slides (default: %(default)s)")
        parser.add_argument("--rd","--runDelimiter",dest="runDelim",help="text run delimiter (default: %(default)s) suggested: _↵•",default=Slide.defaultRunDelim)
        parser.add_argument("--rootPath",default=".")
        parser.add_argument('-V', '--version', action='version', version=program_version_message)
        args = parser.parse_args(argv[1:])
        if args.about:
            print(program_version_message)
            print(f"see {Version.doc_url}")
            webbrowser.open(Version.doc_url)
        else:
            sw=SlideWalker(args.rootPath,args.debug)
            sw.dumpInfo(args.format,excludeHiddenSlides=not args.includeHidden,runDelim=args.runDelim)

    except KeyboardInterrupt:
        ### handle keyboard interrupt ###
        return 1
    except Exception as e:
        indent = len(program_name) * " "
        sys.stderr.write(program_name + ": " + repr(e) + "\n")
        sys.stderr.write(indent + "  for help use --help")
        if args.debug:
            print(traceback.format_exc())
        return 2

version

Created on 2022-04-01

@author: wf

Version

Bases: object

Version handling for pysotsog

Source code in slides/version.py
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
class Version(object):
    '''
    Version handling for pysotsog
    '''
    name="pySemanticSlides"
    description='generate Semantic Mediawiki for a set of powerpoint presentations with semantic annotations' 
    version=slides.__version__
    date = '2023-02-14'
    updated = '2023-02-22'
    authors='Wolfgang Fahl'
    doc_url="https://wiki.bitplan.com/index.php/PySemanticSlides"
    chat_url="https://github.com/WolfgangFahl/pySemanticSlides/discussions"
    cm_url="https://github.com/WolfgangFahl/pySemanticSlides"
    license=f'''Copyright 2020-2023 contributors. All rights reserved.
  Licensed under the Apache License 2.0
  http://www.apache.org/licenses/LICENSE-2.0
  Distributed on an "AS IS" basis without warranties
  or conditions of any kind, either express or implied.'''
    longDescription=f"""{name} version {version}
{description}
  Created by {authors} on {date} last updated {updated}"""