Skip to content

pyCEURmake API Documentation

ceur_ws

Conference

Bases: JSONAble

Represents a conference

Source code in ceurws/ceur_ws.py
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
class Conference(JSONAble):
    """
    Represents a conference
    """

    @staticmethod
    def getSamples() -> list[dict]:
        """
        get sample records of the entity
        """
        samples = [
            {
                "id": "Vol-2436",
                "fullTitle": "SIAM International Conference on Data Mining",
                "homepage": "https://www.siam.org/Conferences/CM/Main/sdm19",
                "acronym": "SDM 2019",
            }
        ]
        return samples

getSamples() staticmethod

get sample records of the entity

Source code in ceurws/ceur_ws.py
767
768
769
770
771
772
773
774
775
776
777
778
779
780
@staticmethod
def getSamples() -> list[dict]:
    """
    get sample records of the entity
    """
    samples = [
        {
            "id": "Vol-2436",
            "fullTitle": "SIAM International Conference on Data Mining",
            "homepage": "https://www.siam.org/Conferences/CM/Main/sdm19",
            "acronym": "SDM 2019",
        }
    ]
    return samples

ConferenceManager

Bases: EntityManager

Contains multiple ceurws sessions

Source code in ceurws/ceur_ws.py
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
class ConferenceManager(EntityManager):
    """
    Contains multiple ceurws sessions
    """

    def __init__(self):
        super().__init__(
            listName="conferences",
            clazz=Conference,
            tableName="conferences",
            entityName=Conference.__class__.__name__,
            primaryKey="id",
            entityPluralName="conferences",
            config=CEURWS.CONFIG,
            name=self.__class__.__name__,
        )

Editor

Bases: JSONAble

Represents a volume editor

Source code in ceurws/ceur_ws.py
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
class Editor(JSONAble):
    """
    Represents a volume editor
    """

    @staticmethod
    def getSamples() -> list[dict]:
        """
        get sample records of the entity
        """
        samples = [
            {
                "id": "Vol-2436/John Doe",
                "name": "John Doe",
                "homepage": "http://www.example.org/john",
                "country": "Germany",
                "affiliation": "Leibniz University Hannover & L3S Research Center",
                "submitted": False,
            },
            {
                "id": "Vol-2436/Jane Doe",
                "name": "Jane Doe",
                "homepage": "http://www.example.org/jane",
                "country": "Germany",
                "affiliation": "Technical University Dortmund",
                "submitted": True,
            },
        ]
        return samples

getSamples() staticmethod

get sample records of the entity

Source code in ceurws/ceur_ws.py
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
@staticmethod
def getSamples() -> list[dict]:
    """
    get sample records of the entity
    """
    samples = [
        {
            "id": "Vol-2436/John Doe",
            "name": "John Doe",
            "homepage": "http://www.example.org/john",
            "country": "Germany",
            "affiliation": "Leibniz University Hannover & L3S Research Center",
            "submitted": False,
        },
        {
            "id": "Vol-2436/Jane Doe",
            "name": "Jane Doe",
            "homepage": "http://www.example.org/jane",
            "country": "Germany",
            "affiliation": "Technical University Dortmund",
            "submitted": True,
        },
    ]
    return samples

EditorManager

Bases: EntityManager

Contains multiple ceurws editors

Source code in ceurws/ceur_ws.py
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
class EditorManager(EntityManager):
    """
    Contains multiple ceurws editors
    """

    def __init__(self):
        super().__init__(
            listName="editors",
            clazz=Editor,
            tableName="editors",
            entityName=Session.__class__.__name__,
            primaryKey="id",
            entityPluralName="editors",
            config=CEURWS.CONFIG,
            name=self.__class__.__name__,
        )

Paper

Bases: JSONAble

Represents a paper

Source code in ceurws/ceur_ws.py
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
class Paper(JSONAble):
    """
    Represents a paper
    """

    def __init__(
        self,
        id: str | None = None,
        title: str | None = None,
        type: str | None = None,
        position: int | None = None,
        pagesFrom: int | None = None,
        pagesTo: int | None = None,
        authors: dict | None = None,
    ):
        super().__init__()
        self.id = id
        self.title = title
        self.type = type
        self.position = position
        self.pagesFrom = pagesFrom
        self.pagesTo = pagesTo
        self.authors = authors

    @staticmethod
    def getSamples() -> list[dict]:
        """
        get sample records of the entity
        """
        samples = [
            {  # id is constructed with volume and position
                # → <volNumber>/s<position>/<type>_<position_relative_to_type>
                "id": "Vol-2436/s1/summary",
                "type": "summary",
                "position": 0,
                "title": "1st Workshop on Evaluation and Experimental Design in Data Mining and "
                "Machine Learning (EDML 2019)",
                "pdf": "http://ceur-ws.org/Vol-2436/summary.pdf",
                "pagesFrom": 1,
                "pagesTo": 3,
                "authors": [
                    "Eirini Ntoutsi",
                    "Erich Schubert",
                    "Arthur Zimek",
                    "Albrecht Zimmermann",
                ],
            },
            {
                "id": "Vol-2436/s1/invited_1",
                "type": "invited",
                "position": 1,
                "title": "Evaluation of Unsupervised Learning Results: Making the Seemingly Impossible Possible",
                "pdf": "http://ceur-ws.org/Vol-2436/invited_1.pdf",
                "pagesFrom": 4,
                "pagesTo": 4,
                "authors": ["Ricardo J. G. B. Campello"],
            },
            {
                "id": "Vol-2436/s1/article_1",
                "type": "article",
                "position": 2,
                "title": "EvalNE: A Framework for Evaluating Network Embeddings on Link Prediction",
                "pdf": "http://ceur-ws.org/Vol-2436/article_2.pdf",
                "pagesFrom": 5,
                "pagesTo": 13,
                "authors": [
                    "Alexandru Mara",
                    "Jefrey Lijffijt",
                    "Tijl De Bie",
                ],
            },
        ]
        return samples

    def __str__(self):
        """
        return my string representation

        Returns:
            str: my text representation
        """
        text = self.title
        return text

__str__()

return my string representation

Returns:

Name Type Description
str

my text representation

Source code in ceurws/ceur_ws.py
609
610
611
612
613
614
615
616
617
def __str__(self):
    """
    return my string representation

    Returns:
        str: my text representation
    """
    text = self.title
    return text

getSamples() staticmethod

get sample records of the entity

Source code in ceurws/ceur_ws.py
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
@staticmethod
def getSamples() -> list[dict]:
    """
    get sample records of the entity
    """
    samples = [
        {  # id is constructed with volume and position
            # → <volNumber>/s<position>/<type>_<position_relative_to_type>
            "id": "Vol-2436/s1/summary",
            "type": "summary",
            "position": 0,
            "title": "1st Workshop on Evaluation and Experimental Design in Data Mining and "
            "Machine Learning (EDML 2019)",
            "pdf": "http://ceur-ws.org/Vol-2436/summary.pdf",
            "pagesFrom": 1,
            "pagesTo": 3,
            "authors": [
                "Eirini Ntoutsi",
                "Erich Schubert",
                "Arthur Zimek",
                "Albrecht Zimmermann",
            ],
        },
        {
            "id": "Vol-2436/s1/invited_1",
            "type": "invited",
            "position": 1,
            "title": "Evaluation of Unsupervised Learning Results: Making the Seemingly Impossible Possible",
            "pdf": "http://ceur-ws.org/Vol-2436/invited_1.pdf",
            "pagesFrom": 4,
            "pagesTo": 4,
            "authors": ["Ricardo J. G. B. Campello"],
        },
        {
            "id": "Vol-2436/s1/article_1",
            "type": "article",
            "position": 2,
            "title": "EvalNE: A Framework for Evaluating Network Embeddings on Link Prediction",
            "pdf": "http://ceur-ws.org/Vol-2436/article_2.pdf",
            "pagesFrom": 5,
            "pagesTo": 13,
            "authors": [
                "Alexandru Mara",
                "Jefrey Lijffijt",
                "Tijl De Bie",
            ],
        },
    ]
    return samples

PaperManager

Bases: EntityManager

Contains multiple ceurws papers

Source code in ceurws/ceur_ws.py
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
class PaperManager(EntityManager):
    """
    Contains multiple ceurws papers
    """

    def __init__(self):
        super().__init__(
            listName="papers",
            clazz=Paper,
            tableName="papers",
            entityName=Paper.__class__.__name__,
            primaryKey="id",
            entityPluralName="papers",
            config=CEURWS.CONFIG,
            handleInvalidListTypes=True,
            listSeparator=",",
            name=self.__class__.__name__,
        )

Session

Bases: JSONAble

Represents a session in ceur-ws

Source code in ceurws/ceur_ws.py
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
class Session(JSONAble):
    """
    Represents a session in ceur-ws
    """

    def __init__(self, id: str | None, title: str | None, position: int | None, papers: dict[str, "Paper"] | None):
        """
        constructor
        """
        super().__init__()
        self.id = id
        self.title = title
        self.position = position
        self._papers = papers

    @staticmethod
    def getSamples() -> list[dict]:
        """
        get sample records of the entity
        """
        samples = [
            {
                "id": "Vol-2436/s1",  # id is constructed with volume and position → <volNumber>/s<position>
                "title": "Information Technologies and Intelligent Decision Making Systems II",
                "position": 1,
                "papers": {  # 1:n relation / command chain
                    "VOL-2436/s1/p1": Paper,
                    "VOL-2436/s1/p2": Paper,
                },
            }
        ]
        return samples

    @property
    def papers(self, cached: bool = False):  # dict: str→Paper
        if cached:
            # check if cached
            pass
        else:
            # load papers
            if cached:
                # set papers
                pass
        return self._papers

    @papers.setter
    def papers(self, paper: Paper):
        # ToDo: Adjust to proper 1:n handling
        if hasattr(self, "_papers") and isinstance(self._papers, dict) and paper.id:
            self._papers[paper.id] = paper
        else:
            self._papers = paper

__init__(id, title, position, papers)

constructor

Source code in ceurws/ceur_ws.py
645
646
647
648
649
650
651
652
653
def __init__(self, id: str | None, title: str | None, position: int | None, papers: dict[str, "Paper"] | None):
    """
    constructor
    """
    super().__init__()
    self.id = id
    self.title = title
    self.position = position
    self._papers = papers

getSamples() staticmethod

get sample records of the entity

Source code in ceurws/ceur_ws.py
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
@staticmethod
def getSamples() -> list[dict]:
    """
    get sample records of the entity
    """
    samples = [
        {
            "id": "Vol-2436/s1",  # id is constructed with volume and position → <volNumber>/s<position>
            "title": "Information Technologies and Intelligent Decision Making Systems II",
            "position": 1,
            "papers": {  # 1:n relation / command chain
                "VOL-2436/s1/p1": Paper,
                "VOL-2436/s1/p2": Paper,
            },
        }
    ]
    return samples

SessionManager

Bases: EntityManager

Contains multiple ceurws sessions

Source code in ceurws/ceur_ws.py
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
class SessionManager(EntityManager):
    """
    Contains multiple ceurws sessions
    """

    def __init__(self):
        super().__init__(
            listName="sessions",
            clazz=Session,
            tableName="sessions",
            entityName=Session.__class__.__name__,
            primaryKey="id",
            # ToDo: check if just the title is a sufficent key or if an ID must be added
            entityPluralName="sessions",
            config=CEURWS.CONFIG,
            name=self.__class__.__name__,
        )

Volume

Bases: JSONAble

Represents a volume in ceur-ws

Source code in ceurws/ceur_ws.py
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
class Volume(JSONAble):
    """
    Represents a volume in ceur-ws
    """

    def __init__(
        self,
        number: int | None = None,
        url: str | None = None,
        title: str | None = None,
        fullTitle: str | None = None,
        acronym: str | None = None,
        lang: str | None = None,
        location: str | None = None,
        country: str | None = None,
        countryWikidataId: str | None = None,
        region: str | None = None,
        city: str | None = None,
        cityWikidataId: str | None = None,
        ordinal: int | None = None,
        date: datetime.datetime | None = None,
        dateFrom: datetime.datetime | None = None,
        dateTo: datetime.datetime | None = None,
        pubYear: str | None = None,
        pubDate: datetime.datetime | None = None,
        submitDate: datetime.datetime | None = None,
        valid: bool = True,
        conference: Optional["Conference"] = None,
        editors: list["Editor"] | None = None,
        sessions: list["Session"] | None = None,
        virtualEvent: bool = False,
        submittedBy: str | None = None,
    ):
        """
        constructor
        """
        self.number = number
        self.url = url
        self.title = title
        self.fullTitle = fullTitle
        self.acronym = acronym
        self.lang = lang
        self.location = location
        self.country = country
        self.countryWikidataId = countryWikidataId
        self.region = region
        self.city = city
        self.cityWikidataId = cityWikidataId
        self.ordinal = ordinal
        self.date = date
        self.dateFrom = dateFrom
        self.dateTo = dateTo
        self.pubYear = pubYear
        self.pubDate = pubDate
        self.submitDate = submitDate
        self.valid = valid
        self.conference = conference
        self.editors = editors
        self.sessions = sessions
        self.virtualEvent = virtualEvent
        self.submittedBy = submittedBy

    def getSamples(self):
        samples = [
            {
                "number": 2436,
                "url": "http://ceur-ws.org/Vol-2436/",
                "title": "Evaluation and Experimental Design in Data Mining and Machine Learning",
                "fullTitle": "1st Workshop on Evaluation and Experimental Design in Data Mining and Machine Learning",
                "acronym": "EDML 2019",
                "lang": "en",
                "location": "Calgary, Alberta, Canada",
                "country": "Canada",
                "region": "Alberta",
                "city": "Calgary",
                "ordinal": 1,
                "date": datetime.datetime(year=2019, month=5, day=4),
                "dateFrom": "",
                "dateTo": "",
                "pubYear": 2019,
                "pubDate": "2019-09-08",
                "submitDate": "2019-07-28",
                "valid": True,
                "conference": Conference,
                "editors": [Editor],
                "sessions": [Session],
                "virtualEvent": False,
            }
        ]
        return samples

    def getVolumeNumber(self):
        """
        get number of the volume
        """
        number = getattr(self, "number", "Volume has no number")
        return number

    def getVolumeUrl(self) -> str | None:
        """
        get the url of the volume page
        """
        number = self.number
        if number is None:
            return None
        url = self.getVolumeUrlOf(number)
        return url

    @staticmethod
    def getVolumeUrlOf(
        number: str | int,
    ) -> str | None:
        """
        get the volume url of the given volume number
        Args:
            number: volume number
        """
        url = None
        if number is not None:
            url = f"http://ceur-ws.org/Vol-{number}/"
        return url

    def isVirtualEvent(self) -> bool:
        """
        Returns True if the event is a virtual event
        """
        return getattr(self, "virtualEvent", False)

    def normalize(self):
        """
        Tries to normalize the properties e.g. breaking loctime into designated location and time properties
        Example: 'Vienna, Austria, July 25th, 2022'
        """
        pass

    def get_loctime(self) -> str | None:
        """
        get the loctime
        """
        loctime = getattr(self, "loctime", None)
        if loctime is None:
            td_title = getattr(self, "tdtitle", None)
            if td_title:
                title_parts = td_title.split(",")
                del title_parts[0]
                loctime = ",".join(title_parts)
                loctime = loctime.strip(".")
                self.loctime = loctime
            else:
                pass
        elif not isinstance(loctime, str):
            loctime = None
        return loctime

    def resolveLoctime(self):
        """
        Resolve the loctime property by breaking it down to city, region, country, dateFrom, and dateTo
        """
        loctime = self.get_loctime()
        if loctime is None:
            return None
        dateFrom, dateTo = self.extractDates(loctime)
        if dateFrom is not None:
            self.dateFrom = dateFrom
        if dateTo is not None:
            self.dateTo = dateTo
        self.extractAndSetLocation(locationStr=loctime)

    def extractAndSetLocation(self, locationStr: str):
        """
        Extracts the location from the given string and returns the found city and country
        ToDo: Once the EventReferenceParser from cc is updated to support city country combinations switch to it
        Args:
            locationStr: string to extract the locations from
        """
        parser = self.__class__.__dict__.get("locationparser")
        if parser is None:
            parser = LocationContext.fromCache()
            self.__class__.locationparser = parser
        locationStr = self.removePartsMatching(locationStr, pattern=r"\d")
        for month in calendar.month_name:
            if month == "":
                continue
            locationStr = locationStr.replace(month, " ")
        locations = parser.locateLocation(locationStr, verbose=True)
        locations = self.rankLocations(locationStr, locations)
        city = None
        cityWikidataId = None
        country = None
        countryWikidataId = None
        if locations is not None and len(locations) > 0:
            bestMatch = locations[0]
            if isinstance(bestMatch, City):
                city = bestMatch.name
                cityWikidataId = bestMatch.wikidataid
                country = bestMatch.country.name
                countryWikidataId = bestMatch.country.wikidataid
            elif isinstance(bestMatch, Country):
                country = bestMatch.wikidataid
        virtualEventKeywords = ["virtual", "online"]
        for keyword in virtualEventKeywords:
            if keyword in locationStr.lower():
                self.virtualEvent = True
        if city is not None:
            self.city = city
            self.cityWikidataId = cityWikidataId
        if countryWikidataId is not None:
            self.country = country
            self.countryWikidataId = countryWikidataId

    def extractDates(
        self, dateStr: str, durationThreshold: int = 11
    ) -> tuple[datetime.date | None, datetime.date | None]:
        """ "
        Extracts the start and end time from the given string
        optimized for the format of the loctime property
        Args:
            dateStr: string to extract the dates from
            durationThreshold: number of days allowed between two extracted dates
        """
        dateFrom = None
        dateTo = None
        if dateStr is None:
            return None, None
        # normalize certain foreign language month names that occur regularly
        if "novembro" in dateStr.lower():
            dateStr = dateStr.lower().replace("novembro", "november")
        loctimeParts = re.split("[,)(]", dateStr)
        if re.fullmatch(r"\d{4}", loctimeParts[-1].strip()):
            year = loctimeParts[-1].strip()
            rawDate = loctimeParts[-2].strip()
            if len(loctimeParts) >= 3 and loctimeParts[-3].lower().strip() in [
                cn.lower() for cn in calendar.month_name
            ]:
                rawDate = f"{loctimeParts[-3]} {rawDate}"
            dateParts: list = re.split("[-–‐&]| to | and ", rawDate)
            try:
                if len(dateParts) == 1:
                    dateFrom = dateutil.parser.parse(f"{dateParts[0]} {year}")
                    dateTo = dateFrom
                elif len(dateParts) == 2:
                    dateParts.sort(key=lambda r: len(r), reverse=True)
                    dateOne = dateutil.parser.parse(f"{dateParts[0]} {year}")
                    if len(dateParts[-1].strip()) <= 4:
                        dayMonthParts = dateParts[0].split(" ")
                        dayMonthParts.sort(key=lambda r: len(r), reverse=True)
                        endDate = dayMonthParts[0] + dateParts[1]
                        dateTwo = dateutil.parser.parse(f"{endDate} {year}")
                    else:
                        dateTwo = dateutil.parser.parse(f"{dateParts[1]} {year}")
                    dates = [dateOne, dateTwo]
                    dates.sort()
                    dateFrom = dates[0]
                    dateTo = dates[1]
            except Exception:
                pass
            if dateTo is not None and dateFrom is not None:
                delta = dateTo - dateFrom
                if delta < datetime.timedelta():
                    print("Error this should not be possible")
                elif delta > datetime.timedelta(days=durationThreshold):
                    print(
                        self.number,
                        f"Event with a duration of more than {durationThreshold} days seems suspicious",
                    )
                else:
                    return dateFrom.date(), dateTo.date()
            else:
                print(self.number, dateStr, "→ Dates could not be extracted")
            return None, None
        else:
            # corner case
            return None, None

    @staticmethod
    def removePartsMatching(value: str, pattern: str, separator=","):
        """
        Removes parts from the given value matching the pattern
        """
        parts = value.split(separator)
        resParts = []
        for part in parts:
            if re.search(pattern, part) is None:
                resParts.append(part)
        resValue = separator.join(resParts)
        return resValue

    @staticmethod
    def rankLocations(locationStr: str, locations: list[Location]):
        """
        rank the given locations to find the best match to the given location string
        Args:
            locationStr: location string
            locations: list of locations objects
        """
        rankedLocations = []
        for location in locations:
            locationsToCheck = []
            if isinstance(location, City):
                locationsToCheck = [
                    location,
                    location.region,
                    location.country,
                ]
            elif isinstance(location, Region):
                locationsToCheck = [location, location.country]
            elif isinstance(location, Country):
                locationsToCheck = [location]
            score = 0
            for ltc in locationsToCheck:
                if ltc.name in locationStr:
                    score += 1
            rankedLocations.append((score, location))
        rankedLocations.sort(key=lambda scoreTuple: scoreTuple[0], reverse=True)
        return [location for score, location in rankedLocations]

    def __str__(self):
        text = f"Vol-{self.number}"
        return text

    @property
    def sessions(self):
        """
        sessions of this volume
        """
        return self._sessions

    @sessions.setter
    def sessions(self, session):
        # ToDo: Adjust to proper 1:n handling
        if hasattr(self, "_sessions") and isinstance(self._sessions, list):
            self._sessions.append(session)
        else:
            self._sessions = session

    @property
    def papers(self):
        """
        papers of this volume
        """
        return

    def extractValuesFromVolumePage(self, timeout: float = 3) -> tuple[dict | None, BeautifulSoup | None]:
        """
        extract values from the given volume page
        """
        self.desc = "?"
        self.h1 = "?"
        if self.url is None:
            return None, None
        volumeParser = VolumeParser(timeout=timeout)
        parseDict, soup = volumeParser.parse_volume(self.getVolumeNumber())
        self.fromDict(parseDict)
        return parseDict, soup

    def getSubmittingEditor(self):
        """
        Returns the Editor that submitted the volume
        """
        submitter = None
        if hasattr(self, "editors"):
            for editor in self.editors:
                if isinstance(editor, Editor) and getattr(editor, "submitted", False):
                    submitter = editor
                    break
        return submitter

papers property

papers of this volume

sessions property writable

sessions of this volume

__init__(number=None, url=None, title=None, fullTitle=None, acronym=None, lang=None, location=None, country=None, countryWikidataId=None, region=None, city=None, cityWikidataId=None, ordinal=None, date=None, dateFrom=None, dateTo=None, pubYear=None, pubDate=None, submitDate=None, valid=True, conference=None, editors=None, sessions=None, virtualEvent=False, submittedBy=None)

constructor

Source code in ceurws/ceur_ws.py
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
def __init__(
    self,
    number: int | None = None,
    url: str | None = None,
    title: str | None = None,
    fullTitle: str | None = None,
    acronym: str | None = None,
    lang: str | None = None,
    location: str | None = None,
    country: str | None = None,
    countryWikidataId: str | None = None,
    region: str | None = None,
    city: str | None = None,
    cityWikidataId: str | None = None,
    ordinal: int | None = None,
    date: datetime.datetime | None = None,
    dateFrom: datetime.datetime | None = None,
    dateTo: datetime.datetime | None = None,
    pubYear: str | None = None,
    pubDate: datetime.datetime | None = None,
    submitDate: datetime.datetime | None = None,
    valid: bool = True,
    conference: Optional["Conference"] = None,
    editors: list["Editor"] | None = None,
    sessions: list["Session"] | None = None,
    virtualEvent: bool = False,
    submittedBy: str | None = None,
):
    """
    constructor
    """
    self.number = number
    self.url = url
    self.title = title
    self.fullTitle = fullTitle
    self.acronym = acronym
    self.lang = lang
    self.location = location
    self.country = country
    self.countryWikidataId = countryWikidataId
    self.region = region
    self.city = city
    self.cityWikidataId = cityWikidataId
    self.ordinal = ordinal
    self.date = date
    self.dateFrom = dateFrom
    self.dateTo = dateTo
    self.pubYear = pubYear
    self.pubDate = pubDate
    self.submitDate = submitDate
    self.valid = valid
    self.conference = conference
    self.editors = editors
    self.sessions = sessions
    self.virtualEvent = virtualEvent
    self.submittedBy = submittedBy

extractAndSetLocation(locationStr)

Extracts the location from the given string and returns the found city and country ToDo: Once the EventReferenceParser from cc is updated to support city country combinations switch to it Args: locationStr: string to extract the locations from

Source code in ceurws/ceur_ws.py
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
def extractAndSetLocation(self, locationStr: str):
    """
    Extracts the location from the given string and returns the found city and country
    ToDo: Once the EventReferenceParser from cc is updated to support city country combinations switch to it
    Args:
        locationStr: string to extract the locations from
    """
    parser = self.__class__.__dict__.get("locationparser")
    if parser is None:
        parser = LocationContext.fromCache()
        self.__class__.locationparser = parser
    locationStr = self.removePartsMatching(locationStr, pattern=r"\d")
    for month in calendar.month_name:
        if month == "":
            continue
        locationStr = locationStr.replace(month, " ")
    locations = parser.locateLocation(locationStr, verbose=True)
    locations = self.rankLocations(locationStr, locations)
    city = None
    cityWikidataId = None
    country = None
    countryWikidataId = None
    if locations is not None and len(locations) > 0:
        bestMatch = locations[0]
        if isinstance(bestMatch, City):
            city = bestMatch.name
            cityWikidataId = bestMatch.wikidataid
            country = bestMatch.country.name
            countryWikidataId = bestMatch.country.wikidataid
        elif isinstance(bestMatch, Country):
            country = bestMatch.wikidataid
    virtualEventKeywords = ["virtual", "online"]
    for keyword in virtualEventKeywords:
        if keyword in locationStr.lower():
            self.virtualEvent = True
    if city is not None:
        self.city = city
        self.cityWikidataId = cityWikidataId
    if countryWikidataId is not None:
        self.country = country
        self.countryWikidataId = countryWikidataId

extractDates(dateStr, durationThreshold=11)

" Extracts the start and end time from the given string optimized for the format of the loctime property Args: dateStr: string to extract the dates from durationThreshold: number of days allowed between two extracted dates

Source code in ceurws/ceur_ws.py
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
def extractDates(
    self, dateStr: str, durationThreshold: int = 11
) -> tuple[datetime.date | None, datetime.date | None]:
    """ "
    Extracts the start and end time from the given string
    optimized for the format of the loctime property
    Args:
        dateStr: string to extract the dates from
        durationThreshold: number of days allowed between two extracted dates
    """
    dateFrom = None
    dateTo = None
    if dateStr is None:
        return None, None
    # normalize certain foreign language month names that occur regularly
    if "novembro" in dateStr.lower():
        dateStr = dateStr.lower().replace("novembro", "november")
    loctimeParts = re.split("[,)(]", dateStr)
    if re.fullmatch(r"\d{4}", loctimeParts[-1].strip()):
        year = loctimeParts[-1].strip()
        rawDate = loctimeParts[-2].strip()
        if len(loctimeParts) >= 3 and loctimeParts[-3].lower().strip() in [
            cn.lower() for cn in calendar.month_name
        ]:
            rawDate = f"{loctimeParts[-3]} {rawDate}"
        dateParts: list = re.split("[-–‐&]| to | and ", rawDate)
        try:
            if len(dateParts) == 1:
                dateFrom = dateutil.parser.parse(f"{dateParts[0]} {year}")
                dateTo = dateFrom
            elif len(dateParts) == 2:
                dateParts.sort(key=lambda r: len(r), reverse=True)
                dateOne = dateutil.parser.parse(f"{dateParts[0]} {year}")
                if len(dateParts[-1].strip()) <= 4:
                    dayMonthParts = dateParts[0].split(" ")
                    dayMonthParts.sort(key=lambda r: len(r), reverse=True)
                    endDate = dayMonthParts[0] + dateParts[1]
                    dateTwo = dateutil.parser.parse(f"{endDate} {year}")
                else:
                    dateTwo = dateutil.parser.parse(f"{dateParts[1]} {year}")
                dates = [dateOne, dateTwo]
                dates.sort()
                dateFrom = dates[0]
                dateTo = dates[1]
        except Exception:
            pass
        if dateTo is not None and dateFrom is not None:
            delta = dateTo - dateFrom
            if delta < datetime.timedelta():
                print("Error this should not be possible")
            elif delta > datetime.timedelta(days=durationThreshold):
                print(
                    self.number,
                    f"Event with a duration of more than {durationThreshold} days seems suspicious",
                )
            else:
                return dateFrom.date(), dateTo.date()
        else:
            print(self.number, dateStr, "→ Dates could not be extracted")
        return None, None
    else:
        # corner case
        return None, None

extractValuesFromVolumePage(timeout=3)

extract values from the given volume page

Source code in ceurws/ceur_ws.py
363
364
365
366
367
368
369
370
371
372
373
374
def extractValuesFromVolumePage(self, timeout: float = 3) -> tuple[dict | None, BeautifulSoup | None]:
    """
    extract values from the given volume page
    """
    self.desc = "?"
    self.h1 = "?"
    if self.url is None:
        return None, None
    volumeParser = VolumeParser(timeout=timeout)
    parseDict, soup = volumeParser.parse_volume(self.getVolumeNumber())
    self.fromDict(parseDict)
    return parseDict, soup

getSubmittingEditor()

Returns the Editor that submitted the volume

Source code in ceurws/ceur_ws.py
376
377
378
379
380
381
382
383
384
385
386
def getSubmittingEditor(self):
    """
    Returns the Editor that submitted the volume
    """
    submitter = None
    if hasattr(self, "editors"):
        for editor in self.editors:
            if isinstance(editor, Editor) and getattr(editor, "submitted", False):
                submitter = editor
                break
    return submitter

getVolumeNumber()

get number of the volume

Source code in ceurws/ceur_ws.py
112
113
114
115
116
117
def getVolumeNumber(self):
    """
    get number of the volume
    """
    number = getattr(self, "number", "Volume has no number")
    return number

getVolumeUrl()

get the url of the volume page

Source code in ceurws/ceur_ws.py
119
120
121
122
123
124
125
126
127
def getVolumeUrl(self) -> str | None:
    """
    get the url of the volume page
    """
    number = self.number
    if number is None:
        return None
    url = self.getVolumeUrlOf(number)
    return url

getVolumeUrlOf(number) staticmethod

get the volume url of the given volume number Args: number: volume number

Source code in ceurws/ceur_ws.py
129
130
131
132
133
134
135
136
137
138
139
140
141
@staticmethod
def getVolumeUrlOf(
    number: str | int,
) -> str | None:
    """
    get the volume url of the given volume number
    Args:
        number: volume number
    """
    url = None
    if number is not None:
        url = f"http://ceur-ws.org/Vol-{number}/"
    return url

get_loctime()

get the loctime

Source code in ceurws/ceur_ws.py
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
def get_loctime(self) -> str | None:
    """
    get the loctime
    """
    loctime = getattr(self, "loctime", None)
    if loctime is None:
        td_title = getattr(self, "tdtitle", None)
        if td_title:
            title_parts = td_title.split(",")
            del title_parts[0]
            loctime = ",".join(title_parts)
            loctime = loctime.strip(".")
            self.loctime = loctime
        else:
            pass
    elif not isinstance(loctime, str):
        loctime = None
    return loctime

isVirtualEvent()

Returns True if the event is a virtual event

Source code in ceurws/ceur_ws.py
143
144
145
146
147
def isVirtualEvent(self) -> bool:
    """
    Returns True if the event is a virtual event
    """
    return getattr(self, "virtualEvent", False)

normalize()

Tries to normalize the properties e.g. breaking loctime into designated location and time properties Example: 'Vienna, Austria, July 25th, 2022'

Source code in ceurws/ceur_ws.py
149
150
151
152
153
154
def normalize(self):
    """
    Tries to normalize the properties e.g. breaking loctime into designated location and time properties
    Example: 'Vienna, Austria, July 25th, 2022'
    """
    pass

rankLocations(locationStr, locations) staticmethod

rank the given locations to find the best match to the given location string Args: locationStr: location string locations: list of locations objects

Source code in ceurws/ceur_ws.py
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
@staticmethod
def rankLocations(locationStr: str, locations: list[Location]):
    """
    rank the given locations to find the best match to the given location string
    Args:
        locationStr: location string
        locations: list of locations objects
    """
    rankedLocations = []
    for location in locations:
        locationsToCheck = []
        if isinstance(location, City):
            locationsToCheck = [
                location,
                location.region,
                location.country,
            ]
        elif isinstance(location, Region):
            locationsToCheck = [location, location.country]
        elif isinstance(location, Country):
            locationsToCheck = [location]
        score = 0
        for ltc in locationsToCheck:
            if ltc.name in locationStr:
                score += 1
        rankedLocations.append((score, location))
    rankedLocations.sort(key=lambda scoreTuple: scoreTuple[0], reverse=True)
    return [location for score, location in rankedLocations]

removePartsMatching(value, pattern, separator=',') staticmethod

Removes parts from the given value matching the pattern

Source code in ceurws/ceur_ws.py
295
296
297
298
299
300
301
302
303
304
305
306
@staticmethod
def removePartsMatching(value: str, pattern: str, separator=","):
    """
    Removes parts from the given value matching the pattern
    """
    parts = value.split(separator)
    resParts = []
    for part in parts:
        if re.search(pattern, part) is None:
            resParts.append(part)
    resValue = separator.join(resParts)
    return resValue

resolveLoctime()

Resolve the loctime property by breaking it down to city, region, country, dateFrom, and dateTo

Source code in ceurws/ceur_ws.py
175
176
177
178
179
180
181
182
183
184
185
186
187
def resolveLoctime(self):
    """
    Resolve the loctime property by breaking it down to city, region, country, dateFrom, and dateTo
    """
    loctime = self.get_loctime()
    if loctime is None:
        return None
    dateFrom, dateTo = self.extractDates(loctime)
    if dateFrom is not None:
        self.dateFrom = dateFrom
    if dateTo is not None:
        self.dateTo = dateTo
    self.extractAndSetLocation(locationStr=loctime)

VolumeManager

Bases: EntityManager

Contains multiple ceurws volumes

Source code in ceurws/ceur_ws.py
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
class VolumeManager(EntityManager):
    """
    Contains multiple ceurws volumes
    """

    def __init__(self, tableName: str = "volumes"):
        super().__init__(
            listName="volumes",
            clazz=Volume,
            tableName=tableName,
            entityName=Volume.__class__.__name__,
            primaryKey="number",
            entityPluralName="volumes",
            config=CEURWS.CONFIG,
            handleInvalidListTypes=True,
            name=self.__class__.__name__,
        )
        self.volumes: list[Volume] = []

    def load(self):
        """
        load the volumeManager
        """
        if Download.needsDownload(CEURWS.CACHE_FILE):
            self.loadFromIndexHtml()
            self.store()
        else:
            self.loadFromBackup()

    def loadFromBackup(self):
        """
        load from the SQLITE Cache file
        """
        self.fromStore(cacheFile=CEURWS.CACHE_FILE)

    def update(self, parser_config: ParserConfig):
        """
        update me by a checking for recently added volumes
        """
        self.set_down_to_volume(parser_config)
        self.update_or_recreate(parser_config)

    def set_down_to_volume(self, parser_config):
        volumeCount = len(self.volumes)
        if volumeCount > 0:
            max_vol = self.volumes[-1]
            parser_config.down_to_volume = max_vol.number + 1
        else:
            pass

    def recreate(self, parser_config: ParserConfig):
        """
        recreate me by a full parse of all volume files

        Args:
            parser_config: parser configuration
        """

        self.update_or_recreate(parser_config)

    def update_or_recreate(self, parser_config: ParserConfig):
        """
        recreate or update me by parsing the index.html file

        Args:
            parser_config: parser configuration
        """
        progress_bar = parser_config.progress_bar
        loctime_parser = LoctimeParser()
        pm = PaperManager()
        if parser_config.down_to_volume != 1:
            pm.fromStore(cacheFile=CEURWS.CACHE_FILE)
        paper_list = pm.getList()

        # first reload me from the main index
        self.loadFromIndexHtml(parser_config)
        invalid = 0
        for volume in self.volumes:
            if volume.number and volume.number < parser_config.down_to_volume:
                break
            _volume_record, soup = volume.extractValuesFromVolumePage()
            if soup:
                ptp = PaperTocParser(number=str(volume.number), soup=soup, debug=self.debug)
                paper_records = ptp.parsePapers()
                for paper_record in paper_records:
                    paper = Paper()
                    paper.fromDict(paper_record)
                    paper_list.append(paper)
            if not volume.valid:
                invalid += 1
            else:
                loctime = volume.get_loctime()
                if loctime:
                    loc_time_dict = loctime_parser.parse(loctime)
                    for key, value in loc_time_dict.items():
                        attr = f"loc_{key}"
                        setattr(volume, attr, value)
                    volume.resolveLoctime()
            # update progress bar
            if progress_bar:
                if volume.valid:
                    # print(f"{volume.url}:{volume.acronym}:{volume.desc}:{volume.h1}:{volume.title}")
                    description = volume.acronym[:20] if volume.acronym else "?"
                    progress_bar.set_description(f"{description}")
                progress_bar.update()
        print(f"storing recreated volume table for {len(self.volumes)} volumes ({invalid} invalid)")
        self.store(replace=True)
        print(f"storing {len(paper_list)} papers")
        pm.store(replace=True)

    def loadFromIndexHtml(self, parser_config: ParserConfig | None = None, vol_limit: int | None = None):
        """
        load my content from the index.html file

        Args:
            parser_config(ParserConfig): the parser Configuration to use
        """
        force = parser_config.force_download if parser_config else True
        htmlText = self.getIndexHtml(force)
        indexParser = IndexHtmlParser(htmlText, parser_config)
        volumeRecords = indexParser.parse(vol_limit)
        for volumeRecord in volumeRecords.values():
            volume = Volume()
            volume.fromDict(volumeRecord)
            for attr in ["desc", "h1"]:
                if not hasattr(volume, attr):
                    setattr(volume, attr, "?")
            self.volumes.append(volume)

    def getIndexHtml(self, force: bool = False):
        """
        get the index html
        """
        cacheHtml = CEURWS.CACHE_HTML
        if cacheHtml.is_file() and not force:
            with open(cacheHtml, encoding="utf-8") as file:
                html_page = file.read()
        else:
            req = Request(CEURWS.URL, headers={"User-Agent": "pyCEURMake"})
            html_page = urlopen(req).read().decode()
            CEURWS.CACHE_DIR.mkdir(parents=True, exist_ok=True)
            with open(cacheHtml, mode="w", encoding="utf-8") as htmlFile:
                print(html_page, file=htmlFile)
        return html_page

getIndexHtml(force=False)

get the index html

Source code in ceurws/ceur_ws.py
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
def getIndexHtml(self, force: bool = False):
    """
    get the index html
    """
    cacheHtml = CEURWS.CACHE_HTML
    if cacheHtml.is_file() and not force:
        with open(cacheHtml, encoding="utf-8") as file:
            html_page = file.read()
    else:
        req = Request(CEURWS.URL, headers={"User-Agent": "pyCEURMake"})
        html_page = urlopen(req).read().decode()
        CEURWS.CACHE_DIR.mkdir(parents=True, exist_ok=True)
        with open(cacheHtml, mode="w", encoding="utf-8") as htmlFile:
            print(html_page, file=htmlFile)
    return html_page

load()

load the volumeManager

Source code in ceurws/ceur_ws.py
408
409
410
411
412
413
414
415
416
def load(self):
    """
    load the volumeManager
    """
    if Download.needsDownload(CEURWS.CACHE_FILE):
        self.loadFromIndexHtml()
        self.store()
    else:
        self.loadFromBackup()

loadFromBackup()

load from the SQLITE Cache file

Source code in ceurws/ceur_ws.py
418
419
420
421
422
def loadFromBackup(self):
    """
    load from the SQLITE Cache file
    """
    self.fromStore(cacheFile=CEURWS.CACHE_FILE)

loadFromIndexHtml(parser_config=None, vol_limit=None)

load my content from the index.html file

Parameters:

Name Type Description Default
parser_config(ParserConfig)

the parser Configuration to use

required
Source code in ceurws/ceur_ws.py
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
def loadFromIndexHtml(self, parser_config: ParserConfig | None = None, vol_limit: int | None = None):
    """
    load my content from the index.html file

    Args:
        parser_config(ParserConfig): the parser Configuration to use
    """
    force = parser_config.force_download if parser_config else True
    htmlText = self.getIndexHtml(force)
    indexParser = IndexHtmlParser(htmlText, parser_config)
    volumeRecords = indexParser.parse(vol_limit)
    for volumeRecord in volumeRecords.values():
        volume = Volume()
        volume.fromDict(volumeRecord)
        for attr in ["desc", "h1"]:
            if not hasattr(volume, attr):
                setattr(volume, attr, "?")
        self.volumes.append(volume)

recreate(parser_config)

recreate me by a full parse of all volume files

Parameters:

Name Type Description Default
parser_config ParserConfig

parser configuration

required
Source code in ceurws/ceur_ws.py
439
440
441
442
443
444
445
446
447
def recreate(self, parser_config: ParserConfig):
    """
    recreate me by a full parse of all volume files

    Args:
        parser_config: parser configuration
    """

    self.update_or_recreate(parser_config)

update(parser_config)

update me by a checking for recently added volumes

Source code in ceurws/ceur_ws.py
424
425
426
427
428
429
def update(self, parser_config: ParserConfig):
    """
    update me by a checking for recently added volumes
    """
    self.set_down_to_volume(parser_config)
    self.update_or_recreate(parser_config)

update_or_recreate(parser_config)

recreate or update me by parsing the index.html file

Parameters:

Name Type Description Default
parser_config ParserConfig

parser configuration

required
Source code in ceurws/ceur_ws.py
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
def update_or_recreate(self, parser_config: ParserConfig):
    """
    recreate or update me by parsing the index.html file

    Args:
        parser_config: parser configuration
    """
    progress_bar = parser_config.progress_bar
    loctime_parser = LoctimeParser()
    pm = PaperManager()
    if parser_config.down_to_volume != 1:
        pm.fromStore(cacheFile=CEURWS.CACHE_FILE)
    paper_list = pm.getList()

    # first reload me from the main index
    self.loadFromIndexHtml(parser_config)
    invalid = 0
    for volume in self.volumes:
        if volume.number and volume.number < parser_config.down_to_volume:
            break
        _volume_record, soup = volume.extractValuesFromVolumePage()
        if soup:
            ptp = PaperTocParser(number=str(volume.number), soup=soup, debug=self.debug)
            paper_records = ptp.parsePapers()
            for paper_record in paper_records:
                paper = Paper()
                paper.fromDict(paper_record)
                paper_list.append(paper)
        if not volume.valid:
            invalid += 1
        else:
            loctime = volume.get_loctime()
            if loctime:
                loc_time_dict = loctime_parser.parse(loctime)
                for key, value in loc_time_dict.items():
                    attr = f"loc_{key}"
                    setattr(volume, attr, value)
                volume.resolveLoctime()
        # update progress bar
        if progress_bar:
            if volume.valid:
                # print(f"{volume.url}:{volume.acronym}:{volume.desc}:{volume.h1}:{volume.title}")
                description = volume.acronym[:20] if volume.acronym else "?"
                progress_bar.set_description(f"{description}")
            progress_bar.update()
    print(f"storing recreated volume table for {len(self.volumes)} volumes ({invalid} invalid)")
    self.store(replace=True)
    print(f"storing {len(paper_list)} papers")
    pm.store(replace=True)

ceur_ws_web_cmd

Created on 2024-02-22

@author: wf

CeurWsCmd

Bases: WebserverCmd

command line handling for CEUR-WS Volume browser

Source code in ceurws/ceur_ws_web_cmd.py
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
class CeurWsCmd(WebserverCmd):
    """
    command line handling for CEUR-WS Volume browser
    """

    def __init__(self):
        """
        constructor
        """
        config = CeurWsWebServer.get_config()
        WebserverCmd.__init__(self, config, CeurWsWebServer, DEBUG)
        pass

    def getArgParser(self, description: str, version_msg) -> ArgumentParser:
        """
        override the default argparser call
        """
        parser = super().getArgParser(description, version_msg)
        parser.add_argument(
            "-dbu",
            "--dblp_update",
            action="store_true",
            help="update dblp cache",
        )
        parser.add_argument(
            "-nq",
            "--namedqueries",
            action="store_true",
            help="generate named queries [default: %(default)s]",
        )
        parser.add_argument(
            "-den",
            "--dblp_endpoint_name",
            help="name of dblp endpoint to use %(default)s",
            default="qlever-dblp",
        )
        parser.add_argument(
            "-f",
            "--force",
            action="store_true",
            help="force update [default: %(default)s]",
        )
        parser.add_argument(
            "--list",
            action="store_true",
            help="list all volumes [default: %(default)s]",
        )
        parser.add_argument(
            "-rc",
            "--recreate",
            action="store_true",
            help="recreate caches e.g. volume table",
        )
        parser.add_argument(
            "-uv",
            "--update",
            action="store_true",
            help="update volumes by parsing index.html adding recently published volumes",
        )
        parser.add_argument(
            "-wen",
            "--wikidata_endpoint_name",
            help="name of wikidata endpoint to use %(default)s",
            default="wikidata",
        )
        parser.add_argument(
            "-wdu",
            "--wikidata_update",
            action="store_true",
            help="update tables from wikidata",
        )
        return parser

    def handle_args(self) -> bool:
        """
        handle the command line arguments
        """
        args = self.args
        if args.namedqueries:
            nq = NamedQueries()
            yaml = nq.toYaml()
            print(yaml)
        if args.list:
            manager = VolumeManager()
            manager.loadFromBackup()
            for volume in manager.getList():
                print(volume)
        if args.recreate or args.update:
            manager = VolumeManager()
            manager.load()
            progress_bar = tqdm(total=len(manager.volumes))
            parser_config = ParserConfig(progress_bar, debug=args.debug)

            if args.recreate:
                manager.recreate(parser_config)
            else:
                manager.update(parser_config)
        if args.wikidata_update:
            wdsync = WikidataSync.from_args(args)
            wdsync.update(withStore=True)
        if args.dblp_update:
            wdsync = WikidataSync.from_args(args)
            endpoint = wdsync.dblpEndpoint
            print(f"updating dblp cache from SPARQL endpoint {endpoint.sparql.url}")
            # Instantiate the progress bar
            pbar = tqdm(total=len(wdsync.dblpEndpoint.dblp_managers))
            for _step, (cache_name, dblp_manager) in enumerate(endpoint.dblp_managers.items(), start=1):
                # Call the corresponding function to refresh cache data
                dblp_manager.load(force_query=args.force)
                # Update the progress bar description with the cache name and increment
                pbar.set_description(f"{cache_name} updated ...")

                # Update the progress bar manually
                pbar.update(1)  # Increment the progress bar by 1 for each iteration

            # Close the progress bar after the loop
            pbar.close()
            table_data = []
            for _step, cache_name in enumerate(endpoint.dblp_managers.keys(), start=1):
                cache = endpoint.cache_manager.get_cache_by_name(cache_name)
                table_data.append(asdict(cache))
            table = tabulate(table_data, headers="keys", tablefmt="grid")
            print(table)
            pass
        handled = super().handle_args()
        return handled

__init__()

constructor

Source code in ceurws/ceur_ws_web_cmd.py
27
28
29
30
31
32
33
def __init__(self):
    """
    constructor
    """
    config = CeurWsWebServer.get_config()
    WebserverCmd.__init__(self, config, CeurWsWebServer, DEBUG)
    pass

getArgParser(description, version_msg)

override the default argparser call

Source code in ceurws/ceur_ws_web_cmd.py
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
def getArgParser(self, description: str, version_msg) -> ArgumentParser:
    """
    override the default argparser call
    """
    parser = super().getArgParser(description, version_msg)
    parser.add_argument(
        "-dbu",
        "--dblp_update",
        action="store_true",
        help="update dblp cache",
    )
    parser.add_argument(
        "-nq",
        "--namedqueries",
        action="store_true",
        help="generate named queries [default: %(default)s]",
    )
    parser.add_argument(
        "-den",
        "--dblp_endpoint_name",
        help="name of dblp endpoint to use %(default)s",
        default="qlever-dblp",
    )
    parser.add_argument(
        "-f",
        "--force",
        action="store_true",
        help="force update [default: %(default)s]",
    )
    parser.add_argument(
        "--list",
        action="store_true",
        help="list all volumes [default: %(default)s]",
    )
    parser.add_argument(
        "-rc",
        "--recreate",
        action="store_true",
        help="recreate caches e.g. volume table",
    )
    parser.add_argument(
        "-uv",
        "--update",
        action="store_true",
        help="update volumes by parsing index.html adding recently published volumes",
    )
    parser.add_argument(
        "-wen",
        "--wikidata_endpoint_name",
        help="name of wikidata endpoint to use %(default)s",
        default="wikidata",
    )
    parser.add_argument(
        "-wdu",
        "--wikidata_update",
        action="store_true",
        help="update tables from wikidata",
    )
    return parser

handle_args()

handle the command line arguments

Source code in ceurws/ceur_ws_web_cmd.py
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
def handle_args(self) -> bool:
    """
    handle the command line arguments
    """
    args = self.args
    if args.namedqueries:
        nq = NamedQueries()
        yaml = nq.toYaml()
        print(yaml)
    if args.list:
        manager = VolumeManager()
        manager.loadFromBackup()
        for volume in manager.getList():
            print(volume)
    if args.recreate or args.update:
        manager = VolumeManager()
        manager.load()
        progress_bar = tqdm(total=len(manager.volumes))
        parser_config = ParserConfig(progress_bar, debug=args.debug)

        if args.recreate:
            manager.recreate(parser_config)
        else:
            manager.update(parser_config)
    if args.wikidata_update:
        wdsync = WikidataSync.from_args(args)
        wdsync.update(withStore=True)
    if args.dblp_update:
        wdsync = WikidataSync.from_args(args)
        endpoint = wdsync.dblpEndpoint
        print(f"updating dblp cache from SPARQL endpoint {endpoint.sparql.url}")
        # Instantiate the progress bar
        pbar = tqdm(total=len(wdsync.dblpEndpoint.dblp_managers))
        for _step, (cache_name, dblp_manager) in enumerate(endpoint.dblp_managers.items(), start=1):
            # Call the corresponding function to refresh cache data
            dblp_manager.load(force_query=args.force)
            # Update the progress bar description with the cache name and increment
            pbar.set_description(f"{cache_name} updated ...")

            # Update the progress bar manually
            pbar.update(1)  # Increment the progress bar by 1 for each iteration

        # Close the progress bar after the loop
        pbar.close()
        table_data = []
        for _step, cache_name in enumerate(endpoint.dblp_managers.keys(), start=1):
            cache = endpoint.cache_manager.get_cache_by_name(cache_name)
            table_data.append(asdict(cache))
        table = tabulate(table_data, headers="keys", tablefmt="grid")
        print(table)
        pass
    handled = super().handle_args()
    return handled

main(argv=None)

main call

Source code in ceurws/ceur_ws_web_cmd.py
150
151
152
153
154
155
156
def main(argv: list | None = None):
    """
    main call
    """
    cmd = CeurWsCmd()
    exit_code = cmd.cmd_main(argv)
    return exit_code

config

CEURWS

CEUR-WS

Source code in ceurws/config.py
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
class CEURWS:
    """
    CEUR-WS
    """

    @staticmethod
    def get_home_path() -> Path:
        """
        Get home path
        """
        home = Path.home()
        if "GITHUB_WORKSPACE" in os.environ:
            home = Path(os.environ["GITHUB_WORKSPACE"])
        return home

    URL = "http://ceur-ws.org"
    home = get_home_path()
    CACHE_DIR = home.joinpath(".ceurws")
    CACHE_FILE = CACHE_DIR.joinpath("ceurws.db")
    CACHE_HTML = CACHE_DIR.joinpath("index.html")
    CONFIG = StorageConfig(cacheFile=str(CACHE_FILE))

get_home_path() staticmethod

Get home path

Source code in ceurws/config.py
12
13
14
15
16
17
18
19
20
@staticmethod
def get_home_path() -> Path:
    """
    Get home path
    """
    home = Path.home()
    if "GITHUB_WORKSPACE" in os.environ:
        home = Path(os.environ["GITHUB_WORKSPACE"])
    return home

dblp

Created on 2024-03-09

@author: wf

DblpAuthorIdentifier dataclass

represents an author id available in dblp and the corresponding property in wikidata

Source code in ceurws/dblp.py
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
@dataclass
class DblpAuthorIdentifier:
    """
    represents an author id available in dblp
    and the corresponding property in wikidata
    """

    name: str  # the name should be usable as SPARQL variable
    dblp_property: str
    wikidata_property: str | None

    @classmethod
    def all(cls) -> list["DblpAuthorIdentifier"]:
        """
        returns all available identifiers
        """
        res = [
            DblpAuthorIdentifier("dblp", "datacite:dblp", "P2456"),
            DblpAuthorIdentifier("wikidata", "datacite:wikidata", None),
            DblpAuthorIdentifier("orcid", "datacite:orcid", "P496"),
            DblpAuthorIdentifier("googleScholar", "datacite:google-scholar", "P1960"),
            DblpAuthorIdentifier("acm", "datacite:acm", "P864"),
            DblpAuthorIdentifier("twitter", "datacite:twitter", "P2002"),
            DblpAuthorIdentifier("github", "datacite:github", "P2037"),
            DblpAuthorIdentifier("viaf", "datacite:viaf", "P214"),
            DblpAuthorIdentifier("scigraph", "datacite:scigraph", "P10861"),
            DblpAuthorIdentifier("zbmath", "datacite:zbmath", "P1556"),
            DblpAuthorIdentifier("researchGate", "datacite:research-gate", "P6023"),
            DblpAuthorIdentifier("mathGenealogy", "datacite:math-genealogy", "P549"),
            DblpAuthorIdentifier("loc", "datacite:loc", "P244"),
            DblpAuthorIdentifier("linkedin", "datacite:linkedin", "P6634"),
            DblpAuthorIdentifier("lattes", "datacite:lattes", "P1007"),
            DblpAuthorIdentifier("isni", "datacite:isni", "P213"),
            DblpAuthorIdentifier("ieee", "datacite:ieee", "P6479"),
            DblpAuthorIdentifier("gepris", "datacite:gepris", "P4872"),
            DblpAuthorIdentifier("gnd", "datacite:gnd", "P227"),
        ]
        return res

    @classmethod
    def getAllAsMap(cls) -> dict[str, "DblpAuthorIdentifier"]:
        """
        return all all available identifiers as map
        """
        res = dict()
        for identifier in cls.all():
            res[identifier.name] = identifier
        return res

    @classmethod
    def getWikidataIdQueryPart(cls, id_name: str, value: str, var: str):
        """
        Generates for the given identifier the wikidata query
        Args:
            id_name: name of the identifier
            value: the identifier value
            var: name of the variable which should have the id
        """
        if not var.startswith("?"):
            var = "?" + var
        query = None
        dblp_author_ids = cls.getAllAsMap().get(id_name)
        if dblp_author_ids is None:
            # unknown identifier
            return ""
        wd_prop = dblp_author_ids.wikidata_property
        values: str | list[str]
        if id_name == "wikidata":
            values = value
            if isinstance(value, str):
                values = [value]
            value_urls = " ".join([f"wd:{value}" for value in values])
            query = f"""{{ SELECT * WHERE {{ VALUES ?person {{ {value_urls} }} }} }}# {id_name}"""
        elif id_name in cls.getAllAsMap():
            if isinstance(value, list):
                values = " ".join([f'"{value}"' for value in value])
                query = f"""{{OPTIONAL{{
                            VALUES ?{id_name} {{ {values} }}
                            {var} wdt:{wd_prop} ?{id_name}.}} 
                            }}  # {id_name}"""
            else:
                query = f"""{{ {var} wdt:{wd_prop} "{value}". }}  # {id_name}"""
        else:
            pass
        return query

all() classmethod

returns all available identifiers

Source code in ceurws/dblp.py
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
@classmethod
def all(cls) -> list["DblpAuthorIdentifier"]:
    """
    returns all available identifiers
    """
    res = [
        DblpAuthorIdentifier("dblp", "datacite:dblp", "P2456"),
        DblpAuthorIdentifier("wikidata", "datacite:wikidata", None),
        DblpAuthorIdentifier("orcid", "datacite:orcid", "P496"),
        DblpAuthorIdentifier("googleScholar", "datacite:google-scholar", "P1960"),
        DblpAuthorIdentifier("acm", "datacite:acm", "P864"),
        DblpAuthorIdentifier("twitter", "datacite:twitter", "P2002"),
        DblpAuthorIdentifier("github", "datacite:github", "P2037"),
        DblpAuthorIdentifier("viaf", "datacite:viaf", "P214"),
        DblpAuthorIdentifier("scigraph", "datacite:scigraph", "P10861"),
        DblpAuthorIdentifier("zbmath", "datacite:zbmath", "P1556"),
        DblpAuthorIdentifier("researchGate", "datacite:research-gate", "P6023"),
        DblpAuthorIdentifier("mathGenealogy", "datacite:math-genealogy", "P549"),
        DblpAuthorIdentifier("loc", "datacite:loc", "P244"),
        DblpAuthorIdentifier("linkedin", "datacite:linkedin", "P6634"),
        DblpAuthorIdentifier("lattes", "datacite:lattes", "P1007"),
        DblpAuthorIdentifier("isni", "datacite:isni", "P213"),
        DblpAuthorIdentifier("ieee", "datacite:ieee", "P6479"),
        DblpAuthorIdentifier("gepris", "datacite:gepris", "P4872"),
        DblpAuthorIdentifier("gnd", "datacite:gnd", "P227"),
    ]
    return res

getAllAsMap() classmethod

return all all available identifiers as map

Source code in ceurws/dblp.py
475
476
477
478
479
480
481
482
483
@classmethod
def getAllAsMap(cls) -> dict[str, "DblpAuthorIdentifier"]:
    """
    return all all available identifiers as map
    """
    res = dict()
    for identifier in cls.all():
        res[identifier.name] = identifier
    return res

getWikidataIdQueryPart(id_name, value, var) classmethod

Generates for the given identifier the wikidata query Args: id_name: name of the identifier value: the identifier value var: name of the variable which should have the id

Source code in ceurws/dblp.py
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
@classmethod
def getWikidataIdQueryPart(cls, id_name: str, value: str, var: str):
    """
    Generates for the given identifier the wikidata query
    Args:
        id_name: name of the identifier
        value: the identifier value
        var: name of the variable which should have the id
    """
    if not var.startswith("?"):
        var = "?" + var
    query = None
    dblp_author_ids = cls.getAllAsMap().get(id_name)
    if dblp_author_ids is None:
        # unknown identifier
        return ""
    wd_prop = dblp_author_ids.wikidata_property
    values: str | list[str]
    if id_name == "wikidata":
        values = value
        if isinstance(value, str):
            values = [value]
        value_urls = " ".join([f"wd:{value}" for value in values])
        query = f"""{{ SELECT * WHERE {{ VALUES ?person {{ {value_urls} }} }} }}# {id_name}"""
    elif id_name in cls.getAllAsMap():
        if isinstance(value, list):
            values = " ".join([f'"{value}"' for value in value])
            query = f"""{{OPTIONAL{{
                        VALUES ?{id_name} {{ {values} }}
                        {var} wdt:{wd_prop} ?{id_name}.}} 
                        }}  # {id_name}"""
        else:
            query = f"""{{ {var} wdt:{wd_prop} "{value}". }}  # {id_name}"""
    else:
        pass
    return query

DblpAuthors

Bases: DblpManager

Manage all authors of DBLP indexed volumes.

Source code in ceurws/dblp.py
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
class DblpAuthors(DblpManager):
    """
    Manage all authors of DBLP indexed volumes.
    """

    def __init__(self, endpoint: "DblpEndpoint"):
        super().__init__(endpoint, "dblp/authors", "CEUR-WS Paper Authors")
        self.authors: list[DblpScholar] | None = None

    def load(self, force_query: bool = False):
        """
        load my authors
        """
        if self.authors is None:
            super().load(force_query=force_query)
            self.authors = []
            for d in self.lod:
                author = DblpScholar(**d)
                self.authors.append(author)
            self.authorsById = {a.dblp_author_id: a for a in self.authors}

load(force_query=False)

load my authors

Source code in ceurws/dblp.py
64
65
66
67
68
69
70
71
72
73
74
def load(self, force_query: bool = False):
    """
    load my authors
    """
    if self.authors is None:
        super().load(force_query=force_query)
        self.authors = []
        for d in self.lod:
            author = DblpScholar(**d)
            self.authors.append(author)
        self.authorsById = {a.dblp_author_id: a for a in self.authors}

DblpEditors

Bases: DblpManager

Manage all editors of DBLP indexed volumes.

Source code in ceurws/dblp.py
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
class DblpEditors(DblpManager):
    """
    Manage all editors of DBLP indexed volumes.
    """

    def __init__(self, endpoint: "DblpEndpoint"):
        super().__init__(endpoint, "dblp/editors", "CEUR-WS all Editors")
        self.editors: list[DblpScholar] | None = None

    def load(self, force_query: bool = False):
        """
        load my editors
        """
        if self.editors is None:
            super().load(force_query=force_query)
            self.editors = []
            for d in self.lod:
                editor = DblpScholar(**d)
                self.editors.append(editor)
            self.editorsById = {e.dblp_author_id: e for e in self.editors}

load(force_query=False)

load my editors

Source code in ceurws/dblp.py
86
87
88
89
90
91
92
93
94
95
96
def load(self, force_query: bool = False):
    """
    load my editors
    """
    if self.editors is None:
        super().load(force_query=force_query)
        self.editors = []
        for d in self.lod:
            editor = DblpScholar(**d)
            self.editors.append(editor)
        self.editorsById = {e.dblp_author_id: e for e in self.editors}

DblpEndpoint

provides queries and a dblp endpoint to execute them

Source code in ceurws/dblp.py
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
class DblpEndpoint:
    """
    provides queries and a dblp endpoint to execute them
    """

    DBLP_REC_PREFIX = "https://dblp.org/rec/"
    DBLP_EVENT_PREFIX = "https://dblp.org/db/"

    def __init__(self, endpoint, debug: bool = False):
        """
        constructor
        """
        self.debug = debug
        self.sparql = SPARQL(endpoint)
        path = os.path.dirname(__file__)
        qYamlFile = f"{path}/resources/queries/dblp.yaml"
        if os.path.isfile(qYamlFile):
            self.qm = QueryManager(lang="sparql", queriesPath=qYamlFile)
        # there is one cache manager for all our json caches
        self.cache_manager = CacheManager("ceurws")
        self.dblp_authors = DblpAuthors(endpoint=self)
        self.dblp_editors = DblpEditors(endpoint=self)
        self.dblp_papers = DblpPapers(endpoint=self)
        self.dblp_volumes = DblpVolumes(endpoint=self)
        self.dblp_managers = {
            "dblp/authors": self.dblp_authors,
            "dblp/editors": self.dblp_editors,
            "dblp/papers": self.dblp_papers,
            "dblp/volumes": self.dblp_volumes,
        }
        self.progress_bar = None

    def load_all(self, force_query: bool = False):
        """
        load all managers
        """
        for _key, manager in self.dblp_managers.items():
            manager.load(force_query=force_query)

    def get_lod(self, cache_name: str, query_name: str, force_query: bool = False) -> list:
        """
        Get the list of dictionaries for the given cache and query names,
        optionally forcing a query.

        Args:
            cache_name (str): The name of the cache to load or store the LOD.
            query_name (str): The name of the query to execute if the data is not cached or forced to query.
            force_query (bool): If True, forces the query execution even if the data is cached. Defaults to False.

        Returns:
            List[Dict]: The list of dictionaries loaded either from cache or by executing the SPARQL query.
        """
        start_time = time.time()  # Record the start time of the operation
        cache = self.cache_manager.get_cache_by_name(cache_name)
        if cache.is_stored and not force_query:
            if self.debug:
                print(f"loading {cache_name} from cache")
            lod = self.cache_manager.load(cache_name)
        else:
            query = self.qm.queriesByName[query_name]
            if self.debug:
                print(f"loading {cache_name} from SPARQL query {query_name}")
            lod = self.sparql.queryAsListOfDicts(query.query)
            self.cache_manager.store(cache_name, lod)
        end_time = time.time()  # Record the end time of the operation
        duration = end_time - start_time  # Calculate the duration of the loading process

        if self.debug:
            print(f"loaded {len(lod)} records for {cache_name} in {duration:.2f} seconds")
        if self.progress_bar:
            self.progress_bar.update(duration * 100 / 36)
        return lod

    def get_ceur_volume_papers(self, volume_number: int) -> list[DblpPaper]:
        """
        Get all papers published in CEUR-WS from dblp
        """
        cache_name = f"dblp/Vol-{volume_number}/papers"
        lod = self.cache_manager.load(cache_name)
        papers = [DblpPaper(**d) for d in lod]
        return papers

    def get_ceur_proceeding(self, volume_number: int) -> DblpProceeding:
        """
        get ceur proceeding by volume number from dblp
        Args:
            volume_number: number of the volume
        """
        cache_name = f"dblp/Vol-{volume_number}/metadata"
        volume = self.cache_manager.load(cache_name, cls=DblpProceeding)
        return volume

    def getDblpIdByVolumeNumber(self, number) -> list[str]:
        """
        Get the dblp entity id by given volume number
        Args:
            number: volume number
        """
        query = f"""PREFIX dblp: <https://dblp.org/rdf/schema#>
            SELECT *
            WHERE {{ 
                ?proceeding dblp:publishedIn "CEUR Workshop Proceedings";
                            dblp:publishedInSeriesVolume "{number}".
                }}
        """
        try:
            qres = self.sparql.queryAsListOfDicts(query)
        except HTTPError:
            print("dblp sparql endpoint unavailable")
            qres = None
        qIds = []
        if qres is not None and qres != []:
            qIds = [record.get("proceeding")[len(self.DBLP_REC_PREFIX) :] for record in qres]
        return qIds

    def getDblpUrlByDblpId(self, entityId: str | None = None) -> str | None:
        """
        Get the dblp url for given entity id
        Args:
            entityId: volume url
        """
        if entityId is None or entityId == "":
            return None
        entityUrl = self.DBLP_REC_PREFIX + entityId
        query = f"""PREFIX dblp: <https://dblp.org/rdf/schema#>
                SELECT *
                WHERE {{ 
                    <{entityUrl}> dblp:listedOnTocPage ?url .
                    }}
            """
        qres = self.sparql.queryAsListOfDicts(query)
        qIds = []
        if qres is not None and qres != []:
            qIds = [record.get("url")[len(self.DBLP_EVENT_PREFIX) :] for record in qres]
        qId = qIds[0] if qIds is not None and len(qIds) > 0 else None
        return qId

    def convertEntityIdToUrlId(self, entityId: str | None) -> str | None:
        """
        Convert the given entityId to the id used in the url
        Note: use with care this conversion does not always work
        Args:
            entityId: id of the entity
        Example:
            conf/aaai/2022 → conf/aaai/aaai2022

        Returns
            str - id used in the url
            None - if the given entityId can not be converted
        """
        return self.getDblpUrlByDblpId(entityId)

    def toDblpUrl(self, entityId: str, withPostfix: bool = False) -> str | None:
        """
        Convert the given id to the corresponding dblp url
        Args:
            entityId: dblp event id
            withPostfix: If True add the postfix ".html"

        Returns:
            dblp url of None if the url can not be generated for the given input
        """
        urlId = self.convertEntityIdToUrlId(entityId)
        if urlId is None:
            return None
        postfix = ".html"
        url = self.DBLP_EVENT_PREFIX + urlId
        if withPostfix:
            url += postfix
        return url

    def getEditorsOfVolume(self, number: int | str | None) -> list[dict]:
        """
        Get the editors for the given volume number
        Args:
            number: number of the volume if none query for all ceur-ws editors

        Returns:
            list of dictionaries where a dict represents one editor containing all identifiers of the editor
        """
        number_var = "?volumeNumber" if number is None else f'"{number}"'
        dblp_identifiers = DblpAuthorIdentifier.all()
        optional_clauses: list[str] = []
        id_vars: list[str] = []
        for identifier in dblp_identifiers:
            id_var = f"?{identifier.name}"
            optional_clauses.append(
                f"""OPTIONAL{{
                ?editor datacite:hasIdentifier {id_var}_blank.
                {id_var}_blank datacite:usesIdentifierScheme {identifier.dblp_property};
                litre:hasLiteralValue {id_var}Var.}}"""
            )
            id_vars.append(id_var)
        id_selects = "\n".join(
            [f"(group_concat(DISTINCT {id_var}Var;separator='|') as {id_var})" for id_var in id_vars]
        )
        id_queries = "\n".join(optional_clauses)
        query = f"""PREFIX datacite: <http://purl.org/spar/datacite/>
                    PREFIX dblp: <https://dblp.org/rdf/schema#>
                    PREFIX litre: <http://purl.org/spar/literal/>
                    SELECT DISTINCT (group_concat(DISTINCT ?nameVar;separator='|') as ?name) 
                                    (group_concat(DISTINCT ?homepageVar;separator='|') as ?homepage)
                                    (group_concat(DISTINCT ?affiliationVar;separator='|') as ?affiliation)
                                    {id_selects}
                    WHERE{{
                        ?proceeding dblp:publishedIn "CEUR Workshop Proceedings";
                                    dblp:publishedInSeriesVolume {number_var};
                                    dblp:editedBy ?editor.
                        ?editor dblp:primaryCreatorName ?nameVar.
                        OPTIONAL{{?editor dblp:primaryHomepage ?homepageVar.}}
                        OPTIONAL{{?editor dblp:primaryAffiliation ?affiliationVar.}}
                        {id_queries}
                    }}
                    GROUP BY ?editor
                """
        qres = self.sparql.queryAsListOfDicts(query)
        for record in qres:
            for key, value in record.items():
                if "|" in value:
                    record[key] = value.split(
                        '"|"'
                    )  # issue in qlever see https://github.com/ad-freiburg/qlever/discussions/806
        return qres

__init__(endpoint, debug=False)

constructor

Source code in ceurws/dblp.py
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
def __init__(self, endpoint, debug: bool = False):
    """
    constructor
    """
    self.debug = debug
    self.sparql = SPARQL(endpoint)
    path = os.path.dirname(__file__)
    qYamlFile = f"{path}/resources/queries/dblp.yaml"
    if os.path.isfile(qYamlFile):
        self.qm = QueryManager(lang="sparql", queriesPath=qYamlFile)
    # there is one cache manager for all our json caches
    self.cache_manager = CacheManager("ceurws")
    self.dblp_authors = DblpAuthors(endpoint=self)
    self.dblp_editors = DblpEditors(endpoint=self)
    self.dblp_papers = DblpPapers(endpoint=self)
    self.dblp_volumes = DblpVolumes(endpoint=self)
    self.dblp_managers = {
        "dblp/authors": self.dblp_authors,
        "dblp/editors": self.dblp_editors,
        "dblp/papers": self.dblp_papers,
        "dblp/volumes": self.dblp_volumes,
    }
    self.progress_bar = None

convertEntityIdToUrlId(entityId)

Convert the given entityId to the id used in the url Note: use with care this conversion does not always work Args: entityId: id of the entity Example: conf/aaai/2022 → conf/aaai/aaai2022

Returns str - id used in the url None - if the given entityId can not be converted

Source code in ceurws/dblp.py
348
349
350
351
352
353
354
355
356
357
358
359
360
361
def convertEntityIdToUrlId(self, entityId: str | None) -> str | None:
    """
    Convert the given entityId to the id used in the url
    Note: use with care this conversion does not always work
    Args:
        entityId: id of the entity
    Example:
        conf/aaai/2022 → conf/aaai/aaai2022

    Returns
        str - id used in the url
        None - if the given entityId can not be converted
    """
    return self.getDblpUrlByDblpId(entityId)

getDblpIdByVolumeNumber(number)

Get the dblp entity id by given volume number Args: number: volume number

Source code in ceurws/dblp.py
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
def getDblpIdByVolumeNumber(self, number) -> list[str]:
    """
    Get the dblp entity id by given volume number
    Args:
        number: volume number
    """
    query = f"""PREFIX dblp: <https://dblp.org/rdf/schema#>
        SELECT *
        WHERE {{ 
            ?proceeding dblp:publishedIn "CEUR Workshop Proceedings";
                        dblp:publishedInSeriesVolume "{number}".
            }}
    """
    try:
        qres = self.sparql.queryAsListOfDicts(query)
    except HTTPError:
        print("dblp sparql endpoint unavailable")
        qres = None
    qIds = []
    if qres is not None and qres != []:
        qIds = [record.get("proceeding")[len(self.DBLP_REC_PREFIX) :] for record in qres]
    return qIds

getDblpUrlByDblpId(entityId=None)

Get the dblp url for given entity id Args: entityId: volume url

Source code in ceurws/dblp.py
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
def getDblpUrlByDblpId(self, entityId: str | None = None) -> str | None:
    """
    Get the dblp url for given entity id
    Args:
        entityId: volume url
    """
    if entityId is None or entityId == "":
        return None
    entityUrl = self.DBLP_REC_PREFIX + entityId
    query = f"""PREFIX dblp: <https://dblp.org/rdf/schema#>
            SELECT *
            WHERE {{ 
                <{entityUrl}> dblp:listedOnTocPage ?url .
                }}
        """
    qres = self.sparql.queryAsListOfDicts(query)
    qIds = []
    if qres is not None and qres != []:
        qIds = [record.get("url")[len(self.DBLP_EVENT_PREFIX) :] for record in qres]
    qId = qIds[0] if qIds is not None and len(qIds) > 0 else None
    return qId

getEditorsOfVolume(number)

Get the editors for the given volume number Args: number: number of the volume if none query for all ceur-ws editors

Returns:

Type Description
list[dict]

list of dictionaries where a dict represents one editor containing all identifiers of the editor

Source code in ceurws/dblp.py
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
def getEditorsOfVolume(self, number: int | str | None) -> list[dict]:
    """
    Get the editors for the given volume number
    Args:
        number: number of the volume if none query for all ceur-ws editors

    Returns:
        list of dictionaries where a dict represents one editor containing all identifiers of the editor
    """
    number_var = "?volumeNumber" if number is None else f'"{number}"'
    dblp_identifiers = DblpAuthorIdentifier.all()
    optional_clauses: list[str] = []
    id_vars: list[str] = []
    for identifier in dblp_identifiers:
        id_var = f"?{identifier.name}"
        optional_clauses.append(
            f"""OPTIONAL{{
            ?editor datacite:hasIdentifier {id_var}_blank.
            {id_var}_blank datacite:usesIdentifierScheme {identifier.dblp_property};
            litre:hasLiteralValue {id_var}Var.}}"""
        )
        id_vars.append(id_var)
    id_selects = "\n".join(
        [f"(group_concat(DISTINCT {id_var}Var;separator='|') as {id_var})" for id_var in id_vars]
    )
    id_queries = "\n".join(optional_clauses)
    query = f"""PREFIX datacite: <http://purl.org/spar/datacite/>
                PREFIX dblp: <https://dblp.org/rdf/schema#>
                PREFIX litre: <http://purl.org/spar/literal/>
                SELECT DISTINCT (group_concat(DISTINCT ?nameVar;separator='|') as ?name) 
                                (group_concat(DISTINCT ?homepageVar;separator='|') as ?homepage)
                                (group_concat(DISTINCT ?affiliationVar;separator='|') as ?affiliation)
                                {id_selects}
                WHERE{{
                    ?proceeding dblp:publishedIn "CEUR Workshop Proceedings";
                                dblp:publishedInSeriesVolume {number_var};
                                dblp:editedBy ?editor.
                    ?editor dblp:primaryCreatorName ?nameVar.
                    OPTIONAL{{?editor dblp:primaryHomepage ?homepageVar.}}
                    OPTIONAL{{?editor dblp:primaryAffiliation ?affiliationVar.}}
                    {id_queries}
                }}
                GROUP BY ?editor
            """
    qres = self.sparql.queryAsListOfDicts(query)
    for record in qres:
        for key, value in record.items():
            if "|" in value:
                record[key] = value.split(
                    '"|"'
                )  # issue in qlever see https://github.com/ad-freiburg/qlever/discussions/806
    return qres

get_ceur_proceeding(volume_number)

get ceur proceeding by volume number from dblp Args: volume_number: number of the volume

Source code in ceurws/dblp.py
293
294
295
296
297
298
299
300
301
def get_ceur_proceeding(self, volume_number: int) -> DblpProceeding:
    """
    get ceur proceeding by volume number from dblp
    Args:
        volume_number: number of the volume
    """
    cache_name = f"dblp/Vol-{volume_number}/metadata"
    volume = self.cache_manager.load(cache_name, cls=DblpProceeding)
    return volume

get_ceur_volume_papers(volume_number)

Get all papers published in CEUR-WS from dblp

Source code in ceurws/dblp.py
284
285
286
287
288
289
290
291
def get_ceur_volume_papers(self, volume_number: int) -> list[DblpPaper]:
    """
    Get all papers published in CEUR-WS from dblp
    """
    cache_name = f"dblp/Vol-{volume_number}/papers"
    lod = self.cache_manager.load(cache_name)
    papers = [DblpPaper(**d) for d in lod]
    return papers

get_lod(cache_name, query_name, force_query=False)

Get the list of dictionaries for the given cache and query names, optionally forcing a query.

Parameters:

Name Type Description Default
cache_name str

The name of the cache to load or store the LOD.

required
query_name str

The name of the query to execute if the data is not cached or forced to query.

required
force_query bool

If True, forces the query execution even if the data is cached. Defaults to False.

False

Returns:

Type Description
list

List[Dict]: The list of dictionaries loaded either from cache or by executing the SPARQL query.

Source code in ceurws/dblp.py
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
def get_lod(self, cache_name: str, query_name: str, force_query: bool = False) -> list:
    """
    Get the list of dictionaries for the given cache and query names,
    optionally forcing a query.

    Args:
        cache_name (str): The name of the cache to load or store the LOD.
        query_name (str): The name of the query to execute if the data is not cached or forced to query.
        force_query (bool): If True, forces the query execution even if the data is cached. Defaults to False.

    Returns:
        List[Dict]: The list of dictionaries loaded either from cache or by executing the SPARQL query.
    """
    start_time = time.time()  # Record the start time of the operation
    cache = self.cache_manager.get_cache_by_name(cache_name)
    if cache.is_stored and not force_query:
        if self.debug:
            print(f"loading {cache_name} from cache")
        lod = self.cache_manager.load(cache_name)
    else:
        query = self.qm.queriesByName[query_name]
        if self.debug:
            print(f"loading {cache_name} from SPARQL query {query_name}")
        lod = self.sparql.queryAsListOfDicts(query.query)
        self.cache_manager.store(cache_name, lod)
    end_time = time.time()  # Record the end time of the operation
    duration = end_time - start_time  # Calculate the duration of the loading process

    if self.debug:
        print(f"loaded {len(lod)} records for {cache_name} in {duration:.2f} seconds")
    if self.progress_bar:
        self.progress_bar.update(duration * 100 / 36)
    return lod

load_all(force_query=False)

load all managers

Source code in ceurws/dblp.py
243
244
245
246
247
248
def load_all(self, force_query: bool = False):
    """
    load all managers
    """
    for _key, manager in self.dblp_managers.items():
        manager.load(force_query=force_query)

toDblpUrl(entityId, withPostfix=False)

Convert the given id to the corresponding dblp url Args: entityId: dblp event id withPostfix: If True add the postfix ".html"

Returns:

Type Description
str | None

dblp url of None if the url can not be generated for the given input

Source code in ceurws/dblp.py
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
def toDblpUrl(self, entityId: str, withPostfix: bool = False) -> str | None:
    """
    Convert the given id to the corresponding dblp url
    Args:
        entityId: dblp event id
        withPostfix: If True add the postfix ".html"

    Returns:
        dblp url of None if the url can not be generated for the given input
    """
    urlId = self.convertEntityIdToUrlId(entityId)
    if urlId is None:
        return None
    postfix = ".html"
    url = self.DBLP_EVENT_PREFIX + urlId
    if withPostfix:
        url += postfix
    return url

DblpManager

Manage DBLP entities.

Attributes:

Name Type Description
endpoint DblpEndpoint

The endpoint for DBLP queries.

cache_name str

The name of the cache to use.

query_name str

The name of the query to execute.

Source code in ceurws/dblp.py
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
class DblpManager:
    """
    Manage DBLP entities.

    Attributes:
        endpoint (DblpEndpoint): The endpoint for DBLP queries.
        cache_name (str): The name of the cache to use.
        query_name (str): The name of the query to execute.
    """

    def __init__(self, endpoint: "DblpEndpoint", cache_name: str, query_name: str):
        """
        Initializes the DBLP Manager with the given endpoint, cache name, and query name.

        Args:
            endpoint (DblpEndpoint): The endpoint for DBLP queries.
            cache_name (str): The name of the cache to use.
            query_name (str): The name of the query to execute.
        """
        self.endpoint = endpoint
        self.cache_name = cache_name
        self.query_name = query_name

    def load(self, force_query: bool = False):
        """
        Loads a list of dictionaries from the DBLP endpoint.

        Args:
            force_query (bool): If True, forces a new query to the endpoint. Defaults to False.
        """
        self.lod = self.endpoint.get_lod(self.cache_name, self.query_name, force_query=force_query)

__init__(endpoint, cache_name, query_name)

Initializes the DBLP Manager with the given endpoint, cache name, and query name.

Parameters:

Name Type Description Default
endpoint DblpEndpoint

The endpoint for DBLP queries.

required
cache_name str

The name of the cache to use.

required
query_name str

The name of the query to execute.

required
Source code in ceurws/dblp.py
32
33
34
35
36
37
38
39
40
41
42
43
def __init__(self, endpoint: "DblpEndpoint", cache_name: str, query_name: str):
    """
    Initializes the DBLP Manager with the given endpoint, cache name, and query name.

    Args:
        endpoint (DblpEndpoint): The endpoint for DBLP queries.
        cache_name (str): The name of the cache to use.
        query_name (str): The name of the query to execute.
    """
    self.endpoint = endpoint
    self.cache_name = cache_name
    self.query_name = query_name

load(force_query=False)

Loads a list of dictionaries from the DBLP endpoint.

Parameters:

Name Type Description Default
force_query bool

If True, forces a new query to the endpoint. Defaults to False.

False
Source code in ceurws/dblp.py
45
46
47
48
49
50
51
52
def load(self, force_query: bool = False):
    """
    Loads a list of dictionaries from the DBLP endpoint.

    Args:
        force_query (bool): If True, forces a new query to the endpoint. Defaults to False.
    """
    self.lod = self.endpoint.get_lod(self.cache_name, self.query_name, force_query=force_query)

DblpPapers

Bases: DblpManager

manage all CEUR-WS papers indexed by dblp

Source code in ceurws/dblp.py
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
class DblpPapers(DblpManager):
    """
    manage all CEUR-WS papers indexed by dblp
    """

    def __init__(self, endpoint: "DblpEndpoint"):
        super().__init__(endpoint, "dblp/papers", "CEUR-WS all Papers")
        self.papers: list[DblpPaper] | None = None
        self.papers_by_volume: dict[str, dict] = {}
        self.papersById: dict[str, DblpPaper] = {}
        self.papersByProceeding: dict[str, list[DblpPaper]] = {}

    def load(self, force_query: bool = False):
        """
        load my editors
        """
        if self.papers is None:
            super().load(force_query=force_query)
            dblp_authors = self.endpoint.dblp_authors
            dblp_authors.load(force_query=force_query)
            self.papers = []
            for d in self.lod:
                pdf_id = d.get("pdf_url", None)
                if pdf_id and isinstance(pdf_id, str):
                    pdf_id = pdf_id.replace("http://ceur-ws.org/", "")
                    pdf_id = pdf_id.replace("https://ceur-ws.org/", "")
                    pdf_id = pdf_id.replace(".pdf", "")
                authors = []
                # get the authors string
                authors_str = d.get("author", "")
                # >;<  qlever quirk until 2023-12
                delim = ">;<" if ">;<" in authors_str else ";"
                for dblp_author_id in authors_str.split(delim):  #
                    author = dblp_authors.authorsById.get(dblp_author_id, None)
                    if author:
                        authors.append(author)
                paper = DblpPaper(
                    dblp_publication_id=d.get("paper"),
                    volume_number=int(d.get("volume_number")),
                    dblp_proceeding_id=d.get("proceeding"),
                    title=d.get("title"),
                    pdf_id=pdf_id,
                    authors=authors,
                )  # type: ignore
                self.papers.append(paper)
            self.papers_by_volume = LOD.getLookup(self.papers, "volume_number", withDuplicates=True)
            self.papersByProceeding = {
                key: list(group) for key, group in groupby(self.papers, lambda paper: paper.dblp_proceeding_id)
            }
            self.papersById = {p.dblp_publication_id: p for p in self.papers} if self.papers is not None else {}
            # papers per volume
            for volume_number, vol_papers in sorted(self.papers_by_volume.items()):
                vol_paper_lod = [dataclasses.asdict(paper) for paper in vol_papers]
                cache_name = f"dblp/Vol-{volume_number}/papers"
                if self.endpoint.progress_bar:
                    self.endpoint.progress_bar.update(30 / 3650)
                    # print(f"caching {cache_name}")
                self.endpoint.cache_manager.store(
                    cache_name,
                    vol_paper_lod,
                )

load(force_query=False)

load my editors

Source code in ceurws/dblp.py
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
def load(self, force_query: bool = False):
    """
    load my editors
    """
    if self.papers is None:
        super().load(force_query=force_query)
        dblp_authors = self.endpoint.dblp_authors
        dblp_authors.load(force_query=force_query)
        self.papers = []
        for d in self.lod:
            pdf_id = d.get("pdf_url", None)
            if pdf_id and isinstance(pdf_id, str):
                pdf_id = pdf_id.replace("http://ceur-ws.org/", "")
                pdf_id = pdf_id.replace("https://ceur-ws.org/", "")
                pdf_id = pdf_id.replace(".pdf", "")
            authors = []
            # get the authors string
            authors_str = d.get("author", "")
            # >;<  qlever quirk until 2023-12
            delim = ">;<" if ">;<" in authors_str else ";"
            for dblp_author_id in authors_str.split(delim):  #
                author = dblp_authors.authorsById.get(dblp_author_id, None)
                if author:
                    authors.append(author)
            paper = DblpPaper(
                dblp_publication_id=d.get("paper"),
                volume_number=int(d.get("volume_number")),
                dblp_proceeding_id=d.get("proceeding"),
                title=d.get("title"),
                pdf_id=pdf_id,
                authors=authors,
            )  # type: ignore
            self.papers.append(paper)
        self.papers_by_volume = LOD.getLookup(self.papers, "volume_number", withDuplicates=True)
        self.papersByProceeding = {
            key: list(group) for key, group in groupby(self.papers, lambda paper: paper.dblp_proceeding_id)
        }
        self.papersById = {p.dblp_publication_id: p for p in self.papers} if self.papers is not None else {}
        # papers per volume
        for volume_number, vol_papers in sorted(self.papers_by_volume.items()):
            vol_paper_lod = [dataclasses.asdict(paper) for paper in vol_papers]
            cache_name = f"dblp/Vol-{volume_number}/papers"
            if self.endpoint.progress_bar:
                self.endpoint.progress_bar.update(30 / 3650)
                # print(f"caching {cache_name}")
            self.endpoint.cache_manager.store(
                cache_name,
                vol_paper_lod,
            )

DblpVolumes

Bases: DblpManager

Manage all DBLP indexed volumes.

Source code in ceurws/dblp.py
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
class DblpVolumes(DblpManager):
    """
    Manage all DBLP indexed volumes.
    """

    def __init__(self, endpoint: "DblpEndpoint"):
        super().__init__(endpoint, "dblp/volumes", "CEUR-WS all Volumes")
        self.volumes = None

    def load(self, force_query: bool = False):
        """
        load my volumes
        """
        if self.volumes is None:
            super().load(force_query=force_query)
            volumes = []
            dblp_editors = self.endpoint.dblp_editors
            dblp_editors.load(force_query=force_query)
            dblp_papers = self.endpoint.dblp_papers
            dblp_papers.load(force_query=force_query)
            for d in self.lod:
                if int(d.get("volume_number")) == 3000:
                    pass
                vol_editors = []
                editor_str = d.get("editor", "")
                # >;<  qlever quirk until 2023-12
                delim = ">;<" if ">;<" in editor_str else ";"
                for dblp_author_id in editor_str.split(delim):
                    editor = dblp_editors.editorsById.get(dblp_author_id, None)
                    if editor:
                        vol_editors.append(editor)
                volume = DblpProceeding(
                    dblp_publication_id=d.get("proceeding"),
                    volume_number=int(d.get("volume_number")),
                    dblp_event_id=d.get("dblp_event_id"),
                    title=d.get("title"),
                    editors=vol_editors,
                    papers=dblp_papers.papersByProceeding.get(d.get("proceeding")),
                )  # type: ignore
                volumes.append(volume)
            volume_by_number, _errors = LOD.getLookup(volumes, "volume_number")
            for number, volume in sorted(volume_by_number.items()):
                cache_name = f"dblp/Vol-{number}/metadata"
                if self.endpoint.progress_bar:
                    self.endpoint.progress_bar.update(int(30 / 3650))
                self.endpoint.cache_manager.store(cache_name, volume)
        return self.volumes

load(force_query=False)

load my volumes

Source code in ceurws/dblp.py
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
def load(self, force_query: bool = False):
    """
    load my volumes
    """
    if self.volumes is None:
        super().load(force_query=force_query)
        volumes = []
        dblp_editors = self.endpoint.dblp_editors
        dblp_editors.load(force_query=force_query)
        dblp_papers = self.endpoint.dblp_papers
        dblp_papers.load(force_query=force_query)
        for d in self.lod:
            if int(d.get("volume_number")) == 3000:
                pass
            vol_editors = []
            editor_str = d.get("editor", "")
            # >;<  qlever quirk until 2023-12
            delim = ">;<" if ">;<" in editor_str else ";"
            for dblp_author_id in editor_str.split(delim):
                editor = dblp_editors.editorsById.get(dblp_author_id, None)
                if editor:
                    vol_editors.append(editor)
            volume = DblpProceeding(
                dblp_publication_id=d.get("proceeding"),
                volume_number=int(d.get("volume_number")),
                dblp_event_id=d.get("dblp_event_id"),
                title=d.get("title"),
                editors=vol_editors,
                papers=dblp_papers.papersByProceeding.get(d.get("proceeding")),
            )  # type: ignore
            volumes.append(volume)
        volume_by_number, _errors = LOD.getLookup(volumes, "volume_number")
        for number, volume in sorted(volume_by_number.items()):
            cache_name = f"dblp/Vol-{number}/metadata"
            if self.endpoint.progress_bar:
                self.endpoint.progress_bar.update(int(30 / 3650))
            self.endpoint.cache_manager.store(cache_name, volume)
    return self.volumes

indexparser

Created on 11.08.2022

@author: wf

IndexHtmlParser

Bases: Textparser

CEUR-WS Index.html parser

Source code in ceurws/indexparser.py
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
class IndexHtmlParser(Textparser):
    """
    CEUR-WS Index.html parser
    """

    def __init__(self, htmlText: str, config: ParserConfig | None = None):
        """
        Constructor

        Args:
            htmlText(str): the HTML text of the index page
        """
        if config is None:
            config = ParserConfig()
        self.config = config
        Textparser.__init__(self, debug=config.debug)
        self.htmlText = htmlText
        # soup (in memory is slow)
        # soup = BeautifulSoup(html_page, 'html.parser'
        self.lines = htmlText.split("\n")
        # trStart, trEnd = makeHTMLTags("tr")
        # self.tr = trStart + SkipTo(trEnd).setResultsName("tr") + trEnd.suppress()
        self.linkPattern = re.compile(r""".*href=[\'"]?([^\'" >]+).*""", re.I)
        self.volPattern = re.compile("http://ceur-ws.org/Vol-([0-9]+)")
        self.volLinkPattern = re.compile(
            r""".*<a\s+href=[\'"]http://ceur-ws.org/Vol-([0-9]+)[/]?[\'"]>([^<]*)</a>.*""",
            re.I | re.DOTALL,
        )
        # Pre-compile patterns used in find and findVolume
        self.thColspanPattern = re.compile(r"^.*<th\s*colspan", re.I)
        self.trStartPattern = re.compile(r"^\s*<tr>", re.I)
        self.trEndPattern = re.compile(r"^\s*</tr>", re.I)
        # Pre-compile patterns used in setVolumeTitle
        self.editedByPattern = re.compile("Edited by:")
        self.tdBgColorPattern = re.compile("<td bgcolor", re.I)

    def find(self, startLine: int, compiledPattern, step: int = 1) -> int | None:
        """
        find the next line with the given compiled regular expression pattern

        Args:
            startLine(int): index of the line to start search
            compiledPattern(re.Pattern): the compiled regular expression pattern to search for
            step(int): the steps to take e.g. +1 for forward -1 for backwards

        Return:
            int: the line number of the line or None if nothing was found
        """
        lineNo = startLine
        while 0 < lineNo < len(self.lines) + 1:
            line = self.lines[lineNo - 1]
            if compiledPattern.match(line):
                return lineNo
            lineNo += step
        return None

    def findVolume(
        self,
        volCount: int,
        startLine: int,
        expectedTr: int = 3,
        progress: int = 10,
    ) -> tuple[int | None, int | None]:
        """
        find Volume lines from the given startLine

        Args:
            volCount(int): the volumeCount before the startLine
            startLine(int): index of the line to search
            expectedTr(int): number of <tr> tags expected
            progress(int): how often to show the progress

        Returns:
            endLine of the volume html or None
        """
        trStartLine = self.find(startLine, self.thColspanPattern)
        if trStartLine is not None:
            lineNo = trStartLine + 1
            trCount = 1
            while lineNo < len(self.lines):
                trLine = self.find(lineNo, self.trStartPattern)
                if trLine is None:
                    break
                else:
                    lineNo = trLine + 1
                    trCount += 1
                    if trCount == expectedTr:
                        trEndLine = self.find(lineNo + 1, self.trEndPattern)
                        if volCount % progress == 0 and self.config.verbose:
                            print(f"volume count {volCount+1:4}: lines {trStartLine:6}-{trEndLine:6}")
                        return trStartLine, trEndLine
        return None, None

    def setVolumeNumber(self, volume, href):
        """
        set the volumen number
        """
        if href is None:
            return
        volNumber = self.getMatch(self.volPattern, href, 1)
        if volNumber is not None:
            volume["number"] = int(volNumber)

    def setVolumeName(self, volume, line):
        """
        set the volume name
        """
        volName = self.getMatch(self.volLinkPattern, line, 2)
        if volName is not None:
            valid = True
            if not volName.startswith("http:"):
                invalidKeys = ["deleted upon editor request", "Not used"]
                for invalidKey in invalidKeys:
                    if invalidKey in volName:
                        href = self.getMatch(self.linkPattern, line, 1)
                        self.setVolumeNumber(volume, href)
                        valid = False
                volume["valid"] = valid
                if valid:
                    volName = html.unescape(volName)
                    volName = Textparser.sanitize(volName)
                    volume["volname"] = volName

    def setVolumeTitle(self, volume: dict, lineIndex: int):
        """
        set the volume title

        Args:
            volume(dict): the volumeRecord to modify
            lineIndex: where to start setting the volumeTitle
        """
        editedByLine = self.find(lineIndex, self.editedByPattern)
        if editedByLine is not None:
            tdLine = self.find(editedByLine, self.tdBgColorPattern, step=-1)
            if tdLine is not None:
                tdIndex = tdLine - 1
                title = ""
                delim = ""
                while tdIndex < len(self.lines):
                    line = self.lines[tdIndex]
                    if line.startswith("Edited by:"):
                        break
                    for tag in [
                        '<TD bgcolor="#FFFFFF">&nbsp;</TD><TD bgcolor="#FFFFFF">',
                        '<TD bgcolor="#FFFFFF">',
                        '<td bgcolor="#FFFFFF">',
                        "<BR>",
                        "<br>",
                    ]:
                        line = line.replace(tag, "")
                    line = line.replace("\r", " ")
                    title += line + delim
                    delim = " "
                    tdIndex += 1
                volume["tdtitle"] = html.unescape(title).strip()

    def setSeeAlsoVolumes(self, volume: dict, firstLine: int, lastLine: int):
        """
        Extract and set the volume numbers form the see also list
        Example result {"seealso": ["Vol-3067"]}

        Args:
            volume: the volumeRecord to modify
            lineIndex: where to start setting the volumeTitle
        """
        volumes = []
        see_also = ""
        for line in range(firstLine, lastLine):
            see_also += self.lines[line]
        see_also_section = re.search(r"see also:(.*?)</font>", see_also, re.DOTALL | re.IGNORECASE)

        if see_also_section:
            # Extract the volumes using regex from the see also section
            volumes = re.findall(
                r'<a href="#(Vol-\d+)">',
                see_also_section.group(1),
                re.IGNORECASE,
            )
        volume["seealso"] = volumes

    def getInfo(self, volume: dict, info: str, pattern, line: str):
        """
        get the info for the given patterns trying to match the pattern on
        the given line

        Args:
            volume(dict): the result dict
            info(str): the name of the dict key to fill
            pattern(regexp): the regular expression to check
            line(str): the line to check
        """
        infoValue = self.getMatch(pattern, line, 1)
        if infoValue is not None:
            for delim in ["<BR>", "<br>"]:
                infoValue = infoValue.replace(delim, "")
            infoValue = infoValue.strip()
            if info in ["editors", "submittedBy"]:
                infoValue = html.unescape(infoValue)
            if info == "pubDate":
                try:
                    infoValue = datetime.datetime.strptime(infoValue, "%d-%b-%Y")
                    published = infoValue.strftime("%Y-%m-%d")
                    volume["published"] = published
                    volume["year"] = infoValue.year
                except ValueError as ve:
                    msg = f"pubDate: {infoValue} of {volume} parsing failed with {ve}"
                    self.log(msg)
            if info in ["urn", "url", "archive"]:
                href = self.getMatch(self.linkPattern, infoValue, 1)
                if href is not None:
                    infoValue = href
                    if info == "url":
                        self.setVolumeNumber(volume, href)
                    if info == "urn":
                        infoValue = href.replace("https://nbn-resolving.org/", "")
            volume[info] = infoValue

    def parseVolume(self, volCount: int, fromLine: int, toLine: int, verbose: bool):
        """
        parse a volume from the given line range
        """
        lineCount = toLine - fromLine
        volume = {
            "fromLine": fromLine,
            "toLine": toLine,
            "valid": None,
            "url": None,
            "acronym": None,
            "title": None,
            "loctime": None,
        }
        self.setVolumeTitle(volume, fromLine)
        self.setSeeAlsoVolumes(volume, fromLine, toLine)

        infoPattern = {}
        infoMappings = [
            ("URN", "urn"),
            ("ONLINE", "url"),
            ("ARCHIVE", "archive"),
            ("Edited by", "editors"),
            ("Submitted by", "submittedBy"),
            ("Published on CEUR-WS", "pubDate"),
        ]
        for prefix, info in infoMappings:
            infoPattern[info] = re.compile(rf"^\s*{prefix}:(.*)")
        for lineIndex in range(fromLine, toLine):
            line = self.lines[lineIndex]
            for info, pattern in infoPattern.items():
                self.getInfo(volume, info, pattern, line)
            self.setVolumeName(volume, line)
            if verbose:
                print(line)
        volumeNumber = volume.get("number", "?")
        acronym = volume.get("acronym", "?")
        self.log(f"{volumeNumber:4}-{volCount:4}:{fromLine}+{lineCount} {acronym}")
        return volume

    def parse(self, vol_limit: int | None = None):
        """
        parse my html code for Volume info
        """
        # Compile the regex pattern right before its usage
        mainTablePattern = re.compile(r'\s*<TABLE id="MAINTABLE"', re.I)
        lineNo = self.find(1, mainTablePattern)
        volCount = 0
        volumes = {}
        while self.lines and lineNo and lineNo < len(self.lines):
            if vol_limit and volCount >= vol_limit:
                break
            expectedTr = 3
            volStartLine, volEndLine = self.findVolume(volCount, lineNo, expectedTr=expectedTr)
            if volStartLine is None or volEndLine is None:
                break
            else:
                volCount += 1
                volume = self.parseVolume(
                    volCount,
                    volStartLine,
                    volEndLine,
                    verbose=self.config.verbose,
                )
                # synchronize on <tr><th and not on end since trailing TR might be missing
                lineNo = volStartLine + 1
                if "number" in volume:
                    volume_number = volume["number"]
                    if volume_number < self.config.down_to_volume:
                        break
                    volumes[volume_number] = volume
                    if self.config.progress_bar:
                        self.config.progress_bar.update()
                else:
                    self.log(f"volume not found for volume at {volStartLine}")
        return volumes

__init__(htmlText, config=None)

Constructor

Parameters:

Name Type Description Default
htmlText(str)

the HTML text of the index page

required
Source code in ceurws/indexparser.py
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
def __init__(self, htmlText: str, config: ParserConfig | None = None):
    """
    Constructor

    Args:
        htmlText(str): the HTML text of the index page
    """
    if config is None:
        config = ParserConfig()
    self.config = config
    Textparser.__init__(self, debug=config.debug)
    self.htmlText = htmlText
    # soup (in memory is slow)
    # soup = BeautifulSoup(html_page, 'html.parser'
    self.lines = htmlText.split("\n")
    # trStart, trEnd = makeHTMLTags("tr")
    # self.tr = trStart + SkipTo(trEnd).setResultsName("tr") + trEnd.suppress()
    self.linkPattern = re.compile(r""".*href=[\'"]?([^\'" >]+).*""", re.I)
    self.volPattern = re.compile("http://ceur-ws.org/Vol-([0-9]+)")
    self.volLinkPattern = re.compile(
        r""".*<a\s+href=[\'"]http://ceur-ws.org/Vol-([0-9]+)[/]?[\'"]>([^<]*)</a>.*""",
        re.I | re.DOTALL,
    )
    # Pre-compile patterns used in find and findVolume
    self.thColspanPattern = re.compile(r"^.*<th\s*colspan", re.I)
    self.trStartPattern = re.compile(r"^\s*<tr>", re.I)
    self.trEndPattern = re.compile(r"^\s*</tr>", re.I)
    # Pre-compile patterns used in setVolumeTitle
    self.editedByPattern = re.compile("Edited by:")
    self.tdBgColorPattern = re.compile("<td bgcolor", re.I)

find(startLine, compiledPattern, step=1)

find the next line with the given compiled regular expression pattern

Parameters:

Name Type Description Default
startLine(int)

index of the line to start search

required
compiledPattern(re.Pattern)

the compiled regular expression pattern to search for

required
step(int)

the steps to take e.g. +1 for forward -1 for backwards

required
Return

int: the line number of the line or None if nothing was found

Source code in ceurws/indexparser.py
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
def find(self, startLine: int, compiledPattern, step: int = 1) -> int | None:
    """
    find the next line with the given compiled regular expression pattern

    Args:
        startLine(int): index of the line to start search
        compiledPattern(re.Pattern): the compiled regular expression pattern to search for
        step(int): the steps to take e.g. +1 for forward -1 for backwards

    Return:
        int: the line number of the line or None if nothing was found
    """
    lineNo = startLine
    while 0 < lineNo < len(self.lines) + 1:
        line = self.lines[lineNo - 1]
        if compiledPattern.match(line):
            return lineNo
        lineNo += step
    return None

findVolume(volCount, startLine, expectedTr=3, progress=10)

find Volume lines from the given startLine

Parameters:

tags expected

Name Type Description Default
volCount(int)

the volumeCount before the startLine

required
startLine(int)

index of the line to search

required
expectedTr(int)

number of

required
progress(int)

how often to show the progress

required

Returns:

Type Description
tuple[int | None, int | None]

endLine of the volume html or None

Source code in ceurws/indexparser.py
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
def findVolume(
    self,
    volCount: int,
    startLine: int,
    expectedTr: int = 3,
    progress: int = 10,
) -> tuple[int | None, int | None]:
    """
    find Volume lines from the given startLine

    Args:
        volCount(int): the volumeCount before the startLine
        startLine(int): index of the line to search
        expectedTr(int): number of <tr> tags expected
        progress(int): how often to show the progress

    Returns:
        endLine of the volume html or None
    """
    trStartLine = self.find(startLine, self.thColspanPattern)
    if trStartLine is not None:
        lineNo = trStartLine + 1
        trCount = 1
        while lineNo < len(self.lines):
            trLine = self.find(lineNo, self.trStartPattern)
            if trLine is None:
                break
            else:
                lineNo = trLine + 1
                trCount += 1
                if trCount == expectedTr:
                    trEndLine = self.find(lineNo + 1, self.trEndPattern)
                    if volCount % progress == 0 and self.config.verbose:
                        print(f"volume count {volCount+1:4}: lines {trStartLine:6}-{trEndLine:6}")
                    return trStartLine, trEndLine
    return None, None

getInfo(volume, info, pattern, line)

get the info for the given patterns trying to match the pattern on the given line

Parameters:

Name Type Description Default
volume(dict)

the result dict

required
info(str)

the name of the dict key to fill

required
pattern(regexp)

the regular expression to check

required
line(str)

the line to check

required
Source code in ceurws/indexparser.py
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
def getInfo(self, volume: dict, info: str, pattern, line: str):
    """
    get the info for the given patterns trying to match the pattern on
    the given line

    Args:
        volume(dict): the result dict
        info(str): the name of the dict key to fill
        pattern(regexp): the regular expression to check
        line(str): the line to check
    """
    infoValue = self.getMatch(pattern, line, 1)
    if infoValue is not None:
        for delim in ["<BR>", "<br>"]:
            infoValue = infoValue.replace(delim, "")
        infoValue = infoValue.strip()
        if info in ["editors", "submittedBy"]:
            infoValue = html.unescape(infoValue)
        if info == "pubDate":
            try:
                infoValue = datetime.datetime.strptime(infoValue, "%d-%b-%Y")
                published = infoValue.strftime("%Y-%m-%d")
                volume["published"] = published
                volume["year"] = infoValue.year
            except ValueError as ve:
                msg = f"pubDate: {infoValue} of {volume} parsing failed with {ve}"
                self.log(msg)
        if info in ["urn", "url", "archive"]:
            href = self.getMatch(self.linkPattern, infoValue, 1)
            if href is not None:
                infoValue = href
                if info == "url":
                    self.setVolumeNumber(volume, href)
                if info == "urn":
                    infoValue = href.replace("https://nbn-resolving.org/", "")
        volume[info] = infoValue

parse(vol_limit=None)

parse my html code for Volume info

Source code in ceurws/indexparser.py
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
def parse(self, vol_limit: int | None = None):
    """
    parse my html code for Volume info
    """
    # Compile the regex pattern right before its usage
    mainTablePattern = re.compile(r'\s*<TABLE id="MAINTABLE"', re.I)
    lineNo = self.find(1, mainTablePattern)
    volCount = 0
    volumes = {}
    while self.lines and lineNo and lineNo < len(self.lines):
        if vol_limit and volCount >= vol_limit:
            break
        expectedTr = 3
        volStartLine, volEndLine = self.findVolume(volCount, lineNo, expectedTr=expectedTr)
        if volStartLine is None or volEndLine is None:
            break
        else:
            volCount += 1
            volume = self.parseVolume(
                volCount,
                volStartLine,
                volEndLine,
                verbose=self.config.verbose,
            )
            # synchronize on <tr><th and not on end since trailing TR might be missing
            lineNo = volStartLine + 1
            if "number" in volume:
                volume_number = volume["number"]
                if volume_number < self.config.down_to_volume:
                    break
                volumes[volume_number] = volume
                if self.config.progress_bar:
                    self.config.progress_bar.update()
            else:
                self.log(f"volume not found for volume at {volStartLine}")
    return volumes

parseVolume(volCount, fromLine, toLine, verbose)

parse a volume from the given line range

Source code in ceurws/indexparser.py
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
def parseVolume(self, volCount: int, fromLine: int, toLine: int, verbose: bool):
    """
    parse a volume from the given line range
    """
    lineCount = toLine - fromLine
    volume = {
        "fromLine": fromLine,
        "toLine": toLine,
        "valid": None,
        "url": None,
        "acronym": None,
        "title": None,
        "loctime": None,
    }
    self.setVolumeTitle(volume, fromLine)
    self.setSeeAlsoVolumes(volume, fromLine, toLine)

    infoPattern = {}
    infoMappings = [
        ("URN", "urn"),
        ("ONLINE", "url"),
        ("ARCHIVE", "archive"),
        ("Edited by", "editors"),
        ("Submitted by", "submittedBy"),
        ("Published on CEUR-WS", "pubDate"),
    ]
    for prefix, info in infoMappings:
        infoPattern[info] = re.compile(rf"^\s*{prefix}:(.*)")
    for lineIndex in range(fromLine, toLine):
        line = self.lines[lineIndex]
        for info, pattern in infoPattern.items():
            self.getInfo(volume, info, pattern, line)
        self.setVolumeName(volume, line)
        if verbose:
            print(line)
    volumeNumber = volume.get("number", "?")
    acronym = volume.get("acronym", "?")
    self.log(f"{volumeNumber:4}-{volCount:4}:{fromLine}+{lineCount} {acronym}")
    return volume

setSeeAlsoVolumes(volume, firstLine, lastLine)

Extract and set the volume numbers form the see also list Example result {"seealso": ["Vol-3067"]}

Parameters:

Name Type Description Default
volume dict

the volumeRecord to modify

required
lineIndex

where to start setting the volumeTitle

required
Source code in ceurws/indexparser.py
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
def setSeeAlsoVolumes(self, volume: dict, firstLine: int, lastLine: int):
    """
    Extract and set the volume numbers form the see also list
    Example result {"seealso": ["Vol-3067"]}

    Args:
        volume: the volumeRecord to modify
        lineIndex: where to start setting the volumeTitle
    """
    volumes = []
    see_also = ""
    for line in range(firstLine, lastLine):
        see_also += self.lines[line]
    see_also_section = re.search(r"see also:(.*?)</font>", see_also, re.DOTALL | re.IGNORECASE)

    if see_also_section:
        # Extract the volumes using regex from the see also section
        volumes = re.findall(
            r'<a href="#(Vol-\d+)">',
            see_also_section.group(1),
            re.IGNORECASE,
        )
    volume["seealso"] = volumes

setVolumeName(volume, line)

set the volume name

Source code in ceurws/indexparser.py
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
def setVolumeName(self, volume, line):
    """
    set the volume name
    """
    volName = self.getMatch(self.volLinkPattern, line, 2)
    if volName is not None:
        valid = True
        if not volName.startswith("http:"):
            invalidKeys = ["deleted upon editor request", "Not used"]
            for invalidKey in invalidKeys:
                if invalidKey in volName:
                    href = self.getMatch(self.linkPattern, line, 1)
                    self.setVolumeNumber(volume, href)
                    valid = False
            volume["valid"] = valid
            if valid:
                volName = html.unescape(volName)
                volName = Textparser.sanitize(volName)
                volume["volname"] = volName

setVolumeNumber(volume, href)

set the volumen number

Source code in ceurws/indexparser.py
142
143
144
145
146
147
148
149
150
def setVolumeNumber(self, volume, href):
    """
    set the volumen number
    """
    if href is None:
        return
    volNumber = self.getMatch(self.volPattern, href, 1)
    if volNumber is not None:
        volume["number"] = int(volNumber)

setVolumeTitle(volume, lineIndex)

set the volume title

Parameters:

Name Type Description Default
volume(dict)

the volumeRecord to modify

required
lineIndex int

where to start setting the volumeTitle

required
Source code in ceurws/indexparser.py
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
def setVolumeTitle(self, volume: dict, lineIndex: int):
    """
    set the volume title

    Args:
        volume(dict): the volumeRecord to modify
        lineIndex: where to start setting the volumeTitle
    """
    editedByLine = self.find(lineIndex, self.editedByPattern)
    if editedByLine is not None:
        tdLine = self.find(editedByLine, self.tdBgColorPattern, step=-1)
        if tdLine is not None:
            tdIndex = tdLine - 1
            title = ""
            delim = ""
            while tdIndex < len(self.lines):
                line = self.lines[tdIndex]
                if line.startswith("Edited by:"):
                    break
                for tag in [
                    '<TD bgcolor="#FFFFFF">&nbsp;</TD><TD bgcolor="#FFFFFF">',
                    '<TD bgcolor="#FFFFFF">',
                    '<td bgcolor="#FFFFFF">',
                    "<BR>",
                    "<br>",
                ]:
                    line = line.replace(tag, "")
                line = line.replace("\r", " ")
                title += line + delim
                delim = " "
                tdIndex += 1
            volume["tdtitle"] = html.unescape(title).strip()

ParserConfig

parser configuration

Source code in ceurws/indexparser.py
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
class ParserConfig:
    """
    parser configuration
    """

    def __init__(
        self,
        progress_bar: tqdm | None = None,
        down_to_volume: int = 1,
        force_download: bool = False,
        verbose: bool = False,
        debug: bool = False,
    ):
        """
        Initializes the ParserConfig with a progress bar, volume threshold, and debug mode setting.

        Args:
            progress_bar : An instance of a Progressbar class to be used for showing progress
                during parsing.
            down_to_volume (int, optional): The volume threshold for parsing.
                Only volumes equal to or less than this value will be considered. Defaults to 1.
            force_download(bool): if True download the file to parse
            verbose(bool): if True give verbose feedback
            debug (bool, optional): Indicates whether debugging mode is enabled.
                If True, additional debug information will be provided during parsing. Defaults to False.
        """
        self.progress_bar = progress_bar
        self.down_to_volume = down_to_volume
        self.force_download = force_download
        self.verbose = verbose
        self.debug = debug

__init__(progress_bar=None, down_to_volume=1, force_download=False, verbose=False, debug=False)

Initializes the ParserConfig with a progress bar, volume threshold, and debug mode setting.

Parameters:

Name Type Description Default
progress_bar

An instance of a Progressbar class to be used for showing progress during parsing.

None
down_to_volume int

The volume threshold for parsing. Only volumes equal to or less than this value will be considered. Defaults to 1.

1
force_download(bool)

if True download the file to parse

required
verbose(bool)

if True give verbose feedback

required
debug bool

Indicates whether debugging mode is enabled. If True, additional debug information will be provided during parsing. Defaults to False.

False
Source code in ceurws/indexparser.py
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
def __init__(
    self,
    progress_bar: tqdm | None = None,
    down_to_volume: int = 1,
    force_download: bool = False,
    verbose: bool = False,
    debug: bool = False,
):
    """
    Initializes the ParserConfig with a progress bar, volume threshold, and debug mode setting.

    Args:
        progress_bar : An instance of a Progressbar class to be used for showing progress
            during parsing.
        down_to_volume (int, optional): The volume threshold for parsing.
            Only volumes equal to or less than this value will be considered. Defaults to 1.
        force_download(bool): if True download the file to parse
        verbose(bool): if True give verbose feedback
        debug (bool, optional): Indicates whether debugging mode is enabled.
            If True, additional debug information will be provided during parsing. Defaults to False.
    """
    self.progress_bar = progress_bar
    self.down_to_volume = down_to_volume
    self.force_download = force_download
    self.verbose = verbose
    self.debug = debug

location

Created on 2023-07-15

@author: wf

LocationLookup

Class for location lookup.

Source code in ceurws/location.py
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
class LocationLookup:
    """
    Class for location lookup.
    """

    predefinedLocations: dict[str, str | None] = {}

    @classmethod
    def initPredefinedLocations(cls):
        """
        Initialize predefined locations.
        """
        locMap = {
            "Not Known": None,
            "Online": None,
            "Virtual": None,
            "Virtual, USA": None,
            "Virtual Event, USA": None,
            "Amsterdam": "Q727",
            "Amsterdam, Amsterdam": "Q727",
            "Amsterdam Netherlands": "Q727",
            "Amsterdam, Netherlands": "Q727",
            "Amsterdam, The Netherlands": "Q727",
            "Amsterdam The Netherlands": "Q727",
            # ... add more predefined locations ...
        }
        cls.predefinedLocations = locMap

    def __init__(self):
        """
        Constructor for LocationLookup.
        """
        LocationLookup.initPredefinedLocations()
        self.locationContext = LocationContext.fromCache()
        cacheRootDir = LocationContext.getDefaultConfig().cacheRootDir
        cacheDir = f"{cacheRootDir}/.nominatim"
        self.nominatimWrapper = NominatimWrapper(cacheDir=cacheDir)

    def getCityByWikiDataId(self, wikidataID: str):
        """
        Get the city for the given wikidataID.

        Args:
            wikidataID (str): The wikidata ID.

        Returns:
            City: The city with the given wikidataID.
        """
        citiesGen = self.locationContext.cityManager.getLocationsByWikidataId(wikidataID)
        if citiesGen is not None:
            cities = list(citiesGen)
            if len(cities) > 0:
                return cities[0]
        else:
            return None

    def lookupNominatim(self, locationText: str):
        """
        Lookup the location for the given locationText (if any).

        Args:
            locationText (str): The location text to search for.

        Returns:
            City: The location found by Nominatim.
        """
        location = None
        wikidataId = self.nominatimWrapper.lookupWikiDataId(locationText)
        if wikidataId is not None:
            location = self.getCityByWikiDataId(wikidataId)
        return location

    def lookup(self, locationText: str, logFile=sys.stdout):
        """
        Lookup a location based on the given locationText.

        Args:
            locationText (str): The location to lookup.
            logFile (file): The log file to write the output.

        Returns:
            City: The located city based on the locationText.
        """
        if locationText in LocationLookup.predefinedLocations:
            locationId = LocationLookup.predefinedLocations[locationText]
            if locationId is None:
                return None
            else:
                location = self.getCityByWikiDataId(locationId)
                if location is None:
                    print(
                        f"❌❌-predefinedLocation {locationText}{locationId} wikidataId not resolved",
                        file=logFile,
                    )
                return location
        lg = self.lookupGeograpy(locationText)
        ln = self.lookupNominatim(locationText)
        if ln is not None and lg is not None and ln.wikidataid != lg.wikidataid:
            print(f"❌❌{locationText}{lg}!={ln}", file=logFile)
            return None
        return lg

    def lookupGeograpy(self, locationText: str):
        """
        Lookup the given location by the given locationText.

        Args:
            locationText (str): The location to lookup.

        Returns:
            City: The located city based on the locationText.
        """
        locations = self.locationContext.locateLocation(locationText)
        if len(locations) > 0:
            return locations[0]
        else:
            return None

__init__()

Constructor for LocationLookup.

Source code in ceurws/location.py
41
42
43
44
45
46
47
48
49
def __init__(self):
    """
    Constructor for LocationLookup.
    """
    LocationLookup.initPredefinedLocations()
    self.locationContext = LocationContext.fromCache()
    cacheRootDir = LocationContext.getDefaultConfig().cacheRootDir
    cacheDir = f"{cacheRootDir}/.nominatim"
    self.nominatimWrapper = NominatimWrapper(cacheDir=cacheDir)

getCityByWikiDataId(wikidataID)

Get the city for the given wikidataID.

Parameters:

Name Type Description Default
wikidataID str

The wikidata ID.

required

Returns:

Name Type Description
City

The city with the given wikidataID.

Source code in ceurws/location.py
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
def getCityByWikiDataId(self, wikidataID: str):
    """
    Get the city for the given wikidataID.

    Args:
        wikidataID (str): The wikidata ID.

    Returns:
        City: The city with the given wikidataID.
    """
    citiesGen = self.locationContext.cityManager.getLocationsByWikidataId(wikidataID)
    if citiesGen is not None:
        cities = list(citiesGen)
        if len(cities) > 0:
            return cities[0]
    else:
        return None

initPredefinedLocations() classmethod

Initialize predefined locations.

Source code in ceurws/location.py
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
@classmethod
def initPredefinedLocations(cls):
    """
    Initialize predefined locations.
    """
    locMap = {
        "Not Known": None,
        "Online": None,
        "Virtual": None,
        "Virtual, USA": None,
        "Virtual Event, USA": None,
        "Amsterdam": "Q727",
        "Amsterdam, Amsterdam": "Q727",
        "Amsterdam Netherlands": "Q727",
        "Amsterdam, Netherlands": "Q727",
        "Amsterdam, The Netherlands": "Q727",
        "Amsterdam The Netherlands": "Q727",
        # ... add more predefined locations ...
    }
    cls.predefinedLocations = locMap

lookup(locationText, logFile=sys.stdout)

Lookup a location based on the given locationText.

Parameters:

Name Type Description Default
locationText str

The location to lookup.

required
logFile file

The log file to write the output.

stdout

Returns:

Name Type Description
City

The located city based on the locationText.

Source code in ceurws/location.py
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
def lookup(self, locationText: str, logFile=sys.stdout):
    """
    Lookup a location based on the given locationText.

    Args:
        locationText (str): The location to lookup.
        logFile (file): The log file to write the output.

    Returns:
        City: The located city based on the locationText.
    """
    if locationText in LocationLookup.predefinedLocations:
        locationId = LocationLookup.predefinedLocations[locationText]
        if locationId is None:
            return None
        else:
            location = self.getCityByWikiDataId(locationId)
            if location is None:
                print(
                    f"❌❌-predefinedLocation {locationText}{locationId} wikidataId not resolved",
                    file=logFile,
                )
            return location
    lg = self.lookupGeograpy(locationText)
    ln = self.lookupNominatim(locationText)
    if ln is not None and lg is not None and ln.wikidataid != lg.wikidataid:
        print(f"❌❌{locationText}{lg}!={ln}", file=logFile)
        return None
    return lg

lookupGeograpy(locationText)

Lookup the given location by the given locationText.

Parameters:

Name Type Description Default
locationText str

The location to lookup.

required

Returns:

Name Type Description
City

The located city based on the locationText.

Source code in ceurws/location.py
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
def lookupGeograpy(self, locationText: str):
    """
    Lookup the given location by the given locationText.

    Args:
        locationText (str): The location to lookup.

    Returns:
        City: The located city based on the locationText.
    """
    locations = self.locationContext.locateLocation(locationText)
    if len(locations) > 0:
        return locations[0]
    else:
        return None

lookupNominatim(locationText)

Lookup the location for the given locationText (if any).

Parameters:

Name Type Description Default
locationText str

The location text to search for.

required

Returns:

Name Type Description
City

The location found by Nominatim.

Source code in ceurws/location.py
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
def lookupNominatim(self, locationText: str):
    """
    Lookup the location for the given locationText (if any).

    Args:
        locationText (str): The location text to search for.

    Returns:
        City: The location found by Nominatim.
    """
    location = None
    wikidataId = self.nominatimWrapper.lookupWikiDataId(locationText)
    if wikidataId is not None:
        location = self.getCityByWikiDataId(wikidataId)
    return location

loctime

Created on 2023-12-22

@author: wf

LoctimeParser

A parser class for handling loctime lookups. This class provides methods to load, parse, and update loctime data using a dictionary of dictionaries structure.

Attributes:

Name Type Description
filepath str

The file path to the loctime YAML configuration.

lookups dict

The loaded lookup dictionaries from the YAML file.

multi_word dict

A dictionary to handle multi-word keys.

multi_word_lookups dict

A version of lookups with keys as concatenated words.

counters dict

A dictionary of Counter objects for various categories.

year_pattern Pattern

A compiled regex pattern to match 4-digit years.

total_loctimes int

The total count of processed loctimes.

Source code in ceurws/loctime.py
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
class LoctimeParser:
    """
    A parser class for handling loctime lookups. This class provides methods to
    load, parse, and update loctime data using a dictionary of dictionaries structure.

    Attributes:
        filepath (str): The file path to the loctime YAML configuration.
        lookups (dict): The loaded lookup dictionaries from the YAML file.
        multi_word (dict): A dictionary to handle multi-word keys.
        multi_word_lookups (dict): A version of lookups with keys as concatenated words.
        counters (dict): A dictionary of Counter objects for various categories.
        year_pattern (re.Pattern): A compiled regex pattern to match 4-digit years.
        total_loctimes (int): The total count of processed loctimes.
    """

    def __init__(self, filepath: str | None = None):
        """
        Initializes the LoctimeParser object, setting up paths, loading lookups,
        and initializing counters and patterns.

        Args:
            filepath (Path, optional): The path to the loctime YAML file.
                                      Defaults to a predefined path if None is provided.
        Raises:
            FileNotFoundError: Raises an error if the specified YAML file does not exist.
        """
        if filepath is None:
            self.ceurws_path = CEURWS.CACHE_DIR
            self.filepath: Path = self.ceurws_path.joinpath("loctime.yaml")
        else:
            self.file_path = Path(filepath)
        self.lookups = self.load()
        self.setup()
        self.counters: dict[str, Counter] = {"4digit-year": Counter()}
        for reverse_pos in range(1, 8):
            self.counters[str(reverse_pos)] = Counter()
        for key in self.lookups:
            self.counters[key] = Counter()

        # Compile a pattern to match a 4-digit year
        self.year_pattern = re.compile(r"\b\d{4}\b")
        self.total_loctimes = 0

    def setup(self):
        """
        Prepares the parser by initializing multi-word handling and creating
        a modified version of the lookup dictionaries with keys as concatenated words.
        This method sets up the 'multi_word' and 'multi_word_lookups' dictionaries
        to facilitate the parsing process, especially for multi-word keys.
        """
        self.multi_word = {}
        for lookup in self.lookups.values():
            for key in lookup:
                if " " in key:
                    self.multi_word[key] = key.replace(" ", "_")

        # Initialize a dictionary derived from self.lookups with underscored keys
        self.multi_word_lookups = {}
        for category, lookup in self.lookups.items():
            self.multi_word_lookups[category] = {key.replace(" ", "_"): value for key, value in lookup.items()}

    def load(
        self,
    ) -> dict:
        """
        Loads the lookup data from the YAML file specified by the filepath attribute.

        This method attempts to open and read the YAML file, converting its contents
        into a dictionary. If the file is empty or does not exist, it returns an empty dictionary.

        Returns:
            dict: A dictionary representing the loaded data from the YAML file. If the file
                  is empty or non-existent, an empty dictionary is returned.

        Raises:
            FileNotFoundError: If the specified file does not exist.
            yaml.YAMLError: If there is an error parsing the YAML file.
        """
        data_dict = {}
        if os.path.isfile(self.filepath) and os.path.getsize(self.filepath) > 0:
            with open(self.filepath) as yaml_file:
                data_dict = yaml.safe_load(yaml_file)
        return data_dict

    def save(self):
        """
        Saves the current lookup dictionary to a YAML file.
        """
        os.makedirs(os.path.dirname(self.filepath), exist_ok=True)  # Ensure directory exists
        with open(self.filepath, "w", encoding="utf-8") as yaml_file:
            yaml.dump(
                self.lookups,
                yaml_file,
                default_flow_style=False,
                allow_unicode=True,
            )

    def get_parts(self, loctime):
        """
        Splits the loctime string into parts and subparts, considering multi-word entries.

        Args:
            loctime (str): The loctime string to split.

        Returns:
            list: A list of parts and subparts.
        """
        # Replace known multi-word entries with their underscore versions
        for original, underscored in self.multi_word.items():
            loctime = loctime.replace(original, underscored)

        parts = loctime.split(",")  # First, split by comma
        all_parts = []
        for part in parts:
            # Further split each part by whitespace, considering underscore as part of the word
            subparts = part.strip().split()
            all_parts.extend(subparts)  # Add all subparts to the list

        return all_parts

    def parse(self, loctime: str) -> dict:
        """
        Alternative parse of CEUR-WS loctimes using lookups

        Args:
            loctime (str): The loctime string to parse.

        """
        result = {}
        self.total_loctimes += 1
        lt_parts = self.get_parts(loctime)

        # Process each part of loctime
        for index, part in enumerate(lt_parts):
            part = part.strip()
            reverse_pos = len(lt_parts) - index  # Position from end

            found_in_lookup = False
            # Check against each lookup and update corresponding counter
            for (
                lookup_key,
                lookup_dict,
            ) in self.multi_word_lookups.items():
                if part in lookup_dict:
                    self.counters[lookup_key][part] += 1  # Increment the lookup counter
                    found_in_lookup = True
                    # set result dict
                    result[lookup_key] = part
                    break  # Break if found, assuming part can't be in multiple lookups
            if not found_in_lookup:
                # Update counter for each part's position from end
                key = str(reverse_pos)
                if key in self.counters:
                    self.counters[key][part] += 1

            # Special handling for 4-digit years
            if index == len(lt_parts) - 1 and self.year_pattern.match(part):
                self.counters["4digit-year"][part] += 1
        return result

    def update_lookup_counts(self):
        """
        to be called  ffter processing all loctimes
        and updating counters update lookup dicts with new counts
        """
        for category, counter in self.counters.items():
            if category in self.lookups:
                for underscore_key, count in counter.items():
                    # Convert underscore_key back to space-separated key
                    original_key = underscore_key.replace("_", " ")
                    if original_key in self.lookups[category]:
                        # Update the count for the original key
                        self.lookups[category][original_key] += count
                    else:
                        # Initialize count for the original key
                        self.lookups[category][original_key] = count

    def create_pareto_analysis(self, level: int = 3, outof: int = 5):
        """
        Creates a Pareto analysis for each category in the lookups and returns
        the percentage table for the distribution across the specified levels.

        Args:
            level (int): The number of segments to divide the data into within the top "outof" parts.
            outof (int): 1 out of n value e.g. on level 1 we have 1:5 which leads to
                the original pareto 80:20 percent rule, on level 2 we have 80:(20=16:4) percent
                which is equivalent to 80/96 thresholds percent on level 3 we have 80:(20=16:4=(3.2:0.8)
                percent which leads to 80%,96%,99.2% thresholds
        """
        pareto_dict = {}
        for category, counter in self.counters.items():
            # Sort items by count in descending order
            sorted_items = counter.most_common()
            total = sum(counter.values())

            # Calculate segment thresholds based on the diminishing series
            thresholds = []
            threshold = 0.0
            for _ in range(1, level + 1):
                # current range to calculate out of for
                trange = 100 - threshold  # 100/80/96/99.2 ...
                # right side of range
                right_range = trange / outof  # 20/4/0.8 ...
                # left threshold is new threshold
                threshold = 100 - right_range
                thresholds.append(threshold)
            thresholds.append(100)

            segment_counts = {threshold: 0 for threshold in thresholds}  # Initialize count dict for each segment
            segment_cutoff = {threshold: 0 for threshold in thresholds}  # Initialize count dict for each segment
            tindex = 0
            current_threshold = thresholds[tindex]
            total_pc = 0.0
            # Calculate cumulative counts for each segment
            for _, count in sorted_items:
                item_percentage = count / total * 100
                if total_pc + item_percentage > current_threshold + 0.000000000001:
                    segment_cutoff[current_threshold] = count
                    tindex += 1
                    if tindex >= len(thresholds):
                        break
                    current_threshold = thresholds[tindex]
                total_pc += item_percentage
                segment_counts[current_threshold] += count

            pareto_dict[category] = segment_cutoff
        return pareto_dict

__init__(filepath=None)

Initializes the LoctimeParser object, setting up paths, loading lookups, and initializing counters and patterns.

Parameters:

Name Type Description Default
filepath Path

The path to the loctime YAML file. Defaults to a predefined path if None is provided.

None

Raises: FileNotFoundError: Raises an error if the specified YAML file does not exist.

Source code in ceurws/loctime.py
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
def __init__(self, filepath: str | None = None):
    """
    Initializes the LoctimeParser object, setting up paths, loading lookups,
    and initializing counters and patterns.

    Args:
        filepath (Path, optional): The path to the loctime YAML file.
                                  Defaults to a predefined path if None is provided.
    Raises:
        FileNotFoundError: Raises an error if the specified YAML file does not exist.
    """
    if filepath is None:
        self.ceurws_path = CEURWS.CACHE_DIR
        self.filepath: Path = self.ceurws_path.joinpath("loctime.yaml")
    else:
        self.file_path = Path(filepath)
    self.lookups = self.load()
    self.setup()
    self.counters: dict[str, Counter] = {"4digit-year": Counter()}
    for reverse_pos in range(1, 8):
        self.counters[str(reverse_pos)] = Counter()
    for key in self.lookups:
        self.counters[key] = Counter()

    # Compile a pattern to match a 4-digit year
    self.year_pattern = re.compile(r"\b\d{4}\b")
    self.total_loctimes = 0

create_pareto_analysis(level=3, outof=5)

Creates a Pareto analysis for each category in the lookups and returns the percentage table for the distribution across the specified levels.

Parameters:

Name Type Description Default
level int

The number of segments to divide the data into within the top "outof" parts.

3
outof int

1 out of n value e.g. on level 1 we have 1:5 which leads to the original pareto 80:20 percent rule, on level 2 we have 80:(20=16:4) percent which is equivalent to 80/96 thresholds percent on level 3 we have 80:(20=16:4=(3.2:0.8) percent which leads to 80%,96%,99.2% thresholds

5
Source code in ceurws/loctime.py
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
def create_pareto_analysis(self, level: int = 3, outof: int = 5):
    """
    Creates a Pareto analysis for each category in the lookups and returns
    the percentage table for the distribution across the specified levels.

    Args:
        level (int): The number of segments to divide the data into within the top "outof" parts.
        outof (int): 1 out of n value e.g. on level 1 we have 1:5 which leads to
            the original pareto 80:20 percent rule, on level 2 we have 80:(20=16:4) percent
            which is equivalent to 80/96 thresholds percent on level 3 we have 80:(20=16:4=(3.2:0.8)
            percent which leads to 80%,96%,99.2% thresholds
    """
    pareto_dict = {}
    for category, counter in self.counters.items():
        # Sort items by count in descending order
        sorted_items = counter.most_common()
        total = sum(counter.values())

        # Calculate segment thresholds based on the diminishing series
        thresholds = []
        threshold = 0.0
        for _ in range(1, level + 1):
            # current range to calculate out of for
            trange = 100 - threshold  # 100/80/96/99.2 ...
            # right side of range
            right_range = trange / outof  # 20/4/0.8 ...
            # left threshold is new threshold
            threshold = 100 - right_range
            thresholds.append(threshold)
        thresholds.append(100)

        segment_counts = {threshold: 0 for threshold in thresholds}  # Initialize count dict for each segment
        segment_cutoff = {threshold: 0 for threshold in thresholds}  # Initialize count dict for each segment
        tindex = 0
        current_threshold = thresholds[tindex]
        total_pc = 0.0
        # Calculate cumulative counts for each segment
        for _, count in sorted_items:
            item_percentage = count / total * 100
            if total_pc + item_percentage > current_threshold + 0.000000000001:
                segment_cutoff[current_threshold] = count
                tindex += 1
                if tindex >= len(thresholds):
                    break
                current_threshold = thresholds[tindex]
            total_pc += item_percentage
            segment_counts[current_threshold] += count

        pareto_dict[category] = segment_cutoff
    return pareto_dict

get_parts(loctime)

Splits the loctime string into parts and subparts, considering multi-word entries.

Parameters:

Name Type Description Default
loctime str

The loctime string to split.

required

Returns:

Name Type Description
list

A list of parts and subparts.

Source code in ceurws/loctime.py
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
def get_parts(self, loctime):
    """
    Splits the loctime string into parts and subparts, considering multi-word entries.

    Args:
        loctime (str): The loctime string to split.

    Returns:
        list: A list of parts and subparts.
    """
    # Replace known multi-word entries with their underscore versions
    for original, underscored in self.multi_word.items():
        loctime = loctime.replace(original, underscored)

    parts = loctime.split(",")  # First, split by comma
    all_parts = []
    for part in parts:
        # Further split each part by whitespace, considering underscore as part of the word
        subparts = part.strip().split()
        all_parts.extend(subparts)  # Add all subparts to the list

    return all_parts

load()

Loads the lookup data from the YAML file specified by the filepath attribute.

This method attempts to open and read the YAML file, converting its contents into a dictionary. If the file is empty or does not exist, it returns an empty dictionary.

Returns:

Name Type Description
dict dict

A dictionary representing the loaded data from the YAML file. If the file is empty or non-existent, an empty dictionary is returned.

Raises:

Type Description
FileNotFoundError

If the specified file does not exist.

YAMLError

If there is an error parsing the YAML file.

Source code in ceurws/loctime.py
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
def load(
    self,
) -> dict:
    """
    Loads the lookup data from the YAML file specified by the filepath attribute.

    This method attempts to open and read the YAML file, converting its contents
    into a dictionary. If the file is empty or does not exist, it returns an empty dictionary.

    Returns:
        dict: A dictionary representing the loaded data from the YAML file. If the file
              is empty or non-existent, an empty dictionary is returned.

    Raises:
        FileNotFoundError: If the specified file does not exist.
        yaml.YAMLError: If there is an error parsing the YAML file.
    """
    data_dict = {}
    if os.path.isfile(self.filepath) and os.path.getsize(self.filepath) > 0:
        with open(self.filepath) as yaml_file:
            data_dict = yaml.safe_load(yaml_file)
    return data_dict

parse(loctime)

Alternative parse of CEUR-WS loctimes using lookups

Parameters:

Name Type Description Default
loctime str

The loctime string to parse.

required
Source code in ceurws/loctime.py
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
def parse(self, loctime: str) -> dict:
    """
    Alternative parse of CEUR-WS loctimes using lookups

    Args:
        loctime (str): The loctime string to parse.

    """
    result = {}
    self.total_loctimes += 1
    lt_parts = self.get_parts(loctime)

    # Process each part of loctime
    for index, part in enumerate(lt_parts):
        part = part.strip()
        reverse_pos = len(lt_parts) - index  # Position from end

        found_in_lookup = False
        # Check against each lookup and update corresponding counter
        for (
            lookup_key,
            lookup_dict,
        ) in self.multi_word_lookups.items():
            if part in lookup_dict:
                self.counters[lookup_key][part] += 1  # Increment the lookup counter
                found_in_lookup = True
                # set result dict
                result[lookup_key] = part
                break  # Break if found, assuming part can't be in multiple lookups
        if not found_in_lookup:
            # Update counter for each part's position from end
            key = str(reverse_pos)
            if key in self.counters:
                self.counters[key][part] += 1

        # Special handling for 4-digit years
        if index == len(lt_parts) - 1 and self.year_pattern.match(part):
            self.counters["4digit-year"][part] += 1
    return result

save()

Saves the current lookup dictionary to a YAML file.

Source code in ceurws/loctime.py
102
103
104
105
106
107
108
109
110
111
112
113
def save(self):
    """
    Saves the current lookup dictionary to a YAML file.
    """
    os.makedirs(os.path.dirname(self.filepath), exist_ok=True)  # Ensure directory exists
    with open(self.filepath, "w", encoding="utf-8") as yaml_file:
        yaml.dump(
            self.lookups,
            yaml_file,
            default_flow_style=False,
            allow_unicode=True,
        )

setup()

Prepares the parser by initializing multi-word handling and creating a modified version of the lookup dictionaries with keys as concatenated words. This method sets up the 'multi_word' and 'multi_word_lookups' dictionaries to facilitate the parsing process, especially for multi-word keys.

Source code in ceurws/loctime.py
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
def setup(self):
    """
    Prepares the parser by initializing multi-word handling and creating
    a modified version of the lookup dictionaries with keys as concatenated words.
    This method sets up the 'multi_word' and 'multi_word_lookups' dictionaries
    to facilitate the parsing process, especially for multi-word keys.
    """
    self.multi_word = {}
    for lookup in self.lookups.values():
        for key in lookup:
            if " " in key:
                self.multi_word[key] = key.replace(" ", "_")

    # Initialize a dictionary derived from self.lookups with underscored keys
    self.multi_word_lookups = {}
    for category, lookup in self.lookups.items():
        self.multi_word_lookups[category] = {key.replace(" ", "_"): value for key, value in lookup.items()}

update_lookup_counts()

to be called ffter processing all loctimes and updating counters update lookup dicts with new counts

Source code in ceurws/loctime.py
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
def update_lookup_counts(self):
    """
    to be called  ffter processing all loctimes
    and updating counters update lookup dicts with new counts
    """
    for category, counter in self.counters.items():
        if category in self.lookups:
            for underscore_key, count in counter.items():
                # Convert underscore_key back to space-separated key
                original_key = underscore_key.replace("_", " ")
                if original_key in self.lookups[category]:
                    # Update the count for the original key
                    self.lookups[category][original_key] += count
                else:
                    # Initialize count for the original key
                    self.lookups[category][original_key] = count

PercentageTable

A class for creating a table that displays values and their corresponding percentages of a total.

Attributes:

Name Type Description
total float

The total value used for calculating percentages.

column_title str

The title for the first column in the table.

digits int

The number of decimal places for rounding percentages.

rows list

A list of dictionaries representing rows in the table.

Source code in ceurws/loctime.py
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
class PercentageTable:
    """
    A class for creating a table that displays values and their corresponding percentages of a total.

    Attributes:
        total (float): The total value used for calculating percentages.
        column_title (str): The title for the first column in the table.
        digits (int): The number of decimal places for rounding percentages.
        rows (list): A list of dictionaries representing rows in the table.
    """

    def __init__(self, column_title: str, total: float, digits: int):
        """
        Initializes the PercentageTable with a title for the column,
        a total value, and specified precision for percentages.

        Args:
            column_title (str): The title for the first column.
            total (float): The total value for calculating percentages.
            digits (int): The precision for percentage values.
        """
        self.total = total
        self.column_title = column_title
        self.digits = digits
        self.rows = [{self.column_title: "Total", "#": total, "%": 100.0}]

    def add_value(self, row_title: str, value: float):
        """
        Adds a row to the table with the given title and value, calculating the percentage of the total.

        Args:
            row_title (str): The title for the row.
            value (float): The value for the row, which is used to calculate its percentage of the total.
        """
        percentage = round((value / self.total) * 100, self.digits) if self.total else 0
        self.rows.append({self.column_title: row_title, "#": value, "%": percentage})

    def generate_table(self, tablefmt="grid") -> str:
        """
        Generates a string representation of the table using the tabulate library.

        Returns:
            str: The string representation of the table with headers and formatted rows.
        """
        if not self.rows:
            return ""
        tabulate_markup = tabulate(
            self.rows,
            headers="keys",
            tablefmt=tablefmt,
            floatfmt=f".{self.digits}f",
        )
        return tabulate_markup

__init__(column_title, total, digits)

Initializes the PercentageTable with a title for the column, a total value, and specified precision for percentages.

Parameters:

Name Type Description Default
column_title str

The title for the first column.

required
total float

The total value for calculating percentages.

required
digits int

The precision for percentage values.

required
Source code in ceurws/loctime.py
258
259
260
261
262
263
264
265
266
267
268
269
270
271
def __init__(self, column_title: str, total: float, digits: int):
    """
    Initializes the PercentageTable with a title for the column,
    a total value, and specified precision for percentages.

    Args:
        column_title (str): The title for the first column.
        total (float): The total value for calculating percentages.
        digits (int): The precision for percentage values.
    """
    self.total = total
    self.column_title = column_title
    self.digits = digits
    self.rows = [{self.column_title: "Total", "#": total, "%": 100.0}]

add_value(row_title, value)

Adds a row to the table with the given title and value, calculating the percentage of the total.

Parameters:

Name Type Description Default
row_title str

The title for the row.

required
value float

The value for the row, which is used to calculate its percentage of the total.

required
Source code in ceurws/loctime.py
273
274
275
276
277
278
279
280
281
282
def add_value(self, row_title: str, value: float):
    """
    Adds a row to the table with the given title and value, calculating the percentage of the total.

    Args:
        row_title (str): The title for the row.
        value (float): The value for the row, which is used to calculate its percentage of the total.
    """
    percentage = round((value / self.total) * 100, self.digits) if self.total else 0
    self.rows.append({self.column_title: row_title, "#": value, "%": percentage})

generate_table(tablefmt='grid')

Generates a string representation of the table using the tabulate library.

Returns:

Name Type Description
str str

The string representation of the table with headers and formatted rows.

Source code in ceurws/loctime.py
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
def generate_table(self, tablefmt="grid") -> str:
    """
    Generates a string representation of the table using the tabulate library.

    Returns:
        str: The string representation of the table with headers and formatted rows.
    """
    if not self.rows:
        return ""
    tabulate_markup = tabulate(
        self.rows,
        headers="keys",
        tablefmt=tablefmt,
        floatfmt=f".{self.digits}f",
    )
    return tabulate_markup

models

ceur

Created on 2024-03-17

CEUR Workshop Proceedings (CEUR-WS.org)

Metamodel @author: wf

Paper

Bases: SQLModel

Represents a paper with details such as authors, volume number, and title.

Source code in ceurws/models/ceur.py
60
61
62
63
64
65
66
67
68
69
70
71
72
class Paper(SQLModel, table=True):  # type: ignore
    """
    Represents a paper with details such as authors, volume number, and title.
    """

    __tablename__ = "papers"
    authors: str | None = Field(default=None, index=False)
    vol_number: int | None = Field(default=None, index=True)
    pdf_name: str | None = Field(default=None, index=False)
    id: str = Field(primary_key=True)
    title: str | None = Field(default=None, index=False)
    pages: str | None = Field(default=None, index=False)
    fail: str | None = Field(default=None, index=False)

Volume

Bases: SQLModel

a single CEUR-WS Volume

Source code in ceurws/models/ceur.py
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
class Volume(SQLModel, table=True):  # type: ignore
    """
    a single CEUR-WS Volume
    """

    __tablename__ = "volumes"

    fromLine: int | None = Field(default=None)
    toLine: int | None = Field(default=None)
    valid: int | None = Field(default=None)
    url: str | None = Field(default=None)
    acronym: str | None = Field(default=None)
    title: str | None = Field(default=None)
    seealso: str | None = Field(default=None)
    editors: str | None = Field(default=None)
    submittedBy: str | None = Field(default=None)
    published: str | None = Field(default=None)
    pubDate: datetime | None = Field(default=None)
    number: int = Field(primary_key=True)
    archive: str | None = Field(default=None)
    desc: str | None = Field(alias="description", default=None)  # 'desc' is a SQL keyword, so it's aliased
    h1: str | None = Field(default=None)
    h3: str | None = Field(default=None)
    volname: str | None = Field(default=None)
    homepage: str | None = Field(default=None)
    year: str | None = Field(default=None)
    urn: str | None = Field(default=None)
    # vol_number: Optional[int] = Field(default=None)
    loctime: str | None = Field(default=None)
    volume_number: str | None = Field(default=None)
    voltitle: str | None = Field(default=None)
    dateFrom: date | None = Field(default=None)
    dateTo: date | None = Field(default=None)
    city: str | None = Field(default=None)
    cityWikidataId: str | None = Field(default=None)
    country: str | None = Field(default=None)
    countryWikidataId: str | None = Field(default=None)
    urn_check_digit: int | None = Field(default=None)
    urn_ok: int | None = Field(default=None)
    ceurpubdate: str | None = Field(default=None)
    colocated: str | None = Field(default=None)
    virtualEvent: int | None = Field(default=None)
    tdtitle: str | None = Field(default=None)

dblp

Created on 2023 @author: Tim Holzheim

refactored 2024-03-09 by wf

DblpPaper

a paper indexed by dblp.org

Source code in ceurws/models/dblp.py
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
@lod_storable
class DblpPaper:
    """
    a paper indexed by dblp.org
    """

    dblp_publication_id: str
    dblp_proceeding_id: str
    volume_number: int
    title: str
    authors: list[DblpScholar] | None = field(default_factory=list)
    pdf_id: str | None = None

    def __post_init__(self):
        for i, author in enumerate(self.authors):
            if isinstance(author, dict):
                self.authors[i] = DblpScholar(**author)

DblpProceeding

a proceeding indexed by dblp.org

Source code in ceurws/models/dblp.py
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
@lod_storable
class DblpProceeding:
    """
    a proceeding indexed by dblp.org
    """

    dblp_publication_id: str
    volume_number: int
    title: str
    dblp_event_id: str | None = None
    papers: list[DblpPaper] | None = field(default_factory=list)
    editors: list[DblpScholar] | None = field(default_factory=list)

    def __post_init__(self):
        if self.editors:
            for i, editor in enumerate(self.editors):
                if isinstance(editor, dict):
                    self.editors[i] = DblpScholar(**editor)
        if self.papers:
            for i, paper in enumerate(self.papers):
                if isinstance(paper, dict):
                    self.papers[i] = DblpPaper(**paper)

DblpScholar

a scholar indexed by dblp.org

example: Tim Berners-Lee https://dblp.org/pid/b/TimBernersLee.html

Source code in ceurws/models/dblp.py
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
@lod_storable
class DblpScholar:
    """
    a scholar indexed by dblp.org

    example: Tim Berners-Lee
    https://dblp.org/pid/b/TimBernersLee.html

    """

    dblp_author_id: str
    label: str | None = None
    wikidata_id: str | None = None
    orcid_id: str | None = None
    gnd_id: str | None = None

dblp2

Created on 2024-03-16

@author: wf

Authorship

Bases: SQLModel

Represents the relationship between a scholar and a paper, capturing the authorship details.

Source code in ceurws/models/dblp2.py
54
55
56
57
58
59
60
class Authorship(SQLModel, table=True):  # type: ignore
    """
    Represents the relationship between a scholar and a paper, capturing the authorship details.
    """

    paper: str = Field(foreign_key="paper.paper", primary_key=True)
    dblp_author_id: str = Field(foreign_key="scholar.dblp_author_id", primary_key=True)

Editorship

Bases: SQLModel

Represents the relationship between a scholar and a proceeding, indicating the scholar's role as an editor.

Source code in ceurws/models/dblp2.py
45
46
47
48
49
50
51
class Editorship(SQLModel, table=True):  # type: ignore
    """
    Represents the relationship between a scholar and a proceeding, indicating the scholar's role as an editor.
    """

    volume_number: int = Field(foreign_key="proceeding.volume_number", primary_key=True)
    dblp_author_id: str = Field(foreign_key="scholar.dblp_author_id", primary_key=True)

Paper

Bases: SQLModel

A paper indexed in DBLP with additional details. The paper URL is used as the unique identifier.

Source code in ceurws/models/dblp2.py
22
23
24
25
26
27
28
29
30
31
class Paper(SQLModel, table=True):  # type: ignore
    """
    A paper indexed in DBLP with additional details. The paper URL is used as the unique identifier.
    """

    paper: str = Field(primary_key=True)
    proceeding: str | None = Field(foreign_key="proceeding.proceeding")
    volume_number: str = Field(index=True)
    title: str
    pdf_url: str | None = None

Proceeding

Bases: SQLModel

A proceeding indexed in DBLP with additional details.

Source code in ceurws/models/dblp2.py
34
35
36
37
38
39
40
41
42
class Proceeding(SQLModel, table=True):  # type: ignore
    """
    A proceeding indexed in DBLP with additional details.
    """

    proceeding: str = Field(primary_key=True)
    volume_number: int = Field(index=True)
    title: str
    dblp_event_id: str | None = None

Scholar

Bases: SQLModel

Represents a scholar with information fetched from DBLP and possibly other sources.

Source code in ceurws/models/dblp2.py
10
11
12
13
14
15
16
17
18
19
class Scholar(SQLModel, table=True):  # type: ignore
    """
    Represents a scholar with information fetched from DBLP and possibly other sources.
    """

    dblp_author_id: str = Field(primary_key=True)
    label: str | None = None
    wikidata_id: str | None = None
    orcid_id: str | None = None
    gnd_id: str | None = None

namedqueries

Created on 2023-03-21

@author: wf

NamedQueries

get named queries

Source code in ceurws/namedqueries.py
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
class NamedQueries:
    """
    get named queries
    """

    def __init__(self, wikiId: str = "cr"):
        """ """
        self.wikiId = wikiId
        self.wikiClient = WikiClient.ofWikiId(wikiId)
        if self.wikiClient.needsLogin():
            self.wikiClient.login()
        self.smw = SMWClient(self.wikiClient.getSite())
        self.qm: QueryManager | None = None

    def query(self):
        """
        run query
        """
        ask_query = """
        {{#ask: [[Concept:Query]]
|mainlabel=Query
|?Query id = id
|?Query name=name
|?Query title = title
|?Query tryiturl = tryiturl
|?Query wdqsurl = wdqsurl
|?Query sparql=sparql
|?Query relevance = relevance
|?Query task = task
|limit=200
|sort=Query task,Query id
|order=ascending
}}"""
        self.q_records = self.smw.query(ask_query)

    def toQueryManager(self) -> QueryManager:
        """
        convert me to a QueryManager
        """
        self.qm = QueryManager(lang="sparql")
        self.qm.queriesByName = {}
        for q_record in self.q_records.values():
            name = q_record["name"]
            sparql = q_record["sparql"]
            if name and sparql:
                query = Query(name, query=sparql)
                self.qm.queriesByName[name] = query
        return self.qm

    def toYaml(self) -> str:
        if self.qm is None:
            self.query()
            qm = self.toQueryManager()
        else:
            qm = self.qm
        yaml_str = "# named queries\n"
        for query in qm.queriesByName.values():
            yaml_str += f"""'{query.name}':
    sparql: |
"""
            for line in query.query.split("\n"):
                yaml_str += f"      {line}\n"
        return yaml_str

__init__(wikiId='cr')

Source code in ceurws/namedqueries.py
17
18
19
20
21
22
23
24
def __init__(self, wikiId: str = "cr"):
    """ """
    self.wikiId = wikiId
    self.wikiClient = WikiClient.ofWikiId(wikiId)
    if self.wikiClient.needsLogin():
        self.wikiClient.login()
    self.smw = SMWClient(self.wikiClient.getSite())
    self.qm: QueryManager | None = None

query()

run query

Source code in ceurws/namedqueries.py
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
    def query(self):
        """
        run query
        """
        ask_query = """
        {{#ask: [[Concept:Query]]
|mainlabel=Query
|?Query id = id
|?Query name=name
|?Query title = title
|?Query tryiturl = tryiturl
|?Query wdqsurl = wdqsurl
|?Query sparql=sparql
|?Query relevance = relevance
|?Query task = task
|limit=200
|sort=Query task,Query id
|order=ascending
}}"""
        self.q_records = self.smw.query(ask_query)

toQueryManager()

convert me to a QueryManager

Source code in ceurws/namedqueries.py
47
48
49
50
51
52
53
54
55
56
57
58
59
def toQueryManager(self) -> QueryManager:
    """
    convert me to a QueryManager
    """
    self.qm = QueryManager(lang="sparql")
    self.qm.queriesByName = {}
    for q_record in self.q_records.values():
        name = q_record["name"]
        sparql = q_record["sparql"]
        if name and sparql:
            query = Query(name, query=sparql)
            self.qm.queriesByName[name] = query
    return self.qm

papertocparser

Created on 2023-03-22

@author: wf

PaperTocParser

Bases: Textparser

parser for paper table of contents

Source code in ceurws/papertocparser.py
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
class PaperTocParser(Textparser):
    """
    parser for paper table of contents
    """

    def __init__(self, number: str, soup: BeautifulSoup, debug: bool = False):
        """
        constructor

        Args:
            number(str): the volume number
            soup(BeautifulSoup): the parser input
            debug(bool): if True print out debug info
        """
        Textparser.__init__(self, debug=debug)
        self.number = number
        self.soup = soup
        self.scrape = WebScrape()
        self.scrapeDescr = [
            ScrapeDescription(key="title", tag="span", attribute="class", value="CEURTITLE"),
            ScrapeDescription(
                key="authors",
                tag="span",
                attribute="class",
                value="CEURAUTHOR",
                multi=True,
            ),
            ScrapeDescription(key="pages", tag="span", attribute="class", value="CEURPAGES"),
            # ScrapeDescription(key='submitted_papers', tag='span', attribute='class', value='CEURSUBMITTEDPAPERS'),
            # ScrapeDescription(key='accepted_papers', tag='span', attribute='class', value='CEURACCEPTEDPAPERS'),
        ]

    def parsePapers(self):
        """
        parse the toc to papers
        """
        paper_records = []
        toc = self.soup.find(attrs={"class": "CEURTOC"})
        if toc:
            paper_ids = []
            for index, paper_li in enumerate(toc.findAll("li")):
                paper_record = self.scrape.parseWithScrapeDescription(paper_li, self.scrapeDescr)
                paper_record["vol_number"] = self.number
                href_node = paper_li.find("a", href=True)
                if href_node:
                    href = href_node.attrs["href"]
                    href = Textparser.sanitize(href)
                    paper_record["pdf_name"] = href
                if "id" in paper_li.attrs:
                    paper_id = paper_li.attrs["id"]
                    if paper_id in paper_ids:
                        paper_id = f"{paper_id}-duplicate-{index}"
                    paper_ids.append(paper_id)
                    key = f"Vol-{self.number}/{paper_id}"
                    paper_record["id"] = key
                paper_records.append(paper_record)
                pass
        else:
            toc = self.soup.find("h2", string=re.compile(".*Contents.*"))
            if toc:
                index = 0
                for paper_li in self.soup.find_all("li", recursive=True):
                    href_node = paper_li.find("a", href=True)
                    if href_node:
                        href = href_node.attrs["href"]
                        href = Textparser.sanitize(href)
                        if ".pdf" in href:
                            title = Textparser.sanitize(href_node.text)
                            index += 1
                            key = f"Vol-{self.number}/paper-{index}"
                            paper_record = {
                                "vol_number": self.number,
                                "title": title,
                                "pdf_name": href,
                                "id": key,
                            }
                            authors = ""
                            # authors are after next br tag
                            br = paper_li.find("br")
                            if not br:
                                paper_record["fail"] = "authors br not found"
                            else:
                                author_part = br.next_sibling
                                if not author_part:
                                    paper_record["fail"] = "authors br not found"
                                else:
                                    authors = author_part.text
                            authors = Textparser.sanitize(authors)
                            author_list = authors.split(",")
                            for i, author in enumerate(author_list):
                                author_list[i] = author.strip()
                            paper_record["authors"] = author_list
                            paper_records.append(paper_record)
            else:
                if self.debug:
                    print(f"no toc for {self.number}")
        return paper_records

__init__(number, soup, debug=False)

constructor

Parameters:

Name Type Description Default
number(str)

the volume number

required
soup(BeautifulSoup)

the parser input

required
debug(bool)

if True print out debug info

required
Source code in ceurws/papertocparser.py
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
def __init__(self, number: str, soup: BeautifulSoup, debug: bool = False):
    """
    constructor

    Args:
        number(str): the volume number
        soup(BeautifulSoup): the parser input
        debug(bool): if True print out debug info
    """
    Textparser.__init__(self, debug=debug)
    self.number = number
    self.soup = soup
    self.scrape = WebScrape()
    self.scrapeDescr = [
        ScrapeDescription(key="title", tag="span", attribute="class", value="CEURTITLE"),
        ScrapeDescription(
            key="authors",
            tag="span",
            attribute="class",
            value="CEURAUTHOR",
            multi=True,
        ),
        ScrapeDescription(key="pages", tag="span", attribute="class", value="CEURPAGES"),
        # ScrapeDescription(key='submitted_papers', tag='span', attribute='class', value='CEURSUBMITTEDPAPERS'),
        # ScrapeDescription(key='accepted_papers', tag='span', attribute='class', value='CEURACCEPTEDPAPERS'),
    ]

parsePapers()

parse the toc to papers

Source code in ceurws/papertocparser.py
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
def parsePapers(self):
    """
    parse the toc to papers
    """
    paper_records = []
    toc = self.soup.find(attrs={"class": "CEURTOC"})
    if toc:
        paper_ids = []
        for index, paper_li in enumerate(toc.findAll("li")):
            paper_record = self.scrape.parseWithScrapeDescription(paper_li, self.scrapeDescr)
            paper_record["vol_number"] = self.number
            href_node = paper_li.find("a", href=True)
            if href_node:
                href = href_node.attrs["href"]
                href = Textparser.sanitize(href)
                paper_record["pdf_name"] = href
            if "id" in paper_li.attrs:
                paper_id = paper_li.attrs["id"]
                if paper_id in paper_ids:
                    paper_id = f"{paper_id}-duplicate-{index}"
                paper_ids.append(paper_id)
                key = f"Vol-{self.number}/{paper_id}"
                paper_record["id"] = key
            paper_records.append(paper_record)
            pass
    else:
        toc = self.soup.find("h2", string=re.compile(".*Contents.*"))
        if toc:
            index = 0
            for paper_li in self.soup.find_all("li", recursive=True):
                href_node = paper_li.find("a", href=True)
                if href_node:
                    href = href_node.attrs["href"]
                    href = Textparser.sanitize(href)
                    if ".pdf" in href:
                        title = Textparser.sanitize(href_node.text)
                        index += 1
                        key = f"Vol-{self.number}/paper-{index}"
                        paper_record = {
                            "vol_number": self.number,
                            "title": title,
                            "pdf_name": href,
                            "id": key,
                        }
                        authors = ""
                        # authors are after next br tag
                        br = paper_li.find("br")
                        if not br:
                            paper_record["fail"] = "authors br not found"
                        else:
                            author_part = br.next_sibling
                            if not author_part:
                                paper_record["fail"] = "authors br not found"
                            else:
                                authors = author_part.text
                        authors = Textparser.sanitize(authors)
                        author_list = authors.split(",")
                        for i, author in enumerate(author_list):
                            author_list[i] = author.strip()
                        paper_record["authors"] = author_list
                        paper_records.append(paper_record)
        else:
            if self.debug:
                print(f"no toc for {self.number}")
    return paper_records

services

entity_fishing

CeurEntityFishing

EntityFishing component for spaCy pipeline. modified version of https://github.com/Lucaterre/spacyfishing/blob/main/spacyfishing/entity_fishing_linker.py

Source code in ceurws/services/entity_fishing.py
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
@Language.factory(
    name=ENTITY_FISHING_PIPELINE,
    default_config={
        "api_ef_base": f"{ENTITY_FISHING_ENDPOINT}/service",
        "language": "en",
        "extra_info": False,
        "filter_statements": [],
        "verbose": False,
    },
)
class CeurEntityFishing:
    """
    EntityFishing component for spaCy pipeline.
    modified version of https://github.com/Lucaterre/spacyfishing/blob/main/spacyfishing/entity_fishing_linker.py
    """

    def __init__(
        self,
        nlp: Language,
        name: str,
        api_ef_base: str,
        language: str,
        extra_info: bool,
        filter_statements: list,
        verbose: bool,
    ):
        """
        `EntityFishing` main class component.

        Note:
            Show default config for default attributes values.

        Parameters:
            api_ef_base (str): describes url of the entity-fishing API used.
            language (str): matches the language of the resources to
            be disambiguated (matches the language model for the NER task).
            extra_info (bool): attach extra information to spans as normalised term,
            description, others knowledge base ids.
            filter_statements (list): filter others KB ids
            that relies on QID  eg. ['P214', 'P244'].
            verbose (bool): display logging messages.

        Attributes:
            api_ef_base (str): cf. `api_ef_base` in parameters section.
            language (dict): cf. `language` in parameters section.
            prepare the language argument for the query.
            wikidata_url_base (str): wikidata base url (to concatenate QID identifiers).
            flag_extra (bool): cf. `extra_info` in parameters section.
            filter_statements (list): cf. `filter_statements` in parameters section.
            verbose (bool): cf. `verbose` in parameters section.
        """
        if not api_ef_base.endswith("/"):
            api_ef_base += "/"
        self.api_ef_base = api_ef_base
        self.language = dict(lang=language)
        self.wikidata_url_base = "https://www.wikidata.org/wiki/"

        self.flag_extra = extra_info
        self.filter_statements = filter_statements
        self.verbose = verbose

        # Set doc extensions to attaches raw response from Entity-Fishing API to doc
        Doc.set_extension("annotations", default={}, force=True)
        Doc.set_extension("metadata", default={}, force=True)

        # Set spans extensions to enhance spans with new information
        # come from Wikidata knowledge base.
        # default spans :
        Span.set_extension("kb_qid", default=None, force=True)
        Span.set_extension("wikipedia_page_ref", default=None, force=True)
        Span.set_extension("url_wikidata", default=None, force=True)
        Span.set_extension("nerd_score", default=None, force=True)

        # spans if extra_info set to True
        Span.set_extension("normal_term", default=None, force=True)
        Span.set_extension("description", default=None, force=True)
        Span.set_extension("src_description", default=None, force=True)
        Span.set_extension("other_ids", default=None, force=True)

    @staticmethod
    def generic_client_batch(
        method: str,
        url_batch: list[str],
        verbose: bool,
        params: dict | None = None,
        files_batch: list[dict] | None = None,
    ) -> list[requests.Response]:
        """
        It takes a list of urls and a list of files, and it sends a request to each url with the
        corresponding file

        :param method: str,
        :type method: str
        :param url_batch: a list of urls to send requests to
        :type url_batch: list[str]
        :param verbose: if True, the client will print out the status of each request
        :type verbose: bool
        :param params: dict = None,
        :type params: dict
        :param files_batch: a list of dictionaries, each dictionary containing the file to be annotated
        :type files_batch: list[dict]
        :return: A list of responses.
        """
        if params is None:
            params = {}
        if files_batch is None:
            files_batch = [{} for url in url_batch]

        def load_url(type_url, type_files):
            if method == "POST":
                return requests.post(
                    url=type_url, headers={"Accept": "application/json"}, files=type_files, params=params
                )
            else:
                return requests.get(
                    url=type_url, headers={"Accept": "application/json"}, files=type_files, params=params
                )

        response_batch = []
        resp_err, resp_ok = 0, 0
        with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:
            future_to_url = {
                executor.submit(load_url, type_url, type_files): (type_url, type_files)
                for type_url, type_files in zip(url_batch, files_batch, strict=False)
            }
            for future in concurrent.futures.as_completed(future_to_url):
                # url = future_to_url[future]
                try:
                    response_batch.append(future.result())
                except Exception:
                    resp_err = resp_err + 1
                else:
                    resp_ok = resp_ok + 1

        def client_log(msg: str) -> None:
            if verbose:
                logging.warning(msg)

        # Manage response status code :
        # cf. https://nerd.readthedocs.io/en/latest/restAPI.html#response-status-codes
        for idx, response in enumerate(response_batch):
            if response.status_code == 400:
                client_log(
                    f"Request {idx}. Wrong request, missing parameters, "
                    "missing header, text too short (<= 5 characters). (400)"
                )
            elif response.status_code == 500:
                client_log(f"Request {idx}. Entity-Fishing API service seems broken. (500)")
            elif response.status_code == 404:
                client_log(f"Request {idx}. Property was not found in request body. (404)")
            elif response.status_code == 406:
                client_log(f"Request {idx}. Language is not supported by Entity-Fishing. (406)")

        return response_batch

    @staticmethod
    def process_response(response: requests.models.Response) -> tuple[dict, dict]:
        """
        It takes a response object from the `requests` library and returns a tuple of two dictionaries.
        The first dictionary is the JSON response from the API, and the second dictionary contains
        metadata about the response

        :param response: The response object returned by the requests library
        :type response: requests.models.Response
        :return: A tuple of two dictionaries.
        """
        try:
            res_json = response.json()
        except json.decoder.JSONDecodeError:
            res_json = {}

        metadata = {
            "status_code": response.status_code,
            "reason": response.reason,
            "ok": response.ok,
            "encoding": response.encoding,
        }

        return res_json, metadata

    @staticmethod
    def prepare_data(text: str, terms: str, entities: list, language: dict, full: bool = False) -> dict:
        """
        > The function takes in a text, a list of entities, a language dictionary and a boolean value.
        It then returns a dictionary with a key called "query" and a value that is a JSON object

        :param text: The text to be analyzed
        :type text: str
        :param terms: the terms to be searched for
        :type terms: str
        :param entities: list of entities in the text
        :type entities: list
        :param language: the language of the text
        :type language: dict
        :param full: if True, the response will contain the full text of the article, defaults to False
        :type full: bool (optional)
        :return: A dictionary with a key of "query" and a value of a json object.
        """
        return {
            "query": json.dumps(
                {
                    "text": text,
                    "shortText": terms,
                    "language": language,
                    "entities": [
                        {
                            "rawName": ent.text,
                            "offsetStart": ent.start_char,
                            "offsetEnd": ent.end_char,
                        }
                        for ent in entities
                    ],
                    "mentions": [],
                    "customisation": "generic",
                    "full": "true" if full else "false",
                },
                ensure_ascii=False,
            )
        }

    def updated_entities(self, doc: Doc, response: list) -> None:
        """
        > The function `updated_entities` takes a `Doc` object and a list of entities as input. It then
        iterates over the list of entities and updates the `Doc` object with the information contained
        in the list of entities

        :param doc: the document to be processed
        :type doc: Doc
        :param response: the response from the NERD API
        :type response: list
        """
        for entity in response:
            with contextlib.suppress(AttributeError):
                span = doc.char_span(start_idx=entity["offsetStart"], end_idx=entity["offsetEnd"])
                with contextlib.suppress(KeyError):
                    span._.kb_qid = str(entity["wikidataId"])
                    span._.url_wikidata = self.wikidata_url_base + span._.kb_qid
                with contextlib.suppress(KeyError):
                    span._.wikipedia_page_ref = str(entity["wikipediaExternalRef"])
                    # if flag_extra : search other info on entity
                    # => attach extra entity info to span
                    if self.flag_extra:
                        self.look_extra_informations_on_entity(span, entity)
                with contextlib.suppress(KeyError):
                    span._.nerd_score = entity["confidence_score"]

    # ~ Entity-fishing call service methods ~:
    def concept_look_up_batch(self, wiki_id_batch: str) -> list[requests.Response]:
        """
        > This function takes a list of wikipedia ids and returns a list of responses from the API

        :param wiki_id_batch: a list of wikipedia ids
        :type wiki_id_batch: str
        :return: A list of requests.Response objects.
        """
        url_concept_lookup_batch = [self.api_ef_base + "kb/concept/" + wiki_id for wiki_id in wiki_id_batch]
        return self.generic_client_batch(
            method="GET", url_batch=url_concept_lookup_batch, params=self.language, verbose=self.verbose
        )

    def disambiguate_text_batch(self, files_batch: list[dict]) -> list[requests.Response]:
        """
        > The function `disambiguate_text_batch` takes a list of dictionaries as input, where each
        dictionary contains the text to be disambiguated and the corresponding language. The function
        returns a list of responses, where each response contains the disambiguated text

        :param files_batch: a list of dictionaries, each dictionary containing the following keys:
        :type files_batch: list[dict]
        :return: A list of responses.
        """
        url_disambiguate = self.api_ef_base + "disambiguate"
        url_disambiguate_batch = [url_disambiguate for file in files_batch]
        return self.generic_client_batch(
            method="POST", url_batch=url_disambiguate_batch, files_batch=files_batch, verbose=self.verbose
        )

    def look_extra_informations_on_entity(self, span: Span, res_desc: dict) -> None:
        """
        It takes a span and a dictionary of information about the entity, and adds the information to
        the span

        :param span: The Span object that the extension is being applied to
        :type span: Span
        :param res_desc: the result of the query to Wikidata
        :type res_desc: dict
        """
        # normalised term name
        with contextlib.suppress(KeyError):
            span._.normal_term = res_desc["preferredTerm"]
        # description and source description (filter by language)
        with contextlib.suppress(KeyError, IndexError):
            span._.description = res_desc["definitions"][0]["definition"]
            span._.src_description = res_desc["definitions"][0]["source"]
        # others identifiers attach to QID
        # in Wikidata KB with filter properties or not
        try:
            ids = []
            for content in res_desc["statements"]:
                new_id = {k: content[k] for k in ["propertyName", "propertyId", "value"]}
                if len(self.filter_statements) != 0:
                    if content["propertyId"] in self.filter_statements:
                        ids.append(new_id)
                else:
                    ids.append(new_id)

            span._.other_ids = ids
        except KeyError:
            pass
        except json.decoder.JSONDecodeError:
            pass

    def main_disambiguation_process_batch(
        self, text_batch: list[str], terms_batch: list[str], entities_batch: list[list]
    ) -> list[tuple[dict, dict, list]]:
        """
        It takes a batch of text, terms and entities, and returns a batch of disambiguated entities

        :param text_batch: a list of strings, each string is a text to be disambiguated
        :type text_batch: list[str]
        :param terms_batch: a list of strings, each string is a list of terms separated by a space
        :type terms_batch: list[str]
        :param entities_batch: a list of lists of entities, where each entity is a dictionary with the
        following keys:
        :type entities_batch: list[list]
        :return: A list of tuples, each tuple containing the response, metadata, and entities_enhanced.
        """
        data_to_post_batch = [
            self.prepare_data(text=text, terms=terms, entities=entities, language=self.language, full=self.flag_extra)
            for text, terms, entities in zip(text_batch, terms_batch, entities_batch, strict=False)
        ]
        reqs = self.disambiguate_text_batch(files_batch=data_to_post_batch)

        response_tuples = []
        for req in reqs:
            res, metadata = self.process_response(response=req)
            try:
                entities_enhanced = res["entities"]
            except KeyError:
                entities_enhanced = []
            response_tuples.append((res, metadata, entities_enhanced))
        return response_tuples

    def process_single_doc_after_call(self, doc: Doc, result_from_ef_text) -> Doc:
        """
        - The function takes a document and a list of entities from the Entity-Fishing service.
        - It then checks if there are any entities in the document that were not disambiguated by the
        Entity-Fishing service.
        - If there are, it passes the text of these entities to the Entity-Fishing service again, but
        this time without the text of the document.
        - It then merges the results of the two calls to the Entity-Fishing service and attaches the
        information from the Entity-Fishing service to the entities in the document

        :param doc: The document to be processed
        :type doc: Doc
        :param result_from_ef_text: a list of three elements:
        :return: A list of dictionaries, each dictionary contains the information of a single entity.
        """
        entities_from_text = result_from_ef_text[2]

        # 1a. Attach raw response (with text method in Entity-Fishing service) to doc
        if len(result_from_ef_text[0]) > 0:
            doc._.annotations["disambiguation_text_service"] = result_from_ef_text[0]

        doc._.metadata["disambiguation_text_service"] = result_from_ef_text[1]

        # 2 .Because some named entities have not been disambiguated,
        # create a list with these unrelated entities ("nil clustering").
        # Pass them back in Entity-fishing without the text but with all
        # the named entities surrounding these entities, to create a context
        # of neighboring terms.
        # nil_clustering = named entities in doc - actual disambiguated entities by EF
        nil_clustering = []
        if len(result_from_ef_text[0]) > 0:
            with contextlib.suppress(KeyError):
                nil_clustering = [
                    doc.char_span(start_idx=ent[1], end_idx=ent[2])
                    for ent in [(ent.text, ent.start_char, ent.end_char) for ent in doc.ents]
                    if ent
                    not in [
                        (ent_ef["rawName"], ent_ef["offsetStart"], ent_ef["offsetEnd"])
                        for ent_ef in result_from_ef_text[0]["entities"]
                    ]
                ]
        entities_from_terms = []
        if len(nil_clustering) != 0:
            # prepare query for Entity-Fishing terms disambiguation
            terms = " ".join([ent.text for ent in doc.ents])
            result_from_ef_terms = self.main_disambiguation_process_batch(
                text_batch=[""], terms_batch=[terms], entities_batch=[nil_clustering]
            )[0]

            entities_from_terms = result_from_ef_terms[2]

            # 2b. Attach raw response (with terms method in Entity-Fishing service) to doc
            if len(result_from_ef_terms[0]) > 0:
                doc._.annotations["disambiguation_terms_service"] = result_from_ef_terms[0]
            doc._.metadata["disambiguation_terms_service"] = result_from_ef_terms[1]

        # 3. Merge two list of entities (first and second pass in EF service)
        # and attach information from Entity-Fishing to spans
        result = (
            entities_from_text
            + [entity_term for entity_term in entities_from_terms if entity_term not in entities_from_text]
            if len(entities_from_terms) > 0
            else entities_from_text
        )

        if len(result) > 0:
            with contextlib.suppress(KeyError):
                self.updated_entities(doc, result)
        return doc

    def __call__(self, doc: Doc) -> Doc:
        """
        > The function takes a spaCy Doc object, and returns a Doc object with the entities
        disambiguated and linked

        :param doc: Doc
        :type doc: Doc
        :return: A Doc object with the entities linked to the corresponding Wikipedia page.
        """
        # 1. Disambiguate and linking named entities in Doc object with Entity-Fishing
        result_from_ef_text = self.main_disambiguation_process_batch(
            text_batch=[doc.text], terms_batch=[""], entities_batch=[doc.ents]
        )[0]
        return self.process_single_doc_after_call(doc, result_from_ef_text)

    def pipe(self, stream: Iterable, batch_size: int = 128) -> Doc:
        """
        For each batch of documents, we disambiguate the named entities in the documents, and then yield
        the results

        :param stream: a generator that yields Doc objects
        :type stream: iterator
        :param batch_size: The number of documents to process at a time, defaults to 128 (optional)
        :type batch_size: int
        """
        for docs in util.minibatch(stream, size=batch_size):
            text_batch = [doc.text for doc in docs]
            entities_batch = [doc.ents for doc in docs]
            terms_batch = ["" for _ in text_batch]

            # 1. Disambiguate and linking named entities in Doc object with Entity-Fishing
            result_from_ef_text_batch = self.main_disambiguation_process_batch(
                text_batch=text_batch, terms_batch=terms_batch, entities_batch=entities_batch
            )

            for doc, result_from_ef_text in zip(docs, result_from_ef_text_batch, strict=False):
                yield self.process_single_doc_after_call(doc, result_from_ef_text)
__call__(doc)

The function takes a spaCy Doc object, and returns a Doc object with the entities disambiguated and linked

:param doc: Doc :type doc: Doc :return: A Doc object with the entities linked to the corresponding Wikipedia page.

Source code in ceurws/services/entity_fishing.py
428
429
430
431
432
433
434
435
436
437
438
439
440
441
def __call__(self, doc: Doc) -> Doc:
    """
    > The function takes a spaCy Doc object, and returns a Doc object with the entities
    disambiguated and linked

    :param doc: Doc
    :type doc: Doc
    :return: A Doc object with the entities linked to the corresponding Wikipedia page.
    """
    # 1. Disambiguate and linking named entities in Doc object with Entity-Fishing
    result_from_ef_text = self.main_disambiguation_process_batch(
        text_batch=[doc.text], terms_batch=[""], entities_batch=[doc.ents]
    )[0]
    return self.process_single_doc_after_call(doc, result_from_ef_text)
__init__(nlp, name, api_ef_base, language, extra_info, filter_statements, verbose)

EntityFishing main class component.

Note

Show default config for default attributes values.

Parameters:

Name Type Description Default
api_ef_base str

describes url of the entity-fishing API used.

required
language str

matches the language of the resources to

required
extra_info bool

attach extra information to spans as normalised term,

required
filter_statements list

filter others KB ids

required
verbose bool

display logging messages.

required

Attributes:

Name Type Description
api_ef_base str

cf. api_ef_base in parameters section.

language dict

cf. language in parameters section.

wikidata_url_base str

wikidata base url (to concatenate QID identifiers).

flag_extra bool

cf. extra_info in parameters section.

filter_statements list

cf. filter_statements in parameters section.

verbose bool

cf. verbose in parameters section.

Source code in ceurws/services/entity_fishing.py
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
def __init__(
    self,
    nlp: Language,
    name: str,
    api_ef_base: str,
    language: str,
    extra_info: bool,
    filter_statements: list,
    verbose: bool,
):
    """
    `EntityFishing` main class component.

    Note:
        Show default config for default attributes values.

    Parameters:
        api_ef_base (str): describes url of the entity-fishing API used.
        language (str): matches the language of the resources to
        be disambiguated (matches the language model for the NER task).
        extra_info (bool): attach extra information to spans as normalised term,
        description, others knowledge base ids.
        filter_statements (list): filter others KB ids
        that relies on QID  eg. ['P214', 'P244'].
        verbose (bool): display logging messages.

    Attributes:
        api_ef_base (str): cf. `api_ef_base` in parameters section.
        language (dict): cf. `language` in parameters section.
        prepare the language argument for the query.
        wikidata_url_base (str): wikidata base url (to concatenate QID identifiers).
        flag_extra (bool): cf. `extra_info` in parameters section.
        filter_statements (list): cf. `filter_statements` in parameters section.
        verbose (bool): cf. `verbose` in parameters section.
    """
    if not api_ef_base.endswith("/"):
        api_ef_base += "/"
    self.api_ef_base = api_ef_base
    self.language = dict(lang=language)
    self.wikidata_url_base = "https://www.wikidata.org/wiki/"

    self.flag_extra = extra_info
    self.filter_statements = filter_statements
    self.verbose = verbose

    # Set doc extensions to attaches raw response from Entity-Fishing API to doc
    Doc.set_extension("annotations", default={}, force=True)
    Doc.set_extension("metadata", default={}, force=True)

    # Set spans extensions to enhance spans with new information
    # come from Wikidata knowledge base.
    # default spans :
    Span.set_extension("kb_qid", default=None, force=True)
    Span.set_extension("wikipedia_page_ref", default=None, force=True)
    Span.set_extension("url_wikidata", default=None, force=True)
    Span.set_extension("nerd_score", default=None, force=True)

    # spans if extra_info set to True
    Span.set_extension("normal_term", default=None, force=True)
    Span.set_extension("description", default=None, force=True)
    Span.set_extension("src_description", default=None, force=True)
    Span.set_extension("other_ids", default=None, force=True)
concept_look_up_batch(wiki_id_batch)

This function takes a list of wikipedia ids and returns a list of responses from the API

:param wiki_id_batch: a list of wikipedia ids :type wiki_id_batch: str :return: A list of requests.Response objects.

Source code in ceurws/services/entity_fishing.py
263
264
265
266
267
268
269
270
271
272
273
274
def concept_look_up_batch(self, wiki_id_batch: str) -> list[requests.Response]:
    """
    > This function takes a list of wikipedia ids and returns a list of responses from the API

    :param wiki_id_batch: a list of wikipedia ids
    :type wiki_id_batch: str
    :return: A list of requests.Response objects.
    """
    url_concept_lookup_batch = [self.api_ef_base + "kb/concept/" + wiki_id for wiki_id in wiki_id_batch]
    return self.generic_client_batch(
        method="GET", url_batch=url_concept_lookup_batch, params=self.language, verbose=self.verbose
    )
disambiguate_text_batch(files_batch)

The function disambiguate_text_batch takes a list of dictionaries as input, where each dictionary contains the text to be disambiguated and the corresponding language. The function returns a list of responses, where each response contains the disambiguated text

:param files_batch: a list of dictionaries, each dictionary containing the following keys: :type files_batch: list[dict] :return: A list of responses.

Source code in ceurws/services/entity_fishing.py
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
def disambiguate_text_batch(self, files_batch: list[dict]) -> list[requests.Response]:
    """
    > The function `disambiguate_text_batch` takes a list of dictionaries as input, where each
    dictionary contains the text to be disambiguated and the corresponding language. The function
    returns a list of responses, where each response contains the disambiguated text

    :param files_batch: a list of dictionaries, each dictionary containing the following keys:
    :type files_batch: list[dict]
    :return: A list of responses.
    """
    url_disambiguate = self.api_ef_base + "disambiguate"
    url_disambiguate_batch = [url_disambiguate for file in files_batch]
    return self.generic_client_batch(
        method="POST", url_batch=url_disambiguate_batch, files_batch=files_batch, verbose=self.verbose
    )
generic_client_batch(method, url_batch, verbose, params=None, files_batch=None) staticmethod

It takes a list of urls and a list of files, and it sends a request to each url with the corresponding file

:param method: str, :type method: str :param url_batch: a list of urls to send requests to :type url_batch: list[str] :param verbose: if True, the client will print out the status of each request :type verbose: bool :param params: dict = None, :type params: dict :param files_batch: a list of dictionaries, each dictionary containing the file to be annotated :type files_batch: list[dict] :return: A list of responses.

Source code in ceurws/services/entity_fishing.py
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
@staticmethod
def generic_client_batch(
    method: str,
    url_batch: list[str],
    verbose: bool,
    params: dict | None = None,
    files_batch: list[dict] | None = None,
) -> list[requests.Response]:
    """
    It takes a list of urls and a list of files, and it sends a request to each url with the
    corresponding file

    :param method: str,
    :type method: str
    :param url_batch: a list of urls to send requests to
    :type url_batch: list[str]
    :param verbose: if True, the client will print out the status of each request
    :type verbose: bool
    :param params: dict = None,
    :type params: dict
    :param files_batch: a list of dictionaries, each dictionary containing the file to be annotated
    :type files_batch: list[dict]
    :return: A list of responses.
    """
    if params is None:
        params = {}
    if files_batch is None:
        files_batch = [{} for url in url_batch]

    def load_url(type_url, type_files):
        if method == "POST":
            return requests.post(
                url=type_url, headers={"Accept": "application/json"}, files=type_files, params=params
            )
        else:
            return requests.get(
                url=type_url, headers={"Accept": "application/json"}, files=type_files, params=params
            )

    response_batch = []
    resp_err, resp_ok = 0, 0
    with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:
        future_to_url = {
            executor.submit(load_url, type_url, type_files): (type_url, type_files)
            for type_url, type_files in zip(url_batch, files_batch, strict=False)
        }
        for future in concurrent.futures.as_completed(future_to_url):
            # url = future_to_url[future]
            try:
                response_batch.append(future.result())
            except Exception:
                resp_err = resp_err + 1
            else:
                resp_ok = resp_ok + 1

    def client_log(msg: str) -> None:
        if verbose:
            logging.warning(msg)

    # Manage response status code :
    # cf. https://nerd.readthedocs.io/en/latest/restAPI.html#response-status-codes
    for idx, response in enumerate(response_batch):
        if response.status_code == 400:
            client_log(
                f"Request {idx}. Wrong request, missing parameters, "
                "missing header, text too short (<= 5 characters). (400)"
            )
        elif response.status_code == 500:
            client_log(f"Request {idx}. Entity-Fishing API service seems broken. (500)")
        elif response.status_code == 404:
            client_log(f"Request {idx}. Property was not found in request body. (404)")
        elif response.status_code == 406:
            client_log(f"Request {idx}. Language is not supported by Entity-Fishing. (406)")

    return response_batch
look_extra_informations_on_entity(span, res_desc)

It takes a span and a dictionary of information about the entity, and adds the information to the span

:param span: The Span object that the extension is being applied to :type span: Span :param res_desc: the result of the query to Wikidata :type res_desc: dict

Source code in ceurws/services/entity_fishing.py
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
def look_extra_informations_on_entity(self, span: Span, res_desc: dict) -> None:
    """
    It takes a span and a dictionary of information about the entity, and adds the information to
    the span

    :param span: The Span object that the extension is being applied to
    :type span: Span
    :param res_desc: the result of the query to Wikidata
    :type res_desc: dict
    """
    # normalised term name
    with contextlib.suppress(KeyError):
        span._.normal_term = res_desc["preferredTerm"]
    # description and source description (filter by language)
    with contextlib.suppress(KeyError, IndexError):
        span._.description = res_desc["definitions"][0]["definition"]
        span._.src_description = res_desc["definitions"][0]["source"]
    # others identifiers attach to QID
    # in Wikidata KB with filter properties or not
    try:
        ids = []
        for content in res_desc["statements"]:
            new_id = {k: content[k] for k in ["propertyName", "propertyId", "value"]}
            if len(self.filter_statements) != 0:
                if content["propertyId"] in self.filter_statements:
                    ids.append(new_id)
            else:
                ids.append(new_id)

        span._.other_ids = ids
    except KeyError:
        pass
    except json.decoder.JSONDecodeError:
        pass
main_disambiguation_process_batch(text_batch, terms_batch, entities_batch)

It takes a batch of text, terms and entities, and returns a batch of disambiguated entities

:param text_batch: a list of strings, each string is a text to be disambiguated :type text_batch: list[str] :param terms_batch: a list of strings, each string is a list of terms separated by a space :type terms_batch: list[str] :param entities_batch: a list of lists of entities, where each entity is a dictionary with the following keys: :type entities_batch: list[list] :return: A list of tuples, each tuple containing the response, metadata, and entities_enhanced.

Source code in ceurws/services/entity_fishing.py
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
def main_disambiguation_process_batch(
    self, text_batch: list[str], terms_batch: list[str], entities_batch: list[list]
) -> list[tuple[dict, dict, list]]:
    """
    It takes a batch of text, terms and entities, and returns a batch of disambiguated entities

    :param text_batch: a list of strings, each string is a text to be disambiguated
    :type text_batch: list[str]
    :param terms_batch: a list of strings, each string is a list of terms separated by a space
    :type terms_batch: list[str]
    :param entities_batch: a list of lists of entities, where each entity is a dictionary with the
    following keys:
    :type entities_batch: list[list]
    :return: A list of tuples, each tuple containing the response, metadata, and entities_enhanced.
    """
    data_to_post_batch = [
        self.prepare_data(text=text, terms=terms, entities=entities, language=self.language, full=self.flag_extra)
        for text, terms, entities in zip(text_batch, terms_batch, entities_batch, strict=False)
    ]
    reqs = self.disambiguate_text_batch(files_batch=data_to_post_batch)

    response_tuples = []
    for req in reqs:
        res, metadata = self.process_response(response=req)
        try:
            entities_enhanced = res["entities"]
        except KeyError:
            entities_enhanced = []
        response_tuples.append((res, metadata, entities_enhanced))
    return response_tuples
pipe(stream, batch_size=128)

For each batch of documents, we disambiguate the named entities in the documents, and then yield the results

:param stream: a generator that yields Doc objects :type stream: iterator :param batch_size: The number of documents to process at a time, defaults to 128 (optional) :type batch_size: int

Source code in ceurws/services/entity_fishing.py
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
def pipe(self, stream: Iterable, batch_size: int = 128) -> Doc:
    """
    For each batch of documents, we disambiguate the named entities in the documents, and then yield
    the results

    :param stream: a generator that yields Doc objects
    :type stream: iterator
    :param batch_size: The number of documents to process at a time, defaults to 128 (optional)
    :type batch_size: int
    """
    for docs in util.minibatch(stream, size=batch_size):
        text_batch = [doc.text for doc in docs]
        entities_batch = [doc.ents for doc in docs]
        terms_batch = ["" for _ in text_batch]

        # 1. Disambiguate and linking named entities in Doc object with Entity-Fishing
        result_from_ef_text_batch = self.main_disambiguation_process_batch(
            text_batch=text_batch, terms_batch=terms_batch, entities_batch=entities_batch
        )

        for doc, result_from_ef_text in zip(docs, result_from_ef_text_batch, strict=False):
            yield self.process_single_doc_after_call(doc, result_from_ef_text)
prepare_data(text, terms, entities, language, full=False) staticmethod

The function takes in a text, a list of entities, a language dictionary and a boolean value. It then returns a dictionary with a key called "query" and a value that is a JSON object

:param text: The text to be analyzed :type text: str :param terms: the terms to be searched for :type terms: str :param entities: list of entities in the text :type entities: list :param language: the language of the text :type language: dict :param full: if True, the response will contain the full text of the article, defaults to False :type full: bool (optional) :return: A dictionary with a key of "query" and a value of a json object.

Source code in ceurws/services/entity_fishing.py
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
@staticmethod
def prepare_data(text: str, terms: str, entities: list, language: dict, full: bool = False) -> dict:
    """
    > The function takes in a text, a list of entities, a language dictionary and a boolean value.
    It then returns a dictionary with a key called "query" and a value that is a JSON object

    :param text: The text to be analyzed
    :type text: str
    :param terms: the terms to be searched for
    :type terms: str
    :param entities: list of entities in the text
    :type entities: list
    :param language: the language of the text
    :type language: dict
    :param full: if True, the response will contain the full text of the article, defaults to False
    :type full: bool (optional)
    :return: A dictionary with a key of "query" and a value of a json object.
    """
    return {
        "query": json.dumps(
            {
                "text": text,
                "shortText": terms,
                "language": language,
                "entities": [
                    {
                        "rawName": ent.text,
                        "offsetStart": ent.start_char,
                        "offsetEnd": ent.end_char,
                    }
                    for ent in entities
                ],
                "mentions": [],
                "customisation": "generic",
                "full": "true" if full else "false",
            },
            ensure_ascii=False,
        )
    }
process_response(response) staticmethod

It takes a response object from the requests library and returns a tuple of two dictionaries. The first dictionary is the JSON response from the API, and the second dictionary contains metadata about the response

:param response: The response object returned by the requests library :type response: requests.models.Response :return: A tuple of two dictionaries.

Source code in ceurws/services/entity_fishing.py
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
@staticmethod
def process_response(response: requests.models.Response) -> tuple[dict, dict]:
    """
    It takes a response object from the `requests` library and returns a tuple of two dictionaries.
    The first dictionary is the JSON response from the API, and the second dictionary contains
    metadata about the response

    :param response: The response object returned by the requests library
    :type response: requests.models.Response
    :return: A tuple of two dictionaries.
    """
    try:
        res_json = response.json()
    except json.decoder.JSONDecodeError:
        res_json = {}

    metadata = {
        "status_code": response.status_code,
        "reason": response.reason,
        "ok": response.ok,
        "encoding": response.encoding,
    }

    return res_json, metadata
process_single_doc_after_call(doc, result_from_ef_text)
  • The function takes a document and a list of entities from the Entity-Fishing service.
  • It then checks if there are any entities in the document that were not disambiguated by the Entity-Fishing service.
  • If there are, it passes the text of these entities to the Entity-Fishing service again, but this time without the text of the document.
  • It then merges the results of the two calls to the Entity-Fishing service and attaches the information from the Entity-Fishing service to the entities in the document

:param doc: The document to be processed :type doc: Doc :param result_from_ef_text: a list of three elements: :return: A list of dictionaries, each dictionary contains the information of a single entity.

Source code in ceurws/services/entity_fishing.py
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
def process_single_doc_after_call(self, doc: Doc, result_from_ef_text) -> Doc:
    """
    - The function takes a document and a list of entities from the Entity-Fishing service.
    - It then checks if there are any entities in the document that were not disambiguated by the
    Entity-Fishing service.
    - If there are, it passes the text of these entities to the Entity-Fishing service again, but
    this time without the text of the document.
    - It then merges the results of the two calls to the Entity-Fishing service and attaches the
    information from the Entity-Fishing service to the entities in the document

    :param doc: The document to be processed
    :type doc: Doc
    :param result_from_ef_text: a list of three elements:
    :return: A list of dictionaries, each dictionary contains the information of a single entity.
    """
    entities_from_text = result_from_ef_text[2]

    # 1a. Attach raw response (with text method in Entity-Fishing service) to doc
    if len(result_from_ef_text[0]) > 0:
        doc._.annotations["disambiguation_text_service"] = result_from_ef_text[0]

    doc._.metadata["disambiguation_text_service"] = result_from_ef_text[1]

    # 2 .Because some named entities have not been disambiguated,
    # create a list with these unrelated entities ("nil clustering").
    # Pass them back in Entity-fishing without the text but with all
    # the named entities surrounding these entities, to create a context
    # of neighboring terms.
    # nil_clustering = named entities in doc - actual disambiguated entities by EF
    nil_clustering = []
    if len(result_from_ef_text[0]) > 0:
        with contextlib.suppress(KeyError):
            nil_clustering = [
                doc.char_span(start_idx=ent[1], end_idx=ent[2])
                for ent in [(ent.text, ent.start_char, ent.end_char) for ent in doc.ents]
                if ent
                not in [
                    (ent_ef["rawName"], ent_ef["offsetStart"], ent_ef["offsetEnd"])
                    for ent_ef in result_from_ef_text[0]["entities"]
                ]
            ]
    entities_from_terms = []
    if len(nil_clustering) != 0:
        # prepare query for Entity-Fishing terms disambiguation
        terms = " ".join([ent.text for ent in doc.ents])
        result_from_ef_terms = self.main_disambiguation_process_batch(
            text_batch=[""], terms_batch=[terms], entities_batch=[nil_clustering]
        )[0]

        entities_from_terms = result_from_ef_terms[2]

        # 2b. Attach raw response (with terms method in Entity-Fishing service) to doc
        if len(result_from_ef_terms[0]) > 0:
            doc._.annotations["disambiguation_terms_service"] = result_from_ef_terms[0]
        doc._.metadata["disambiguation_terms_service"] = result_from_ef_terms[1]

    # 3. Merge two list of entities (first and second pass in EF service)
    # and attach information from Entity-Fishing to spans
    result = (
        entities_from_text
        + [entity_term for entity_term in entities_from_terms if entity_term not in entities_from_text]
        if len(entities_from_terms) > 0
        else entities_from_text
    )

    if len(result) > 0:
        with contextlib.suppress(KeyError):
            self.updated_entities(doc, result)
    return doc
updated_entities(doc, response)

The function updated_entities takes a Doc object and a list of entities as input. It then iterates over the list of entities and updates the Doc object with the information contained in the list of entities

:param doc: the document to be processed :type doc: Doc :param response: the response from the NERD API :type response: list

Source code in ceurws/services/entity_fishing.py
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
def updated_entities(self, doc: Doc, response: list) -> None:
    """
    > The function `updated_entities` takes a `Doc` object and a list of entities as input. It then
    iterates over the list of entities and updates the `Doc` object with the information contained
    in the list of entities

    :param doc: the document to be processed
    :type doc: Doc
    :param response: the response from the NERD API
    :type response: list
    """
    for entity in response:
        with contextlib.suppress(AttributeError):
            span = doc.char_span(start_idx=entity["offsetStart"], end_idx=entity["offsetEnd"])
            with contextlib.suppress(KeyError):
                span._.kb_qid = str(entity["wikidataId"])
                span._.url_wikidata = self.wikidata_url_base + span._.kb_qid
            with contextlib.suppress(KeyError):
                span._.wikipedia_page_ref = str(entity["wikipediaExternalRef"])
                # if flag_extra : search other info on entity
                # => attach extra entity info to span
                if self.flag_extra:
                    self.look_extra_informations_on_entity(span, entity)
            with contextlib.suppress(KeyError):
                span._.nerd_score = entity["confidence_score"]

opentapioca

@author: https://github.com/UB-Mannheim/spacyopentapioca/blob/main/spacyopentapioca/entity_linker.py

EntityLinker

Sends raw data to the OpenTapioca API. Attaches entities to the document. Based on: https://github.com/UB-Mannheim/spacyopentapioca/blob/main/spacyopentapioca/entity_linker.py

Source code in ceurws/services/opentapioca.py
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
@Language.factory(OPENTAPIOCA_PIPELINE, default_config={"url": f"{OPENTAPIOCA_ENDPOINT}/api/annotate"})
class EntityLinker:
    """
    Sends raw data to the OpenTapioca API. Attaches entities to the document.
    Based on: https://github.com/UB-Mannheim/spacyopentapioca/blob/main/spacyopentapioca/entity_linker.py
    """

    def __init__(self, nlp, name, url):
        """Passes url. Registers OpenTapioca extensions for Doc and Span."""
        self.url = url
        Doc.set_extension("annotations", default=None, force=True)
        Doc.set_extension("metadata", default=None, force=True)
        Span.set_extension("annotations", default=None, force=True)
        Span.set_extension("description", default=None, force=True)
        Span.set_extension("aliases", default=None, force=True)
        Span.set_extension("rank", default=None, force=True)
        Span.set_extension("score", default=None, force=True)
        Span.set_extension("types", default=None, force=True)
        Span.set_extension("label", default=None, force=True)
        Span.set_extension("extra_aliases", default=None, force=True)
        Span.set_extension("nb_sitelinks", default=None, force=True)
        Span.set_extension("nb_statements", default=None, force=True)

    def process_single_doc_after_call(self, doc: Doc, r) -> Doc:
        r.raise_for_status()
        data = r.json()

        # Attaches raw data to doc
        doc._.annotations = data.get("annotations")
        doc._.metadata = {"status_code": r.status_code, "reason": r.reason, "ok": r.ok, "encoding": r.encoding}

        # Attaches indexes, label and QID to spans
        # Processes annotations: if 'best_qid'==None, then no annotation
        ents = []
        for ent in data.get("annotations"):
            start, end = ent["start"], ent["end"]
            if ent.get("best_qid"):
                ent_kb_id = ent["best_qid"]
                try:  # to identify the type of entities
                    t = ent["tags"][0]["types"]
                    types = {
                        "PERSON": t["Q5"] + t["P496"],
                        "ORG": t["Q43229"] + t["P2427"],
                        "LOC": t["Q618123"] + t["P1566"],
                    }
                    m = max(types.values())
                    etype = "".join([k for k, v in types.items() if v == m])
                except Exception as e:
                    log.error(e, extra=ent)
                    etype = ""
                span = doc.char_span(start, end, etype, ent_kb_id)
            else:
                etype, ent_kb_id = "", ""
                span = doc.char_span(start, end, etype)
            if not span:
                span = doc.char_span(start, end, etype, ent_kb_id, alignment_mode="expand")
                log.warning(
                    'The OpenTapioca-entity "%s" %s does not fit the span "%s" %s in spaCy. EXPANDED!',
                    ent["tags"][0]["label"][0],
                    (start, end),
                    span.text,
                    (span.start_char, span.end_char),
                )
            span._.annotations = ent
            span._.description = ent["tags"][0]["desc"]
            span._.aliases = ent["tags"][0]["aliases"]
            span._.rank = ent["tags"][0]["rank"]
            span._.score = ent["tags"][0]["score"]
            span._.types = ent["tags"][0]["types"]
            span._.label = ent["tags"][0]["label"]
            span._.extra_aliases = ent["tags"][0]["extra_aliases"]
            span._.nb_sitelinks = ent["tags"][0]["nb_sitelinks"]
            span._.nb_statements = ent["tags"][0]["nb_statements"]
            ents.append(span)

        # Attach processed entities to doc.ents
        try:
            # this works with non-overlapping spans
            doc.ents = list(doc.ents) + ents
        except Exception:
            # filter the overlapping spans, keep the (first) longest one
            doc.ents = spacy.util.filter_spans(ents)
        # Attach all entities found by OpenTapioca to spans
        doc.spans["all_entities_opentapioca"] = ents
        return doc

    def make_request(self, doc: Doc):
        return requests.post(url=self.url, data={"query": doc.text}, headers={"User-Agent": "spaCyOpenTapioca"})

    def __call__(self, doc):
        """Requests the OpenTapioca API. Attaches entities to spans and doc."""

        # Post request to the OpenTapioca API
        r = self.make_request(doc)

        return self.process_single_doc_after_call(doc, r)

    def pipe(self, stream, batch_size=128):
        """
        It takes a stream of documents, and for each batch of documents, it makes a request to the API
        for each document in the batch, and then yields the processed results of each document

        :param stream: the stream of documents to be processed
        :param batch_size: The number of documents to send to the API in a single request, defaults to
        128 (optional)
        """
        for docs in util.minibatch(stream, size=batch_size):
            with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:
                future_to_url = {executor.submit(self.make_request, doc): doc for doc in docs}
                for future in concurrent.futures.as_completed(future_to_url):
                    doc = future_to_url[future]
                    yield self.process_single_doc_after_call(doc, future.result())
__call__(doc)

Requests the OpenTapioca API. Attaches entities to spans and doc.

Source code in ceurws/services/opentapioca.py
110
111
112
113
114
115
116
def __call__(self, doc):
    """Requests the OpenTapioca API. Attaches entities to spans and doc."""

    # Post request to the OpenTapioca API
    r = self.make_request(doc)

    return self.process_single_doc_after_call(doc, r)
__init__(nlp, name, url)

Passes url. Registers OpenTapioca extensions for Doc and Span.

Source code in ceurws/services/opentapioca.py
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
def __init__(self, nlp, name, url):
    """Passes url. Registers OpenTapioca extensions for Doc and Span."""
    self.url = url
    Doc.set_extension("annotations", default=None, force=True)
    Doc.set_extension("metadata", default=None, force=True)
    Span.set_extension("annotations", default=None, force=True)
    Span.set_extension("description", default=None, force=True)
    Span.set_extension("aliases", default=None, force=True)
    Span.set_extension("rank", default=None, force=True)
    Span.set_extension("score", default=None, force=True)
    Span.set_extension("types", default=None, force=True)
    Span.set_extension("label", default=None, force=True)
    Span.set_extension("extra_aliases", default=None, force=True)
    Span.set_extension("nb_sitelinks", default=None, force=True)
    Span.set_extension("nb_statements", default=None, force=True)
pipe(stream, batch_size=128)

It takes a stream of documents, and for each batch of documents, it makes a request to the API for each document in the batch, and then yields the processed results of each document

:param stream: the stream of documents to be processed :param batch_size: The number of documents to send to the API in a single request, defaults to 128 (optional)

Source code in ceurws/services/opentapioca.py
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
def pipe(self, stream, batch_size=128):
    """
    It takes a stream of documents, and for each batch of documents, it makes a request to the API
    for each document in the batch, and then yields the processed results of each document

    :param stream: the stream of documents to be processed
    :param batch_size: The number of documents to send to the API in a single request, defaults to
    128 (optional)
    """
    for docs in util.minibatch(stream, size=batch_size):
        with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:
            future_to_url = {executor.submit(self.make_request, doc): doc for doc in docs}
            for future in concurrent.futures.as_completed(future_to_url):
                doc = future_to_url[future]
                yield self.process_single_doc_after_call(doc, future.result())

sql_cache

Created on 2024-03-16 @author: wf

Cached

Manage cached entities.

Source code in ceurws/sql_cache.py
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
class Cached:
    """
    Manage cached entities.
    """

    def __init__(
        self, clazz: type[Any], sparql: SPARQL, sql_db: SqlDB, query_name: str, max_errors: int = 0, debug: bool = False
    ):
        """
        Initializes the Manager with class reference, SPARQL endpoint URL, SQL database connection string,
        query name, and an optional debug flag.
        Args:
            clazz (type[Any]): The class reference for the type of objects managed by this manager.
            sparql (SPARQL): a SPARQL endpoint.
            sql_db (SqlDB): SQL database object
            query_name (str): The name of the query to be executed.
            debug (bool, optional): Flag to enable debug mode. Defaults to False.
        """
        self.clazz = clazz
        self.sparql = sparql
        self.sql_db = sql_db
        self.query_name = query_name
        self.max_errors = max_errors
        self.debug = debug
        self.entities: list[object] = []
        self.errors: list[Exception] = []
        # Ensure the table for the class exists
        clazz.metadata.create_all(self.sql_db.engine)

    def fetch_or_query(self, qm, force_query=False):
        """
        Fetches data from the local cache if available.
        If the data is not in the cache or if force_query is True,
        it queries via SPARQL and caches the results.

        Args:
            qm (QueryManager): The query manager object used for making SPARQL queries.
            force_query (bool, optional): A flag to force querying via SPARQL even
                if the data exists in the local cache. Defaults to False.
        """
        if not force_query and self.check_local_cache():
            self.fetch_from_local()
        else:
            self.get_lod(qm)
            self.store()

    def check_local_cache(self) -> bool:
        """
        Checks if there is data in the local cache (SQL database).

        Returns:
            bool: True if  there is at least one record in the local SQL cache table
        """
        with self.sql_db.get_session() as session:
            result = session.exec(select(self.clazz)).first()
            return result is not None

    def fetch_from_local(self):
        """
        Fetches data from the local SQL database.
        """
        profiler = Profiler(f"fetch {self.query_name} from local", profile=self.debug)
        with self.sql_db.get_session() as session:
            self.entities = session.exec(select(self.clazz)).all()
            self.lod = [entity.dict() for entity in self.entities]
            if self.debug:
                print(f"Loaded {len(self.entities)} records from local cache")
        profiler.time()

    def get_lod(self, qm: QueryManager) -> list[dict]:
        """
        Fetches data using the SPARQL query specified by my query_name.

        Args:
            qm (QueryManager): The query manager object used for making SPARQL queries.
        Returns:
            list[dict]: A list of dictionaries representing the data fetched.
        """
        profiler = Profiler(f"fetch {self.query_name} from SPARQL endpoint {self.sparql.url}", profile=self.debug)
        query = qm.queriesByName[self.query_name]
        self.lod = self.sparql.queryAsListOfDicts(query.query)
        profiler.time()
        if self.debug:
            print(f"Found {len(self.lod)} records for {self.query_name}")
        return self.lod

    def to_entities(self, max_errors: int | None = None) -> list[Any]:
        """
        Converts records fetched from the LOD into entity instances, applying validation.
        Args:
            max_errors (int, optional): Maximum allowed validation errors. Defaults to 0.
        Returns:
            list[Any]: A list of entity instances that have passed validation.
        """
        self.entities = []
        self.errors = []
        error_records = []
        if max_errors is None:
            max_errors = self.max_errors
        for record in self.lod:
            try:
                entity = self.clazz.model_validate(record)
                self.entities.append(entity)
            except Exception as e:
                self.errors.append(e)
                error_records.append(record)
        error_count = len(self.errors)
        if error_count > max_errors:
            msg = f"found {error_count} errors > maximum allowed {max_errors} errors"
            if self.debug:
                print(msg)
                for i, error in enumerate(self.errors):
                    print(f"{i}:{error} for \n{error_records[i]}")
            raise Exception(msg)
        return self.entities

    def store(self, max_errors: int | None = None) -> list[Any]:
        """
        Stores the fetched data into the local SQL database.

        Args:
            max_errors (int, optional): Maximum allowed validation errors. Defaults to 0.
        Returns:
            list[Any]: A list of entity instances that were stored in the database.

        """
        profiler = Profiler(f"store {self.query_name}", profile=self.debug)
        self.to_entities(max_errors=max_errors)
        with self.sql_db.get_session() as session:
            session.add_all(self.entities)
            session.commit()
            if self.debug:
                print(f"Stored {len(self.entities)} records in local cache")
        profiler.time()
        return self.entities

__init__(clazz, sparql, sql_db, query_name, max_errors=0, debug=False)

Initializes the Manager with class reference, SPARQL endpoint URL, SQL database connection string, query name, and an optional debug flag. Args: clazz (type[Any]): The class reference for the type of objects managed by this manager. sparql (SPARQL): a SPARQL endpoint. sql_db (SqlDB): SQL database object query_name (str): The name of the query to be executed. debug (bool, optional): Flag to enable debug mode. Defaults to False.

Source code in ceurws/sql_cache.py
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
def __init__(
    self, clazz: type[Any], sparql: SPARQL, sql_db: SqlDB, query_name: str, max_errors: int = 0, debug: bool = False
):
    """
    Initializes the Manager with class reference, SPARQL endpoint URL, SQL database connection string,
    query name, and an optional debug flag.
    Args:
        clazz (type[Any]): The class reference for the type of objects managed by this manager.
        sparql (SPARQL): a SPARQL endpoint.
        sql_db (SqlDB): SQL database object
        query_name (str): The name of the query to be executed.
        debug (bool, optional): Flag to enable debug mode. Defaults to False.
    """
    self.clazz = clazz
    self.sparql = sparql
    self.sql_db = sql_db
    self.query_name = query_name
    self.max_errors = max_errors
    self.debug = debug
    self.entities: list[object] = []
    self.errors: list[Exception] = []
    # Ensure the table for the class exists
    clazz.metadata.create_all(self.sql_db.engine)

check_local_cache()

Checks if there is data in the local cache (SQL database).

Returns:

Name Type Description
bool bool

True if there is at least one record in the local SQL cache table

Source code in ceurws/sql_cache.py
81
82
83
84
85
86
87
88
89
90
def check_local_cache(self) -> bool:
    """
    Checks if there is data in the local cache (SQL database).

    Returns:
        bool: True if  there is at least one record in the local SQL cache table
    """
    with self.sql_db.get_session() as session:
        result = session.exec(select(self.clazz)).first()
        return result is not None

fetch_from_local()

Fetches data from the local SQL database.

Source code in ceurws/sql_cache.py
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
def fetch_from_local(self):
    """
    Fetches data from the local SQL database.
    """
    profiler = Profiler(f"fetch {self.query_name} from local", profile=self.debug)
    with self.sql_db.get_session() as session:
        self.entities = session.exec(select(self.clazz)).all()
        self.lod = [entity.dict() for entity in self.entities]
        if self.debug:
            print(f"Loaded {len(self.entities)} records from local cache")
    profiler.time()

fetch_or_query(qm, force_query=False)

Fetches data from the local cache if available. If the data is not in the cache or if force_query is True, it queries via SPARQL and caches the results.

Parameters:

Name Type Description Default
qm QueryManager

The query manager object used for making SPARQL queries.

required
force_query bool

A flag to force querying via SPARQL even if the data exists in the local cache. Defaults to False.

False
Source code in ceurws/sql_cache.py
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
def fetch_or_query(self, qm, force_query=False):
    """
    Fetches data from the local cache if available.
    If the data is not in the cache or if force_query is True,
    it queries via SPARQL and caches the results.

    Args:
        qm (QueryManager): The query manager object used for making SPARQL queries.
        force_query (bool, optional): A flag to force querying via SPARQL even
            if the data exists in the local cache. Defaults to False.
    """
    if not force_query and self.check_local_cache():
        self.fetch_from_local()
    else:
        self.get_lod(qm)
        self.store()

get_lod(qm)

Fetches data using the SPARQL query specified by my query_name.

Parameters:

Name Type Description Default
qm QueryManager

The query manager object used for making SPARQL queries.

required

Returns: list[dict]: A list of dictionaries representing the data fetched.

Source code in ceurws/sql_cache.py
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
def get_lod(self, qm: QueryManager) -> list[dict]:
    """
    Fetches data using the SPARQL query specified by my query_name.

    Args:
        qm (QueryManager): The query manager object used for making SPARQL queries.
    Returns:
        list[dict]: A list of dictionaries representing the data fetched.
    """
    profiler = Profiler(f"fetch {self.query_name} from SPARQL endpoint {self.sparql.url}", profile=self.debug)
    query = qm.queriesByName[self.query_name]
    self.lod = self.sparql.queryAsListOfDicts(query.query)
    profiler.time()
    if self.debug:
        print(f"Found {len(self.lod)} records for {self.query_name}")
    return self.lod

store(max_errors=None)

Stores the fetched data into the local SQL database.

Parameters:

Name Type Description Default
max_errors int

Maximum allowed validation errors. Defaults to 0.

None

Returns: list[Any]: A list of entity instances that were stored in the database.

Source code in ceurws/sql_cache.py
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
def store(self, max_errors: int | None = None) -> list[Any]:
    """
    Stores the fetched data into the local SQL database.

    Args:
        max_errors (int, optional): Maximum allowed validation errors. Defaults to 0.
    Returns:
        list[Any]: A list of entity instances that were stored in the database.

    """
    profiler = Profiler(f"store {self.query_name}", profile=self.debug)
    self.to_entities(max_errors=max_errors)
    with self.sql_db.get_session() as session:
        session.add_all(self.entities)
        session.commit()
        if self.debug:
            print(f"Stored {len(self.entities)} records in local cache")
    profiler.time()
    return self.entities

to_entities(max_errors=None)

Converts records fetched from the LOD into entity instances, applying validation. Args: max_errors (int, optional): Maximum allowed validation errors. Defaults to 0. Returns: list[Any]: A list of entity instances that have passed validation.

Source code in ceurws/sql_cache.py
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
def to_entities(self, max_errors: int | None = None) -> list[Any]:
    """
    Converts records fetched from the LOD into entity instances, applying validation.
    Args:
        max_errors (int, optional): Maximum allowed validation errors. Defaults to 0.
    Returns:
        list[Any]: A list of entity instances that have passed validation.
    """
    self.entities = []
    self.errors = []
    error_records = []
    if max_errors is None:
        max_errors = self.max_errors
    for record in self.lod:
        try:
            entity = self.clazz.model_validate(record)
            self.entities.append(entity)
        except Exception as e:
            self.errors.append(e)
            error_records.append(record)
    error_count = len(self.errors)
    if error_count > max_errors:
        msg = f"found {error_count} errors > maximum allowed {max_errors} errors"
        if self.debug:
            print(msg)
            for i, error in enumerate(self.errors):
                print(f"{i}:{error} for \n{error_records[i]}")
        raise Exception(msg)
    return self.entities

SqlDB

general SQL database

Source code in ceurws/sql_cache.py
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
class SqlDB:
    """
    general SQL database
    """

    def __init__(self, sqlite_file_path: str, debug: bool = False):
        debug = debug
        sqlite_url = f"sqlite:///{sqlite_file_path}"
        connect_args = {"check_same_thread": False}
        self.engine = create_engine(sqlite_url, echo=debug, connect_args=connect_args)

    def get_session(self) -> Session:
        """
        Provide a session for database operations.

        Returns:
            Session: A SQLAlchemy Session object bound to the engine for database operations.
        """
        return Session(bind=self.engine)

get_session()

Provide a session for database operations.

Returns:

Name Type Description
Session Session

A SQLAlchemy Session object bound to the engine for database operations.

Source code in ceurws/sql_cache.py
25
26
27
28
29
30
31
32
def get_session(self) -> Session:
    """
    Provide a session for database operations.

    Returns:
        Session: A SQLAlchemy Session object bound to the engine for database operations.
    """
    return Session(bind=self.engine)

textparser

Created on 2022-08-15

@author: wf

Textparser

general text parser

Source code in ceurws/textparser.py
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
class Textparser:
    """
    general text parser
    """

    def __init__(self, debug: bool):
        """
        Constructor

        Args:
            debug(bool): if TRUE switch debugging on
        """
        self.debug = debug

    @classmethod
    def sanitize(cls, text, replaceList=None) -> str:
        """
        sanitize given text

        Args:
            text: text to sanitize
            replaceList: list of strings to remove from the given text

        Returns:
            str: sanitized string
        """
        if replaceList is None:
            replaceList = []
        if text is not None:
            sanitizeChars = "\n\t\r., "
            text = text.strip(sanitizeChars)
            text = text.replace("\n", " ")
            text = text.replace("\r", "")
            for replace in replaceList:
                text = text.replace(replace, "")
            # compress multiple spaces
            text = " ".join(text.split())
        return text

    def log(self, msg: str):
        """
        log the given message if debug is on

        Args:
            msg(str): the message to log
        """
        if self.debug:
            print(msg)

    def hasValue(self, d, key):
        """
        check that the given attribute in the given dict is available and not none

        Args:
            d(dict): the dict
            key(str): the key

        Returns:
            True: if a not None Value is available
        """
        result = key in d and d[key] is not None
        return result

    def getMatch(self, pattern, text, groupNo: int = 1):
        """
        get the match for the given regular expression for the given text returning the given group number

        Args:
            regexp(str): the regular expression to check
            text(str): the text to check
            groupNo(int): the number of the regular expression group to return

        Returns:
            str: the matching result or None if no match was found
        """
        matchResult = pattern.match(text)
        if matchResult:
            return matchResult.group(groupNo)
        else:
            return None

__init__(debug)

Constructor

Parameters:

Name Type Description Default
debug(bool)

if TRUE switch debugging on

required
Source code in ceurws/textparser.py
14
15
16
17
18
19
20
21
def __init__(self, debug: bool):
    """
    Constructor

    Args:
        debug(bool): if TRUE switch debugging on
    """
    self.debug = debug

getMatch(pattern, text, groupNo=1)

get the match for the given regular expression for the given text returning the given group number

Parameters:

Name Type Description Default
regexp(str)

the regular expression to check

required
text(str)

the text to check

required
groupNo(int)

the number of the regular expression group to return

required

Returns:

Name Type Description
str

the matching result or None if no match was found

Source code in ceurws/textparser.py
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
def getMatch(self, pattern, text, groupNo: int = 1):
    """
    get the match for the given regular expression for the given text returning the given group number

    Args:
        regexp(str): the regular expression to check
        text(str): the text to check
        groupNo(int): the number of the regular expression group to return

    Returns:
        str: the matching result or None if no match was found
    """
    matchResult = pattern.match(text)
    if matchResult:
        return matchResult.group(groupNo)
    else:
        return None

hasValue(d, key)

check that the given attribute in the given dict is available and not none

Parameters:

Name Type Description Default
d(dict)

the dict

required
key(str)

the key

required

Returns:

Name Type Description
True

if a not None Value is available

Source code in ceurws/textparser.py
58
59
60
61
62
63
64
65
66
67
68
69
70
def hasValue(self, d, key):
    """
    check that the given attribute in the given dict is available and not none

    Args:
        d(dict): the dict
        key(str): the key

    Returns:
        True: if a not None Value is available
    """
    result = key in d and d[key] is not None
    return result

log(msg)

log the given message if debug is on

Parameters:

Name Type Description Default
msg(str)

the message to log

required
Source code in ceurws/textparser.py
48
49
50
51
52
53
54
55
56
def log(self, msg: str):
    """
    log the given message if debug is on

    Args:
        msg(str): the message to log
    """
    if self.debug:
        print(msg)

sanitize(text, replaceList=None) classmethod

sanitize given text

Parameters:

Name Type Description Default
text

text to sanitize

required
replaceList

list of strings to remove from the given text

None

Returns:

Name Type Description
str str

sanitized string

Source code in ceurws/textparser.py
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
@classmethod
def sanitize(cls, text, replaceList=None) -> str:
    """
    sanitize given text

    Args:
        text: text to sanitize
        replaceList: list of strings to remove from the given text

    Returns:
        str: sanitized string
    """
    if replaceList is None:
        replaceList = []
    if text is not None:
        sanitizeChars = "\n\t\r., "
        text = text.strip(sanitizeChars)
        text = text.replace("\n", " ")
        text = text.replace("\r", "")
        for replace in replaceList:
            text = text.replace(replace, "")
        # compress multiple spaces
        text = " ".join(text.split())
    return text

urn

Created on 2023-12-28

@author: wf / ChatGPT-4 as instructed

Class URN is designed to verify and calculate check digits for URNs (Uniform Resource Names) as used in the DNB URN service. The class provides methods for both verifying a full URN's check digit (check_urn_checksum) and calculating the check digit for a given URN (calc_urn_checksum). It's adapted from PHP and JavaScript sources, following the guidelines and methods outlined by the DNB (German National Library) URN service.

URN

URN check digit calculator for DNB URN service:

see https://www.dnb.de/DE/Professionell/Services/URN-Service/urn-service_node.html

and https://d-nb.info/1045320641/34 http://nbn-resolving.de/nbnpruefziffer.php

Source code in ceurws/urn.py
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
class URN:
    """
    URN check digit calculator for DNB URN service:

    see https://www.dnb.de/DE/Professionell/Services/URN-Service/urn-service_node.html

    and
        https://d-nb.info/1045320641/34
        http://nbn-resolving.de/nbnpruefziffer.php

    """

    @classmethod
    def check_urn_checksum(cls, urn: str, debug: bool = False) -> bool:
        urn_check_digit_str = urn[-1]
        urn_prefix = urn[:-1]
        check_digit = cls.calc_urn_checksum(urn_prefix, debug)
        urn_ok = str(check_digit) == urn_check_digit_str
        return urn_ok

    @classmethod
    def calc_urn_checksum(cls, test_urn: str, debug: bool = False) -> int:
        """
        converted from PHP and JavaScript code see
        see https://github.com/bohnelang/URN-Pruefziffer

        Args:
            debug(bool) if True show the internal values while calculating
        """
        # Code string provided in the original PHP function
        code = "3947450102030405060708094117############1814191516212223242542262713282931123233113435363738########43"

        # Initialization of variables
        _sum = 0
        pos = 1

        # Iterating through each character in the URN
        for i, char in enumerate(test_urn.upper()):
            # Getting the ASCII value and adjusting it based on the character '-' (45 in ASCII)
            x = ord(char) - 45
            # Extracting two consecutive values from the code string
            v1 = int(code[x * 2]) if code[x * 2] != "#" else 0
            v2 = int(code[x * 2 + 1]) if code[x * 2 + 1] != "#" else 0

            if v1 == 0:
                # If v1 is 0, increment pos after multiplying v2 with its current value
                _sum += v2 * pos
                pos += 1  # post-increment equivalent in Python
            else:
                # If v1 is not 0, use pos for the first term, increment pos,
                # then use the new value of pos for the second term
                # This effectively increases pos by 2 in this branch
                _sum += pos * v1
                pos += 1  # increment for the first term
                _sum += v2 * pos  # use incremented pos for the second term
                pos += 1  # increment for the second term

            if debug:
                print(f"i: {i:2} pos: {pos:2} x: {x:2} v1: {v1:2} v2: {v2:2} sum: {_sum:4}")

        # Assuming v2 is not 0 at the end of your URN calculations
        check_digit = (_sum // v2) % 10  # Using integer division for floor behavior

        return check_digit

calc_urn_checksum(test_urn, debug=False) classmethod

converted from PHP and JavaScript code see see https://github.com/bohnelang/URN-Pruefziffer

Source code in ceurws/urn.py
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
@classmethod
def calc_urn_checksum(cls, test_urn: str, debug: bool = False) -> int:
    """
    converted from PHP and JavaScript code see
    see https://github.com/bohnelang/URN-Pruefziffer

    Args:
        debug(bool) if True show the internal values while calculating
    """
    # Code string provided in the original PHP function
    code = "3947450102030405060708094117############1814191516212223242542262713282931123233113435363738########43"

    # Initialization of variables
    _sum = 0
    pos = 1

    # Iterating through each character in the URN
    for i, char in enumerate(test_urn.upper()):
        # Getting the ASCII value and adjusting it based on the character '-' (45 in ASCII)
        x = ord(char) - 45
        # Extracting two consecutive values from the code string
        v1 = int(code[x * 2]) if code[x * 2] != "#" else 0
        v2 = int(code[x * 2 + 1]) if code[x * 2 + 1] != "#" else 0

        if v1 == 0:
            # If v1 is 0, increment pos after multiplying v2 with its current value
            _sum += v2 * pos
            pos += 1  # post-increment equivalent in Python
        else:
            # If v1 is not 0, use pos for the first term, increment pos,
            # then use the new value of pos for the second term
            # This effectively increases pos by 2 in this branch
            _sum += pos * v1
            pos += 1  # increment for the first term
            _sum += v2 * pos  # use incremented pos for the second term
            pos += 1  # increment for the second term

        if debug:
            print(f"i: {i:2} pos: {pos:2} x: {x:2} v1: {v1:2} v2: {v2:2} sum: {_sum:4}")

    # Assuming v2 is not 0 at the end of your URN calculations
    check_digit = (_sum // v2) % 10  # Using integer division for floor behavior

    return check_digit

utils

download

Created on 2021-08-21

this is a redundant copy see e.g. https://github.com/WolfgangFahl/ConferenceCorpus/blob/main/corpus/utils/download.py

@author: wf

Download

Utility functions for downloading data

Source code in ceurws/utils/download.py
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
class Download:
    """
    Utility functions for downloading data
    """

    @staticmethod
    def getURLContent(url: str):
        with urllib.request.urlopen(url) as urlResponse:
            content = urlResponse.read().decode()
            return content

    @staticmethod
    def getFileContent(path: str):
        with open(path) as file:
            content = file.read()
            return content

    @staticmethod
    def needsDownload(filePath: Path, force: bool = False) -> bool:
        """
        check if a download of the given filePath is necessary that is the file
        does not exist has a size of zero or the download should be forced

        Args:
            filePath(str): the path of the file to be checked
            force(bool): True if the result should be forced to True

        Return:
            bool: True if  a download for this file needed
        """
        if not filePath.is_file():
            result = True
        else:
            stats = filePath.stat()
            size = stats.st_size
            result = force or size == 0
        return result

    @staticmethod
    def downloadBackupFile(
        url: str,
        fileName: str,
        targetDirectory: Path,
        force: bool = False,
        profile: bool = True,
    ):
        """
        Downloads from the given url the zip-file and extracts the file corresponding to the given fileName.

        Args:
            url: url linking to a downloadable gzip file
            fileName: Name of the file that should be extracted from gzip file
            targetDirectory(str): download the file to this directory
            force (bool): True if the download should be forced
            profile(bool): if True show profiling information

        Returns:
            Name of the extracted file with path to the backup directory
        """
        extractTo = targetDirectory.joinpath(fileName)
        zipped = targetDirectory.joinpath(f"{fileName}.gz")
        # we might want to check whether a new version is available
        if Download.needsDownload(extractTo, force=force):
            if not targetDirectory.is_dir():
                targetDirectory.parent.mkdir(parents=True, exist_ok=True)
            msg = f"Downloading {zipped} from {url} ... this might take a few seconds ..."
            profiler = Profiler(msg=msg, profile=profile)
            urllib.request.urlretrieve(url, zipped)
            profiler.time(extraMsg=f" unzipping {extractTo} from {zipped}")
            with gzip.open(zipped, "rb") as gzipped, open(extractTo, "wb") as unzipped:
                shutil.copyfileobj(gzipped, unzipped)
            if not extractTo.is_file():
                raise Exception(f"could not extract {fileName} from {zipped}")
        return extractTo
downloadBackupFile(url, fileName, targetDirectory, force=False, profile=True) staticmethod

Downloads from the given url the zip-file and extracts the file corresponding to the given fileName.

Parameters:

Name Type Description Default
url str

url linking to a downloadable gzip file

required
fileName str

Name of the file that should be extracted from gzip file

required
targetDirectory(str)

download the file to this directory

required
force bool

True if the download should be forced

False
profile(bool)

if True show profiling information

required

Returns:

Type Description

Name of the extracted file with path to the backup directory

Source code in ceurws/utils/download.py
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
@staticmethod
def downloadBackupFile(
    url: str,
    fileName: str,
    targetDirectory: Path,
    force: bool = False,
    profile: bool = True,
):
    """
    Downloads from the given url the zip-file and extracts the file corresponding to the given fileName.

    Args:
        url: url linking to a downloadable gzip file
        fileName: Name of the file that should be extracted from gzip file
        targetDirectory(str): download the file to this directory
        force (bool): True if the download should be forced
        profile(bool): if True show profiling information

    Returns:
        Name of the extracted file with path to the backup directory
    """
    extractTo = targetDirectory.joinpath(fileName)
    zipped = targetDirectory.joinpath(f"{fileName}.gz")
    # we might want to check whether a new version is available
    if Download.needsDownload(extractTo, force=force):
        if not targetDirectory.is_dir():
            targetDirectory.parent.mkdir(parents=True, exist_ok=True)
        msg = f"Downloading {zipped} from {url} ... this might take a few seconds ..."
        profiler = Profiler(msg=msg, profile=profile)
        urllib.request.urlretrieve(url, zipped)
        profiler.time(extraMsg=f" unzipping {extractTo} from {zipped}")
        with gzip.open(zipped, "rb") as gzipped, open(extractTo, "wb") as unzipped:
            shutil.copyfileobj(gzipped, unzipped)
        if not extractTo.is_file():
            raise Exception(f"could not extract {fileName} from {zipped}")
    return extractTo
needsDownload(filePath, force=False) staticmethod

check if a download of the given filePath is necessary that is the file does not exist has a size of zero or the download should be forced

Parameters:

Name Type Description Default
filePath(str)

the path of the file to be checked

required
force(bool)

True if the result should be forced to True

required
Return

bool: True if a download for this file needed

Source code in ceurws/utils/download.py
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
@staticmethod
def needsDownload(filePath: Path, force: bool = False) -> bool:
    """
    check if a download of the given filePath is necessary that is the file
    does not exist has a size of zero or the download should be forced

    Args:
        filePath(str): the path of the file to be checked
        force(bool): True if the result should be forced to True

    Return:
        bool: True if  a download for this file needed
    """
    if not filePath.is_file():
        result = True
    else:
        stats = filePath.stat()
        size = stats.st_size
        result = force or size == 0
    return result

Profiler

simple profiler

Source code in ceurws/utils/download.py
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
class Profiler:
    """
    simple profiler
    """

    def __init__(self, msg: str | None = None, profile: bool = True):
        """
        construct me with the given msg and profile active flag

        Args:
            msg(str): the message to show if profiling is active
            profile(bool): True if messages should be shown
        """
        if msg is not None:
            self.msg = msg
        else:
            self.msg = ""
        self.profile = profile
        self.starttime = time.time()
        if profile:
            print(f"Starting {msg} ...")

    def time(self, extraMsg=""):
        """
        time the action and print if profile is active
        """
        elapsed = time.time() - self.starttime
        if self.profile:
            print(f"{self.msg}{extraMsg} took {elapsed:5.1f} s")
        return elapsed
__init__(msg=None, profile=True)

construct me with the given msg and profile active flag

Parameters:

Name Type Description Default
msg(str)

the message to show if profiling is active

required
profile(bool)

True if messages should be shown

required
Source code in ceurws/utils/download.py
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
def __init__(self, msg: str | None = None, profile: bool = True):
    """
    construct me with the given msg and profile active flag

    Args:
        msg(str): the message to show if profiling is active
        profile(bool): True if messages should be shown
    """
    if msg is not None:
        self.msg = msg
    else:
        self.msg = ""
    self.profile = profile
    self.starttime = time.time()
    if profile:
        print(f"Starting {msg} ...")
time(extraMsg='')

time the action and print if profile is active

Source code in ceurws/utils/download.py
114
115
116
117
118
119
120
121
def time(self, extraMsg=""):
    """
    time the action and print if profile is active
    """
    elapsed = time.time() - self.starttime
    if self.profile:
        print(f"{self.msg}{extraMsg} took {elapsed:5.1f} s")
    return elapsed

webscrape

Created on 2020-08-20

@author: wf

this is a redundant copy of the sources at https://github.com/WolfgangFahl/ConferenceCorpus/blob/main/corpus/datasources/webscrape.py

ScrapeDescription dataclass

Description of rdfa elements to scrape

Source code in ceurws/utils/webscrape.py
238
239
240
241
242
243
244
245
246
247
248
@dataclass
class ScrapeDescription:
    """
    Description of rdfa elements to scrape
    """

    key: str
    tag: str  # the tag to search
    attribute: str  # the attribute to expect
    value: str  # the value to expect
    multi: bool = False  # do we expect multiple elements?

WebScrape

WebScraper with a rudimentary Parser for https://en.wikipedia.org/wiki/RDFa extended for CEUR-WS and WikiCFP specific scraping

https://stackoverflow.com/questions/21876602/what-does-the-html-typeof-attribute-do https://de.wikipedia.org/wiki/RDFa https://stackoverflow.com/questions/20767903/parsing-rdfa-in-html-xhtml https://www.w3.org/MarkUp/2009/rdfa-for-html-authors

Source code in ceurws/utils/webscrape.py
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
class WebScrape:
    """
    WebScraper
    with a rudimentary Parser for https://en.wikipedia.org/wiki/RDFa
    extended for CEUR-WS and WikiCFP specific scraping

    https://stackoverflow.com/questions/21876602/what-does-the-html-typeof-attribute-do
    https://de.wikipedia.org/wiki/RDFa
    https://stackoverflow.com/questions/20767903/parsing-rdfa-in-html-xhtml
    https://www.w3.org/MarkUp/2009/rdfa-for-html-authors
    """

    def __init__(
        self,
        debug: bool = False,
        showHtml: bool = False,
        timeout: float = 20,
        agent: str = "Mozilla/5.0",
    ):
        """
        Constructor

        Args:
            debug(bool): if True show debugging information
            showHtml(bool): if True show the HTML retrieved
            timeout(float): the default timeout
            agent(str): the agent to mimic
        """
        self.err: Exception | None = None
        self.valid = False
        self.debug = debug
        self.showHtml = showHtml
        self.timeout = timeout
        self.agent = agent

    def findLinkForRegexp(self, regex: str):
        """
        find a link for the given regular expression

        Args:
            regex(str): the regular expression to find a link for

        Return:
            m(object),text(str): the match/text tuple or None,None
        """
        m = None
        text = None
        link = self.soup.find("a", href=re.compile(regex))
        if link:
            href = link["href"]
            m = re.match(regex, href)
            if hasattr(link, "text"):
                text = link.text
        return m, text

    def fromTag(
        self,
        soup: BeautifulSoup,
        tag: str,
        attr: str | None = None,
        value: str | None = None,
        multi: bool = False,
    ):
        """
        get metadata from a given tag, attribute and value
        e.g. <span class="CEURVOLACRONYM">DL4KG2020</span>

        tag=span, attr=class, value=CEURVOLACRONYM

        Args:
           soup(BeautifulSoup): the parser to work with
           tag(string): the tag to search
           attr(string): the attribute to expect
           value(string): the value to expect
           multi(bool): if True - return multiple values
        """
        # https://stackoverflow.com/a/16248908/1497139
        # find a list of all tag elements
        if attr is not None and value is not None:
            nodes = soup.find_all(tag, {attr: value})
        else:
            nodes = soup.find_all(tag)
        lines = [node.get_text() for node in nodes]
        if multi:
            return lines
        if len(lines) > 0:
            return lines[0]
        else:
            return None

    def getSoup(self, url: str, showHtml: bool = False, debug: bool = False) -> BeautifulSoup | None:
        """
        get the beautiful Soup parser

        Args:
           url(str): the url to open
           showHtml(bool): if True  the html code should be pretty printed and shown
           debug(bool): if True debug info should be printed
        Return:
            BeautifulSoup: the html parser
        """
        html = self.get_html_from_url(url, debug=debug)
        soup = self.get_soup_from_string(html, show_html=showHtml) if html is not None else None
        return soup

    def get_soup_from_string(self, html: str | bytes, show_html: bool = False) -> BeautifulSoup:
        """
        get the beautiful Soup parser for the given html string

        Args:
            html: html content to parse
            show_html: True if the html code should be pretty printed and shown

        Returns:
            BeautifulSoup: the html parser
        """
        soup = BeautifulSoup(html, "html.parser")
        if show_html:
            self.printPrettyHtml(soup)
        return soup

    def printPrettyHtml(self, soup):
        """
        print the prettified html for the given soup

        Args:
            soup(BeuatifulSoup): the parsed html to print
        """
        prettyHtml = soup.prettify()
        print(prettyHtml)

    def parseWithScrapeDescription(
        self,
        soup: BeautifulSoup,
        scrapeDescr: list["ScrapeDescription"] | None = None,
    ) -> dict:
        """
        parse the given url with the given encoding
        Args:
            soup: html parser to parse the content from
            scrapeDescr: description of the

        Return:
             a dict with the results
        """
        scrapeDict = dict()
        if isinstance(scrapeDescr, list):
            for scrapeItem in scrapeDescr:
                value = self.fromTag(
                    soup,
                    scrapeItem.tag,
                    scrapeItem.attribute,
                    scrapeItem.value,
                    multi=scrapeItem.multi,
                )
                scrapeDict[scrapeItem.key] = value
        self.valid = True
        return scrapeDict

    def parseRDFa(self, url):
        """
        rudimentary RDFa parsing
        """
        triples = []
        try:
            self.soup = self.getSoup(url, self.showHtml)
            subjectNodes = self.soup.find_all(True, {"typeof": True})
            for subjectNode in subjectNodes:
                subject = subjectNode.attrs["typeof"]
                if self.debug:
                    print(subjectNode)
                for predicateNode in subjectNode.find_all():
                    value = None
                    name = None
                    if "content" in predicateNode.attrs:
                        value = predicateNode.attrs["content"]
                    else:
                        value = predicateNode.get_text()
                    if "property" in predicateNode.attrs:
                        name = predicateNode.attrs["property"]
                    if name is not None and value is not None:
                        triples.append((subject, name, value))
            self.valid = True
        except HTTPError as herr:
            self.err = herr
        except urllib.error.URLError as terr:
            self.err = terr
        return triples

    def get_html_from_url(self, url: str, debug: bool = False) -> str | bytes | None:
        """
        Get the html response from the given url
        Args:
            url: url to the get the content from
            debug(bool): if True show non available volumes

        Returns:
            str: content of the url as string
            bytes: If the content of the url contains encoding errors
            None: If the url is not reachable
        """
        req = urllib.request.Request(url, headers={"User-Agent": f"{self.agent}"})
        # handle cookies
        opener = build_opener(HTTPCookieProcessor())
        try:
            response = opener.open(req, timeout=self.timeout)
        except HTTPError as herr:
            self.err = herr
            if debug:
                print(f"{url.split('/')[-1]} not available")
            return None
        html = response.read()
        try:
            html = html.decode(response.headers.get_content_charset())
        except UnicodeDecodeError as ex:
            print(f"ERROR: Could not properly decode the html code of <{url}>")
            print(ex)
        return html
__init__(debug=False, showHtml=False, timeout=20, agent='Mozilla/5.0')

Constructor

Parameters:

Name Type Description Default
debug(bool)

if True show debugging information

required
showHtml(bool)

if True show the HTML retrieved

required
timeout(float)

the default timeout

required
agent(str)

the agent to mimic

required
Source code in ceurws/utils/webscrape.py
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
def __init__(
    self,
    debug: bool = False,
    showHtml: bool = False,
    timeout: float = 20,
    agent: str = "Mozilla/5.0",
):
    """
    Constructor

    Args:
        debug(bool): if True show debugging information
        showHtml(bool): if True show the HTML retrieved
        timeout(float): the default timeout
        agent(str): the agent to mimic
    """
    self.err: Exception | None = None
    self.valid = False
    self.debug = debug
    self.showHtml = showHtml
    self.timeout = timeout
    self.agent = agent
findLinkForRegexp(regex)

find a link for the given regular expression

Parameters:

Name Type Description Default
regex(str)

the regular expression to find a link for

required
Return

m(object),text(str): the match/text tuple or None,None

Source code in ceurws/utils/webscrape.py
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
def findLinkForRegexp(self, regex: str):
    """
    find a link for the given regular expression

    Args:
        regex(str): the regular expression to find a link for

    Return:
        m(object),text(str): the match/text tuple or None,None
    """
    m = None
    text = None
    link = self.soup.find("a", href=re.compile(regex))
    if link:
        href = link["href"]
        m = re.match(regex, href)
        if hasattr(link, "text"):
            text = link.text
    return m, text
fromTag(soup, tag, attr=None, value=None, multi=False)

get metadata from a given tag, attribute and value e.g. DL4KG2020

tag=span, attr=class, value=CEURVOLACRONYM

Parameters:

Name Type Description Default
soup(BeautifulSoup)

the parser to work with

required
tag(string)

the tag to search

required
attr(string)

the attribute to expect

required
value(string)

the value to expect

required
multi(bool)

if True - return multiple values

required
Source code in ceurws/utils/webscrape.py
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
def fromTag(
    self,
    soup: BeautifulSoup,
    tag: str,
    attr: str | None = None,
    value: str | None = None,
    multi: bool = False,
):
    """
    get metadata from a given tag, attribute and value
    e.g. <span class="CEURVOLACRONYM">DL4KG2020</span>

    tag=span, attr=class, value=CEURVOLACRONYM

    Args:
       soup(BeautifulSoup): the parser to work with
       tag(string): the tag to search
       attr(string): the attribute to expect
       value(string): the value to expect
       multi(bool): if True - return multiple values
    """
    # https://stackoverflow.com/a/16248908/1497139
    # find a list of all tag elements
    if attr is not None and value is not None:
        nodes = soup.find_all(tag, {attr: value})
    else:
        nodes = soup.find_all(tag)
    lines = [node.get_text() for node in nodes]
    if multi:
        return lines
    if len(lines) > 0:
        return lines[0]
    else:
        return None
getSoup(url, showHtml=False, debug=False)

get the beautiful Soup parser

Parameters:

Name Type Description Default
url(str)

the url to open

required
showHtml(bool)

if True the html code should be pretty printed and shown

required
debug(bool)

if True debug info should be printed

required

Return: BeautifulSoup: the html parser

Source code in ceurws/utils/webscrape.py
108
109
110
111
112
113
114
115
116
117
118
119
120
121
def getSoup(self, url: str, showHtml: bool = False, debug: bool = False) -> BeautifulSoup | None:
    """
    get the beautiful Soup parser

    Args:
       url(str): the url to open
       showHtml(bool): if True  the html code should be pretty printed and shown
       debug(bool): if True debug info should be printed
    Return:
        BeautifulSoup: the html parser
    """
    html = self.get_html_from_url(url, debug=debug)
    soup = self.get_soup_from_string(html, show_html=showHtml) if html is not None else None
    return soup
get_html_from_url(url, debug=False)

Get the html response from the given url Args: url: url to the get the content from debug(bool): if True show non available volumes

Returns:

Name Type Description
str str | bytes | None

content of the url as string

bytes str | bytes | None

If the content of the url contains encoding errors

None str | bytes | None

If the url is not reachable

Source code in ceurws/utils/webscrape.py
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
def get_html_from_url(self, url: str, debug: bool = False) -> str | bytes | None:
    """
    Get the html response from the given url
    Args:
        url: url to the get the content from
        debug(bool): if True show non available volumes

    Returns:
        str: content of the url as string
        bytes: If the content of the url contains encoding errors
        None: If the url is not reachable
    """
    req = urllib.request.Request(url, headers={"User-Agent": f"{self.agent}"})
    # handle cookies
    opener = build_opener(HTTPCookieProcessor())
    try:
        response = opener.open(req, timeout=self.timeout)
    except HTTPError as herr:
        self.err = herr
        if debug:
            print(f"{url.split('/')[-1]} not available")
        return None
    html = response.read()
    try:
        html = html.decode(response.headers.get_content_charset())
    except UnicodeDecodeError as ex:
        print(f"ERROR: Could not properly decode the html code of <{url}>")
        print(ex)
    return html
get_soup_from_string(html, show_html=False)

get the beautiful Soup parser for the given html string

Parameters:

Name Type Description Default
html str | bytes

html content to parse

required
show_html bool

True if the html code should be pretty printed and shown

False

Returns:

Name Type Description
BeautifulSoup BeautifulSoup

the html parser

Source code in ceurws/utils/webscrape.py
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
def get_soup_from_string(self, html: str | bytes, show_html: bool = False) -> BeautifulSoup:
    """
    get the beautiful Soup parser for the given html string

    Args:
        html: html content to parse
        show_html: True if the html code should be pretty printed and shown

    Returns:
        BeautifulSoup: the html parser
    """
    soup = BeautifulSoup(html, "html.parser")
    if show_html:
        self.printPrettyHtml(soup)
    return soup
parseRDFa(url)

rudimentary RDFa parsing

Source code in ceurws/utils/webscrape.py
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
def parseRDFa(self, url):
    """
    rudimentary RDFa parsing
    """
    triples = []
    try:
        self.soup = self.getSoup(url, self.showHtml)
        subjectNodes = self.soup.find_all(True, {"typeof": True})
        for subjectNode in subjectNodes:
            subject = subjectNode.attrs["typeof"]
            if self.debug:
                print(subjectNode)
            for predicateNode in subjectNode.find_all():
                value = None
                name = None
                if "content" in predicateNode.attrs:
                    value = predicateNode.attrs["content"]
                else:
                    value = predicateNode.get_text()
                if "property" in predicateNode.attrs:
                    name = predicateNode.attrs["property"]
                if name is not None and value is not None:
                    triples.append((subject, name, value))
        self.valid = True
    except HTTPError as herr:
        self.err = herr
    except urllib.error.URLError as terr:
        self.err = terr
    return triples
parseWithScrapeDescription(soup, scrapeDescr=None)

parse the given url with the given encoding Args: soup: html parser to parse the content from scrapeDescr: description of the

Return

a dict with the results

Source code in ceurws/utils/webscrape.py
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
def parseWithScrapeDescription(
    self,
    soup: BeautifulSoup,
    scrapeDescr: list["ScrapeDescription"] | None = None,
) -> dict:
    """
    parse the given url with the given encoding
    Args:
        soup: html parser to parse the content from
        scrapeDescr: description of the

    Return:
         a dict with the results
    """
    scrapeDict = dict()
    if isinstance(scrapeDescr, list):
        for scrapeItem in scrapeDescr:
            value = self.fromTag(
                soup,
                scrapeItem.tag,
                scrapeItem.attribute,
                scrapeItem.value,
                multi=scrapeItem.multi,
            )
            scrapeDict[scrapeItem.key] = value
    self.valid = True
    return scrapeDict
printPrettyHtml(soup)

print the prettified html for the given soup

Parameters:

Name Type Description Default
soup(BeuatifulSoup)

the parsed html to print

required
Source code in ceurws/utils/webscrape.py
139
140
141
142
143
144
145
146
147
def printPrettyHtml(self, soup):
    """
    print the prettified html for the given soup

    Args:
        soup(BeuatifulSoup): the parsed html to print
    """
    prettyHtml = soup.prettify()
    print(prettyHtml)

version

Created on 2022-09-11

@author: wf

Version dataclass

Version handling for VolumeBrowser

Source code in ceurws/version.py
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
@dataclass
class Version:
    """
    Version handling for VolumeBrowser
    """

    name = "CEUR-WS Volume Browser"
    version = ceurws.__version__
    date = "2022-08-14"
    updated = "2024-07-31"
    description = "CEUR-WS Volume browser"

    authors = "Tim Holzheim, Wolfgang Fahl"

    doc_url = "https://wiki.bitplan.com/index.php/pyCEURmake"
    chat_url = "https://github.com/WolfgangFahl/pyCEURmake/discussions"
    cm_url = "https://github.com/WolfgangFahl/pyCEURmake"

    license = """Copyright 2022 contributors. All rights reserved.

  Licensed under the Apache License 2.0
  http://www.apache.org/licenses/LICENSE-2.0

  Distributed on an "AS IS" basis without warranties
  or conditions of any kind, either express or implied."""
    longDescription = f"""{name} version {version}
{description}

  Created by {authors} on {date} last updated {updated}"""

view

Created on 2024-02-23

@author: wf

View

generic View

Source code in ceurws/view.py
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
class View:
    """
    generic View
    """

    noneValue = "-"
    wdPrefix = "http://www.wikidata.org/entity/"

    def getValue(self, obj, attr):
        value = getattr(obj, attr, View.noneValue)
        if value is None:
            value = View.noneValue
        return value

    def getRowValue(self, row, key):
        value = None
        if key in row:
            value = row[key]
        if value is None:
            value = View.noneValue
        return value

    def createLink(self, url: str, text: str):
        """
        create a link from the given url and text

        Args:
            url(str): the url to create a link for
            text(str): the text to add for the link
        """
        link = Link.create(url, text, target="_blank")
        return link

    def createWdLink(self, qid: str, text: str):
        wd_url = f"{View.wdPrefix}/{qid}"
        link = self.createLink(wd_url, text)
        return link

    def get_dict_as_html_table(self, data_dict) -> str:
        # Convert the dictionary to a list of lists for tabulate
        data_list = [[key, value] for key, value in data_dict.items()]

        # Generate the HTML table
        html_table = tabulate(data_list, tablefmt="html", headers=["Key", "Value"])
        return html_table

    def createExternalLink(
        self,
        row: dict,
        key: str,
        text: str,
        formatterUrl: str,
        emptyIfNone: bool = False,
    ) -> str:
        """
        create an ExternalLink for the given row entry with the given key, text and formatterUrl

        Args:
            row(dict): the row to extract the value from
            key(str): the key
            text(str): the text to display for the link
            formatterUrl(str): the prefix for the url to use
            emptyIfNone(bool): if True return empty string if value is Display.noneValue

        Returns:
            str - html link for external id
        """
        value = self.getRowValue(row, key)
        if not value or value == View.noneValue:
            if emptyIfNone:
                return ""
            else:
                return View.noneValue

        if value.startswith(View.wdPrefix):
            value = value.replace(View.wdPrefix, "")
        url = formatterUrl + value
        link = self.createLink(url, text)
        return link

    def createItemLink(self, row: dict, key: str, separator: str | None = None) -> str:
        """
        create an item link
        Args:
            row: row object with the data
            key: key of the value for which the link is created
            separator: If not None split the value on the separator and create multiple links
        """
        value = self.getRowValue(row, key)
        if value == View.noneValue:
            return value
        item = row[key]
        itemLabel = row[f"{key}Label"]
        itemLink = ""
        if separator is not None:
            item_parts = item.split(separator)
            itemLabel_parts = itemLabel.split(separator)
            links = []
            for url, label in zip(item_parts, itemLabel_parts, strict=False):
                link = self.createLink(url, label)
                links.append(link)
            itemLink = "<br>".join(links)
        else:
            itemLink = self.createLink(item, itemLabel)
        return itemLink

create an ExternalLink for the given row entry with the given key, text and formatterUrl

Parameters:

Name Type Description Default
row(dict)

the row to extract the value from

required
key(str)

the key

required
text(str)

the text to display for the link

required
formatterUrl(str)

the prefix for the url to use

required
emptyIfNone(bool)

if True return empty string if value is Display.noneValue

required

Returns:

Type Description
str

str - html link for external id

Source code in ceurws/view.py
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
def createExternalLink(
    self,
    row: dict,
    key: str,
    text: str,
    formatterUrl: str,
    emptyIfNone: bool = False,
) -> str:
    """
    create an ExternalLink for the given row entry with the given key, text and formatterUrl

    Args:
        row(dict): the row to extract the value from
        key(str): the key
        text(str): the text to display for the link
        formatterUrl(str): the prefix for the url to use
        emptyIfNone(bool): if True return empty string if value is Display.noneValue

    Returns:
        str - html link for external id
    """
    value = self.getRowValue(row, key)
    if not value or value == View.noneValue:
        if emptyIfNone:
            return ""
        else:
            return View.noneValue

    if value.startswith(View.wdPrefix):
        value = value.replace(View.wdPrefix, "")
    url = formatterUrl + value
    link = self.createLink(url, text)
    return link

create an item link Args: row: row object with the data key: key of the value for which the link is created separator: If not None split the value on the separator and create multiple links

Source code in ceurws/view.py
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
def createItemLink(self, row: dict, key: str, separator: str | None = None) -> str:
    """
    create an item link
    Args:
        row: row object with the data
        key: key of the value for which the link is created
        separator: If not None split the value on the separator and create multiple links
    """
    value = self.getRowValue(row, key)
    if value == View.noneValue:
        return value
    item = row[key]
    itemLabel = row[f"{key}Label"]
    itemLink = ""
    if separator is not None:
        item_parts = item.split(separator)
        itemLabel_parts = itemLabel.split(separator)
        links = []
        for url, label in zip(item_parts, itemLabel_parts, strict=False):
            link = self.createLink(url, label)
            links.append(link)
        itemLink = "<br>".join(links)
    else:
        itemLink = self.createLink(item, itemLabel)
    return itemLink

create a link from the given url and text

Parameters:

Name Type Description Default
url(str)

the url to create a link for

required
text(str)

the text to add for the link

required
Source code in ceurws/view.py
33
34
35
36
37
38
39
40
41
42
def createLink(self, url: str, text: str):
    """
    create a link from the given url and text

    Args:
        url(str): the url to create a link for
        text(str): the text to add for the link
    """
    link = Link.create(url, text, target="_blank")
    return link

volume_neo4j

Editor dataclass

Represents an editor with their name and ORCID.

Source code in ceurws/volume_neo4j.py
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
@dataclass
class Editor:
    """
    Represents an editor with their name and ORCID.
    """

    name: str
    orcid: str | None = None
    likelihood: float | None = None

    @classmethod
    def from_json(cls, json_data):
        """
        Create an Editor instance from JSON data.

        Args:
            json_data (dict): The JSON data representing the editor.

        Returns:
            Editor: The Editor instance created from the JSON data.
        """
        return cls(name=json_data.get("name"), orcid=json_data.get("orcid"))

    def search_by_name(self):
        """
        Search the editor by name using the ORCID API and calculate the likelihood.
        """
        if self.name:
            url = f"https://pub.orcid.org/v3.0/search/?q={self.name}"
            headers = {"Accept": "application/json"}
            response = requests.get(url, headers=headers)
            if response.status_code == 200:
                data = response.json()
                num_results = data.get("num-found", 0)
                self.likelihood = num_results / 10  # Arbitrary calculation, adjust as needed

    def create_node(self, tx, volume_node_id: int) -> int | None:
        """
        Create an Editor node in Neo4j and establish a relationship with a Volume node.

        Args:
            tx: The Neo4j transaction.
            volume_node_id (int): The ID of the volume node.

        Returns:
            int: The ID of the created Editor node.
            None: if the editor could not be created
        """
        query = """
        MATCH (v:Volume)
        WHERE id(v) = $volume_node_id
        CREATE (v)-[:HAS_EDITOR]->(e:Editor {name: $name, orcid: $orcid, likelihood: $likelihood})
        RETURN id(e) as node_id
        """
        parameters = {
            "volume_node_id": volume_node_id,
            "name": self.name,
            "orcid": self.orcid,
            "likelihood": self.likelihood,
        }
        result = tx.run(query, parameters)
        record = result.single()
        if record is not None:
            return record["node_id"]
        else:
            return None

create_node(tx, volume_node_id)

Create an Editor node in Neo4j and establish a relationship with a Volume node.

Parameters:

Name Type Description Default
tx

The Neo4j transaction.

required
volume_node_id int

The ID of the volume node.

required

Returns:

Name Type Description
int int | None

The ID of the created Editor node.

None int | None

if the editor could not be created

Source code in ceurws/volume_neo4j.py
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
def create_node(self, tx, volume_node_id: int) -> int | None:
    """
    Create an Editor node in Neo4j and establish a relationship with a Volume node.

    Args:
        tx: The Neo4j transaction.
        volume_node_id (int): The ID of the volume node.

    Returns:
        int: The ID of the created Editor node.
        None: if the editor could not be created
    """
    query = """
    MATCH (v:Volume)
    WHERE id(v) = $volume_node_id
    CREATE (v)-[:HAS_EDITOR]->(e:Editor {name: $name, orcid: $orcid, likelihood: $likelihood})
    RETURN id(e) as node_id
    """
    parameters = {
        "volume_node_id": volume_node_id,
        "name": self.name,
        "orcid": self.orcid,
        "likelihood": self.likelihood,
    }
    result = tx.run(query, parameters)
    record = result.single()
    if record is not None:
        return record["node_id"]
    else:
        return None

from_json(json_data) classmethod

Create an Editor instance from JSON data.

Parameters:

Name Type Description Default
json_data dict

The JSON data representing the editor.

required

Returns:

Name Type Description
Editor

The Editor instance created from the JSON data.

Source code in ceurws/volume_neo4j.py
199
200
201
202
203
204
205
206
207
208
209
210
@classmethod
def from_json(cls, json_data):
    """
    Create an Editor instance from JSON data.

    Args:
        json_data (dict): The JSON data representing the editor.

    Returns:
        Editor: The Editor instance created from the JSON data.
    """
    return cls(name=json_data.get("name"), orcid=json_data.get("orcid"))

search_by_name()

Search the editor by name using the ORCID API and calculate the likelihood.

Source code in ceurws/volume_neo4j.py
212
213
214
215
216
217
218
219
220
221
222
223
def search_by_name(self):
    """
    Search the editor by name using the ORCID API and calculate the likelihood.
    """
    if self.name:
        url = f"https://pub.orcid.org/v3.0/search/?q={self.name}"
        headers = {"Accept": "application/json"}
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            data = response.json()
            num_results = data.get("num-found", 0)
            self.likelihood = num_results / 10  # Arbitrary calculation, adjust as needed

Location dataclass

Source code in ceurws/volume_neo4j.py
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
@dataclass
class Location:
    city: str
    country: str
    date: str

    @staticmethod
    def parse(location_str: str) -> Optional["Location"]:
        """
        Parse a location string of the format "City, Country, Date"

        Args:
            location_str: The location string to parse.

        Returns:
            A Location object or None if the string could not be parsed.
        """
        match = re.match(r"^(.*), (.*), (.*)$", location_str)
        if match:
            city, country, date = match.groups()
            return Location(city, country, date)
        else:
            return None

parse(location_str) staticmethod

Parse a location string of the format "City, Country, Date"

Parameters:

Name Type Description Default
location_str str

The location string to parse.

required

Returns:

Type Description
Optional[Location]

A Location object or None if the string could not be parsed.

Source code in ceurws/volume_neo4j.py
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
@staticmethod
def parse(location_str: str) -> Optional["Location"]:
    """
    Parse a location string of the format "City, Country, Date"

    Args:
        location_str: The location string to parse.

    Returns:
        A Location object or None if the string could not be parsed.
    """
    match = re.match(r"^(.*), (.*), (.*)$", location_str)
    if match:
        city, country, date = match.groups()
        return Location(city, country, date)
    else:
        return None

Neo4j

Neo4j wrapper class

Source code in ceurws/volume_neo4j.py
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
class Neo4j:
    """
    Neo4j wrapper class
    """

    def __init__(
        self,
        host: str = socket.gethostbyname(socket.gethostname()),
        bolt_port: int = 7687,
        auth=("neo4j", "password"),
        scheme: str = "bolt",
        encrypted: bool = False,
    ):
        """
        constructor
        """
        self.driver = None
        self.error = None
        self.host = host
        self.bolt_port = bolt_port
        self.encrypted = encrypted
        self.scheme = scheme
        try:
            uri = f"{scheme}://{host}:{bolt_port}"
            if not Neo4j.is_port_available(host, bolt_port):
                raise ValueError(f"port at {uri} not available")
            self.driver = GraphDatabase.driver(uri, auth=auth, encrypted=encrypted)
        except (ServiceUnavailable, AuthError, ConfigurationError) as e:
            self.error = e

    @classmethod
    def is_port_available(cls, host, port: int) -> bool:
        sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        sock.settimeout(1)  # 1 Second Timeout
        try:
            sock.connect((host, port))
        except OSError:
            return False
        finally:
            sock.close()
        return True

    def close(self):
        if self.driver is not None:
            self.driver.close()

__init__(host=socket.gethostbyname(socket.gethostname()), bolt_port=7687, auth=('neo4j', 'password'), scheme='bolt', encrypted=False)

constructor

Source code in ceurws/volume_neo4j.py
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
def __init__(
    self,
    host: str = socket.gethostbyname(socket.gethostname()),
    bolt_port: int = 7687,
    auth=("neo4j", "password"),
    scheme: str = "bolt",
    encrypted: bool = False,
):
    """
    constructor
    """
    self.driver = None
    self.error = None
    self.host = host
    self.bolt_port = bolt_port
    self.encrypted = encrypted
    self.scheme = scheme
    try:
        uri = f"{scheme}://{host}:{bolt_port}"
        if not Neo4j.is_port_available(host, bolt_port):
            raise ValueError(f"port at {uri} not available")
        self.driver = GraphDatabase.driver(uri, auth=auth, encrypted=encrypted)
    except (ServiceUnavailable, AuthError, ConfigurationError) as e:
        self.error = e

Volume dataclass

Represents a volume with its attributes.

Source code in ceurws/volume_neo4j.py
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
@dataclass
class Volume:
    """
    Represents a volume with its attributes.
    """

    acronym: str
    title: str
    loctime: str
    editors: list["Editor"] = field(default_factory=list)

    @classmethod
    def from_json(cls, json_data):
        """
        Create a Volume instance from JSON data.

        Args:
            json_data (dict): The JSON data representing the volume.

        Returns:
            Volume: The Volume instance created from the JSON data.
        """
        editor_names = json_data.get("editors")
        editor_names = editor_names.split(",") if editor_names else []
        editors = [Editor(name=name.strip()) for name in editor_names]
        return cls(
            acronym=json_data.get("acronym"),
            title=json_data.get("title"),
            loctime=json_data.get("loctime"),
            editors=editors,
        )

    def create_node(self, tx) -> int | None:
        """
        Create a Volume node in Neo4j.

        Args:
            tx: The Neo4j transaction.

        Returns:
            int: The ID of the created node.
            None: if the node was not created
        """
        query = """
        CREATE (v:Volume {acronym: $acronym, title: $title, loctime: $loctime})
        RETURN id(v) as node_id
        """
        parameters = {
            "acronym": self.acronym,
            "title": self.title,
            "loctime": self.loctime,
        }
        result = tx.run(query, parameters)
        record = result.single()
        if record is not None:
            return record["node_id"]
        else:
            return None

    @staticmethod
    def load_json_file(source: str) -> list["Volume"]:
        """
        Load volumes from the source JSON file.

        Args:
            source (str): Path to the source JSON file.

        Returns:
            List[Volume]: The list of loaded volumes.
        """
        with open(source) as file:
            json_data = json.load(file)

        volumes = [Volume.from_json(volume_data) for volume_data in json_data]
        return volumes

    @classmethod
    def default_source(cls) -> Path:
        """
        get the default source
        """
        default_source = CEURWS.CACHE_DIR / "volumes.json"
        return default_source

    @classmethod
    def parse_args(cls, argv: list | None = None):
        """
        Parse command line arguments.

        Args:
            argv(list): command line arguments

        Returns:
            argparse.Namespace: The parsed command line arguments.
        """

        default_source = cls.default_source()
        parser = argparse.ArgumentParser(description="Volume/Editor/Location Information")
        parser.add_argument("--source", default=str(default_source), help="Source JSON file path")
        # Add progress option
        parser.add_argument(
            "--progress",
            action="store_true",
            help="Display progress information",
        )

        return parser.parse_args(argv)

    @staticmethod
    def main(argv=None):
        if argv is None:
            argv = sys.argv[1:]
        args = Volume.parse_args(argv)
        volumes = Volume.load_json_file(args.source)

        # Connect to Neo4j
        driver = GraphDatabase.driver("bolt://localhost:7687", auth=("neo4j", "password"))
        with driver.session() as session:
            for volume in volumes:
                volume_node_id = volume.create_node(session)
                for editor in volume.editors:
                    editor.search_by_name()
                    editor.create_node(session, volume_node_id)

create_node(tx)

Create a Volume node in Neo4j.

Parameters:

Name Type Description Default
tx

The Neo4j transaction.

required

Returns:

Name Type Description
int int | None

The ID of the created node.

None int | None

if the node was not created

Source code in ceurws/volume_neo4j.py
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
def create_node(self, tx) -> int | None:
    """
    Create a Volume node in Neo4j.

    Args:
        tx: The Neo4j transaction.

    Returns:
        int: The ID of the created node.
        None: if the node was not created
    """
    query = """
    CREATE (v:Volume {acronym: $acronym, title: $title, loctime: $loctime})
    RETURN id(v) as node_id
    """
    parameters = {
        "acronym": self.acronym,
        "title": self.title,
        "loctime": self.loctime,
    }
    result = tx.run(query, parameters)
    record = result.single()
    if record is not None:
        return record["node_id"]
    else:
        return None

default_source() classmethod

get the default source

Source code in ceurws/volume_neo4j.py
140
141
142
143
144
145
146
@classmethod
def default_source(cls) -> Path:
    """
    get the default source
    """
    default_source = CEURWS.CACHE_DIR / "volumes.json"
    return default_source

from_json(json_data) classmethod

Create a Volume instance from JSON data.

Parameters:

Name Type Description Default
json_data dict

The JSON data representing the volume.

required

Returns:

Name Type Description
Volume

The Volume instance created from the JSON data.

Source code in ceurws/volume_neo4j.py
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
@classmethod
def from_json(cls, json_data):
    """
    Create a Volume instance from JSON data.

    Args:
        json_data (dict): The JSON data representing the volume.

    Returns:
        Volume: The Volume instance created from the JSON data.
    """
    editor_names = json_data.get("editors")
    editor_names = editor_names.split(",") if editor_names else []
    editors = [Editor(name=name.strip()) for name in editor_names]
    return cls(
        acronym=json_data.get("acronym"),
        title=json_data.get("title"),
        loctime=json_data.get("loctime"),
        editors=editors,
    )

load_json_file(source) staticmethod

Load volumes from the source JSON file.

Parameters:

Name Type Description Default
source str

Path to the source JSON file.

required

Returns:

Type Description
list[Volume]

List[Volume]: The list of loaded volumes.

Source code in ceurws/volume_neo4j.py
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
@staticmethod
def load_json_file(source: str) -> list["Volume"]:
    """
    Load volumes from the source JSON file.

    Args:
        source (str): Path to the source JSON file.

    Returns:
        List[Volume]: The list of loaded volumes.
    """
    with open(source) as file:
        json_data = json.load(file)

    volumes = [Volume.from_json(volume_data) for volume_data in json_data]
    return volumes

parse_args(argv=None) classmethod

Parse command line arguments.

Parameters:

Name Type Description Default
argv(list)

command line arguments

required

Returns:

Type Description

argparse.Namespace: The parsed command line arguments.

Source code in ceurws/volume_neo4j.py
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
@classmethod
def parse_args(cls, argv: list | None = None):
    """
    Parse command line arguments.

    Args:
        argv(list): command line arguments

    Returns:
        argparse.Namespace: The parsed command line arguments.
    """

    default_source = cls.default_source()
    parser = argparse.ArgumentParser(description="Volume/Editor/Location Information")
    parser.add_argument("--source", default=str(default_source), help="Source JSON file path")
    # Add progress option
    parser.add_argument(
        "--progress",
        action="store_true",
        help="Display progress information",
    )

    return parser.parse_args(argv)

volume_view

Created on 2024-02-23

@author: wf

VolumeListView

Bases: View

show a list of volumes a table

Source code in ceurws/volume_view.py
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
class VolumeListView(View):
    """
    show a list of volumes a table
    """

    def __init__(self, solution, parent):
        """
        constructor

        Args:
            solution: the solution
            parent: the parent UI container

        """
        self.solution = solution
        self.parent = parent
        self.wdSync = self.solution.wdSync
        self.dry_run = True
        self.ignore_errors = False
        self.get_volume_lod()
        self.setup_ui()

    def setup_ui(self):
        """
        show my volumes as a list
        """
        try:
            with ui.row() as self.button_row:
                self.check_recently_added_volumes_button = (
                    ui.button(
                        icon="cloud_download",
                        on_click=self.on_check_recently_update_volumes_button_click,
                    )
                    .classes("btn btn-primary btn-sm col-1")
                    .tooltip("check for recently added volumes")
                )
                self.wikidataButton = (
                    ui.button(
                        icon="web",
                        on_click=self.onWikidataButtonClick,
                    )
                    .classes("btn btn-primary btn-sm col-1")
                    .tooltip("Export to Wikidata")
                )
                self.dry_run_switch = ui.switch("dry run").bind_value(self, "dry_run")
                self.ignore_errors_check_box = ui.checkbox("ignore_errors", value=self.ignore_errors).bind_value(
                    self, "ignore_errors"
                )
                pass
                self.progress_bar = NiceguiProgressbar(total=100, desc="added", unit="volume")
            with ui.row() as self.log_row:
                self.log_view = ui.html()
            with ui.row() as self.grid_row:
                grid_config = GridConfig(key_col="Vol", multiselect=True)
                self.lod_grid = ListOfDictsGrid(lod=self.lod, config=grid_config)
                # Modify the columnDefs for the "Title" column after grid initialization
                for col_def in self.lod_grid.ag_grid.options["columnDefs"]:
                    if col_def["field"] == "Title":  # Identify the "Title" column
                        col_def["maxWidth"] = 400  # width in pixels
                self.lod_grid.sizeColumnsToFit()
        except Exception as ex:
            self.solution.handle_exception(ex)

    def clear_msg(self, msg: str = ""):
        """
        clear the log_view with the given message

        Args:
            msg(str): the message to display
        """
        with self.log_row:
            self.log_view.content = msg

    def add_msg(self, html_markup: str):
        """
        add the given html_markup message to the log_view

        Args:
            msg(str): the html formatted message to add
        """
        with self.log_row:
            self.log_view.content += html_markup

    def updateWikidataVolumes(self, selected_rows):
        """
        update wikidata volumes for the selected rows
        """
        try:
            msg = f"{len(selected_rows)} Volumes selected<br>"
            self.clear_msg(msg)
            # First, sort selected_rows by the volume number in ascending order
            sorted_rows = sorted(selected_rows, key=lambda row: row["#"])
            for row in sorted_rows:
                vol_number = row["#"]
                volume = self.wdSync.volumesByNumber[vol_number]
                self.add_or_update_volume_in_wikidata(volume)
            pass
        except Exception as ex:
            self.solution.handle_exception(ex)

    async def onWikidataButtonClick(self, _args):
        """
        handle wikidata sync request
        """
        selected_rows = await self.lod_grid.get_selected_rows()
        await run.io_bound(self.updateWikidataVolumes, selected_rows)

    def check_recently_updated_volumes(self):
        """
        check recently updated volumes
        """
        try:
            text = "checking CEUR-WS index.html for recently added volumes ..."
            self.clear_msg(text)
            (
                volumesByNumber,
                addedVolumeNumberList,
            ) = self.wdSync.getRecentlyAddedVolumeList()
            self.add_msg(f"<br>found {len(addedVolumeNumberList)} new volumes")
            total = len(addedVolumeNumberList)
            self.progress_bar.total = total
            for i, volumeNumber in enumerate(addedVolumeNumberList):
                if i % 100 == 0 and i != 0:
                    self.wdSync.storeVolumes()
                    time.sleep(60)
                volume = volumesByNumber[volumeNumber]
                self.updateRecentlyAddedVolume(volume, i + 1, total)
                url = f"/volume/{volume.number}"
                text = f"{volume}:{volume.acronym}"
                link = self.createLink(url, text)
                self.add_msg(f":{link}")
            pass
            self.wdSync.storeVolumes()
            with self.parent:
                self.progress_bar.reset()
            with self.grid_row:
                self.lod_grid.update()
        except Exception as ex:
            self.solution.handle_exception(ex)

    async def on_check_recently_update_volumes_button_click(self, args):
        """
        handle clicking of the refresh button to get recently added volumes
        """
        await run.io_bound(self.check_recently_updated_volumes)

    def updateRecentlyAddedVolume(self, volume, index, total):
        """
        update a recently added Volume

        Args:
            volume(Volume): the volume to update
            index(int): the relative index of the volume currently being added
            total(int): the total number of volumes currently being added
        """
        html_msg = f"<br>reading {index}/{total} from {volume.url}"
        self.add_msg(html_msg)
        volume.extractValuesFromVolumePage()
        self.wdSync.addVolume(volume)
        self.progress_bar.update_value(index)

    def get_volume_lod(self):
        """
        get the list of dict of all volumes
        """
        self.lod = []
        volumeList = self.wdSync.vm.getList()
        reverseVolumeList = sorted(volumeList, key=lambda volume: volume.number, reverse=True)
        for volume in reverseVolumeList:
            validMark = "✅" if volume.valid else "❌"
            self.lod.append(
                {
                    "#": volume.number,
                    "Vol": self.createLink(volume.url, f"Vol-{volume.number:04}"),
                    "Acronym": self.getValue(volume, "acronym"),
                    "Title": self.getValue(volume, "title"),
                    "Loctime": self.getValue(volume, "loctime"),
                    "Published": self.getValue(volume, "published"),
                    "SubmittedBy": self.getValue(volume, "submittedBy"),
                    "valid": validMark,
                }
            )

    def add_or_update_volume_in_wikidata(self, volume: Volume):
        """
        add the given volume to wikidata or update it if it already exists

        Args:
            volume(Volume): the CEUR-WS volume to update proceedings and event entries for
        """
        try:
            msg = f"trying to add Volume {volume.number} to wikidata"
            with self.parent:
                ui.notify(msg)
            self.add_msg(msg + "<br>")
            proceedingsWikidataId = self.createProceedingsItemFromVolume(volume)
            if proceedingsWikidataId is not None:
                self.createEventItemAndLinkProceedings(volume, proceedingsWikidataId)
            else:
                msg = f"<br>adding Volume {volume.number} proceedings to wikidata failed"
                self.add_msg(msg)
                with self.parent:
                    ui.notify(msg)
        except Exception as ex:
            self.solution.handle_exception(ex)

    def optional_login(self) -> bool:
        """
        check if we need to login

        Returns:
            bool: True if write is enabled
        """
        write = not self.dry_run
        if write:
            self.wdSync.login()
        return write

    def createProceedingsItemFromVolume(self, volume: Volume):
        """
        Create wikidata item for proceedings of given volume
        """
        qId = None
        try:
            write = self.optional_login()
            # check if already in wikidata → use URN
            urn = volume.urn
            wdItems = self.wdSync.getProceedingWdItemsByUrn(urn)
            if len(wdItems) > 0:
                html = f"Volume {volume.number} already in Wikidata see "
                delim = ""
                for wdItem in wdItems:
                    qId = wdItem.split("/")[-1]
                    link = self.createLink(wdItem, qId)
                    html += f"{link}{delim}"
                    delim = ","
                self.add_msg(html + "<br>")
            else:
                # A proceedings volume for the URN is not known → create wd entry
                wdRecord = self.wdSync.getWikidataProceedingsRecord(volume)
                if self.dry_run:
                    markup = self.get_dict_as_html_table(wdRecord)
                    self.add_msg(markup)
                result = self.wdSync.addProceedingsToWikidata(wdRecord, write=write, ignoreErrors=self.ignore_errors)
                qId = result.qid
                if qId is not None:
                    proc_link = self.createWdLink(
                        qId,
                        f"Proceedings entry for Vol {volume.number} {qId} was created",
                    )
                    self.add_msg(proc_link)
                else:
                    self.add_msg(f"Creating wikidata Proceedings entry for Vol {volume.number} failed")
                    for key, value in result.errors.items():
                        msg = f"{key}:{value}"
                        self.add_msg(msg)
        except Exception as ex:
            self.solution.handle_exception(ex)
        return qId

    def createEventItemAndLinkProceedings(self, volume: Volume, proceedingsWikidataId: str | None = None):
        """
        Create event  wikidata item for given volume and link
        the proceedings with the event

        Args:
            volume(Volume): the volume for which to create the event item
            proceedingsWikidataId: wikidata id of the proceedings
        """
        try:
            write = self.optional_login()
            results = self.wdSync.doCreateEventItemAndLinkProceedings(volume, proceedingsWikidataId, write=write)
            if write:
                self.wdSync.logout()
            for key, result in results.items():
                if result.qid:
                    if key == "dblp":
                        url = f"https://dblp.org/db/{result.qid}.html"
                        link = self.createLink(url, f"dblp {result.qid}")
                    else:
                        link = self.createWdLink(
                            result.qid,
                            f"{key} for Vol {volume.number} {result.qid}",
                        )
                    self.add_msg("<br>" + link)
                if result.msg:
                    self.add_msg("<br>" + result.msg)
                if len(result.errors) > 0:
                    for error in result.errors.values():
                        self.add_msg(f"error {str(error)}")
        except Exception as ex:
            self.solution.handle_exception(ex)
        pass

__init__(solution, parent)

constructor

Parameters:

Name Type Description Default
solution

the solution

required
parent

the parent UI container

required
Source code in ceurws/volume_view.py
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
def __init__(self, solution, parent):
    """
    constructor

    Args:
        solution: the solution
        parent: the parent UI container

    """
    self.solution = solution
    self.parent = parent
    self.wdSync = self.solution.wdSync
    self.dry_run = True
    self.ignore_errors = False
    self.get_volume_lod()
    self.setup_ui()

add_msg(html_markup)

add the given html_markup message to the log_view

Parameters:

Name Type Description Default
msg(str)

the html formatted message to add

required
Source code in ceurws/volume_view.py
269
270
271
272
273
274
275
276
277
def add_msg(self, html_markup: str):
    """
    add the given html_markup message to the log_view

    Args:
        msg(str): the html formatted message to add
    """
    with self.log_row:
        self.log_view.content += html_markup

add_or_update_volume_in_wikidata(volume)

add the given volume to wikidata or update it if it already exists

Parameters:

Name Type Description Default
volume(Volume)

the CEUR-WS volume to update proceedings and event entries for

required
Source code in ceurws/volume_view.py
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
def add_or_update_volume_in_wikidata(self, volume: Volume):
    """
    add the given volume to wikidata or update it if it already exists

    Args:
        volume(Volume): the CEUR-WS volume to update proceedings and event entries for
    """
    try:
        msg = f"trying to add Volume {volume.number} to wikidata"
        with self.parent:
            ui.notify(msg)
        self.add_msg(msg + "<br>")
        proceedingsWikidataId = self.createProceedingsItemFromVolume(volume)
        if proceedingsWikidataId is not None:
            self.createEventItemAndLinkProceedings(volume, proceedingsWikidataId)
        else:
            msg = f"<br>adding Volume {volume.number} proceedings to wikidata failed"
            self.add_msg(msg)
            with self.parent:
                ui.notify(msg)
    except Exception as ex:
        self.solution.handle_exception(ex)

check_recently_updated_volumes()

check recently updated volumes

Source code in ceurws/volume_view.py
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
def check_recently_updated_volumes(self):
    """
    check recently updated volumes
    """
    try:
        text = "checking CEUR-WS index.html for recently added volumes ..."
        self.clear_msg(text)
        (
            volumesByNumber,
            addedVolumeNumberList,
        ) = self.wdSync.getRecentlyAddedVolumeList()
        self.add_msg(f"<br>found {len(addedVolumeNumberList)} new volumes")
        total = len(addedVolumeNumberList)
        self.progress_bar.total = total
        for i, volumeNumber in enumerate(addedVolumeNumberList):
            if i % 100 == 0 and i != 0:
                self.wdSync.storeVolumes()
                time.sleep(60)
            volume = volumesByNumber[volumeNumber]
            self.updateRecentlyAddedVolume(volume, i + 1, total)
            url = f"/volume/{volume.number}"
            text = f"{volume}:{volume.acronym}"
            link = self.createLink(url, text)
            self.add_msg(f":{link}")
        pass
        self.wdSync.storeVolumes()
        with self.parent:
            self.progress_bar.reset()
        with self.grid_row:
            self.lod_grid.update()
    except Exception as ex:
        self.solution.handle_exception(ex)

clear_msg(msg='')

clear the log_view with the given message

Parameters:

Name Type Description Default
msg(str)

the message to display

required
Source code in ceurws/volume_view.py
259
260
261
262
263
264
265
266
267
def clear_msg(self, msg: str = ""):
    """
    clear the log_view with the given message

    Args:
        msg(str): the message to display
    """
    with self.log_row:
        self.log_view.content = msg

createEventItemAndLinkProceedings(volume, proceedingsWikidataId=None)

Create event wikidata item for given volume and link the proceedings with the event

Parameters:

Name Type Description Default
volume(Volume)

the volume for which to create the event item

required
proceedingsWikidataId str | None

wikidata id of the proceedings

None
Source code in ceurws/volume_view.py
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
def createEventItemAndLinkProceedings(self, volume: Volume, proceedingsWikidataId: str | None = None):
    """
    Create event  wikidata item for given volume and link
    the proceedings with the event

    Args:
        volume(Volume): the volume for which to create the event item
        proceedingsWikidataId: wikidata id of the proceedings
    """
    try:
        write = self.optional_login()
        results = self.wdSync.doCreateEventItemAndLinkProceedings(volume, proceedingsWikidataId, write=write)
        if write:
            self.wdSync.logout()
        for key, result in results.items():
            if result.qid:
                if key == "dblp":
                    url = f"https://dblp.org/db/{result.qid}.html"
                    link = self.createLink(url, f"dblp {result.qid}")
                else:
                    link = self.createWdLink(
                        result.qid,
                        f"{key} for Vol {volume.number} {result.qid}",
                    )
                self.add_msg("<br>" + link)
            if result.msg:
                self.add_msg("<br>" + result.msg)
            if len(result.errors) > 0:
                for error in result.errors.values():
                    self.add_msg(f"error {str(error)}")
    except Exception as ex:
        self.solution.handle_exception(ex)
    pass

createProceedingsItemFromVolume(volume)

Create wikidata item for proceedings of given volume

Source code in ceurws/volume_view.py
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
def createProceedingsItemFromVolume(self, volume: Volume):
    """
    Create wikidata item for proceedings of given volume
    """
    qId = None
    try:
        write = self.optional_login()
        # check if already in wikidata → use URN
        urn = volume.urn
        wdItems = self.wdSync.getProceedingWdItemsByUrn(urn)
        if len(wdItems) > 0:
            html = f"Volume {volume.number} already in Wikidata see "
            delim = ""
            for wdItem in wdItems:
                qId = wdItem.split("/")[-1]
                link = self.createLink(wdItem, qId)
                html += f"{link}{delim}"
                delim = ","
            self.add_msg(html + "<br>")
        else:
            # A proceedings volume for the URN is not known → create wd entry
            wdRecord = self.wdSync.getWikidataProceedingsRecord(volume)
            if self.dry_run:
                markup = self.get_dict_as_html_table(wdRecord)
                self.add_msg(markup)
            result = self.wdSync.addProceedingsToWikidata(wdRecord, write=write, ignoreErrors=self.ignore_errors)
            qId = result.qid
            if qId is not None:
                proc_link = self.createWdLink(
                    qId,
                    f"Proceedings entry for Vol {volume.number} {qId} was created",
                )
                self.add_msg(proc_link)
            else:
                self.add_msg(f"Creating wikidata Proceedings entry for Vol {volume.number} failed")
                for key, value in result.errors.items():
                    msg = f"{key}:{value}"
                    self.add_msg(msg)
    except Exception as ex:
        self.solution.handle_exception(ex)
    return qId

get_volume_lod()

get the list of dict of all volumes

Source code in ceurws/volume_view.py
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
def get_volume_lod(self):
    """
    get the list of dict of all volumes
    """
    self.lod = []
    volumeList = self.wdSync.vm.getList()
    reverseVolumeList = sorted(volumeList, key=lambda volume: volume.number, reverse=True)
    for volume in reverseVolumeList:
        validMark = "✅" if volume.valid else "❌"
        self.lod.append(
            {
                "#": volume.number,
                "Vol": self.createLink(volume.url, f"Vol-{volume.number:04}"),
                "Acronym": self.getValue(volume, "acronym"),
                "Title": self.getValue(volume, "title"),
                "Loctime": self.getValue(volume, "loctime"),
                "Published": self.getValue(volume, "published"),
                "SubmittedBy": self.getValue(volume, "submittedBy"),
                "valid": validMark,
            }
        )

onWikidataButtonClick(_args) async

handle wikidata sync request

Source code in ceurws/volume_view.py
296
297
298
299
300
301
async def onWikidataButtonClick(self, _args):
    """
    handle wikidata sync request
    """
    selected_rows = await self.lod_grid.get_selected_rows()
    await run.io_bound(self.updateWikidataVolumes, selected_rows)

on_check_recently_update_volumes_button_click(args) async

handle clicking of the refresh button to get recently added volumes

Source code in ceurws/volume_view.py
336
337
338
339
340
async def on_check_recently_update_volumes_button_click(self, args):
    """
    handle clicking of the refresh button to get recently added volumes
    """
    await run.io_bound(self.check_recently_updated_volumes)

optional_login()

check if we need to login

Returns:

Name Type Description
bool bool

True if write is enabled

Source code in ceurws/volume_view.py
402
403
404
405
406
407
408
409
410
411
412
def optional_login(self) -> bool:
    """
    check if we need to login

    Returns:
        bool: True if write is enabled
    """
    write = not self.dry_run
    if write:
        self.wdSync.login()
    return write

setup_ui()

show my volumes as a list

Source code in ceurws/volume_view.py
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
def setup_ui(self):
    """
    show my volumes as a list
    """
    try:
        with ui.row() as self.button_row:
            self.check_recently_added_volumes_button = (
                ui.button(
                    icon="cloud_download",
                    on_click=self.on_check_recently_update_volumes_button_click,
                )
                .classes("btn btn-primary btn-sm col-1")
                .tooltip("check for recently added volumes")
            )
            self.wikidataButton = (
                ui.button(
                    icon="web",
                    on_click=self.onWikidataButtonClick,
                )
                .classes("btn btn-primary btn-sm col-1")
                .tooltip("Export to Wikidata")
            )
            self.dry_run_switch = ui.switch("dry run").bind_value(self, "dry_run")
            self.ignore_errors_check_box = ui.checkbox("ignore_errors", value=self.ignore_errors).bind_value(
                self, "ignore_errors"
            )
            pass
            self.progress_bar = NiceguiProgressbar(total=100, desc="added", unit="volume")
        with ui.row() as self.log_row:
            self.log_view = ui.html()
        with ui.row() as self.grid_row:
            grid_config = GridConfig(key_col="Vol", multiselect=True)
            self.lod_grid = ListOfDictsGrid(lod=self.lod, config=grid_config)
            # Modify the columnDefs for the "Title" column after grid initialization
            for col_def in self.lod_grid.ag_grid.options["columnDefs"]:
                if col_def["field"] == "Title":  # Identify the "Title" column
                    col_def["maxWidth"] = 400  # width in pixels
            self.lod_grid.sizeColumnsToFit()
    except Exception as ex:
        self.solution.handle_exception(ex)

updateRecentlyAddedVolume(volume, index, total)

update a recently added Volume

Parameters:

Name Type Description Default
volume(Volume)

the volume to update

required
index(int)

the relative index of the volume currently being added

required
total(int)

the total number of volumes currently being added

required
Source code in ceurws/volume_view.py
342
343
344
345
346
347
348
349
350
351
352
353
354
355
def updateRecentlyAddedVolume(self, volume, index, total):
    """
    update a recently added Volume

    Args:
        volume(Volume): the volume to update
        index(int): the relative index of the volume currently being added
        total(int): the total number of volumes currently being added
    """
    html_msg = f"<br>reading {index}/{total} from {volume.url}"
    self.add_msg(html_msg)
    volume.extractValuesFromVolumePage()
    self.wdSync.addVolume(volume)
    self.progress_bar.update_value(index)

updateWikidataVolumes(selected_rows)

update wikidata volumes for the selected rows

Source code in ceurws/volume_view.py
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
def updateWikidataVolumes(self, selected_rows):
    """
    update wikidata volumes for the selected rows
    """
    try:
        msg = f"{len(selected_rows)} Volumes selected<br>"
        self.clear_msg(msg)
        # First, sort selected_rows by the volume number in ascending order
        sorted_rows = sorted(selected_rows, key=lambda row: row["#"])
        for row in sorted_rows:
            vol_number = row["#"]
            volume = self.wdSync.volumesByNumber[vol_number]
            self.add_or_update_volume_in_wikidata(volume)
        pass
    except Exception as ex:
        self.solution.handle_exception(ex)

VolumeView

Bases: View

displays a single volume

Source code in ceurws/volume_view.py
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
class VolumeView(View):
    """
    displays a single volume
    """

    def __init__(self, solution, parent):
        """
        constructor

        Args:
            solution: the solution
            parent: the parent UI container

        """
        self.solution = solution
        self.parent = parent
        self.volumeToolBar = None
        self.wdSync = self.solution.wdSync
        self.wdSpan = None

    def setup_ui(self):
        """
        setup my User Interface elements
        """
        with self.parent:
            with ui.row() as self.volumeToolBar:
                self.volumeRefreshButton = (
                    ui.button(
                        icon="refresh",
                        on_click=self.onRefreshButtonClick,
                    )
                    .classes("btn btn-primary btn-sm col-1")
                    .tooltip("Refresh from CEUR-WS Volume page")
                )
                self.wikidataButton = (
                    ui.button(
                        icon="web",
                        on_click=self.onWikidataButtonClick,
                    )
                    .classes("btn btn-primary btn-sm col-1")
                    .tooltip("Export to Wikidata")
                )
            self.header_view = ui.html()
            self.iframe_view = ui.html().classes("w-full").style("height: 80vh;")

    def updateWikidataSpan(self, qId: str, volume: Volume):
        """
        create a Wikidata Export span

        Args:
            a(): ancestor
            qId(str): wikidata item Q Identifier
            volume(Volume): the Volume
        """
        if self.wdSpan is None:
            self.wdSpan = ui.html()
        volume_link = Link.create(url=self.volume.url, text=f"{volume.number}:{volume.acronym}")
        wd_url = self.wdSync.itemUrl(qId)
        wd_link = Link.create(url=wd_url, text=f"{qId} ")
        self.wdSpan.content = f"{volume_link}{wd_link}"

    def showVolume(self, volume: Volume):
        """
        show the given volume

        Args:
            volume(Volume): the volume to show
        """
        try:
            self.volume = volume
            if self.volumeToolBar is None:
                self.setup_ui()

            wdProc = self.wdSync.getProceedingsForVolume(volume.number)
            self.wikidataButton.disabled = wdProc is not None
            links = ""
            if wdProc is not None:
                # wikidata proceedings link
                itemLink = self.createLink(wdProc["item"], "wikidataitem")
                # dblp proceedings link
                dblpLink = self.createExternalLink(
                    wdProc,
                    "dblpEventId",
                    "dblp",
                    DblpEndpoint.DBLP_EVENT_PREFIX,
                    emptyIfNone=True,
                )
                # k10plus proceedings link
                k10PlusLink = self.createExternalLink(
                    wdProc,
                    "ppnId",
                    "k10plus",
                    "https://opac.k10plus.de/DB=2.299/PPNSET?PPN=",
                    emptyIfNone=True,
                )
                # scholia proceedings link
                scholiaLink = self.createExternalLink(
                    wdProc,
                    "item",
                    "scholia",
                    "https://scholia.toolforge.org/venue/",
                    emptyIfNone=True,
                )
                # scholia event link
                scholiaEventLink = self.createExternalLink(
                    wdProc,
                    "event",
                    "event",
                    "https://scholia.toolforge.org/event/",
                    emptyIfNone=True,
                )
                # scholia event series link
                scholiaEventSeriesLink = self.createExternalLink(
                    wdProc,
                    "eventSeries",
                    "series",
                    "https://scholia.toolforge.org/event-series/",
                    emptyIfNone=True,
                )
                # scholia colocated with link
                delim = ""
                for link in [
                    itemLink,
                    dblpLink,
                    k10PlusLink,
                    scholiaLink,
                    scholiaEventLink,
                    scholiaEventSeriesLink,
                ]:
                    if link:
                        links += delim + link
                        delim = "&nbsp;"

            headerHtml = f"""
    {links}<h3 style='font-size: 24px; font-weight: normal; margin-top: 20px; margin-bottom: 10px;'>{volume.h1}</h3>
    <a href='{volume.url}'>{volume.acronym}<a>
    {volume.title}<br>
    {volume.desc}
    published: {volume.pubDate}
    submitted By: {volume.submittedBy}"""
            iframeHtml = f"""
            <iframe src='{volume.url}' style='width: 100%; height: 80vh; border: none;'></iframe>"""
            self.header_view.content = headerHtml
            self.iframe_view.content = iframeHtml

        except Exception as ex:
            self.solution.handle_exception(ex)

    async def onRefreshButtonClick(self, _args):
        try:
            self.volume.extractValuesFromVolumePage()
            msg = f"updated from {self.volume.url}"
            ui.notify(msg)
            self.showVolume(self.volume)
            # self.wdSync.storeVolumes()
        except Exception as ex:
            self.solution.handle_exception(ex)

    async def onWikidataButtonClick(self, _args):
        """
        handle wikidata sync request
        """
        try:
            wdRecord = self.wdSync.getWikidataProceedingsRecord(self.volume)
            result = self.wdSync.addProceedingsToWikidata(wdRecord, write=True, ignoreErrors=False)
            qId = result.qid
            if qId is not None:
                msg = f"wikidata export of {self.volume.number} to {qId} done"
                ui.notify(msg)
                self.updateWikidataSpan(qId=qId, volume=self.volume)
            else:
                err_msg = f"error:{result.error}"
                self.solution.log_view.push(err_msg)
        except Exception as ex:
            self.solution.handle_exception(ex)

__init__(solution, parent)

constructor

Parameters:

Name Type Description Default
solution

the solution

required
parent

the parent UI container

required
Source code in ceurws/volume_view.py
24
25
26
27
28
29
30
31
32
33
34
35
36
37
def __init__(self, solution, parent):
    """
    constructor

    Args:
        solution: the solution
        parent: the parent UI container

    """
    self.solution = solution
    self.parent = parent
    self.volumeToolBar = None
    self.wdSync = self.solution.wdSync
    self.wdSpan = None

onWikidataButtonClick(_args) async

handle wikidata sync request

Source code in ceurws/volume_view.py
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
async def onWikidataButtonClick(self, _args):
    """
    handle wikidata sync request
    """
    try:
        wdRecord = self.wdSync.getWikidataProceedingsRecord(self.volume)
        result = self.wdSync.addProceedingsToWikidata(wdRecord, write=True, ignoreErrors=False)
        qId = result.qid
        if qId is not None:
            msg = f"wikidata export of {self.volume.number} to {qId} done"
            ui.notify(msg)
            self.updateWikidataSpan(qId=qId, volume=self.volume)
        else:
            err_msg = f"error:{result.error}"
            self.solution.log_view.push(err_msg)
    except Exception as ex:
        self.solution.handle_exception(ex)

setup_ui()

setup my User Interface elements

Source code in ceurws/volume_view.py
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
def setup_ui(self):
    """
    setup my User Interface elements
    """
    with self.parent:
        with ui.row() as self.volumeToolBar:
            self.volumeRefreshButton = (
                ui.button(
                    icon="refresh",
                    on_click=self.onRefreshButtonClick,
                )
                .classes("btn btn-primary btn-sm col-1")
                .tooltip("Refresh from CEUR-WS Volume page")
            )
            self.wikidataButton = (
                ui.button(
                    icon="web",
                    on_click=self.onWikidataButtonClick,
                )
                .classes("btn btn-primary btn-sm col-1")
                .tooltip("Export to Wikidata")
            )
        self.header_view = ui.html()
        self.iframe_view = ui.html().classes("w-full").style("height: 80vh;")

showVolume(volume)

show the given volume

Parameters:

Name Type Description Default
volume(Volume)

the volume to show

required
Source code in ceurws/volume_view.py
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
def showVolume(self, volume: Volume):
    """
    show the given volume

    Args:
        volume(Volume): the volume to show
    """
    try:
        self.volume = volume
        if self.volumeToolBar is None:
            self.setup_ui()

        wdProc = self.wdSync.getProceedingsForVolume(volume.number)
        self.wikidataButton.disabled = wdProc is not None
        links = ""
        if wdProc is not None:
            # wikidata proceedings link
            itemLink = self.createLink(wdProc["item"], "wikidataitem")
            # dblp proceedings link
            dblpLink = self.createExternalLink(
                wdProc,
                "dblpEventId",
                "dblp",
                DblpEndpoint.DBLP_EVENT_PREFIX,
                emptyIfNone=True,
            )
            # k10plus proceedings link
            k10PlusLink = self.createExternalLink(
                wdProc,
                "ppnId",
                "k10plus",
                "https://opac.k10plus.de/DB=2.299/PPNSET?PPN=",
                emptyIfNone=True,
            )
            # scholia proceedings link
            scholiaLink = self.createExternalLink(
                wdProc,
                "item",
                "scholia",
                "https://scholia.toolforge.org/venue/",
                emptyIfNone=True,
            )
            # scholia event link
            scholiaEventLink = self.createExternalLink(
                wdProc,
                "event",
                "event",
                "https://scholia.toolforge.org/event/",
                emptyIfNone=True,
            )
            # scholia event series link
            scholiaEventSeriesLink = self.createExternalLink(
                wdProc,
                "eventSeries",
                "series",
                "https://scholia.toolforge.org/event-series/",
                emptyIfNone=True,
            )
            # scholia colocated with link
            delim = ""
            for link in [
                itemLink,
                dblpLink,
                k10PlusLink,
                scholiaLink,
                scholiaEventLink,
                scholiaEventSeriesLink,
            ]:
                if link:
                    links += delim + link
                    delim = "&nbsp;"

        headerHtml = f"""
{links}<h3 style='font-size: 24px; font-weight: normal; margin-top: 20px; margin-bottom: 10px;'>{volume.h1}</h3>
<a href='{volume.url}'>{volume.acronym}<a>
{volume.title}<br>
{volume.desc}
published: {volume.pubDate}
submitted By: {volume.submittedBy}"""
        iframeHtml = f"""
        <iframe src='{volume.url}' style='width: 100%; height: 80vh; border: none;'></iframe>"""
        self.header_view.content = headerHtml
        self.iframe_view.content = iframeHtml

    except Exception as ex:
        self.solution.handle_exception(ex)

updateWikidataSpan(qId, volume)

create a Wikidata Export span

Parameters:

Name Type Description Default
a()

ancestor

required
qId(str)

wikidata item Q Identifier

required
volume(Volume)

the Volume

required
Source code in ceurws/volume_view.py
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
def updateWikidataSpan(self, qId: str, volume: Volume):
    """
    create a Wikidata Export span

    Args:
        a(): ancestor
        qId(str): wikidata item Q Identifier
        volume(Volume): the Volume
    """
    if self.wdSpan is None:
        self.wdSpan = ui.html()
    volume_link = Link.create(url=self.volume.url, text=f"{volume.number}:{volume.acronym}")
    wd_url = self.wdSync.itemUrl(qId)
    wd_link = Link.create(url=wd_url, text=f"{qId} ")
    self.wdSpan.content = f"{volume_link}{wd_link}"

volumeparser

Created on 2022-08-14

@author: wf

VolumePageCache

Cache interface for ceur-ws volume pages

Source code in ceurws/volumeparser.py
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
class VolumePageCache:
    """
    Cache interface for ceur-ws volume pages
    """

    cache_location: Path = CEURWS.CACHE_DIR / "volumes"

    @classmethod
    def is_cached(cls, number: int) -> bool:
        """
        Check if the volume page of the given volume number is cached
        Args:
            number: volume number of the volume page

        Returns:
            True if the corresponding volume page is cached
        """
        return cls._get_volume_cache_path(number).is_file()

    @classmethod
    def cache(cls, number: int, html: str | bytes):
        """
        cache the volume page corresponding to the given number
        Args:
            number: number of the volume to cache
            html: html of the volume page to cache
        """
        if html is None:
            return
        Path(cls.cache_location).mkdir(parents=True, exist_ok=True)
        filename = cls._get_volume_cache_path(number)
        mode = "w"
        if isinstance(html, bytes):
            mode += "b"
        with open(filename, mode=mode) as f:
            f.write(html)

    @classmethod
    def _get_volume_cache_path(cls, number: int) -> Path:
        """
        get the name of the volume cache file
        """
        return cls.cache_location / f"Vol-{number}.html"

    @classmethod
    def get(cls, number: int) -> str | bytes | None:
        """
        Get the cached volume page of the given volume number.
        If the volume page is not cached None is returned.
        Args:
            number: volume number to retrieve

        Returns:
            str: cached volume page
            bytes: if the cached volume page contains encoding errors
            None: if no volume with the given number is cached
        """
        volume_page: str | bytes | None = None
        if cls.is_cached(number):
            filepath = cls._get_volume_cache_path(number)
            try:
                volume_page = filepath.read_text()
            except UnicodeDecodeError as _ex:
                volume_page = filepath.read_bytes()
        return volume_page

    @classmethod
    def delete(cls, number: int):
        """
        Delete the cache corresponding to the given volume number
        Args:
            number: volume number
        """
        if cls.is_cached(number):
            filepath = cls._get_volume_cache_path(number)
            os.remove(filepath)

cache(number, html) classmethod

cache the volume page corresponding to the given number Args: number: number of the volume to cache html: html of the volume page to cache

Source code in ceurws/volumeparser.py
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
@classmethod
def cache(cls, number: int, html: str | bytes):
    """
    cache the volume page corresponding to the given number
    Args:
        number: number of the volume to cache
        html: html of the volume page to cache
    """
    if html is None:
        return
    Path(cls.cache_location).mkdir(parents=True, exist_ok=True)
    filename = cls._get_volume_cache_path(number)
    mode = "w"
    if isinstance(html, bytes):
        mode += "b"
    with open(filename, mode=mode) as f:
        f.write(html)

delete(number) classmethod

Delete the cache corresponding to the given volume number Args: number: volume number

Source code in ceurws/volumeparser.py
500
501
502
503
504
505
506
507
508
509
@classmethod
def delete(cls, number: int):
    """
    Delete the cache corresponding to the given volume number
    Args:
        number: volume number
    """
    if cls.is_cached(number):
        filepath = cls._get_volume_cache_path(number)
        os.remove(filepath)

get(number) classmethod

Get the cached volume page of the given volume number. If the volume page is not cached None is returned. Args: number: volume number to retrieve

Returns:

Name Type Description
str str | bytes | None

cached volume page

bytes str | bytes | None

if the cached volume page contains encoding errors

None str | bytes | None

if no volume with the given number is cached

Source code in ceurws/volumeparser.py
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
@classmethod
def get(cls, number: int) -> str | bytes | None:
    """
    Get the cached volume page of the given volume number.
    If the volume page is not cached None is returned.
    Args:
        number: volume number to retrieve

    Returns:
        str: cached volume page
        bytes: if the cached volume page contains encoding errors
        None: if no volume with the given number is cached
    """
    volume_page: str | bytes | None = None
    if cls.is_cached(number):
        filepath = cls._get_volume_cache_path(number)
        try:
            volume_page = filepath.read_text()
        except UnicodeDecodeError as _ex:
            volume_page = filepath.read_bytes()
    return volume_page

is_cached(number) classmethod

Check if the volume page of the given volume number is cached Args: number: volume number of the volume page

Returns:

Type Description
bool

True if the corresponding volume page is cached

Source code in ceurws/volumeparser.py
441
442
443
444
445
446
447
448
449
450
451
@classmethod
def is_cached(cls, number: int) -> bool:
    """
    Check if the volume page of the given volume number is cached
    Args:
        number: volume number of the volume page

    Returns:
        True if the corresponding volume page is cached
    """
    return cls._get_volume_cache_path(number).is_file()

VolumeParser

Bases: Textparser

CEUR-WS VolumeParser

Source code in ceurws/volumeparser.py
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
class VolumeParser(Textparser):
    """
    CEUR-WS VolumeParser
    """

    def __init__(
        self,
        baseurl: str = "http://ceur-ws.org",
        timeout: float = 3,
        showHtml: bool = False,
        debug: bool = False,
    ):
        """
        Constructor

        Args:
            baseurl(str): the baseurl of the CEUR-WS website,
            timeout(float): the number of seconds to wait
            showHtml(bool): if True show the HTML code
            debug(bool): if True switch debugging on
        """
        Textparser.__init__(self, debug=debug)
        self.showHtml = showHtml
        self.baseurl = baseurl
        self.timeout = timeout
        self.scrape = WebScrape(timeout=timeout)

    def volumeUrl(self, volnumber: str | int):
        """
        get the url for the given volume number

        Args:
            volnumber(str): the volume number

        Returns:
            str: url - the url of the volume
        """
        # e.g. http://ceur-ws.org/Vol-2635/
        url = f"{self.baseurl}/Vol-{volnumber}"
        return url

    def getSoup(self, url: str) -> BeautifulSoup | None:
        """
        get the beautiful Soup parser for the given url
        Args:
            url: url to parse

        Returns:
            parsed webpage
        """
        return self.scrape.getSoup(url, showHtml=self.showHtml, debug=self.debug)

    def get_volume_soup(self, number: int, use_cache: bool = True) -> BeautifulSoup | None:
        """
        Get Soup of the volume page for the given volume number
        Args:
            number: volume number of the volume to parse
            use_cache: If True use volume page from cache if present otherwise load from web and cache

        Returns:
            BeautifulSoup: soup of the volume page
            None: soup can not be loaded from cache or from web
        """
        html = self.get_volume_page(number, recache=not use_cache)
        if html is None:
            if self.debug:
                print(f"Vol-{number} could not be retrieved")
            return None
        soup = self.scrape.get_soup_from_string(html, show_html=self.showHtml)
        return soup

    def get_volume_page(self, number: int, recache: bool = False) -> str | bytes | None:
        """
        Get the html content of the given volume number.
        Retrieves the volume page from cache or from ceur-ws.org
        Caches the volume page if not already cached
        Args:
            number: volume number
            recache: If True update the cache with a new fetch from the web. Otherwise, cache is used if present

        Returns:
            html of volume page or None if the volume page is not found
        """
        if not recache and VolumePageCache.is_cached(number):
            volume_page = VolumePageCache.get(number)
        else:
            url = self.volumeUrl(number)
            volume_page = self.scrape.get_html_from_url(url)
            if volume_page:
                VolumePageCache.cache(number, volume_page)
        return volume_page

    def parse_volume(self, number: int, use_cache: bool = True) -> tuple[dict, BeautifulSoup | None]:
        """
        parse the given volume
        caches the volume pages at ~/.ceurws/volumes

        Args:
            number: volume number of the volume to parse
            use_cache: If True use volume page from cache if present otherwise load from web and cache

        Returns:
            dict: extracted information
        """
        soup = self.get_volume_soup(number, use_cache=use_cache)
        parsed_dict = self.parse_soup(number=str(number), soup=soup) if soup else {}
        self.check_parsed_dict(parsed_dict)
        return parsed_dict, soup

    def check_parsed_dict(self, parsed_dict: dict):
        """
        check parsed_dict content e.g. urn check digit
        """
        if "urn" in parsed_dict:
            urn = parsed_dict["urn"]
            if urn:
                urn_prefix = urn[:-1]
                check_digit = URN.calc_urn_checksum(urn_prefix)
                parsed_dict["urn_check_digit"] = check_digit
                urn_ok = URN.check_urn_checksum(urn)
                parsed_dict["urn_ok"] = urn_ok

    def parse(self, url: str) -> dict:
        """
        parse the given url
        Args:
             url: URL to parse the volume information from

        Returns:
            dict: extracted information
        """
        soup = self.getSoup(url)
        parsed_dict = self.parse_soup(soup=soup) if soup else {}
        return parsed_dict

    def parse_soup(self, soup: BeautifulSoup, number: str | None = None) -> dict:
        """
        parse the volume page data from the given soup

        Args:
            number(str): the volume number
            soup(BeautifulSoup): html parser to extract the content from

        Returns:
            dict: parsed content
        """
        if soup is None:
            return {"vol_number": number}
        # first try RDFa annotations
        scrapedDict = self.parseRDFa(soup)
        for key in scrapedDict:
            scrapedDict[key] = Textparser.sanitize(scrapedDict[key])

        # second part
        for descValue in ["description", "descripton"]:
            # descripton is a typo in the Volume index files not here!
            firstDesc = soup.find("meta", {"name": descValue})
            if isinstance(firstDesc, Tag):
                desc = firstDesc["content"]
                desc = Textparser.sanitize(desc, ["CEUR Workshop Proceedings "])
                scrapedDict["desc"] = desc
                break

        # first H1 has title info
        firstH1 = soup.find("h1")
        if firstH1 is not None:
            h1 = firstH1.text
            h1 = Textparser.sanitize(h1, ['<TD bgcolor="#FFFFFF">'])
            scrapedDict["h1"] = h1
            link = firstH1.find("a")
            if link is not None and isinstance(link, Tag) and len(link.text) < 20:
                acronym = link.text.strip()
                if not acronym:
                    acronym = h1 if len(h1) < 28 else h1.split()[0]

                eventHomepage = link.attrs.get("href")
                scrapedDict["acronym"] = acronym
                scrapedDict["homepage"] = eventHomepage

        # first h3 has loctime
        firstH3 = soup.find("h3")
        if firstH3 is not None:
            h3 = firstH3.text
            h3 = Textparser.sanitize(h3)
            scrapedDict["h3"] = h3

        if self.hasValue(scrapedDict, "desc") and not self.hasValue(scrapedDict, "acronym"):
            scrapedDict["acronym"] = scrapedDict["desc"]
        if self.hasValue(scrapedDict, "h1") and not self.hasValue(scrapedDict, "title"):
            scrapedDict["title"] = scrapedDict["h1"]
        if (
            self.hasValue(scrapedDict, "h1")
            and self.hasValue(scrapedDict, "title")
            and not self.hasValue(scrapedDict, "acronym")
        ):
            scrapedDict["acronym"] = scrapedDict["h1"]
        # editorsRecords = self.parseEditors(soup)
        # scrapedDict["editors"] = editorsRecords
        return scrapedDict

    def parseEditors(self, soup: BeautifulSoup):
        """
        parse all editor information contained in the given soup
        parse all information between <b> Edited by </b> ... <hr>
        Args:
            soup: volume web page
        """
        if soup is None:
            return None
        possible_start_elements = soup.find_all("b")
        # find start
        start_elements = []
        for e in possible_start_elements:
            start_tags = ["edited by", "program committee"]
            for tag in start_tags:
                if tag in e.text.lower():
                    start_elements.append(e)
        if len(start_elements) == 0:
            return None
        edited_by = start_elements[0]
        editor_h3 = edited_by.find_next("h3")
        editor_records: dict[str, dict] = dict()
        if editor_h3 is None:
            return None
        editor_spans = editor_h3.find_all(attrs={"class": "CEURVOLEDITOR"})
        if editor_spans is not None and len(editor_spans) > 0:
            for editor_span in editor_spans:
                editor_name = editor_span.text
                editor = {"name": editor_name}
                if editor_span.parent.name == "a":
                    homepage = editor_span.parent.attrs.get("href", None)
                    editor["homepage"] = homepage
                    if editor_span.parent.next_sibling is not None:
                        affiliation_keys = editor_span.parent.next_sibling.text.strip()
                    else:
                        affiliation_keys = None
                else:
                    if editor_span.next_sibling is not None:
                        affiliation_keys = editor_span.next_sibling.text.strip()
                    else:
                        affiliation_keys = None
                if affiliation_keys is None or affiliation_keys == "":
                    sup = editor_span.find_next("sup")
                    if sup is not None:
                        affiliation_keys = sup.text.strip()
                editor["affiliation_keys"] = affiliation_keys
                editor_records[editor_name] = editor
        else:
            editor_elements = []
            group_elements: list[PageElement] = []
            if (
                editor_h3.next_sibling
                and editor_h3.next_sibling.next_sibling
                and editor_h3.next_sibling.next_sibling.name == "h3"
            ):
                while editor_h3.next_sibling.next_sibling.name == "h3" and editor_h3.text.strip() != "":
                    editor_elements.append(editor_h3.contents)
                    editor_h3 = editor_h3.next_sibling.next_sibling
            else:
                for child in editor_h3.childGenerator():
                    if child.name == "br":
                        editor_elements.append(group_elements)
                        group_elements = []
                    else:
                        group_elements.append(child)
            for elements in editor_elements:
                text = "".join([e.text for e in elements]).strip()
                affiliation_key = text.split(" ")[-1]
                editor_name = text[: -len(affiliation_key)]
                links = [e for e in elements if e.name == "a"]
                homepage = links[0].attrs.get("href", None) if len(links) > 0 else None
                editor = {
                    "name": editor_name,
                    "homepage": homepage,
                    "affiliation_key": affiliation_key,
                }
                editor_records[editor_name] = editor
        affiliation_keys = {
            editor.get("affiliation_key")
            for editor in editor_records.values()
            if editor.get("affiliation_key", None) is not None
        }
        affiliation_map = self.parseAffiliationMap(editor_h3.next_sibling)
        for editor_record in editor_records.values():
            editor_keys = editor_record.get("affiliation_keys", "")
            if editor_keys is not None:
                keys = re.split("[, ]", editor_keys)
                editor_affiliations = []
                for key in keys:
                    if key in affiliation_map:
                        editor_affiliations.append(affiliation_map.get(key.strip()))
                editor_record["affiliation"] = editor_affiliations
        return editor_records

    def parseAffiliationMap(self, start: PageElement) -> dict:
        """
        Parse out the affiliations and their reference key
        Args:
            start:

        Returns:
            dict
        """
        if start is None:
            return dict()
        end = start.find_next("hr")
        affiliations_elements = []
        group_elements: list[PageElement] = []
        if isinstance(start.previous, Tag | NavigableString):
            for element in start.previous.nextGenerator():
                if isinstance(element, Tag | NavigableString) and element.name in ["br", "hr"]:
                    affiliations_elements.append(group_elements)
                    group_elements = []
                elif isinstance(element, NavigableString) and element.text.strip() == "":
                    pass
                elif isinstance(element, Tag | NavigableString) and element.name == "h3":
                    # elements inside the element are included through the nextGenerator
                    pass
                else:
                    group_elements.append(element)
                if element == end:
                    break
        affiliations_elements = [x for x in affiliations_elements if x != []]
        affiliation_map = dict()
        for elements in affiliations_elements:
            if isinstance(elements[0], NavigableString) and " " in elements[0].text.strip():
                text_containing_key = elements[0].text.strip()
                key = text_containing_key.split(" ")[0]
                key_element = NavigableString(value=key)
                text_element = NavigableString(value=text_containing_key[len(key) :])
                elements = [key_element, text_element, *elements[1:]]
            key = elements[0].text.strip()
            text_elements = []
            link_elements = []
            for element in elements[1:]:
                if isinstance(element, NavigableString):
                    text_elements.append(element)
                elif isinstance(element, Tag | NavigableString) and element.name == "a":
                    link_elements.append(element)
            affiliation = "".join([elem.text for elem in text_elements])
            affiliation = affiliation.replace("\n", "").replace("\t", "").replace("\r", "")
            if affiliation.startswith(key):
                affiliation = affiliation[len(key) :]
            homepages = []
            for element in link_elements:
                if hasattr(element, "attrs") and element.attrs.get("href", None) is not None:
                    homepage = element.attrs.get("href", None)
                    homepages.append(homepage)
            if key is not None and key != "":
                key = key.strip(".")
                affiliation_map[key] = {
                    "name": affiliation,
                    "homepage": homepages,
                }
        return affiliation_map

    def parseRDFa(self, soup: BeautifulSoup) -> dict:
        """
        tries to parse rdfa content from the given soup
        Args:
            soup: html parser to extract the content from

        Returns:
            dict: dict with the extracted content
        """
        scrapeDescr = [
            ScrapeDescription(
                key="volume_number",
                tag="span",
                attribute="class",
                value="CEURVOLNR",
            ),
            ScrapeDescription(key="urn", tag="span", attribute="class", value="CEURURN"),
            ScrapeDescription(key="year", tag="span", attribute="class", value="CEURPUBYEAR"),
            ScrapeDescription(
                key="ceurpubdate",
                tag="span",
                attribute="class",
                value="CEURPUBDATE",
            ),
            ScrapeDescription(
                key="acronym",
                tag="span",
                attribute="class",
                value="CEURVOLACRONYM",
            ),
            ScrapeDescription(
                key="voltitle",
                tag="span",
                attribute="class",
                value="CEURVOLTITLE",
            ),
            ScrapeDescription(
                key="title",
                tag="span",
                attribute="class",
                value="CEURFULLTITLE",
            ),
            ScrapeDescription(
                key="loctime",
                tag="span",
                attribute="class",
                value="CEURLOCTIME",
            ),
            ScrapeDescription(
                key="colocated",
                tag="span",
                attribute="class",
                value="CEURCOLOCATED",
            ),
        ]
        scrapedDict = self.scrape.parseWithScrapeDescription(soup, scrapeDescr)
        return scrapedDict

__init__(baseurl='http://ceur-ws.org', timeout=3, showHtml=False, debug=False)

Constructor

Parameters:

Name Type Description Default
baseurl(str)

the baseurl of the CEUR-WS website,

required
timeout(float)

the number of seconds to wait

required
showHtml(bool)

if True show the HTML code

required
debug(bool)

if True switch debugging on

required
Source code in ceurws/volumeparser.py
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
def __init__(
    self,
    baseurl: str = "http://ceur-ws.org",
    timeout: float = 3,
    showHtml: bool = False,
    debug: bool = False,
):
    """
    Constructor

    Args:
        baseurl(str): the baseurl of the CEUR-WS website,
        timeout(float): the number of seconds to wait
        showHtml(bool): if True show the HTML code
        debug(bool): if True switch debugging on
    """
    Textparser.__init__(self, debug=debug)
    self.showHtml = showHtml
    self.baseurl = baseurl
    self.timeout = timeout
    self.scrape = WebScrape(timeout=timeout)

check_parsed_dict(parsed_dict)

check parsed_dict content e.g. urn check digit

Source code in ceurws/volumeparser.py
128
129
130
131
132
133
134
135
136
137
138
139
def check_parsed_dict(self, parsed_dict: dict):
    """
    check parsed_dict content e.g. urn check digit
    """
    if "urn" in parsed_dict:
        urn = parsed_dict["urn"]
        if urn:
            urn_prefix = urn[:-1]
            check_digit = URN.calc_urn_checksum(urn_prefix)
            parsed_dict["urn_check_digit"] = check_digit
            urn_ok = URN.check_urn_checksum(urn)
            parsed_dict["urn_ok"] = urn_ok

getSoup(url)

get the beautiful Soup parser for the given url Args: url: url to parse

Returns:

Type Description
BeautifulSoup | None

parsed webpage

Source code in ceurws/volumeparser.py
60
61
62
63
64
65
66
67
68
69
def getSoup(self, url: str) -> BeautifulSoup | None:
    """
    get the beautiful Soup parser for the given url
    Args:
        url: url to parse

    Returns:
        parsed webpage
    """
    return self.scrape.getSoup(url, showHtml=self.showHtml, debug=self.debug)

get_volume_page(number, recache=False)

Get the html content of the given volume number. Retrieves the volume page from cache or from ceur-ws.org Caches the volume page if not already cached Args: number: volume number recache: If True update the cache with a new fetch from the web. Otherwise, cache is used if present

Returns:

Type Description
str | bytes | None

html of volume page or None if the volume page is not found

Source code in ceurws/volumeparser.py
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
def get_volume_page(self, number: int, recache: bool = False) -> str | bytes | None:
    """
    Get the html content of the given volume number.
    Retrieves the volume page from cache or from ceur-ws.org
    Caches the volume page if not already cached
    Args:
        number: volume number
        recache: If True update the cache with a new fetch from the web. Otherwise, cache is used if present

    Returns:
        html of volume page or None if the volume page is not found
    """
    if not recache and VolumePageCache.is_cached(number):
        volume_page = VolumePageCache.get(number)
    else:
        url = self.volumeUrl(number)
        volume_page = self.scrape.get_html_from_url(url)
        if volume_page:
            VolumePageCache.cache(number, volume_page)
    return volume_page

get_volume_soup(number, use_cache=True)

Get Soup of the volume page for the given volume number Args: number: volume number of the volume to parse use_cache: If True use volume page from cache if present otherwise load from web and cache

Returns:

Name Type Description
BeautifulSoup BeautifulSoup | None

soup of the volume page

None BeautifulSoup | None

soup can not be loaded from cache or from web

Source code in ceurws/volumeparser.py
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
def get_volume_soup(self, number: int, use_cache: bool = True) -> BeautifulSoup | None:
    """
    Get Soup of the volume page for the given volume number
    Args:
        number: volume number of the volume to parse
        use_cache: If True use volume page from cache if present otherwise load from web and cache

    Returns:
        BeautifulSoup: soup of the volume page
        None: soup can not be loaded from cache or from web
    """
    html = self.get_volume_page(number, recache=not use_cache)
    if html is None:
        if self.debug:
            print(f"Vol-{number} could not be retrieved")
        return None
    soup = self.scrape.get_soup_from_string(html, show_html=self.showHtml)
    return soup

parse(url)

parse the given url Args: url: URL to parse the volume information from

Returns:

Name Type Description
dict dict

extracted information

Source code in ceurws/volumeparser.py
141
142
143
144
145
146
147
148
149
150
151
152
def parse(self, url: str) -> dict:
    """
    parse the given url
    Args:
         url: URL to parse the volume information from

    Returns:
        dict: extracted information
    """
    soup = self.getSoup(url)
    parsed_dict = self.parse_soup(soup=soup) if soup else {}
    return parsed_dict

parseAffiliationMap(start)

Parse out the affiliations and their reference key Args: start:

Returns:

Type Description
dict

dict

Source code in ceurws/volumeparser.py
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
def parseAffiliationMap(self, start: PageElement) -> dict:
    """
    Parse out the affiliations and their reference key
    Args:
        start:

    Returns:
        dict
    """
    if start is None:
        return dict()
    end = start.find_next("hr")
    affiliations_elements = []
    group_elements: list[PageElement] = []
    if isinstance(start.previous, Tag | NavigableString):
        for element in start.previous.nextGenerator():
            if isinstance(element, Tag | NavigableString) and element.name in ["br", "hr"]:
                affiliations_elements.append(group_elements)
                group_elements = []
            elif isinstance(element, NavigableString) and element.text.strip() == "":
                pass
            elif isinstance(element, Tag | NavigableString) and element.name == "h3":
                # elements inside the element are included through the nextGenerator
                pass
            else:
                group_elements.append(element)
            if element == end:
                break
    affiliations_elements = [x for x in affiliations_elements if x != []]
    affiliation_map = dict()
    for elements in affiliations_elements:
        if isinstance(elements[0], NavigableString) and " " in elements[0].text.strip():
            text_containing_key = elements[0].text.strip()
            key = text_containing_key.split(" ")[0]
            key_element = NavigableString(value=key)
            text_element = NavigableString(value=text_containing_key[len(key) :])
            elements = [key_element, text_element, *elements[1:]]
        key = elements[0].text.strip()
        text_elements = []
        link_elements = []
        for element in elements[1:]:
            if isinstance(element, NavigableString):
                text_elements.append(element)
            elif isinstance(element, Tag | NavigableString) and element.name == "a":
                link_elements.append(element)
        affiliation = "".join([elem.text for elem in text_elements])
        affiliation = affiliation.replace("\n", "").replace("\t", "").replace("\r", "")
        if affiliation.startswith(key):
            affiliation = affiliation[len(key) :]
        homepages = []
        for element in link_elements:
            if hasattr(element, "attrs") and element.attrs.get("href", None) is not None:
                homepage = element.attrs.get("href", None)
                homepages.append(homepage)
        if key is not None and key != "":
            key = key.strip(".")
            affiliation_map[key] = {
                "name": affiliation,
                "homepage": homepages,
            }
    return affiliation_map

parseEditors(soup)

parse all editor information contained in the given soup parse all information between Edited by ...


Args: soup: volume web page

Source code in ceurws/volumeparser.py
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
def parseEditors(self, soup: BeautifulSoup):
    """
    parse all editor information contained in the given soup
    parse all information between <b> Edited by </b> ... <hr>
    Args:
        soup: volume web page
    """
    if soup is None:
        return None
    possible_start_elements = soup.find_all("b")
    # find start
    start_elements = []
    for e in possible_start_elements:
        start_tags = ["edited by", "program committee"]
        for tag in start_tags:
            if tag in e.text.lower():
                start_elements.append(e)
    if len(start_elements) == 0:
        return None
    edited_by = start_elements[0]
    editor_h3 = edited_by.find_next("h3")
    editor_records: dict[str, dict] = dict()
    if editor_h3 is None:
        return None
    editor_spans = editor_h3.find_all(attrs={"class": "CEURVOLEDITOR"})
    if editor_spans is not None and len(editor_spans) > 0:
        for editor_span in editor_spans:
            editor_name = editor_span.text
            editor = {"name": editor_name}
            if editor_span.parent.name == "a":
                homepage = editor_span.parent.attrs.get("href", None)
                editor["homepage"] = homepage
                if editor_span.parent.next_sibling is not None:
                    affiliation_keys = editor_span.parent.next_sibling.text.strip()
                else:
                    affiliation_keys = None
            else:
                if editor_span.next_sibling is not None:
                    affiliation_keys = editor_span.next_sibling.text.strip()
                else:
                    affiliation_keys = None
            if affiliation_keys is None or affiliation_keys == "":
                sup = editor_span.find_next("sup")
                if sup is not None:
                    affiliation_keys = sup.text.strip()
            editor["affiliation_keys"] = affiliation_keys
            editor_records[editor_name] = editor
    else:
        editor_elements = []
        group_elements: list[PageElement] = []
        if (
            editor_h3.next_sibling
            and editor_h3.next_sibling.next_sibling
            and editor_h3.next_sibling.next_sibling.name == "h3"
        ):
            while editor_h3.next_sibling.next_sibling.name == "h3" and editor_h3.text.strip() != "":
                editor_elements.append(editor_h3.contents)
                editor_h3 = editor_h3.next_sibling.next_sibling
        else:
            for child in editor_h3.childGenerator():
                if child.name == "br":
                    editor_elements.append(group_elements)
                    group_elements = []
                else:
                    group_elements.append(child)
        for elements in editor_elements:
            text = "".join([e.text for e in elements]).strip()
            affiliation_key = text.split(" ")[-1]
            editor_name = text[: -len(affiliation_key)]
            links = [e for e in elements if e.name == "a"]
            homepage = links[0].attrs.get("href", None) if len(links) > 0 else None
            editor = {
                "name": editor_name,
                "homepage": homepage,
                "affiliation_key": affiliation_key,
            }
            editor_records[editor_name] = editor
    affiliation_keys = {
        editor.get("affiliation_key")
        for editor in editor_records.values()
        if editor.get("affiliation_key", None) is not None
    }
    affiliation_map = self.parseAffiliationMap(editor_h3.next_sibling)
    for editor_record in editor_records.values():
        editor_keys = editor_record.get("affiliation_keys", "")
        if editor_keys is not None:
            keys = re.split("[, ]", editor_keys)
            editor_affiliations = []
            for key in keys:
                if key in affiliation_map:
                    editor_affiliations.append(affiliation_map.get(key.strip()))
            editor_record["affiliation"] = editor_affiliations
    return editor_records

parseRDFa(soup)

tries to parse rdfa content from the given soup Args: soup: html parser to extract the content from

Returns:

Name Type Description
dict dict

dict with the extracted content

Source code in ceurws/volumeparser.py
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
def parseRDFa(self, soup: BeautifulSoup) -> dict:
    """
    tries to parse rdfa content from the given soup
    Args:
        soup: html parser to extract the content from

    Returns:
        dict: dict with the extracted content
    """
    scrapeDescr = [
        ScrapeDescription(
            key="volume_number",
            tag="span",
            attribute="class",
            value="CEURVOLNR",
        ),
        ScrapeDescription(key="urn", tag="span", attribute="class", value="CEURURN"),
        ScrapeDescription(key="year", tag="span", attribute="class", value="CEURPUBYEAR"),
        ScrapeDescription(
            key="ceurpubdate",
            tag="span",
            attribute="class",
            value="CEURPUBDATE",
        ),
        ScrapeDescription(
            key="acronym",
            tag="span",
            attribute="class",
            value="CEURVOLACRONYM",
        ),
        ScrapeDescription(
            key="voltitle",
            tag="span",
            attribute="class",
            value="CEURVOLTITLE",
        ),
        ScrapeDescription(
            key="title",
            tag="span",
            attribute="class",
            value="CEURFULLTITLE",
        ),
        ScrapeDescription(
            key="loctime",
            tag="span",
            attribute="class",
            value="CEURLOCTIME",
        ),
        ScrapeDescription(
            key="colocated",
            tag="span",
            attribute="class",
            value="CEURCOLOCATED",
        ),
    ]
    scrapedDict = self.scrape.parseWithScrapeDescription(soup, scrapeDescr)
    return scrapedDict

parse_soup(soup, number=None)

parse the volume page data from the given soup

Parameters:

Name Type Description Default
number(str)

the volume number

required
soup(BeautifulSoup)

html parser to extract the content from

required

Returns:

Name Type Description
dict dict

parsed content

Source code in ceurws/volumeparser.py
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
def parse_soup(self, soup: BeautifulSoup, number: str | None = None) -> dict:
    """
    parse the volume page data from the given soup

    Args:
        number(str): the volume number
        soup(BeautifulSoup): html parser to extract the content from

    Returns:
        dict: parsed content
    """
    if soup is None:
        return {"vol_number": number}
    # first try RDFa annotations
    scrapedDict = self.parseRDFa(soup)
    for key in scrapedDict:
        scrapedDict[key] = Textparser.sanitize(scrapedDict[key])

    # second part
    for descValue in ["description", "descripton"]:
        # descripton is a typo in the Volume index files not here!
        firstDesc = soup.find("meta", {"name": descValue})
        if isinstance(firstDesc, Tag):
            desc = firstDesc["content"]
            desc = Textparser.sanitize(desc, ["CEUR Workshop Proceedings "])
            scrapedDict["desc"] = desc
            break

    # first H1 has title info
    firstH1 = soup.find("h1")
    if firstH1 is not None:
        h1 = firstH1.text
        h1 = Textparser.sanitize(h1, ['<TD bgcolor="#FFFFFF">'])
        scrapedDict["h1"] = h1
        link = firstH1.find("a")
        if link is not None and isinstance(link, Tag) and len(link.text) < 20:
            acronym = link.text.strip()
            if not acronym:
                acronym = h1 if len(h1) < 28 else h1.split()[0]

            eventHomepage = link.attrs.get("href")
            scrapedDict["acronym"] = acronym
            scrapedDict["homepage"] = eventHomepage

    # first h3 has loctime
    firstH3 = soup.find("h3")
    if firstH3 is not None:
        h3 = firstH3.text
        h3 = Textparser.sanitize(h3)
        scrapedDict["h3"] = h3

    if self.hasValue(scrapedDict, "desc") and not self.hasValue(scrapedDict, "acronym"):
        scrapedDict["acronym"] = scrapedDict["desc"]
    if self.hasValue(scrapedDict, "h1") and not self.hasValue(scrapedDict, "title"):
        scrapedDict["title"] = scrapedDict["h1"]
    if (
        self.hasValue(scrapedDict, "h1")
        and self.hasValue(scrapedDict, "title")
        and not self.hasValue(scrapedDict, "acronym")
    ):
        scrapedDict["acronym"] = scrapedDict["h1"]
    # editorsRecords = self.parseEditors(soup)
    # scrapedDict["editors"] = editorsRecords
    return scrapedDict

parse_volume(number, use_cache=True)

parse the given volume caches the volume pages at ~/.ceurws/volumes

Parameters:

Name Type Description Default
number int

volume number of the volume to parse

required
use_cache bool

If True use volume page from cache if present otherwise load from web and cache

True

Returns:

Name Type Description
dict tuple[dict, BeautifulSoup | None]

extracted information

Source code in ceurws/volumeparser.py
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
def parse_volume(self, number: int, use_cache: bool = True) -> tuple[dict, BeautifulSoup | None]:
    """
    parse the given volume
    caches the volume pages at ~/.ceurws/volumes

    Args:
        number: volume number of the volume to parse
        use_cache: If True use volume page from cache if present otherwise load from web and cache

    Returns:
        dict: extracted information
    """
    soup = self.get_volume_soup(number, use_cache=use_cache)
    parsed_dict = self.parse_soup(number=str(number), soup=soup) if soup else {}
    self.check_parsed_dict(parsed_dict)
    return parsed_dict, soup

volumeUrl(volnumber)

get the url for the given volume number

Parameters:

Name Type Description Default
volnumber(str)

the volume number

required

Returns:

Name Type Description
str

url - the url of the volume

Source code in ceurws/volumeparser.py
46
47
48
49
50
51
52
53
54
55
56
57
58
def volumeUrl(self, volnumber: str | int):
    """
    get the url for the given volume number

    Args:
        volnumber(str): the volume number

    Returns:
        str: url - the url of the volume
    """
    # e.g. http://ceur-ws.org/Vol-2635/
    url = f"{self.baseurl}/Vol-{volnumber}"
    return url

webserver

Created on 2024-02-22

@author: wf

CeurWsSolution

Bases: InputWebSolution

CEUR-WS Volume browser solution

Source code in ceurws/webserver.py
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
class CeurWsSolution(InputWebSolution):
    """
    CEUR-WS Volume browser solution

    """

    def __init__(self, webserver: CeurWsWebServer, client: Client):
        """
        Initialize the solution

        Calls the constructor of the base solution
        Args:
            webserver (CeurWsWebServer): The webserver instance associated with this context.
            client (Client): The client instance this context is associated with.
        """
        super().__init__(webserver, client)  # Call to the superclass constructor
        self.wdSync = self.webserver.wdSync

    def configure_menu(self):
        InputWebSolution.configure_menu(self)
        self.link_button(name="volumes", icon_name="table", target="/volumes", new_tab=False)
        self.link_button(name="wikidata", icon_name="cloud_sync", target="/wikidatasync", new_tab=False)

    def prepare_ui(self):
        """
        prepare the user interface
        """
        InputWebSolution.prepare_ui(self)
        # does not work as expected ...
        # self.add_css()

    def add_css(self):
        # Get the correct path to the 'css' directory
        css_directory_path = Path(__file__).parent.parent / "css"
        # Check if the directory exists before trying to serve it
        if css_directory_path.is_dir():
            # Serve files from the 'css' directory at the '/css' route
            app.add_static_files("/css", str(css_directory_path))

            # Iterate over all .css files in the directory
            for css_file in os.listdir(css_directory_path):
                if css_file.endswith(".css"):
                    # Add the link tag for the css file to the head of the HTML document
                    ui.add_head_html(f'<link rel="stylesheet" type="text/css" href="/css/{css_file}">')

    async def wikidatasync(self):
        """
        show the wikidata sync table
        """

        def show():
            self.wikidata_view = WikidataView(self, self.container)

        await self.setup_content_div(show)

    async def volumes(self):
        """
        show the volumes table
        """

        def show():
            self.volume_list_view = VolumeListView(self, self.container)

        await self.setup_content_div(show)

    async def home(self):
        """
        home page selection
        """

        def show():
            try:
                with self.container:
                    with ui.row() as self.select_container:
                        self.volume_select = self.add_select(
                            "Volume",
                            selection=self.wdSync.volumeOptions,
                            with_input=True,
                            on_change=self.volume_selected,
                        ).props("size=120")
                    self.volume_view = VolumeView(self, self.container)
            except Exception as ex:
                self.handle_exception(ex)

        await self.setup_content_div(show)

    async def volume_selected(self, args: ValueChangeEventArguments):
        """
        when a volume is selected show the details in the Volume View
        """
        vol_number = args.value
        volume = self.wdSync.volumesByNumber[vol_number]
        self.volume_view.showVolume(volume)
        pass

__init__(webserver, client)

Initialize the solution

Calls the constructor of the base solution Args: webserver (CeurWsWebServer): The webserver instance associated with this context. client (Client): The client instance this context is associated with.

Source code in ceurws/webserver.py
238
239
240
241
242
243
244
245
246
247
248
def __init__(self, webserver: CeurWsWebServer, client: Client):
    """
    Initialize the solution

    Calls the constructor of the base solution
    Args:
        webserver (CeurWsWebServer): The webserver instance associated with this context.
        client (Client): The client instance this context is associated with.
    """
    super().__init__(webserver, client)  # Call to the superclass constructor
    self.wdSync = self.webserver.wdSync

home() async

home page selection

Source code in ceurws/webserver.py
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
async def home(self):
    """
    home page selection
    """

    def show():
        try:
            with self.container:
                with ui.row() as self.select_container:
                    self.volume_select = self.add_select(
                        "Volume",
                        selection=self.wdSync.volumeOptions,
                        with_input=True,
                        on_change=self.volume_selected,
                    ).props("size=120")
                self.volume_view = VolumeView(self, self.container)
        except Exception as ex:
            self.handle_exception(ex)

    await self.setup_content_div(show)

prepare_ui()

prepare the user interface

Source code in ceurws/webserver.py
255
256
257
258
259
def prepare_ui(self):
    """
    prepare the user interface
    """
    InputWebSolution.prepare_ui(self)

volume_selected(args) async

when a volume is selected show the details in the Volume View

Source code in ceurws/webserver.py
318
319
320
321
322
323
324
325
async def volume_selected(self, args: ValueChangeEventArguments):
    """
    when a volume is selected show the details in the Volume View
    """
    vol_number = args.value
    volume = self.wdSync.volumesByNumber[vol_number]
    self.volume_view.showVolume(volume)
    pass

volumes() async

show the volumes table

Source code in ceurws/webserver.py
287
288
289
290
291
292
293
294
295
async def volumes(self):
    """
    show the volumes table
    """

    def show():
        self.volume_list_view = VolumeListView(self, self.container)

    await self.setup_content_div(show)

wikidatasync() async

show the wikidata sync table

Source code in ceurws/webserver.py
277
278
279
280
281
282
283
284
285
async def wikidatasync(self):
    """
    show the wikidata sync table
    """

    def show():
        self.wikidata_view = WikidataView(self, self.container)

    await self.setup_content_div(show)

CeurWsWebServer

Bases: InputWebserver

webserver

Source code in ceurws/webserver.py
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
class CeurWsWebServer(InputWebserver):
    """
    webserver
    """

    @classmethod
    def get_config(cls) -> WebserverConfig:
        copy_right = "(c)2023-2024 Wolfgang Fahl"
        config = WebserverConfig(
            copy_right=copy_right,
            version=Version(),
            default_port=9998,
            timeout=10.0,
            short_name="spf",
        )
        server_config = WebserverConfig.get(config)
        server_config.solution_class = CeurWsSolution
        return server_config

    def __init__(self):
        """
        constructor
        """
        InputWebserver.__init__(self, config=CeurWsWebServer.get_config())

        @ui.page("/volumes")
        async def show_volumes(client: Client):
            return await self.page(client, CeurWsSolution.volumes)

        @ui.page("/volume/{volnumber}")
        async def show_volume_page(client: Client, vol_number):
            return await self.page(client, CeurWsSolution.volumePage, vol_number)

        @ui.page("/wikidatasync")
        async def wikidatasync(client: Client):
            return await self.page(client, CeurWsSolution.wikidatasync)

        @app.get("/volumes.json")
        async def volumes():
            """
            direct fastapi return of volumes
            """
            volumeList = self.wdSync.vm.getList()
            return volumeList

        @app.get("/proceedings.json")
        async def proceedings():
            """
            direct fastapi return of proceedings
            """
            proceedingsList = self.wdSync.loadProceedingsFromCache()
            return ORJSONResponse(proceedingsList)

        @app.get("/papers.json")
        async def papers():
            """
            direct fastapi return of papers
            """
            paperList = self.wdSync.pm.getList()
            return paperList

        @app.get(
            "/papers_dblp.json",
            tags=["dblp complete dataset"],
            # response_model= List[DblpPaper]
        )
        async def papers_dblp():
            """
            direct fastapi return of paper information from dblp
            """
            self.wdSync.dblpEndpoint.dblp_papers.load()
            papers = self.wdSync.dblpEndpoint.dblp_papers.papers
            records = [p.to_json() for p in papers]
            lod = [orjson.loads(json_str) for json_str in records]
            return ORJSONResponse(lod)

        @app.get(
            "/authors_dblp.json",
            tags=["dblp complete dataset"],
            # response_model=List[DblpAuthor]
        )
        async def authors_papers_dblp():
            """
            direct fastapi return of paper information from dblp
            """
            authors = self.wdSync.dblpEndpoint.get_all_ceur_authors()
            return ORJSONResponse(content=authors)

        @app.get("/dblp/papers", tags=["dblp complete dataset"])
        async def dblp_papers(limit: int = 100, offset: int = 0) -> list[DblpPaper]:
            """
            Get ceur-ws volumes form dblp
            Args:
                limit: max number of returned papers
                offset:

            Returns:
            """
            papers = self.wdSync.dblpEndpoint.get_all_ceur_papers()
            return papers[offset:limit]

        @app.get("/dblp/editors", tags=["dblp complete dataset"])
        async def dblp_editors(limit: int = 100, offset: int = 0) -> list[DblpScholar]:
            """
            Get ceur-ws volume editors form dblp
            Args:
                limit: max number of returned papers
                offset:

            Returns:
            """
            editors = self.wdSync.dblpEndpoint.get_all_ceur_editors()
            return editors[offset:limit]

        @app.get("/dblp/volumes", tags=["dblp complete dataset"])
        async def dblp_volumes(limit: int = 100, offset: int = 0) -> list[DblpPaper]:
            """
            Get ceur-ws volumes form dblp
            Args:
                limit: max number of returned papers
                offset:

            Returns:
            """
            proceedings = self.wdSync.dblpEndpoint.get_all_ceur_proceedings()
            return proceedings[offset:limit]

        @app.get("/dblp/volume/{volume_number}", tags=["dblp"])
        async def dblp_volume(volume_number: int) -> DblpProceeding:
            """
            Get ceur-ws volume form dblp
            """
            try:
                proceeding = self.wdSync.dblpEndpoint.get_ceur_proceeding(volume_number)
            except Exception as e:
                raise HTTPException(status_code=404, detail=str(e)) from e
            if proceeding:
                return proceeding
            else:
                raise HTTPException(status_code=404, detail="Volume not found")

        @app.get("/dblp/volume/{volume_number}/editor", tags=["dblp"])
        async def dblp_volume_editors(volume_number: int) -> list[DblpScholar]:
            """
            Get ceur-ws volume editors form dblp
            """
            try:
                proceeding = self.wdSync.dblpEndpoint.get_ceur_proceeding(volume_number)
            except Exception as e:
                raise HTTPException(status_code=404, detail=str(e)) from e
            if proceeding:
                return proceeding.editors
            else:
                raise HTTPException(status_code=404, detail="Volume not found")

        @app.get("/dblp/volume/{volume_number}/paper", tags=["dblp"])
        async def dblp_volume_papers(volume_number: int) -> list[DblpPaper]:
            """
            Get ceur-ws volume papers form dblp
            Args:
                volume_number: number of the volume

            Returns:
            """
            papers = self.wdSync.dblpEndpoint.get_ceur_volume_papers(volume_number)
            return papers

        @app.get("/dblp/volume/{volume_number}/paper/{paper_id}", tags=["dblp"])
        async def dblp_paper(volume_number: int, paper_id: str) -> DblpPaper:
            """
            Get ceur-ws volume paper form dblp
            """
            papers = self.wdSync.dblpEndpoint.get_ceur_volume_papers(volume_number)
            if papers:
                for paper in papers:
                    if paper.pdf_id == f"Vol-{volume_number}/{paper_id}":
                        return paper
                raise HTTPException(status_code=404, detail="Paper not found")
            else:
                raise HTTPException(status_code=404, detail="Volume not found")

        @app.get(
            "/dblp/volume/{volume_number}/paper/{paper_id}/author",
            tags=["dblp"],
        )
        async def dblp_paper_authors(volume_number: int, paper_id: str) -> list[DblpScholar]:
            """
            Get ceur-ws volume paper form dblp
            """
            papers = self.wdSync.dblpEndpoint.get_ceur_volume_papers(volume_number)
            if papers:
                for paper in papers:
                    if paper.pdf_id == f"Vol-{volume_number}/{paper_id}":
                        return paper.authors
                raise HTTPException(status_code=404, detail="Paper not found")
            else:
                raise HTTPException(status_code=404, detail="Volume not found")

    def configure_run(self):
        """
        configure command line specific details
        """
        InputWebserver.configure_run(self)
        self.wdSync = WikidataSync.from_args(self.args)

__init__()

constructor

Source code in ceurws/webserver.py
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
def __init__(self):
    """
    constructor
    """
    InputWebserver.__init__(self, config=CeurWsWebServer.get_config())

    @ui.page("/volumes")
    async def show_volumes(client: Client):
        return await self.page(client, CeurWsSolution.volumes)

    @ui.page("/volume/{volnumber}")
    async def show_volume_page(client: Client, vol_number):
        return await self.page(client, CeurWsSolution.volumePage, vol_number)

    @ui.page("/wikidatasync")
    async def wikidatasync(client: Client):
        return await self.page(client, CeurWsSolution.wikidatasync)

    @app.get("/volumes.json")
    async def volumes():
        """
        direct fastapi return of volumes
        """
        volumeList = self.wdSync.vm.getList()
        return volumeList

    @app.get("/proceedings.json")
    async def proceedings():
        """
        direct fastapi return of proceedings
        """
        proceedingsList = self.wdSync.loadProceedingsFromCache()
        return ORJSONResponse(proceedingsList)

    @app.get("/papers.json")
    async def papers():
        """
        direct fastapi return of papers
        """
        paperList = self.wdSync.pm.getList()
        return paperList

    @app.get(
        "/papers_dblp.json",
        tags=["dblp complete dataset"],
        # response_model= List[DblpPaper]
    )
    async def papers_dblp():
        """
        direct fastapi return of paper information from dblp
        """
        self.wdSync.dblpEndpoint.dblp_papers.load()
        papers = self.wdSync.dblpEndpoint.dblp_papers.papers
        records = [p.to_json() for p in papers]
        lod = [orjson.loads(json_str) for json_str in records]
        return ORJSONResponse(lod)

    @app.get(
        "/authors_dblp.json",
        tags=["dblp complete dataset"],
        # response_model=List[DblpAuthor]
    )
    async def authors_papers_dblp():
        """
        direct fastapi return of paper information from dblp
        """
        authors = self.wdSync.dblpEndpoint.get_all_ceur_authors()
        return ORJSONResponse(content=authors)

    @app.get("/dblp/papers", tags=["dblp complete dataset"])
    async def dblp_papers(limit: int = 100, offset: int = 0) -> list[DblpPaper]:
        """
        Get ceur-ws volumes form dblp
        Args:
            limit: max number of returned papers
            offset:

        Returns:
        """
        papers = self.wdSync.dblpEndpoint.get_all_ceur_papers()
        return papers[offset:limit]

    @app.get("/dblp/editors", tags=["dblp complete dataset"])
    async def dblp_editors(limit: int = 100, offset: int = 0) -> list[DblpScholar]:
        """
        Get ceur-ws volume editors form dblp
        Args:
            limit: max number of returned papers
            offset:

        Returns:
        """
        editors = self.wdSync.dblpEndpoint.get_all_ceur_editors()
        return editors[offset:limit]

    @app.get("/dblp/volumes", tags=["dblp complete dataset"])
    async def dblp_volumes(limit: int = 100, offset: int = 0) -> list[DblpPaper]:
        """
        Get ceur-ws volumes form dblp
        Args:
            limit: max number of returned papers
            offset:

        Returns:
        """
        proceedings = self.wdSync.dblpEndpoint.get_all_ceur_proceedings()
        return proceedings[offset:limit]

    @app.get("/dblp/volume/{volume_number}", tags=["dblp"])
    async def dblp_volume(volume_number: int) -> DblpProceeding:
        """
        Get ceur-ws volume form dblp
        """
        try:
            proceeding = self.wdSync.dblpEndpoint.get_ceur_proceeding(volume_number)
        except Exception as e:
            raise HTTPException(status_code=404, detail=str(e)) from e
        if proceeding:
            return proceeding
        else:
            raise HTTPException(status_code=404, detail="Volume not found")

    @app.get("/dblp/volume/{volume_number}/editor", tags=["dblp"])
    async def dblp_volume_editors(volume_number: int) -> list[DblpScholar]:
        """
        Get ceur-ws volume editors form dblp
        """
        try:
            proceeding = self.wdSync.dblpEndpoint.get_ceur_proceeding(volume_number)
        except Exception as e:
            raise HTTPException(status_code=404, detail=str(e)) from e
        if proceeding:
            return proceeding.editors
        else:
            raise HTTPException(status_code=404, detail="Volume not found")

    @app.get("/dblp/volume/{volume_number}/paper", tags=["dblp"])
    async def dblp_volume_papers(volume_number: int) -> list[DblpPaper]:
        """
        Get ceur-ws volume papers form dblp
        Args:
            volume_number: number of the volume

        Returns:
        """
        papers = self.wdSync.dblpEndpoint.get_ceur_volume_papers(volume_number)
        return papers

    @app.get("/dblp/volume/{volume_number}/paper/{paper_id}", tags=["dblp"])
    async def dblp_paper(volume_number: int, paper_id: str) -> DblpPaper:
        """
        Get ceur-ws volume paper form dblp
        """
        papers = self.wdSync.dblpEndpoint.get_ceur_volume_papers(volume_number)
        if papers:
            for paper in papers:
                if paper.pdf_id == f"Vol-{volume_number}/{paper_id}":
                    return paper
            raise HTTPException(status_code=404, detail="Paper not found")
        else:
            raise HTTPException(status_code=404, detail="Volume not found")

    @app.get(
        "/dblp/volume/{volume_number}/paper/{paper_id}/author",
        tags=["dblp"],
    )
    async def dblp_paper_authors(volume_number: int, paper_id: str) -> list[DblpScholar]:
        """
        Get ceur-ws volume paper form dblp
        """
        papers = self.wdSync.dblpEndpoint.get_ceur_volume_papers(volume_number)
        if papers:
            for paper in papers:
                if paper.pdf_id == f"Vol-{volume_number}/{paper_id}":
                    return paper.authors
            raise HTTPException(status_code=404, detail="Paper not found")
        else:
            raise HTTPException(status_code=404, detail="Volume not found")

configure_run()

configure command line specific details

Source code in ceurws/webserver.py
223
224
225
226
227
228
def configure_run(self):
    """
    configure command line specific details
    """
    InputWebserver.configure_run(self)
    self.wdSync = WikidataSync.from_args(self.args)

wikidata_view

Created on 2024-02-23

@author: wf

WikidataView

Bases: View

Wikidata View

Source code in ceurws/wikidata_view.py
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
class WikidataView(View):
    """
    Wikidata View
    """

    def __init__(self, solution, parent):
        """
        constructor

        Args:
            solution: the solution
            parent: the parent UI container

        """
        self.solution = solution
        self.parent = parent
        self.setup_ui()

    async def update_proceedings(self):
        """
        update the cached proceedings
        """
        try:
            self.proceedings_records = self.solution.wdSync.loadProceedingsFromCache()
            with self.parent:
                ui.notify(f"found {len(self.proceedings_records)} cached wikidata proceedings records")
                self.reload_aggrid(self.proceedings_records)
        except Exception as ex:
            self.solution.handle_exception(ex)

    def reload_aggrid(self, olod: list):
        """
        reload my aggrid with the list of Volumes
        """
        reverseLod = sorted(
            olod,
            key=lambda row: int(row.get("sVolume") or row.get("Volume") or 0),
            reverse=True,
        )
        lod = []
        for row in reverseLod:
            volume = self.getRowValue(row, "sVolume")
            if volume == self.noneValue:
                volume = self.getRowValue(row, "Volume")
            if volume != self.noneValue:
                try:
                    vol_no = int(volume)
                    volumeLink = self.createLink(
                        f"http://ceur-ws.org/Vol-{volume}",
                        f"Vol-{vol_no:04}",
                    )
                except Exception as _ex:
                    volumeLink = self.noneValue
            else:
                volumeLink = self.noneValue
            itemLink = self.createItemLink(row, "item")
            eventLink = self.createItemLink(row, "event", separator="|")
            eventSeriesLink = self.createItemLink(row, "eventSeries", separator="|")
            dblpLink = self.createExternalLink(row, "dblpProceedingsId", "dblp", DblpEndpoint.DBLP_REC_PREFIX)
            k10PlusLink = self.createExternalLink(
                row, "ppnId", "k10plus", "https://opac.k10plus.de/DB=2.299/PPNSET?PPN="
            )
            lod.append(
                {
                    "#": volume,
                    "item": itemLink,
                    "volume": volumeLink,
                    "acronym": self.getRowValue(row, "short_name"),
                    "dblp": dblpLink,
                    "k10plus": k10PlusLink,
                    "event": eventLink,
                    "series": eventSeriesLink,
                    "ordinal": self.getRowValue(row, "eventSeriesOrdinal"),
                    # "title":row.get("title","?"),
                }
            )
        self.lod_grid.load_lod(lod)
        # set max width of Item column
        self.lod_grid.set_column_def("item", "maxWidth", 380)
        self.lod_grid.set_column_def("event", "maxWidth", 380)
        self.lod_grid.sizeColumnsToFit()

    async def on_refresh_button_click(self):
        """
        handle the refreshing of the proceedings from wikidata
        """
        await run.io_bound(self.refresh_wikidata)

    def refresh_wikidata(self):
        try:
            with self.solution.container:
                ui.notify("wikidata refresh button clicked")
            wd_records = self.solution.wdSync.update()
            with self.solution.container:
                ui.notify(f"read {len(wd_records)} proceeding records from wikidata")
            with self.parent:
                self.reload_aggrid(wd_records)
            pass
        except Exception as ex:
            self.solution.handle_exception(ex)

    def setup_ui(self):
        """
        setup my User Interface elements
        """
        with self.parent:
            with ui.row() as self.tool_bar:
                self.refresh_button = (
                    ui.button(
                        icon="refresh",
                        on_click=self.on_refresh_button_click,
                    )
                    .classes("btn btn-primary btn-sm col-1")
                    .tooltip("Refresh from Wikidata SPARQL endpoint")
                )
                self.query_view = QueryView(
                    self.solution,
                    name="CEUR-WS wikidata sync",
                    sparql_endpoint=self.solution.wdSync.wikidata_endpoint,
                )
                self.query_view.show_query(self.solution.wdSync.wdQuery.query)

            # grid_config = GridConfig(
            #        key_col="Vol",
            #        multiselect=True)

            self.lod_grid = ListOfDictsGrid()
            ui.timer(0, self.update_proceedings, once=True)
            pass

__init__(solution, parent)

constructor

Parameters:

Name Type Description Default
solution

the solution

required
parent

the parent UI container

required
Source code in ceurws/wikidata_view.py
20
21
22
23
24
25
26
27
28
29
30
31
def __init__(self, solution, parent):
    """
    constructor

    Args:
        solution: the solution
        parent: the parent UI container

    """
    self.solution = solution
    self.parent = parent
    self.setup_ui()

on_refresh_button_click() async

handle the refreshing of the proceedings from wikidata

Source code in ceurws/wikidata_view.py
 97
 98
 99
100
101
async def on_refresh_button_click(self):
    """
    handle the refreshing of the proceedings from wikidata
    """
    await run.io_bound(self.refresh_wikidata)

reload_aggrid(olod)

reload my aggrid with the list of Volumes

Source code in ceurws/wikidata_view.py
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
def reload_aggrid(self, olod: list):
    """
    reload my aggrid with the list of Volumes
    """
    reverseLod = sorted(
        olod,
        key=lambda row: int(row.get("sVolume") or row.get("Volume") or 0),
        reverse=True,
    )
    lod = []
    for row in reverseLod:
        volume = self.getRowValue(row, "sVolume")
        if volume == self.noneValue:
            volume = self.getRowValue(row, "Volume")
        if volume != self.noneValue:
            try:
                vol_no = int(volume)
                volumeLink = self.createLink(
                    f"http://ceur-ws.org/Vol-{volume}",
                    f"Vol-{vol_no:04}",
                )
            except Exception as _ex:
                volumeLink = self.noneValue
        else:
            volumeLink = self.noneValue
        itemLink = self.createItemLink(row, "item")
        eventLink = self.createItemLink(row, "event", separator="|")
        eventSeriesLink = self.createItemLink(row, "eventSeries", separator="|")
        dblpLink = self.createExternalLink(row, "dblpProceedingsId", "dblp", DblpEndpoint.DBLP_REC_PREFIX)
        k10PlusLink = self.createExternalLink(
            row, "ppnId", "k10plus", "https://opac.k10plus.de/DB=2.299/PPNSET?PPN="
        )
        lod.append(
            {
                "#": volume,
                "item": itemLink,
                "volume": volumeLink,
                "acronym": self.getRowValue(row, "short_name"),
                "dblp": dblpLink,
                "k10plus": k10PlusLink,
                "event": eventLink,
                "series": eventSeriesLink,
                "ordinal": self.getRowValue(row, "eventSeriesOrdinal"),
                # "title":row.get("title","?"),
            }
        )
    self.lod_grid.load_lod(lod)
    # set max width of Item column
    self.lod_grid.set_column_def("item", "maxWidth", 380)
    self.lod_grid.set_column_def("event", "maxWidth", 380)
    self.lod_grid.sizeColumnsToFit()

setup_ui()

setup my User Interface elements

Source code in ceurws/wikidata_view.py
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
def setup_ui(self):
    """
    setup my User Interface elements
    """
    with self.parent:
        with ui.row() as self.tool_bar:
            self.refresh_button = (
                ui.button(
                    icon="refresh",
                    on_click=self.on_refresh_button_click,
                )
                .classes("btn btn-primary btn-sm col-1")
                .tooltip("Refresh from Wikidata SPARQL endpoint")
            )
            self.query_view = QueryView(
                self.solution,
                name="CEUR-WS wikidata sync",
                sparql_endpoint=self.solution.wdSync.wikidata_endpoint,
            )
            self.query_view.show_query(self.solution.wdSync.wdQuery.query)

        # grid_config = GridConfig(
        #        key_col="Vol",
        #        multiselect=True)

        self.lod_grid = ListOfDictsGrid()
        ui.timer(0, self.update_proceedings, once=True)
        pass

update_proceedings() async

update the cached proceedings

Source code in ceurws/wikidata_view.py
33
34
35
36
37
38
39
40
41
42
43
async def update_proceedings(self):
    """
    update the cached proceedings
    """
    try:
        self.proceedings_records = self.solution.wdSync.loadProceedingsFromCache()
        with self.parent:
            ui.notify(f"found {len(self.proceedings_records)} cached wikidata proceedings records")
            self.reload_aggrid(self.proceedings_records)
    except Exception as ex:
        self.solution.handle_exception(ex)

wikidatasync

Created on 2022-08-14

@author: wf

WikidataSync

synchronize with wikidata

Source code in ceurws/wikidatasync.py
  25
  26
  27
  28
  29
  30
  31
  32
  33
  34
  35
  36
  37
  38
  39
  40
  41
  42
  43
  44
  45
  46
  47
  48
  49
  50
  51
  52
  53
  54
  55
  56
  57
  58
  59
  60
  61
  62
  63
  64
  65
  66
  67
  68
  69
  70
  71
  72
  73
  74
  75
  76
  77
  78
  79
  80
  81
  82
  83
  84
  85
  86
  87
  88
  89
  90
  91
  92
  93
  94
  95
  96
  97
  98
  99
 100
 101
 102
 103
 104
 105
 106
 107
 108
 109
 110
 111
 112
 113
 114
 115
 116
 117
 118
 119
 120
 121
 122
 123
 124
 125
 126
 127
 128
 129
 130
 131
 132
 133
 134
 135
 136
 137
 138
 139
 140
 141
 142
 143
 144
 145
 146
 147
 148
 149
 150
 151
 152
 153
 154
 155
 156
 157
 158
 159
 160
 161
 162
 163
 164
 165
 166
 167
 168
 169
 170
 171
 172
 173
 174
 175
 176
 177
 178
 179
 180
 181
 182
 183
 184
 185
 186
 187
 188
 189
 190
 191
 192
 193
 194
 195
 196
 197
 198
 199
 200
 201
 202
 203
 204
 205
 206
 207
 208
 209
 210
 211
 212
 213
 214
 215
 216
 217
 218
 219
 220
 221
 222
 223
 224
 225
 226
 227
 228
 229
 230
 231
 232
 233
 234
 235
 236
 237
 238
 239
 240
 241
 242
 243
 244
 245
 246
 247
 248
 249
 250
 251
 252
 253
 254
 255
 256
 257
 258
 259
 260
 261
 262
 263
 264
 265
 266
 267
 268
 269
 270
 271
 272
 273
 274
 275
 276
 277
 278
 279
 280
 281
 282
 283
 284
 285
 286
 287
 288
 289
 290
 291
 292
 293
 294
 295
 296
 297
 298
 299
 300
 301
 302
 303
 304
 305
 306
 307
 308
 309
 310
 311
 312
 313
 314
 315
 316
 317
 318
 319
 320
 321
 322
 323
 324
 325
 326
 327
 328
 329
 330
 331
 332
 333
 334
 335
 336
 337
 338
 339
 340
 341
 342
 343
 344
 345
 346
 347
 348
 349
 350
 351
 352
 353
 354
 355
 356
 357
 358
 359
 360
 361
 362
 363
 364
 365
 366
 367
 368
 369
 370
 371
 372
 373
 374
 375
 376
 377
 378
 379
 380
 381
 382
 383
 384
 385
 386
 387
 388
 389
 390
 391
 392
 393
 394
 395
 396
 397
 398
 399
 400
 401
 402
 403
 404
 405
 406
 407
 408
 409
 410
 411
 412
 413
 414
 415
 416
 417
 418
 419
 420
 421
 422
 423
 424
 425
 426
 427
 428
 429
 430
 431
 432
 433
 434
 435
 436
 437
 438
 439
 440
 441
 442
 443
 444
 445
 446
 447
 448
 449
 450
 451
 452
 453
 454
 455
 456
 457
 458
 459
 460
 461
 462
 463
 464
 465
 466
 467
 468
 469
 470
 471
 472
 473
 474
 475
 476
 477
 478
 479
 480
 481
 482
 483
 484
 485
 486
 487
 488
 489
 490
 491
 492
 493
 494
 495
 496
 497
 498
 499
 500
 501
 502
 503
 504
 505
 506
 507
 508
 509
 510
 511
 512
 513
 514
 515
 516
 517
 518
 519
 520
 521
 522
 523
 524
 525
 526
 527
 528
 529
 530
 531
 532
 533
 534
 535
 536
 537
 538
 539
 540
 541
 542
 543
 544
 545
 546
 547
 548
 549
 550
 551
 552
 553
 554
 555
 556
 557
 558
 559
 560
 561
 562
 563
 564
 565
 566
 567
 568
 569
 570
 571
 572
 573
 574
 575
 576
 577
 578
 579
 580
 581
 582
 583
 584
 585
 586
 587
 588
 589
 590
 591
 592
 593
 594
 595
 596
 597
 598
 599
 600
 601
 602
 603
 604
 605
 606
 607
 608
 609
 610
 611
 612
 613
 614
 615
 616
 617
 618
 619
 620
 621
 622
 623
 624
 625
 626
 627
 628
 629
 630
 631
 632
 633
 634
 635
 636
 637
 638
 639
 640
 641
 642
 643
 644
 645
 646
 647
 648
 649
 650
 651
 652
 653
 654
 655
 656
 657
 658
 659
 660
 661
 662
 663
 664
 665
 666
 667
 668
 669
 670
 671
 672
 673
 674
 675
 676
 677
 678
 679
 680
 681
 682
 683
 684
 685
 686
 687
 688
 689
 690
 691
 692
 693
 694
 695
 696
 697
 698
 699
 700
 701
 702
 703
 704
 705
 706
 707
 708
 709
 710
 711
 712
 713
 714
 715
 716
 717
 718
 719
 720
 721
 722
 723
 724
 725
 726
 727
 728
 729
 730
 731
 732
 733
 734
 735
 736
 737
 738
 739
 740
 741
 742
 743
 744
 745
 746
 747
 748
 749
 750
 751
 752
 753
 754
 755
 756
 757
 758
 759
 760
 761
 762
 763
 764
 765
 766
 767
 768
 769
 770
 771
 772
 773
 774
 775
 776
 777
 778
 779
 780
 781
 782
 783
 784
 785
 786
 787
 788
 789
 790
 791
 792
 793
 794
 795
 796
 797
 798
 799
 800
 801
 802
 803
 804
 805
 806
 807
 808
 809
 810
 811
 812
 813
 814
 815
 816
 817
 818
 819
 820
 821
 822
 823
 824
 825
 826
 827
 828
 829
 830
 831
 832
 833
 834
 835
 836
 837
 838
 839
 840
 841
 842
 843
 844
 845
 846
 847
 848
 849
 850
 851
 852
 853
 854
 855
 856
 857
 858
 859
 860
 861
 862
 863
 864
 865
 866
 867
 868
 869
 870
 871
 872
 873
 874
 875
 876
 877
 878
 879
 880
 881
 882
 883
 884
 885
 886
 887
 888
 889
 890
 891
 892
 893
 894
 895
 896
 897
 898
 899
 900
 901
 902
 903
 904
 905
 906
 907
 908
 909
 910
 911
 912
 913
 914
 915
 916
 917
 918
 919
 920
 921
 922
 923
 924
 925
 926
 927
 928
 929
 930
 931
 932
 933
 934
 935
 936
 937
 938
 939
 940
 941
 942
 943
 944
 945
 946
 947
 948
 949
 950
 951
 952
 953
 954
 955
 956
 957
 958
 959
 960
 961
 962
 963
 964
 965
 966
 967
 968
 969
 970
 971
 972
 973
 974
 975
 976
 977
 978
 979
 980
 981
 982
 983
 984
 985
 986
 987
 988
 989
 990
 991
 992
 993
 994
 995
 996
 997
 998
 999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
class WikidataSync:
    """
    synchronize with wikidata
    """

    def __init__(
        self,
        baseurl: str = "https://www.wikidata.org",
        debug: bool = False,
        dblp_endpoint_url: str | None = None,
    ):
        """
        Constructor

        Args:
            baseurl(str): the baseurl of the wikidata endpoint
            debug(bool): if True switch on debugging
            dblp_endpoint_url: sparql endpoint url of dblp
        """
        if dblp_endpoint_url is None:
            dblp_endpoint_url = DBLP_ENDPOINT.endpoint
        self.debug = debug
        self.prepareVolumeManager()
        self.preparePaperManager()
        self.prepareRDF()
        self.wdQuery = self.qm.queriesByName["Proceedings"]
        self.baseurl = baseurl
        self.wd = Wikidata(debug=debug)
        self.sqldb = SQLDB(CEURWS.CACHE_FILE, check_same_thread=False)
        self.procRecords = None
        self.procsByVolnumber = None
        self.dblpEndpoint = DblpEndpoint(endpoint=dblp_endpoint_url)
        self.wikidata_endpoint: Endpoint | None = None

    @classmethod
    def from_args(cls, args) -> "WikidataSync":
        """
        create a WikidataSync object from the given command line arguments

        Args:
            args(Namespace): the command line arguments
        """
        wd_en = args.wikidata_endpoint_name
        dblp_en = args.dblp_endpoint_name
        wd_sync = cls.from_endpoint_names(wd_en, dblp_en, debug=args.debug)
        return wd_sync

    @classmethod
    def from_endpoint_names(cls, wd_en: str, dblp_en: str, debug: bool = False) -> "WikidataSync":
        """
        create a WikidataSync object from the given endpoint names

        Args:
            wd_en(str): wikidata endpoint name
            dblp_en(str): dblp endpoint name
        """
        endpoints = EndpointManager.getEndpoints()
        if wd_en not in endpoints:
            raise Exception(f"invalid wikidata endpoint name {wd_en}\nsee sparqlquery -le ")
        if dblp_en not in endpoints:
            raise Exception(f"invalid dblp endpoint name {dblp_en}\nsee sparqlquery -le ")
        dblp_ep = endpoints[dblp_en]
        wd_ep = endpoints[wd_en]
        wd_sync = cls(
            baseurl=wd_ep.endpoint,
            dblp_endpoint_url=dblp_ep.endpoint,
            debug=debug,
        )
        wd_sync.wikidata_endpoint = wd_ep
        return wd_sync

    def login(self):
        self.wd.loginWithCredentials()

    def logout(self):
        self.wd.logout()

    def itemUrl(self, qId):
        url = f"{self.baseurl}/wiki/{qId}"
        return url

    def prepareRDF(self):
        # SPARQL setup
        self.endpoints = EndpointManager.getEndpoints(lang="sparql")
        self.endpointConf = self.endpoints.get("wikidata")
        self.sparql = SPARQL(self.endpointConf.endpoint)
        path = os.path.dirname(__file__)
        qYamlFile = f"{path}/resources/queries/ceurws.yaml"
        if os.path.isfile(qYamlFile):
            self.qm = QueryManager(lang="sparql", queriesPath=qYamlFile)

    def preparePaperManager(self):
        """
        prepare my paper Manager
        """
        self.pm = PaperManager()
        if self.pm.isCached():
            self.pm.fromStore(cacheFile=CEURWS.CACHE_FILE)
        else:
            print(
                "PaperManager not cached you might want to run ceur-ws --recreate",
                file=sys.stderr,
            )

    def prepareVolumeManager(self):
        """
        prepare my volume manager
        """
        self.vm = VolumeManager()
        self.vm.load()
        self.volumesByNumber, _duplicates = LOD.getLookup(self.vm.getList(), "number")
        self.volumeList = self.vm.getList()
        self.volumeCount = len(self.volumeList)
        self.volumeOptions = {}
        reverse_keys = sorted(self.volumesByNumber.keys(), reverse=True)
        for volume_number in reverse_keys:
            volume = self.volumesByNumber[volume_number]
            self.volumeOptions[volume.number] = f"Vol-{volume.number}:{volume.title}"

    def addVolume(self, volume: Volume):
        """
        add the given volume

        Args:
            volume(Volume): the volume to add
        """
        self.volumeList.append(volume)
        self.volumesByNumber[volume.number] = volume
        self.volumeCount += 1

    def getRecentlyAddedVolumeList(self) -> tuple[dict[int, dict], list[dict]]:
        """
        get the list of volumes that have recently been added
        we do not expect deletions

        Returns:
            list[int]: list of volume numbers recently added

        """
        self.prepareVolumeManager()
        refreshVm = VolumeManager()
        parser_config = ParserConfig()
        parser_config.force_download = True
        self.vm.set_down_to_volume(parser_config)
        refreshVm.loadFromIndexHtml(parser_config=parser_config)
        refreshVolumesByNumber, _duplicates = LOD.getLookup(refreshVm.getList(), "number")
        # https://stackoverflow.com/questions/3462143/get-difference-between-two-lists
        newVolumes = list(set(list(refreshVolumesByNumber.keys())) - set(list(self.volumesByNumber.keys())))
        return refreshVolumesByNumber, newVolumes

    def storeVolumes(self):
        """
        store my volumes
        """
        self.vm.store()

    def getWikidataProceedingsRecord(self, volume):
        """
        get the wikidata Record for the given volume
        """
        record = {
            "title": getattr(volume, "title", None),
            "label": getattr(volume, "title", None),
            "description": f"Proceedings of {getattr(volume, 'acronym', None)} workshop",
            "urn": getattr(volume, "urn", None),
            "short name": getattr(volume, "acronym", None),
            "volume": getattr(volume, "number", None),
            "pubDate": getattr(volume, "pubDate", None),
            "ceurwsUrl": getattr(volume, "url", None),
            "language of work or name": "Q1860",
            "fullWorkUrl": getattr(volume, "url", None),
        }
        if isinstance(record.get("pubDate"), datetime.datetime):
            record["pubDate"] = record["pubDate"].isoformat()
        return record

    def getWikidataEventRecord(self, volume: Volume):
        """
        get the wikidata Record for the given volume
        """
        volumeTitle = volume.title
        volumeNumber = volume.number
        dblpEntityIds = self.dblpEndpoint.getDblpIdByVolumeNumber(number=volumeNumber)
        title = label = instanceOf = description = None
        if volumeTitle:
            instanceOf, description = self.getEventTypeFromTitle(volumeTitle)
            title = label = self.getEventNameFromTitle(volumeTitle)
        start_time = volume.dateFrom
        end_time = volume.dateTo
        record = {
            "title": title,
            "label": label,
            "description": description,
            "instanceOf": instanceOf,
            "short name": volume.acronym,
            "locationWikidataId": volume.cityWikidataId,
            "countryWikidataId": volume.countryWikidataId,
            "start time": start_time.isoformat() if start_time is not None else start_time,
            "end time": end_time.isoformat() if end_time is not None else end_time,
            "referenceUrl": volume.getVolumeUrl(),
        }
        if dblpEntityIds is not None and len(dblpEntityIds) > 0:
            dblpEntityId = dblpEntityIds[0]
            record["describedAt"] = self.dblpEndpoint.toDblpUrl(dblpEntityId)
            record["language of work or name"] = "Q1860"
            record["dblpEventId"] = self.dblpEndpoint.convertEntityIdToUrlId(entityId=dblpEntityId)
        # the modeling of virtual events has changed in wikidata
        # virtual event (Q7935096) is discontinued for conferences
        # if volume.isVirtualEvent():
        #     record["instanceOf"] = [instanceOf, "Q7935096"]
        return record

    def update(self, withStore: bool = True):
        """
        update my table from the Wikidata Proceedings SPARQL query
        """
        if self.debug:
            print(f"Querying proceedings from {self.baseurl} ...")
        # query proceedings
        wd_proceedings_records: list[dict] = self.sparql.queryAsListOfDicts(self.wdQuery.query)
        # query events
        event_query = self.qm.queriesByName["EventsByProceeding"]
        wd_event_records: list[dict] = self.sparql.queryAsListOfDicts(event_query.query)
        # add events to proceeding records
        proceedings_event_map, _duplicates = LOD.getLookup(wd_event_records, "item")
        for proceedings_record in wd_proceedings_records:
            item = proceedings_record.get("item")
            if item in proceedings_event_map:
                event_record = proceedings_event_map.get(item)
                proceedings_record.update(**event_record)
        primaryKey = "URN_NBN"
        withCreate = True
        withDrop = True
        entityInfo = self.sqldb.createTable(
            wd_proceedings_records,
            "Proceedings",
            primaryKey,
            withCreate,
            withDrop,
            sampleRecordCount=5000,
            failIfTooFew=False,
        )
        procsByURN, duplicates = LOD.getLookup(wd_proceedings_records, "URN_NBN")
        if withStore:
            self.sqldb.store(procsByURN.values(), entityInfo, executeMany=True, fixNone=True)
        if self.debug:
            print(f"stored {len(procsByURN.values())} proceedings records")
        if len(duplicates) > 0:
            print(f"found {len(duplicates)} duplicates URN entries")
            if len(duplicates) < 10:
                print(duplicates)
        return wd_proceedings_records

    def loadProceedingsFromCache(self):
        """
        load the proceedings records from the cache
        """
        sqlQuery = "SELECT * from Proceedings"
        self.procRecords = self.sqldb.query(sqlQuery)
        return self.procRecords

    def getProceedingsForVolume(self, searchVolnumber: int) -> dict | None:
        """
        get the proceedings record for the given searchVolnumber

        Args:
            searchVolnumber(int): the number of the volume to search

        Returns:
            dict: the record for the proceedings in wikidata
            None: if the proceeding record in not found for the given searchVolnumber
        """
        if self.procRecords is None:
            self.loadProceedingsFromCache()
        if self.procsByVolnumber is None:
            self.procsByVolnumber: dict[int, dict] = {}
            if isinstance(self.procRecords, list):
                for procRecord in self.procRecords:
                    volnumber = procRecord.get("sVolume", None)
                    if volnumber is None:
                        procRecord.get("Volume", None)
                    if volnumber is not None:
                        self.procsByVolnumber[int(volnumber)] = procRecord
        volProcRecord = self.procsByVolnumber.get(searchVolnumber, None)
        return volProcRecord

    def getProceedingWdItemsByUrn(self, urn: str) -> list[str]:
        """
        queries the wikidata items that have the given urn for the property P4109
        Args:
            urn: URN id to query for

        Returns:
            List of corresponding wikidata item ids or empty list of no matching item is found
        """
        query = f"""SELECT ?proceeding WHERE{{ ?proceeding wdt:P4109 "{urn}"}}"""
        qres = self.sparql.queryAsListOfDicts(query)
        wdItems = [record.get("proceeding") for record in qres]
        return wdItems

    def getEventWdItemsByUrn(self, urn: str) -> list[str]:
        """
        queries the wikidata proceedings that have the given urn assigned to P4109 and returns the assigned event
        Args:
            urn: URN id to query for

        Returns:
            List of corresponding wikidata item ids or empty list of no matching item is found
        """
        query = f"""SELECT ?event WHERE{{ ?proceeding wdt:P4109 "{urn}"; wdt:P4745 ?event .}}"""
        qres = self.sparql.queryAsListOfDicts(query)
        wdItems = [record.get("event") for record in qres]
        return wdItems

    def getEventsOfProceedings(self, itemId: str) -> list[str]:
        """
        get the item ids of the events the given proceedings ids is the proceedings from
        Args:
            itemId: Qid of the proceedings

        Returns:
            List of the events
        """
        query = f"""SELECT ?event WHERE {{ wd:{itemId} wdt:P4745 ?event.}}"""
        qres = self.sparql.queryAsListOfDicts(query)
        wdItems = [record.get("event")[len("http://www.wikidata.org/entity/") :] for record in qres]
        return wdItems

    def getEventsOfProceedingsByVolnumber(self, volnumber: int | str) -> list[str]:
        """
        get the item ids of the events the given proceedings ids is the proceedings from
        Args:
            volnumber: Volume number of the proceedings

        Returns:
            List of the events
        """
        query = f"""SELECT ?event 
                    WHERE {{
                    ?proceeding wdt:P31 wd:Q1143604; 
                                p:P179 [ps:P179 wd:Q27230297; pq:P478 "{volnumber}"]; 
                                wdt:P4745 ?event.}}
        """
        qres = self.sparql.queryAsListOfDicts(query)
        wdItems = [record.get("event")[len("http://www.wikidata.org/entity/") :] for record in qres]
        return wdItems

    def addProceedingsToWikidata(self, record: dict, write: bool = True, ignoreErrors: bool = False):
        """
        Creates a wikidata entry for the given record

        Args:
            record(dict): the data to add
            write(bool): if True actually write
            ignoreErrors(bool): if True ignore errors

        """
        if write:
            self.login()
        result = self.doAddProceedingsToWikidata(record, write, ignoreErrors)
        if write:
            self.logout()
        return result

    def doAddProceedingsToWikidata(
        self, record: dict, write: bool = True, ignoreErrors: bool = False
    ) -> WikidataResult:
        """
        Creates a wikidata proceedings entry for the given record

        Args:
            record(dict): the data to add
            write(bool): if True actually write
            ignoreErrors(bool): if True ignore errors
        Returns:
            WikidataResult: the result of the add operation
        """
        mappings = [
            PropertyMapping(
                column="instanceof",
                propertyName="instanceof",
                propertyId="P31",
                propertyType=WdDatatype.itemid,
                value="Q1143604",
            ),
            PropertyMapping(
                column="part of the series",
                propertyName="part of the series",
                propertyId="P179",
                propertyType=WdDatatype.itemid,
                value="Q27230297",
            ),
            PropertyMapping(
                column="volume",
                propertyName="volume",
                propertyId="P478",
                propertyType=WdDatatype.string,
                qualifierOf="part of the series",
            ),  # ToDo: refactor qualifier of anchor column or property name?
            PropertyMapping(
                column="short name",
                propertyName="short name",
                propertyId="P1813",
                propertyType=WdDatatype.text,
            ),
            PropertyMapping(
                column="pubDate",
                propertyName="publication date",
                propertyId="P577",
                propertyType=WdDatatype.date,
            ),
            PropertyMapping(
                column="title",
                propertyName="title",
                propertyId="P1476",
                propertyType=WdDatatype.text,
            ),
            PropertyMapping(
                column="ceurwsUrl",
                propertyName="described at URL",
                propertyId="P973",
                propertyType=WdDatatype.url,
            ),
            PropertyMapping(
                column="language of work or name",
                propertyName="language of work or name",
                propertyId="P407",
                propertyType=WdDatatype.itemid,
                qualifierOf="ceurwsUrl",
            ),
            PropertyMapping(
                column="fullWorkUrl",
                propertyName="full work available at URL",
                propertyId="P953",
                propertyType=WdDatatype.url,
            ),
            PropertyMapping(
                column="urn",
                propertyName="URN-NBN",
                propertyId="P4109",
                propertyType=WdDatatype.extid,
            ),
        ]
        reference = UrlReference(url=record.get("ceurwsUrl"))
        result = self.wd.add_record(
            record=record,
            property_mappings=mappings,
            write=write,
            ignore_errors=ignoreErrors,
            reference=reference,
        )
        return result

    def askWikidata(self, askQuery: str) -> bool:
        try:
            qres = self.sparql.rawQuery(askQuery).convert()
            return qres.get("boolean", False)
        except Exception as ex:
            print(ex)
            return False

    def checkIfProceedingsFromExists(self, volumeNumber: int, eventItemQid: str | None) -> bool:
        """Returns True if the is proceedings from relation already exists between the given proceedings and event"""
        eventVar = "?event"
        if eventItemQid is not None:
            eventVar = f"wd:{eventItemQid}"
        proceedingsWikidataId = self.getWikidataIdByVolumeNumber(number=volumeNumber)
        query = f"""ASK{{ wd:{proceedingsWikidataId} wdt:P4745 {eventVar}.}}"""
        proceedingExists = self.askWikidata(query)
        return proceedingExists

    def hasItemPropertyValueFor(self, item, propertyId: str):
        """
        ask wikidata if the given item has a value for the given property
        Args:
            item: item Qid
            propertyId: property Pid
        Returns:
            True if the item has the property else False
        """
        query = f"""ASK{{ wd:{item} wdt:{propertyId} ?value.}}"""
        return self.askWikidata(query)

    def addLinkBetweenProceedingsAndEvent(
        self,
        eventItemQid: str,
        volumeNumber: int | None = None,
        proceedingsWikidataId: str | None = None,
        write: bool = True,
        ignoreErrors: bool = False,
    ) -> WikidataResult:
        """
        add the link between the wikidata proceedings item and the given event wikidata item
        Args:
            volumeNumber: ceurws volume number of the proceedings
            eventItemQid: wikidata Qid of the event
            proceedingsWikidataId: wikidata id of the proceedings item
            write(bool): if True actually write
            ignoreErrors(bool): if True ignore errors

        Returns:
            WikidataResult: the result of the add operation

        Raises:
            ValueError: if the volume number is not provided or the volume is not unique in Wikidata
        """
        if proceedingsWikidataId is None:
            proceedingsWikidataId = self.getWikidataIdByVolumeNumber(number=volumeNumber)
        if proceedingsWikidataId is None:
            raise ValueError("Volume is not unique → Proceedings item can not be determined")
        mappings = [
            PropertyMapping(
                column="isProceedingsFrom",
                propertyName="is proceedings from",
                propertyId="P4745",
                propertyType=WdDatatype.itemid,
            )
        ]
        reference = None
        if volumeNumber is not None:
            volume_url = Volume.getVolumeUrlOf(volumeNumber)
            reference = UrlReference(volume_url)
        record = {"isProceedingsFrom": eventItemQid}
        result = self.wd.add_record(
            item_id=proceedingsWikidataId,
            record=record,
            property_mappings=mappings,
            write=write,
            ignore_errors=ignoreErrors,
            reference=reference,
        )
        return result

    def doAddEventToWikidata(self, record: dict, write: bool = True, ignoreErrors: bool = False):
        """
        Creates a wikidata event entry for the given record
        Args:
            record(dict): the data to add
            write(bool): if True actually write
            ignoreErrors(bool): if True ignore errors

        Returns:
            WikidataResult: the result of the add operation
        """
        entityQid = record.get("instanceOf")
        # entity = record.get("description")
        mappings = [
            PropertyMapping(
                column="instanceof",
                propertyName="instanceof",
                propertyId="P31",
                propertyType=WdDatatype.itemid,
                value=entityQid,
            ),
            PropertyMapping(
                column="short name",
                propertyName="short name",
                propertyId="P1813",
                propertyType=WdDatatype.text,
            ),
            PropertyMapping(
                column="describedAt",
                propertyName="described at URL",
                propertyId="P973",
                propertyType=WdDatatype.url,
            ),
            PropertyMapping(
                column="language of work or name",
                propertyName="language of work or name",
                propertyId="P407",
                propertyType=WdDatatype.itemid,
                qualifierOf="describedAt",
                value="Q1860",
            ),
            PropertyMapping(
                column="title",
                propertyName="title",
                propertyId="P1476",
                propertyType=WdDatatype.text,
            ),
            PropertyMapping(
                column="describedAt",
                propertyName="described at URL",
                propertyId="P973",
                propertyType=WdDatatype.url,
            ),
            PropertyMapping(
                column="dblpEventId",
                propertyName="DBLP event ID",
                propertyId="P10692",
                propertyType=WdDatatype.extid,
            ),
            PropertyMapping(
                column="start time",
                propertyName="start time",
                propertyId="P580",
                propertyType=WdDatatype.date,
            ),
            PropertyMapping(
                column="end time",
                propertyName="end time",
                propertyId="P582",
                propertyType=WdDatatype.date,
            ),
            PropertyMapping(
                column="locationWikidataId",
                propertyName="location",
                propertyId="P276",
                propertyType=WdDatatype.itemid,
            ),
            PropertyMapping(
                column="countryWikidataId",
                propertyName="country",
                propertyId="P17",
                propertyType=WdDatatype.itemid,
            ),
        ]
        reference_url = record.pop("referenceUrl")
        reference = UrlReference(url=reference_url)
        result = self.wd.add_record(
            record=record,
            property_mappings=mappings,
            write=write,
            ignore_errors=ignoreErrors,
            reference=reference,
        )
        return result

    def addDblpPublicationId(
        self,
        volumeNumber: int,
        dblpRecordId: str | None = None,
        write: bool = True,
        ignoreErrors: bool = False,
    ) -> WikidataResult:
        """
        try to add the dblp publication id (P8978) to the proceedings record
        Args:
            volumeNumber: ceurws volumenumber of the proceedings
            dblpRecordId: dblp record id to add to the proceedings item. If None query dblp for the record id
            write: if True actually write
            ignoreErrors(bool): if True ignore errors

        Returns:
            WikidataResult: the result of the add operation
        """
        proceedingsWikidataId = self.getWikidataIdByVolumeNumber(number=volumeNumber)
        if proceedingsWikidataId is None:
            return False, "Proceedings item can not be determined"
        if self.hasItemPropertyValueFor(item=proceedingsWikidataId, propertyId="P8978"):
            return (
                False,
                "dblp publication id is already assigned to the proceedings item",
            )
        if dblpRecordId is None:
            dblpRecordIds = self.dblpEndpoint.getDblpIdByVolumeNumber(volumeNumber)
            if len(dblpRecordIds) == 1:
                dblpRecordId = dblpRecordIds[0]
            elif len(dblpRecordIds) > 1:
                return (
                    False,
                    f"More than one proceedings record found ({dblpRecordIds})",
                )
            else:
                return (
                    False,
                    f"Proceedings of volume {volumeNumber} are not in dblp",
                )
        mappings = [
            PropertyMapping(
                column="DBLP publication ID",
                propertyName="DBLP publication ID",
                propertyId="P8978",
                propertyType=WdDatatype.extid,
            )
        ]
        wdMetadata = [
            {
                "Entity": "proceedings",
                "Column": "DBLP publication ID",
                "PropertyName": "DBLP publication ID",
                "PropertyId": "P8978",
                "Type": "extid",
                "Qualifier": None,
                "Lookup": "",
            }
        ]
        mapDict, _ = LOD.getLookup(wdMetadata, "PropertyId")
        volume_url = Volume.getVolumeUrlOf(volumeNumber)
        reference = UrlReference(volume_url)
        record = {"DBLP publication ID": dblpRecordId}
        result = self.wd.add_record(
            item_id=proceedingsWikidataId,
            record=record,
            property_mappings=mappings,
            write=write,
            ignore_errors=ignoreErrors,
            reference=reference,
        )
        return result

    def addAcronymToItem(
        self,
        itemId: str,
        acronym: str,
        desc: str | None = None,
        label: str | None = None,
        write: bool = True,
        ignoreErrors: bool = False,
    ):
        """
        add the acronym to the given item
        Args:
            itemId: item to add the acronym to
            acronym(str): acronym of the item
            write(bool): if True actually write
            ignoreErrors(bool): if True ignore errors

        Returns:
            (qid, errors) id of the created entry and occurred errors
        """
        wdMetadata = [
            {
                "Column": "short name",
                "PropertyName": "short name",
                "PropertyId": "P1813",
                "Type": "text",
                "Lookup": "",
            }
        ]
        record = {"short name": acronym, "description": desc, "label": label}
        map_dict, _ = LOD.getLookup(wdMetadata, "PropertyId")
        qId, errors = self.wd.addDict(
            itemId=itemId,
            row=record,
            mapDict=map_dict,
            write=write,
            ignoreErrors=ignoreErrors,
        )
        return qId, errors

    def addOfficialWebsiteToItem(
        self,
        itemId: str,
        officialWebsite: str,
        write: bool = True,
        ignoreErrors: bool = False,
    ):
        """
        add the official website to the given item
        Args:
            itemId: item to add the acronym to
            officialWebsite(str): officialWebsite of the item
            write(bool): if True actually write
            ignoreErrors(bool): if True ignore errors

        Returns:
            WikidataResult: the result of the add operation
        """
        mappings = [
            PropertyMapping(
                column="official website",
                propertyName="official website",
                propertyId="P856",
                propertyType=WdDatatype.url,
            ),
            PropertyMapping(
                column="language of work or name",
                propertyName="language of work or name",
                propertyId="P407",
                propertyType=WdDatatype.itemid,
            ),
        ]
        record = {
            "official website": officialWebsite,
            "language of work or name": "Q1860",
        }
        qId, errors = self.wd.add_record(
            item_id=itemId,
            record=record,
            property_mappings=mappings,
            write=write,
            ignore_errors=ignoreErrors,
        )
        return qId, errors

    def getWikidataIdByVolumeNumber(self, number: int | None) -> str | None:
        """
        query wikidata for the qId of the proceedings of the given volume number
        Args:
            number: volume number

        Returns:
            str: wikidata id corresponding to the given volume number
            None: if the corresponding wikidata id was not found
        """
        if number is None:
            return None
        query = f"""SELECT * WHERE{{ ?proceeding p:P179 [ps:P179 wd:Q27230297; pq:P478 "{number}"].}}"""
        qres = self.sparql.queryAsListOfDicts(query)
        qid = None
        if qres is not None and qres != []:
            qids = [record.get("proceeding").split("/")[-1] for record in qres]
            if len(qids) > 1:
                print("CEUR-WS volume number is not unique")
            else:
                qid = qids[0]
        return qid

    def getWikidataIdByDblpEventId(self, entityId: str | None, volumeNumber: int | None = None) -> list[str]:
        """
        query wikidata for the qId of items that correspond to the given dblpEventId
        Args:
            entityId: id of a dblp event
            volumeNumber: volume number

        Returns:
            list of matching wikidata items
        """
        dblpEventId = self.dblpEndpoint.convertEntityIdToUrlId(entityId=entityId)
        dblpIds = [entityId, dblpEventId]
        dblpIdsStr = " ".join([f'"{dblpId}"' for dblpId in dblpIds])
        urls = ""
        if entityId is not None:
            urls = " ".join(
                [
                    f"<{self.dblpEndpoint.toDblpUrl(entityId)}>",
                    f"<{self.dblpEndpoint.toDblpUrl(entityId, True)}>",
                ]
            )
        volumeQuery = ""
        if volumeNumber is not None:
            volumeQuery = f"""
            UNION
                  {{
                  ?proceeding p:P179 [ps:P179 wd:Q27230297; pq:P478 "{volumeNumber}"].
                  ?proceeding wdt:P4745 ?qid.
                  }}
            """
        query = f"""SELECT DISTINCT ?qid
            WHERE{{
              VALUES ?url {{ {urls} }}
              VALUES ?dblpEventId {{ {dblpIdsStr} }}
              VALUES ?eventType {{wd:Q2020153 wd:Q40444998}}
              {{?qid wdt:P31 ?eventType; wdt:P973 ?url}}
              UNION
              {{?qid wdt:P31 ?eventType; wdt:P10692 ?dblpEventId}}
              {volumeQuery}
            }}
        """
        qres = self.sparql.queryAsListOfDicts(query)
        qIds = []
        if qres is not None and qres != []:
            qIds = [self.removeWdPrefix(record.get("qid")) for record in qres]
        return qIds

    @classmethod
    def getEventNameFromTitle(cls, title: str) -> str:
        """
        Get the event name from the given proceedings title
        Args:
            title: title of the proceeding

        Returns:
            name of the event
        """
        prefixes = [
            "Proceedings of the",
            "Proceedings of",
            "Joint Proceedings of the",
            "Joint Proceedings of",
            "Joint Proceedings",
            "Joint Proceeding of the",
            "Joint Proceeding of",
            "Selected Papers of the",
            "Selected Contributions of the",
            "Workshops Proceedings for the",
            "Supplementary Proceedings of the",
            "Short Paper Proceedings of",
            "Short Paper Proceedings of the",
            "Working Notes Proceedings of the",
            "Working Notes of",
            "Working Notes for",
            "Joint Workshop Proceedings of the",
            "Joint Workshop Proceedings of",
            "Workshop Proceedings from",
            "Workshop and Poster Proceedings of the",
            "Workshops Proceedings and Tutorials of the",
            "Extended Papers of the",
            "Short Papers Proceedings of the",
            "Short Papers Proceedings of",
            "Proceedings of the Selected Papers of the",
            "Proceedings of the Working Notes of",
            "Proceedings of the Doctoral Consortium Papers Presented at the",
            "Selected Contributions to the",
            "Selected and Revised Papers of",
            "Selected Papers of",
            "Up-and-Coming and Short Papers of the",
            "Academic Papers at",
            "Poster Track of the",
            "Actes de la",
            "Post-proceedings of the",
            "Late Breaking Papers of the",
            "Anais do",
            "Proceedings del",
            "Proceedings",
            "Gemeinsamer Tagungsband der",
            "Local Proceedings of the",
            "Local Proceedings and Materials of",
        ]
        postfixes = [
            "Workshop Proceedings",
            "Proceedings",
            "Conference Proceedings",
            "Workshops Proceedings",
            "Adjunct Proceedings",
            "Poster and Demo Proceedings",
            "(full papers)",
        ]
        if title is not None:
            prefixes.sort(key=lambda prefix: len(prefix), reverse=True)
            for prefix in prefixes:
                if title.lower().startswith(prefix.lower()):
                    title = title[len(prefix) :]
                    title = title.strip()
                    break
            postfixes.sort(key=lambda postfix: len(postfix), reverse=True)
            for postfix in postfixes:
                if title.lower().endswith(postfix.lower()):
                    title = title[: -len(postfix)]
                    title = title.strip(" .,")
                    break
        return title

    @classmethod
    def getEventTypeFromTitle(cls, title: str) -> tuple[str | None, str | None]:
        """
        Extract the event type from the given title
        Assumption: lowest mentioned type is the correct one
        Args:
            title: title of the event

        Returns:
            wikidata id and label of the event type
        """
        if title is None or title == "":
            return None, None
        academicConference = ("Q2020153", "academic conference")
        academicWorkshop = ("Q40444998", "academic workshop")
        if "workshop" in title.lower():
            return academicWorkshop
        elif "conference" in title.lower() or "symposium" in title.lower():
            return academicConference
        else:
            return academicWorkshop

    def doCreateEventItemAndLinkProceedings(
        self,
        volume: Volume,
        proceedingsWikidataId: str | None = None,
        write: bool = False,
    ) -> dict[str, WikidataResult]:
        """
        Create event  wikidata item for given volume and link the proceedings with the event
        Args:
            volume: volume to create the event for
            proceedingsWikidataId: proceedings wikidata id of the event
            write: If True actually write

        Returns:
            proceedingsQId, eventQId, msg
        """
        results = {}
        vol_number = volume.number
        if (
            proceedingsWikidataId is None
            and vol_number is not None
            and self.checkIfProceedingsFromExists(vol_number, eventItemQid=None)
        ):
            # link between proceedings and event already exists
            proceedingsWikidataId = self.getWikidataIdByVolumeNumber(number=vol_number)
            results["Proceedings"] = WikidataResult(
                qid=proceedingsWikidataId,
                msg=f"Proceedings for Vol-{vol_number} already exists",
            )
        dblpEntityIds = self.dblpEndpoint.getDblpIdByVolumeNumber(vol_number)
        dblpEntityId = None
        msg = None
        if len(dblpEntityIds) > 1:
            msg = f"Multiple dblpEventIds found for Vol-{vol_number}: {','.join(dblpEntityIds)}"
        elif len(dblpEntityIds) == 1:
            dblpEntityId = dblpEntityIds[0]
        else:
            dblpEntityId = None
        results["dblp"] = WikidataResult(qid=dblpEntityId, msg=msg)
        wdItems = self.getWikidataIdByDblpEventId(dblpEntityId, vol_number)
        msg = ""
        eventQid = None
        if len(wdItems) == 0:
            # event item does not exist → create a new one
            volume.resolveLoctime()
            eventRecord = self.getWikidataEventRecord(volume)
            event_result = self.doAddEventToWikidata(record=eventRecord, write=write)
            eventQid = event_result.qid
            results["Event"] = event_result
        elif len(wdItems) == 1:
            results["Event"] = WikidataResult(
                # the event item already exists
                qid=wdItems[0],
                msg="Event item already exists;",
            )
        else:
            results["Event"] = WikidataResult(msg=f"Multiple event entries exist: {','.join(wdItems)}")
        if eventQid is not None:
            # add link between Proceedings and the event item
            link_result = self.addLinkBetweenProceedingsAndEvent(
                volumeNumber=vol_number,
                eventItemQid=eventQid,
                proceedingsWikidataId=proceedingsWikidataId,
                write=write,
            )
            link_result.msg = "Added Link between Proceedings and Event item;"
            results["link"] = link_result
        return results

    @classmethod
    def removeWdPrefix(cls, value: str):
        """
        removes the wikidata entity prefix
        Args:
            value: wikidata entity url
        """
        wd_prefix = "http://www.wikidata.org/entity/"
        if value is not None and isinstance(value, str) and value.startswith(wd_prefix):
            value = value[len("http://www.wikidata.org/entity/") :]
        return value

    def getAuthorByIds(self, identifiers: dict) -> dict[str, str]:
        """
        Based on the given identifiers get potential author items
        the names of the identifiers must be according to DblpAuthorIdentifier
        Args:
            identifiers: known identifiers of the author
        """
        if identifiers is None or len(identifiers) == 0:
            return dict()
        id_map = DblpAuthorIdentifier.getAllAsMap()
        optional_clauses = []
        for id_name, id_value in identifiers.items():
            if id_value is not None and id_value != "":
                id_query = None
                if id_name in id_map:
                    id_query = DblpAuthorIdentifier.getWikidataIdQueryPart(id_name, id_value, "?person")
                else:
                    if id_name == "homepage":
                        id_query = f"{{ ?person wdt:P856 <{id_value}>. }}"
                if id_query is not None:
                    optional_clauses.append(id_query)
        id_queries = "\nUNION\n".join(optional_clauses)
        query = f"""SELECT DISTINCT ?person ?personLabel
                    WHERE
                    {{
                        {id_queries}
                        ?person rdfs:label ?personLabel. FILTER(lang(?personLabel)="en").
                    }}"""
        qres = self.sparql.queryAsListOfDicts(query)
        res = dict()
        for record in qres:
            if record is None or len(record) == 0:
                continue
            item_id = self.removeWdPrefix(record.get("person"))
            name = record.get("personLabel")
            res[item_id] = name
        return res

__init__(baseurl='https://www.wikidata.org', debug=False, dblp_endpoint_url=None)

Constructor

Parameters:

Name Type Description Default
baseurl(str)

the baseurl of the wikidata endpoint

required
debug(bool)

if True switch on debugging

required
dblp_endpoint_url str | None

sparql endpoint url of dblp

None
Source code in ceurws/wikidatasync.py
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
def __init__(
    self,
    baseurl: str = "https://www.wikidata.org",
    debug: bool = False,
    dblp_endpoint_url: str | None = None,
):
    """
    Constructor

    Args:
        baseurl(str): the baseurl of the wikidata endpoint
        debug(bool): if True switch on debugging
        dblp_endpoint_url: sparql endpoint url of dblp
    """
    if dblp_endpoint_url is None:
        dblp_endpoint_url = DBLP_ENDPOINT.endpoint
    self.debug = debug
    self.prepareVolumeManager()
    self.preparePaperManager()
    self.prepareRDF()
    self.wdQuery = self.qm.queriesByName["Proceedings"]
    self.baseurl = baseurl
    self.wd = Wikidata(debug=debug)
    self.sqldb = SQLDB(CEURWS.CACHE_FILE, check_same_thread=False)
    self.procRecords = None
    self.procsByVolnumber = None
    self.dblpEndpoint = DblpEndpoint(endpoint=dblp_endpoint_url)
    self.wikidata_endpoint: Endpoint | None = None

addAcronymToItem(itemId, acronym, desc=None, label=None, write=True, ignoreErrors=False)

add the acronym to the given item Args: itemId: item to add the acronym to acronym(str): acronym of the item write(bool): if True actually write ignoreErrors(bool): if True ignore errors

Returns:

Type Description

(qid, errors) id of the created entry and occurred errors

Source code in ceurws/wikidatasync.py
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
def addAcronymToItem(
    self,
    itemId: str,
    acronym: str,
    desc: str | None = None,
    label: str | None = None,
    write: bool = True,
    ignoreErrors: bool = False,
):
    """
    add the acronym to the given item
    Args:
        itemId: item to add the acronym to
        acronym(str): acronym of the item
        write(bool): if True actually write
        ignoreErrors(bool): if True ignore errors

    Returns:
        (qid, errors) id of the created entry and occurred errors
    """
    wdMetadata = [
        {
            "Column": "short name",
            "PropertyName": "short name",
            "PropertyId": "P1813",
            "Type": "text",
            "Lookup": "",
        }
    ]
    record = {"short name": acronym, "description": desc, "label": label}
    map_dict, _ = LOD.getLookup(wdMetadata, "PropertyId")
    qId, errors = self.wd.addDict(
        itemId=itemId,
        row=record,
        mapDict=map_dict,
        write=write,
        ignoreErrors=ignoreErrors,
    )
    return qId, errors

addDblpPublicationId(volumeNumber, dblpRecordId=None, write=True, ignoreErrors=False)

try to add the dblp publication id (P8978) to the proceedings record Args: volumeNumber: ceurws volumenumber of the proceedings dblpRecordId: dblp record id to add to the proceedings item. If None query dblp for the record id write: if True actually write ignoreErrors(bool): if True ignore errors

Returns:

Name Type Description
WikidataResult WikidataResult

the result of the add operation

Source code in ceurws/wikidatasync.py
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
def addDblpPublicationId(
    self,
    volumeNumber: int,
    dblpRecordId: str | None = None,
    write: bool = True,
    ignoreErrors: bool = False,
) -> WikidataResult:
    """
    try to add the dblp publication id (P8978) to the proceedings record
    Args:
        volumeNumber: ceurws volumenumber of the proceedings
        dblpRecordId: dblp record id to add to the proceedings item. If None query dblp for the record id
        write: if True actually write
        ignoreErrors(bool): if True ignore errors

    Returns:
        WikidataResult: the result of the add operation
    """
    proceedingsWikidataId = self.getWikidataIdByVolumeNumber(number=volumeNumber)
    if proceedingsWikidataId is None:
        return False, "Proceedings item can not be determined"
    if self.hasItemPropertyValueFor(item=proceedingsWikidataId, propertyId="P8978"):
        return (
            False,
            "dblp publication id is already assigned to the proceedings item",
        )
    if dblpRecordId is None:
        dblpRecordIds = self.dblpEndpoint.getDblpIdByVolumeNumber(volumeNumber)
        if len(dblpRecordIds) == 1:
            dblpRecordId = dblpRecordIds[0]
        elif len(dblpRecordIds) > 1:
            return (
                False,
                f"More than one proceedings record found ({dblpRecordIds})",
            )
        else:
            return (
                False,
                f"Proceedings of volume {volumeNumber} are not in dblp",
            )
    mappings = [
        PropertyMapping(
            column="DBLP publication ID",
            propertyName="DBLP publication ID",
            propertyId="P8978",
            propertyType=WdDatatype.extid,
        )
    ]
    wdMetadata = [
        {
            "Entity": "proceedings",
            "Column": "DBLP publication ID",
            "PropertyName": "DBLP publication ID",
            "PropertyId": "P8978",
            "Type": "extid",
            "Qualifier": None,
            "Lookup": "",
        }
    ]
    mapDict, _ = LOD.getLookup(wdMetadata, "PropertyId")
    volume_url = Volume.getVolumeUrlOf(volumeNumber)
    reference = UrlReference(volume_url)
    record = {"DBLP publication ID": dblpRecordId}
    result = self.wd.add_record(
        item_id=proceedingsWikidataId,
        record=record,
        property_mappings=mappings,
        write=write,
        ignore_errors=ignoreErrors,
        reference=reference,
    )
    return result

addLinkBetweenProceedingsAndEvent(eventItemQid, volumeNumber=None, proceedingsWikidataId=None, write=True, ignoreErrors=False)

add the link between the wikidata proceedings item and the given event wikidata item Args: volumeNumber: ceurws volume number of the proceedings eventItemQid: wikidata Qid of the event proceedingsWikidataId: wikidata id of the proceedings item write(bool): if True actually write ignoreErrors(bool): if True ignore errors

Returns:

Name Type Description
WikidataResult WikidataResult

the result of the add operation

Raises:

Type Description
ValueError

if the volume number is not provided or the volume is not unique in Wikidata

Source code in ceurws/wikidatasync.py
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
def addLinkBetweenProceedingsAndEvent(
    self,
    eventItemQid: str,
    volumeNumber: int | None = None,
    proceedingsWikidataId: str | None = None,
    write: bool = True,
    ignoreErrors: bool = False,
) -> WikidataResult:
    """
    add the link between the wikidata proceedings item and the given event wikidata item
    Args:
        volumeNumber: ceurws volume number of the proceedings
        eventItemQid: wikidata Qid of the event
        proceedingsWikidataId: wikidata id of the proceedings item
        write(bool): if True actually write
        ignoreErrors(bool): if True ignore errors

    Returns:
        WikidataResult: the result of the add operation

    Raises:
        ValueError: if the volume number is not provided or the volume is not unique in Wikidata
    """
    if proceedingsWikidataId is None:
        proceedingsWikidataId = self.getWikidataIdByVolumeNumber(number=volumeNumber)
    if proceedingsWikidataId is None:
        raise ValueError("Volume is not unique → Proceedings item can not be determined")
    mappings = [
        PropertyMapping(
            column="isProceedingsFrom",
            propertyName="is proceedings from",
            propertyId="P4745",
            propertyType=WdDatatype.itemid,
        )
    ]
    reference = None
    if volumeNumber is not None:
        volume_url = Volume.getVolumeUrlOf(volumeNumber)
        reference = UrlReference(volume_url)
    record = {"isProceedingsFrom": eventItemQid}
    result = self.wd.add_record(
        item_id=proceedingsWikidataId,
        record=record,
        property_mappings=mappings,
        write=write,
        ignore_errors=ignoreErrors,
        reference=reference,
    )
    return result

addOfficialWebsiteToItem(itemId, officialWebsite, write=True, ignoreErrors=False)

add the official website to the given item Args: itemId: item to add the acronym to officialWebsite(str): officialWebsite of the item write(bool): if True actually write ignoreErrors(bool): if True ignore errors

Returns:

Name Type Description
WikidataResult

the result of the add operation

Source code in ceurws/wikidatasync.py
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
def addOfficialWebsiteToItem(
    self,
    itemId: str,
    officialWebsite: str,
    write: bool = True,
    ignoreErrors: bool = False,
):
    """
    add the official website to the given item
    Args:
        itemId: item to add the acronym to
        officialWebsite(str): officialWebsite of the item
        write(bool): if True actually write
        ignoreErrors(bool): if True ignore errors

    Returns:
        WikidataResult: the result of the add operation
    """
    mappings = [
        PropertyMapping(
            column="official website",
            propertyName="official website",
            propertyId="P856",
            propertyType=WdDatatype.url,
        ),
        PropertyMapping(
            column="language of work or name",
            propertyName="language of work or name",
            propertyId="P407",
            propertyType=WdDatatype.itemid,
        ),
    ]
    record = {
        "official website": officialWebsite,
        "language of work or name": "Q1860",
    }
    qId, errors = self.wd.add_record(
        item_id=itemId,
        record=record,
        property_mappings=mappings,
        write=write,
        ignore_errors=ignoreErrors,
    )
    return qId, errors

addProceedingsToWikidata(record, write=True, ignoreErrors=False)

Creates a wikidata entry for the given record

Parameters:

Name Type Description Default
record(dict)

the data to add

required
write(bool)

if True actually write

required
ignoreErrors(bool)

if True ignore errors

required
Source code in ceurws/wikidatasync.py
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
def addProceedingsToWikidata(self, record: dict, write: bool = True, ignoreErrors: bool = False):
    """
    Creates a wikidata entry for the given record

    Args:
        record(dict): the data to add
        write(bool): if True actually write
        ignoreErrors(bool): if True ignore errors

    """
    if write:
        self.login()
    result = self.doAddProceedingsToWikidata(record, write, ignoreErrors)
    if write:
        self.logout()
    return result

addVolume(volume)

add the given volume

Parameters:

Name Type Description Default
volume(Volume)

the volume to add

required
Source code in ceurws/wikidatasync.py
144
145
146
147
148
149
150
151
152
153
def addVolume(self, volume: Volume):
    """
    add the given volume

    Args:
        volume(Volume): the volume to add
    """
    self.volumeList.append(volume)
    self.volumesByNumber[volume.number] = volume
    self.volumeCount += 1

checkIfProceedingsFromExists(volumeNumber, eventItemQid)

Returns True if the is proceedings from relation already exists between the given proceedings and event

Source code in ceurws/wikidatasync.py
486
487
488
489
490
491
492
493
494
def checkIfProceedingsFromExists(self, volumeNumber: int, eventItemQid: str | None) -> bool:
    """Returns True if the is proceedings from relation already exists between the given proceedings and event"""
    eventVar = "?event"
    if eventItemQid is not None:
        eventVar = f"wd:{eventItemQid}"
    proceedingsWikidataId = self.getWikidataIdByVolumeNumber(number=volumeNumber)
    query = f"""ASK{{ wd:{proceedingsWikidataId} wdt:P4745 {eventVar}.}}"""
    proceedingExists = self.askWikidata(query)
    return proceedingExists

doAddEventToWikidata(record, write=True, ignoreErrors=False)

Creates a wikidata event entry for the given record Args: record(dict): the data to add write(bool): if True actually write ignoreErrors(bool): if True ignore errors

Returns:

Name Type Description
WikidataResult

the result of the add operation

Source code in ceurws/wikidatasync.py
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
def doAddEventToWikidata(self, record: dict, write: bool = True, ignoreErrors: bool = False):
    """
    Creates a wikidata event entry for the given record
    Args:
        record(dict): the data to add
        write(bool): if True actually write
        ignoreErrors(bool): if True ignore errors

    Returns:
        WikidataResult: the result of the add operation
    """
    entityQid = record.get("instanceOf")
    # entity = record.get("description")
    mappings = [
        PropertyMapping(
            column="instanceof",
            propertyName="instanceof",
            propertyId="P31",
            propertyType=WdDatatype.itemid,
            value=entityQid,
        ),
        PropertyMapping(
            column="short name",
            propertyName="short name",
            propertyId="P1813",
            propertyType=WdDatatype.text,
        ),
        PropertyMapping(
            column="describedAt",
            propertyName="described at URL",
            propertyId="P973",
            propertyType=WdDatatype.url,
        ),
        PropertyMapping(
            column="language of work or name",
            propertyName="language of work or name",
            propertyId="P407",
            propertyType=WdDatatype.itemid,
            qualifierOf="describedAt",
            value="Q1860",
        ),
        PropertyMapping(
            column="title",
            propertyName="title",
            propertyId="P1476",
            propertyType=WdDatatype.text,
        ),
        PropertyMapping(
            column="describedAt",
            propertyName="described at URL",
            propertyId="P973",
            propertyType=WdDatatype.url,
        ),
        PropertyMapping(
            column="dblpEventId",
            propertyName="DBLP event ID",
            propertyId="P10692",
            propertyType=WdDatatype.extid,
        ),
        PropertyMapping(
            column="start time",
            propertyName="start time",
            propertyId="P580",
            propertyType=WdDatatype.date,
        ),
        PropertyMapping(
            column="end time",
            propertyName="end time",
            propertyId="P582",
            propertyType=WdDatatype.date,
        ),
        PropertyMapping(
            column="locationWikidataId",
            propertyName="location",
            propertyId="P276",
            propertyType=WdDatatype.itemid,
        ),
        PropertyMapping(
            column="countryWikidataId",
            propertyName="country",
            propertyId="P17",
            propertyType=WdDatatype.itemid,
        ),
    ]
    reference_url = record.pop("referenceUrl")
    reference = UrlReference(url=reference_url)
    result = self.wd.add_record(
        record=record,
        property_mappings=mappings,
        write=write,
        ignore_errors=ignoreErrors,
        reference=reference,
    )
    return result

doAddProceedingsToWikidata(record, write=True, ignoreErrors=False)

Creates a wikidata proceedings entry for the given record

Parameters:

Name Type Description Default
record(dict)

the data to add

required
write(bool)

if True actually write

required
ignoreErrors(bool)

if True ignore errors

required

Returns: WikidataResult: the result of the add operation

Source code in ceurws/wikidatasync.py
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
def doAddProceedingsToWikidata(
    self, record: dict, write: bool = True, ignoreErrors: bool = False
) -> WikidataResult:
    """
    Creates a wikidata proceedings entry for the given record

    Args:
        record(dict): the data to add
        write(bool): if True actually write
        ignoreErrors(bool): if True ignore errors
    Returns:
        WikidataResult: the result of the add operation
    """
    mappings = [
        PropertyMapping(
            column="instanceof",
            propertyName="instanceof",
            propertyId="P31",
            propertyType=WdDatatype.itemid,
            value="Q1143604",
        ),
        PropertyMapping(
            column="part of the series",
            propertyName="part of the series",
            propertyId="P179",
            propertyType=WdDatatype.itemid,
            value="Q27230297",
        ),
        PropertyMapping(
            column="volume",
            propertyName="volume",
            propertyId="P478",
            propertyType=WdDatatype.string,
            qualifierOf="part of the series",
        ),  # ToDo: refactor qualifier of anchor column or property name?
        PropertyMapping(
            column="short name",
            propertyName="short name",
            propertyId="P1813",
            propertyType=WdDatatype.text,
        ),
        PropertyMapping(
            column="pubDate",
            propertyName="publication date",
            propertyId="P577",
            propertyType=WdDatatype.date,
        ),
        PropertyMapping(
            column="title",
            propertyName="title",
            propertyId="P1476",
            propertyType=WdDatatype.text,
        ),
        PropertyMapping(
            column="ceurwsUrl",
            propertyName="described at URL",
            propertyId="P973",
            propertyType=WdDatatype.url,
        ),
        PropertyMapping(
            column="language of work or name",
            propertyName="language of work or name",
            propertyId="P407",
            propertyType=WdDatatype.itemid,
            qualifierOf="ceurwsUrl",
        ),
        PropertyMapping(
            column="fullWorkUrl",
            propertyName="full work available at URL",
            propertyId="P953",
            propertyType=WdDatatype.url,
        ),
        PropertyMapping(
            column="urn",
            propertyName="URN-NBN",
            propertyId="P4109",
            propertyType=WdDatatype.extid,
        ),
    ]
    reference = UrlReference(url=record.get("ceurwsUrl"))
    result = self.wd.add_record(
        record=record,
        property_mappings=mappings,
        write=write,
        ignore_errors=ignoreErrors,
        reference=reference,
    )
    return result

doCreateEventItemAndLinkProceedings(volume, proceedingsWikidataId=None, write=False)

Create event wikidata item for given volume and link the proceedings with the event Args: volume: volume to create the event for proceedingsWikidataId: proceedings wikidata id of the event write: If True actually write

Returns:

Type Description
dict[str, WikidataResult]

proceedingsQId, eventQId, msg

Source code in ceurws/wikidatasync.py
 981
 982
 983
 984
 985
 986
 987
 988
 989
 990
 991
 992
 993
 994
 995
 996
 997
 998
 999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
def doCreateEventItemAndLinkProceedings(
    self,
    volume: Volume,
    proceedingsWikidataId: str | None = None,
    write: bool = False,
) -> dict[str, WikidataResult]:
    """
    Create event  wikidata item for given volume and link the proceedings with the event
    Args:
        volume: volume to create the event for
        proceedingsWikidataId: proceedings wikidata id of the event
        write: If True actually write

    Returns:
        proceedingsQId, eventQId, msg
    """
    results = {}
    vol_number = volume.number
    if (
        proceedingsWikidataId is None
        and vol_number is not None
        and self.checkIfProceedingsFromExists(vol_number, eventItemQid=None)
    ):
        # link between proceedings and event already exists
        proceedingsWikidataId = self.getWikidataIdByVolumeNumber(number=vol_number)
        results["Proceedings"] = WikidataResult(
            qid=proceedingsWikidataId,
            msg=f"Proceedings for Vol-{vol_number} already exists",
        )
    dblpEntityIds = self.dblpEndpoint.getDblpIdByVolumeNumber(vol_number)
    dblpEntityId = None
    msg = None
    if len(dblpEntityIds) > 1:
        msg = f"Multiple dblpEventIds found for Vol-{vol_number}: {','.join(dblpEntityIds)}"
    elif len(dblpEntityIds) == 1:
        dblpEntityId = dblpEntityIds[0]
    else:
        dblpEntityId = None
    results["dblp"] = WikidataResult(qid=dblpEntityId, msg=msg)
    wdItems = self.getWikidataIdByDblpEventId(dblpEntityId, vol_number)
    msg = ""
    eventQid = None
    if len(wdItems) == 0:
        # event item does not exist → create a new one
        volume.resolveLoctime()
        eventRecord = self.getWikidataEventRecord(volume)
        event_result = self.doAddEventToWikidata(record=eventRecord, write=write)
        eventQid = event_result.qid
        results["Event"] = event_result
    elif len(wdItems) == 1:
        results["Event"] = WikidataResult(
            # the event item already exists
            qid=wdItems[0],
            msg="Event item already exists;",
        )
    else:
        results["Event"] = WikidataResult(msg=f"Multiple event entries exist: {','.join(wdItems)}")
    if eventQid is not None:
        # add link between Proceedings and the event item
        link_result = self.addLinkBetweenProceedingsAndEvent(
            volumeNumber=vol_number,
            eventItemQid=eventQid,
            proceedingsWikidataId=proceedingsWikidataId,
            write=write,
        )
        link_result.msg = "Added Link between Proceedings and Event item;"
        results["link"] = link_result
    return results

from_args(args) classmethod

create a WikidataSync object from the given command line arguments

Parameters:

Name Type Description Default
args(Namespace)

the command line arguments

required
Source code in ceurws/wikidatasync.py
59
60
61
62
63
64
65
66
67
68
69
70
@classmethod
def from_args(cls, args) -> "WikidataSync":
    """
    create a WikidataSync object from the given command line arguments

    Args:
        args(Namespace): the command line arguments
    """
    wd_en = args.wikidata_endpoint_name
    dblp_en = args.dblp_endpoint_name
    wd_sync = cls.from_endpoint_names(wd_en, dblp_en, debug=args.debug)
    return wd_sync

from_endpoint_names(wd_en, dblp_en, debug=False) classmethod

create a WikidataSync object from the given endpoint names

Parameters:

Name Type Description Default
wd_en(str)

wikidata endpoint name

required
dblp_en(str)

dblp endpoint name

required
Source code in ceurws/wikidatasync.py
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
@classmethod
def from_endpoint_names(cls, wd_en: str, dblp_en: str, debug: bool = False) -> "WikidataSync":
    """
    create a WikidataSync object from the given endpoint names

    Args:
        wd_en(str): wikidata endpoint name
        dblp_en(str): dblp endpoint name
    """
    endpoints = EndpointManager.getEndpoints()
    if wd_en not in endpoints:
        raise Exception(f"invalid wikidata endpoint name {wd_en}\nsee sparqlquery -le ")
    if dblp_en not in endpoints:
        raise Exception(f"invalid dblp endpoint name {dblp_en}\nsee sparqlquery -le ")
    dblp_ep = endpoints[dblp_en]
    wd_ep = endpoints[wd_en]
    wd_sync = cls(
        baseurl=wd_ep.endpoint,
        dblp_endpoint_url=dblp_ep.endpoint,
        debug=debug,
    )
    wd_sync.wikidata_endpoint = wd_ep
    return wd_sync

getAuthorByIds(identifiers)

Based on the given identifiers get potential author items the names of the identifiers must be according to DblpAuthorIdentifier Args: identifiers: known identifiers of the author

Source code in ceurws/wikidatasync.py
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
def getAuthorByIds(self, identifiers: dict) -> dict[str, str]:
    """
    Based on the given identifiers get potential author items
    the names of the identifiers must be according to DblpAuthorIdentifier
    Args:
        identifiers: known identifiers of the author
    """
    if identifiers is None or len(identifiers) == 0:
        return dict()
    id_map = DblpAuthorIdentifier.getAllAsMap()
    optional_clauses = []
    for id_name, id_value in identifiers.items():
        if id_value is not None and id_value != "":
            id_query = None
            if id_name in id_map:
                id_query = DblpAuthorIdentifier.getWikidataIdQueryPart(id_name, id_value, "?person")
            else:
                if id_name == "homepage":
                    id_query = f"{{ ?person wdt:P856 <{id_value}>. }}"
            if id_query is not None:
                optional_clauses.append(id_query)
    id_queries = "\nUNION\n".join(optional_clauses)
    query = f"""SELECT DISTINCT ?person ?personLabel
                WHERE
                {{
                    {id_queries}
                    ?person rdfs:label ?personLabel. FILTER(lang(?personLabel)="en").
                }}"""
    qres = self.sparql.queryAsListOfDicts(query)
    res = dict()
    for record in qres:
        if record is None or len(record) == 0:
            continue
        item_id = self.removeWdPrefix(record.get("person"))
        name = record.get("personLabel")
        res[item_id] = name
    return res

getEventNameFromTitle(title) classmethod

Get the event name from the given proceedings title Args: title: title of the proceeding

Returns:

Type Description
str

name of the event

Source code in ceurws/wikidatasync.py
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
@classmethod
def getEventNameFromTitle(cls, title: str) -> str:
    """
    Get the event name from the given proceedings title
    Args:
        title: title of the proceeding

    Returns:
        name of the event
    """
    prefixes = [
        "Proceedings of the",
        "Proceedings of",
        "Joint Proceedings of the",
        "Joint Proceedings of",
        "Joint Proceedings",
        "Joint Proceeding of the",
        "Joint Proceeding of",
        "Selected Papers of the",
        "Selected Contributions of the",
        "Workshops Proceedings for the",
        "Supplementary Proceedings of the",
        "Short Paper Proceedings of",
        "Short Paper Proceedings of the",
        "Working Notes Proceedings of the",
        "Working Notes of",
        "Working Notes for",
        "Joint Workshop Proceedings of the",
        "Joint Workshop Proceedings of",
        "Workshop Proceedings from",
        "Workshop and Poster Proceedings of the",
        "Workshops Proceedings and Tutorials of the",
        "Extended Papers of the",
        "Short Papers Proceedings of the",
        "Short Papers Proceedings of",
        "Proceedings of the Selected Papers of the",
        "Proceedings of the Working Notes of",
        "Proceedings of the Doctoral Consortium Papers Presented at the",
        "Selected Contributions to the",
        "Selected and Revised Papers of",
        "Selected Papers of",
        "Up-and-Coming and Short Papers of the",
        "Academic Papers at",
        "Poster Track of the",
        "Actes de la",
        "Post-proceedings of the",
        "Late Breaking Papers of the",
        "Anais do",
        "Proceedings del",
        "Proceedings",
        "Gemeinsamer Tagungsband der",
        "Local Proceedings of the",
        "Local Proceedings and Materials of",
    ]
    postfixes = [
        "Workshop Proceedings",
        "Proceedings",
        "Conference Proceedings",
        "Workshops Proceedings",
        "Adjunct Proceedings",
        "Poster and Demo Proceedings",
        "(full papers)",
    ]
    if title is not None:
        prefixes.sort(key=lambda prefix: len(prefix), reverse=True)
        for prefix in prefixes:
            if title.lower().startswith(prefix.lower()):
                title = title[len(prefix) :]
                title = title.strip()
                break
        postfixes.sort(key=lambda postfix: len(postfix), reverse=True)
        for postfix in postfixes:
            if title.lower().endswith(postfix.lower()):
                title = title[: -len(postfix)]
                title = title.strip(" .,")
                break
    return title

getEventTypeFromTitle(title) classmethod

Extract the event type from the given title Assumption: lowest mentioned type is the correct one Args: title: title of the event

Returns:

Type Description
tuple[str | None, str | None]

wikidata id and label of the event type

Source code in ceurws/wikidatasync.py
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
@classmethod
def getEventTypeFromTitle(cls, title: str) -> tuple[str | None, str | None]:
    """
    Extract the event type from the given title
    Assumption: lowest mentioned type is the correct one
    Args:
        title: title of the event

    Returns:
        wikidata id and label of the event type
    """
    if title is None or title == "":
        return None, None
    academicConference = ("Q2020153", "academic conference")
    academicWorkshop = ("Q40444998", "academic workshop")
    if "workshop" in title.lower():
        return academicWorkshop
    elif "conference" in title.lower() or "symposium" in title.lower():
        return academicConference
    else:
        return academicWorkshop

getEventWdItemsByUrn(urn)

queries the wikidata proceedings that have the given urn assigned to P4109 and returns the assigned event Args: urn: URN id to query for

Returns:

Type Description
list[str]

List of corresponding wikidata item ids or empty list of no matching item is found

Source code in ceurws/wikidatasync.py
325
326
327
328
329
330
331
332
333
334
335
336
337
def getEventWdItemsByUrn(self, urn: str) -> list[str]:
    """
    queries the wikidata proceedings that have the given urn assigned to P4109 and returns the assigned event
    Args:
        urn: URN id to query for

    Returns:
        List of corresponding wikidata item ids or empty list of no matching item is found
    """
    query = f"""SELECT ?event WHERE{{ ?proceeding wdt:P4109 "{urn}"; wdt:P4745 ?event .}}"""
    qres = self.sparql.queryAsListOfDicts(query)
    wdItems = [record.get("event") for record in qres]
    return wdItems

getEventsOfProceedings(itemId)

get the item ids of the events the given proceedings ids is the proceedings from Args: itemId: Qid of the proceedings

Returns:

Type Description
list[str]

List of the events

Source code in ceurws/wikidatasync.py
339
340
341
342
343
344
345
346
347
348
349
350
351
def getEventsOfProceedings(self, itemId: str) -> list[str]:
    """
    get the item ids of the events the given proceedings ids is the proceedings from
    Args:
        itemId: Qid of the proceedings

    Returns:
        List of the events
    """
    query = f"""SELECT ?event WHERE {{ wd:{itemId} wdt:P4745 ?event.}}"""
    qres = self.sparql.queryAsListOfDicts(query)
    wdItems = [record.get("event")[len("http://www.wikidata.org/entity/") :] for record in qres]
    return wdItems

getEventsOfProceedingsByVolnumber(volnumber)

get the item ids of the events the given proceedings ids is the proceedings from Args: volnumber: Volume number of the proceedings

Returns:

Type Description
list[str]

List of the events

Source code in ceurws/wikidatasync.py
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
def getEventsOfProceedingsByVolnumber(self, volnumber: int | str) -> list[str]:
    """
    get the item ids of the events the given proceedings ids is the proceedings from
    Args:
        volnumber: Volume number of the proceedings

    Returns:
        List of the events
    """
    query = f"""SELECT ?event 
                WHERE {{
                ?proceeding wdt:P31 wd:Q1143604; 
                            p:P179 [ps:P179 wd:Q27230297; pq:P478 "{volnumber}"]; 
                            wdt:P4745 ?event.}}
    """
    qres = self.sparql.queryAsListOfDicts(query)
    wdItems = [record.get("event")[len("http://www.wikidata.org/entity/") :] for record in qres]
    return wdItems

getProceedingWdItemsByUrn(urn)

queries the wikidata items that have the given urn for the property P4109 Args: urn: URN id to query for

Returns:

Type Description
list[str]

List of corresponding wikidata item ids or empty list of no matching item is found

Source code in ceurws/wikidatasync.py
311
312
313
314
315
316
317
318
319
320
321
322
323
def getProceedingWdItemsByUrn(self, urn: str) -> list[str]:
    """
    queries the wikidata items that have the given urn for the property P4109
    Args:
        urn: URN id to query for

    Returns:
        List of corresponding wikidata item ids or empty list of no matching item is found
    """
    query = f"""SELECT ?proceeding WHERE{{ ?proceeding wdt:P4109 "{urn}"}}"""
    qres = self.sparql.queryAsListOfDicts(query)
    wdItems = [record.get("proceeding") for record in qres]
    return wdItems

getProceedingsForVolume(searchVolnumber)

get the proceedings record for the given searchVolnumber

Parameters:

Name Type Description Default
searchVolnumber(int)

the number of the volume to search

required

Returns:

Name Type Description
dict dict | None

the record for the proceedings in wikidata

None dict | None

if the proceeding record in not found for the given searchVolnumber

Source code in ceurws/wikidatasync.py
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
def getProceedingsForVolume(self, searchVolnumber: int) -> dict | None:
    """
    get the proceedings record for the given searchVolnumber

    Args:
        searchVolnumber(int): the number of the volume to search

    Returns:
        dict: the record for the proceedings in wikidata
        None: if the proceeding record in not found for the given searchVolnumber
    """
    if self.procRecords is None:
        self.loadProceedingsFromCache()
    if self.procsByVolnumber is None:
        self.procsByVolnumber: dict[int, dict] = {}
        if isinstance(self.procRecords, list):
            for procRecord in self.procRecords:
                volnumber = procRecord.get("sVolume", None)
                if volnumber is None:
                    procRecord.get("Volume", None)
                if volnumber is not None:
                    self.procsByVolnumber[int(volnumber)] = procRecord
    volProcRecord = self.procsByVolnumber.get(searchVolnumber, None)
    return volProcRecord

getRecentlyAddedVolumeList()

get the list of volumes that have recently been added we do not expect deletions

Returns:

Type Description
tuple[dict[int, dict], list[dict]]

list[int]: list of volume numbers recently added

Source code in ceurws/wikidatasync.py
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
def getRecentlyAddedVolumeList(self) -> tuple[dict[int, dict], list[dict]]:
    """
    get the list of volumes that have recently been added
    we do not expect deletions

    Returns:
        list[int]: list of volume numbers recently added

    """
    self.prepareVolumeManager()
    refreshVm = VolumeManager()
    parser_config = ParserConfig()
    parser_config.force_download = True
    self.vm.set_down_to_volume(parser_config)
    refreshVm.loadFromIndexHtml(parser_config=parser_config)
    refreshVolumesByNumber, _duplicates = LOD.getLookup(refreshVm.getList(), "number")
    # https://stackoverflow.com/questions/3462143/get-difference-between-two-lists
    newVolumes = list(set(list(refreshVolumesByNumber.keys())) - set(list(self.volumesByNumber.keys())))
    return refreshVolumesByNumber, newVolumes

getWikidataEventRecord(volume)

get the wikidata Record for the given volume

Source code in ceurws/wikidatasync.py
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
def getWikidataEventRecord(self, volume: Volume):
    """
    get the wikidata Record for the given volume
    """
    volumeTitle = volume.title
    volumeNumber = volume.number
    dblpEntityIds = self.dblpEndpoint.getDblpIdByVolumeNumber(number=volumeNumber)
    title = label = instanceOf = description = None
    if volumeTitle:
        instanceOf, description = self.getEventTypeFromTitle(volumeTitle)
        title = label = self.getEventNameFromTitle(volumeTitle)
    start_time = volume.dateFrom
    end_time = volume.dateTo
    record = {
        "title": title,
        "label": label,
        "description": description,
        "instanceOf": instanceOf,
        "short name": volume.acronym,
        "locationWikidataId": volume.cityWikidataId,
        "countryWikidataId": volume.countryWikidataId,
        "start time": start_time.isoformat() if start_time is not None else start_time,
        "end time": end_time.isoformat() if end_time is not None else end_time,
        "referenceUrl": volume.getVolumeUrl(),
    }
    if dblpEntityIds is not None and len(dblpEntityIds) > 0:
        dblpEntityId = dblpEntityIds[0]
        record["describedAt"] = self.dblpEndpoint.toDblpUrl(dblpEntityId)
        record["language of work or name"] = "Q1860"
        record["dblpEventId"] = self.dblpEndpoint.convertEntityIdToUrlId(entityId=dblpEntityId)
    # the modeling of virtual events has changed in wikidata
    # virtual event (Q7935096) is discontinued for conferences
    # if volume.isVirtualEvent():
    #     record["instanceOf"] = [instanceOf, "Q7935096"]
    return record

getWikidataIdByDblpEventId(entityId, volumeNumber=None)

query wikidata for the qId of items that correspond to the given dblpEventId Args: entityId: id of a dblp event volumeNumber: volume number

Returns:

Type Description
list[str]

list of matching wikidata items

Source code in ceurws/wikidatasync.py
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
def getWikidataIdByDblpEventId(self, entityId: str | None, volumeNumber: int | None = None) -> list[str]:
    """
    query wikidata for the qId of items that correspond to the given dblpEventId
    Args:
        entityId: id of a dblp event
        volumeNumber: volume number

    Returns:
        list of matching wikidata items
    """
    dblpEventId = self.dblpEndpoint.convertEntityIdToUrlId(entityId=entityId)
    dblpIds = [entityId, dblpEventId]
    dblpIdsStr = " ".join([f'"{dblpId}"' for dblpId in dblpIds])
    urls = ""
    if entityId is not None:
        urls = " ".join(
            [
                f"<{self.dblpEndpoint.toDblpUrl(entityId)}>",
                f"<{self.dblpEndpoint.toDblpUrl(entityId, True)}>",
            ]
        )
    volumeQuery = ""
    if volumeNumber is not None:
        volumeQuery = f"""
        UNION
              {{
              ?proceeding p:P179 [ps:P179 wd:Q27230297; pq:P478 "{volumeNumber}"].
              ?proceeding wdt:P4745 ?qid.
              }}
        """
    query = f"""SELECT DISTINCT ?qid
        WHERE{{
          VALUES ?url {{ {urls} }}
          VALUES ?dblpEventId {{ {dblpIdsStr} }}
          VALUES ?eventType {{wd:Q2020153 wd:Q40444998}}
          {{?qid wdt:P31 ?eventType; wdt:P973 ?url}}
          UNION
          {{?qid wdt:P31 ?eventType; wdt:P10692 ?dblpEventId}}
          {volumeQuery}
        }}
    """
    qres = self.sparql.queryAsListOfDicts(query)
    qIds = []
    if qres is not None and qres != []:
        qIds = [self.removeWdPrefix(record.get("qid")) for record in qres]
    return qIds

getWikidataIdByVolumeNumber(number)

query wikidata for the qId of the proceedings of the given volume number Args: number: volume number

Returns:

Name Type Description
str str | None

wikidata id corresponding to the given volume number

None str | None

if the corresponding wikidata id was not found

Source code in ceurws/wikidatasync.py
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
def getWikidataIdByVolumeNumber(self, number: int | None) -> str | None:
    """
    query wikidata for the qId of the proceedings of the given volume number
    Args:
        number: volume number

    Returns:
        str: wikidata id corresponding to the given volume number
        None: if the corresponding wikidata id was not found
    """
    if number is None:
        return None
    query = f"""SELECT * WHERE{{ ?proceeding p:P179 [ps:P179 wd:Q27230297; pq:P478 "{number}"].}}"""
    qres = self.sparql.queryAsListOfDicts(query)
    qid = None
    if qres is not None and qres != []:
        qids = [record.get("proceeding").split("/")[-1] for record in qres]
        if len(qids) > 1:
            print("CEUR-WS volume number is not unique")
        else:
            qid = qids[0]
    return qid

getWikidataProceedingsRecord(volume)

get the wikidata Record for the given volume

Source code in ceurws/wikidatasync.py
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
def getWikidataProceedingsRecord(self, volume):
    """
    get the wikidata Record for the given volume
    """
    record = {
        "title": getattr(volume, "title", None),
        "label": getattr(volume, "title", None),
        "description": f"Proceedings of {getattr(volume, 'acronym', None)} workshop",
        "urn": getattr(volume, "urn", None),
        "short name": getattr(volume, "acronym", None),
        "volume": getattr(volume, "number", None),
        "pubDate": getattr(volume, "pubDate", None),
        "ceurwsUrl": getattr(volume, "url", None),
        "language of work or name": "Q1860",
        "fullWorkUrl": getattr(volume, "url", None),
    }
    if isinstance(record.get("pubDate"), datetime.datetime):
        record["pubDate"] = record["pubDate"].isoformat()
    return record

hasItemPropertyValueFor(item, propertyId)

ask wikidata if the given item has a value for the given property Args: item: item Qid propertyId: property Pid Returns: True if the item has the property else False

Source code in ceurws/wikidatasync.py
496
497
498
499
500
501
502
503
504
505
506
def hasItemPropertyValueFor(self, item, propertyId: str):
    """
    ask wikidata if the given item has a value for the given property
    Args:
        item: item Qid
        propertyId: property Pid
    Returns:
        True if the item has the property else False
    """
    query = f"""ASK{{ wd:{item} wdt:{propertyId} ?value.}}"""
    return self.askWikidata(query)

loadProceedingsFromCache()

load the proceedings records from the cache

Source code in ceurws/wikidatasync.py
278
279
280
281
282
283
284
def loadProceedingsFromCache(self):
    """
    load the proceedings records from the cache
    """
    sqlQuery = "SELECT * from Proceedings"
    self.procRecords = self.sqldb.query(sqlQuery)
    return self.procRecords

preparePaperManager()

prepare my paper Manager

Source code in ceurws/wikidatasync.py
116
117
118
119
120
121
122
123
124
125
126
127
def preparePaperManager(self):
    """
    prepare my paper Manager
    """
    self.pm = PaperManager()
    if self.pm.isCached():
        self.pm.fromStore(cacheFile=CEURWS.CACHE_FILE)
    else:
        print(
            "PaperManager not cached you might want to run ceur-ws --recreate",
            file=sys.stderr,
        )

prepareVolumeManager()

prepare my volume manager

Source code in ceurws/wikidatasync.py
129
130
131
132
133
134
135
136
137
138
139
140
141
142
def prepareVolumeManager(self):
    """
    prepare my volume manager
    """
    self.vm = VolumeManager()
    self.vm.load()
    self.volumesByNumber, _duplicates = LOD.getLookup(self.vm.getList(), "number")
    self.volumeList = self.vm.getList()
    self.volumeCount = len(self.volumeList)
    self.volumeOptions = {}
    reverse_keys = sorted(self.volumesByNumber.keys(), reverse=True)
    for volume_number in reverse_keys:
        volume = self.volumesByNumber[volume_number]
        self.volumeOptions[volume.number] = f"Vol-{volume.number}:{volume.title}"

removeWdPrefix(value) classmethod

removes the wikidata entity prefix Args: value: wikidata entity url

Source code in ceurws/wikidatasync.py
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
@classmethod
def removeWdPrefix(cls, value: str):
    """
    removes the wikidata entity prefix
    Args:
        value: wikidata entity url
    """
    wd_prefix = "http://www.wikidata.org/entity/"
    if value is not None and isinstance(value, str) and value.startswith(wd_prefix):
        value = value[len("http://www.wikidata.org/entity/") :]
    return value

storeVolumes()

store my volumes

Source code in ceurws/wikidatasync.py
175
176
177
178
179
def storeVolumes(self):
    """
    store my volumes
    """
    self.vm.store()

update(withStore=True)

update my table from the Wikidata Proceedings SPARQL query

Source code in ceurws/wikidatasync.py
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
def update(self, withStore: bool = True):
    """
    update my table from the Wikidata Proceedings SPARQL query
    """
    if self.debug:
        print(f"Querying proceedings from {self.baseurl} ...")
    # query proceedings
    wd_proceedings_records: list[dict] = self.sparql.queryAsListOfDicts(self.wdQuery.query)
    # query events
    event_query = self.qm.queriesByName["EventsByProceeding"]
    wd_event_records: list[dict] = self.sparql.queryAsListOfDicts(event_query.query)
    # add events to proceeding records
    proceedings_event_map, _duplicates = LOD.getLookup(wd_event_records, "item")
    for proceedings_record in wd_proceedings_records:
        item = proceedings_record.get("item")
        if item in proceedings_event_map:
            event_record = proceedings_event_map.get(item)
            proceedings_record.update(**event_record)
    primaryKey = "URN_NBN"
    withCreate = True
    withDrop = True
    entityInfo = self.sqldb.createTable(
        wd_proceedings_records,
        "Proceedings",
        primaryKey,
        withCreate,
        withDrop,
        sampleRecordCount=5000,
        failIfTooFew=False,
    )
    procsByURN, duplicates = LOD.getLookup(wd_proceedings_records, "URN_NBN")
    if withStore:
        self.sqldb.store(procsByURN.values(), entityInfo, executeMany=True, fixNone=True)
    if self.debug:
        print(f"stored {len(procsByURN.values())} proceedings records")
    if len(duplicates) > 0:
        print(f"found {len(duplicates)} duplicates URN entries")
        if len(duplicates) < 10:
            print(duplicates)
    return wd_proceedings_records

workshop

Created on 2020-11-12

@author: wf

Workshop

a single Workshop

Source code in ceurws/workshop.py
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
class Workshop:
    """
    a single Workshop
    """

    def __init__(self):
        """
        Constructor
        """

    @staticmethod
    def ofURI(uri):
        xml = urlopen(uri).read().decode()
        ws = Workshop()
        ws.wsdict = xmltodict.parse(xml)
        return ws

__init__()

Constructor

Source code in ceurws/workshop.py
17
18
19
20
def __init__(self):
    """
    Constructor
    """