In [1]:
from flashtext import KeywordProcessor
import pandas as pd
from pathlib import Path
from collections import defaultdict
from IPython.display import display, HTML

In [2]:
wikicssh_path = Path("../data/v1")
wikicssh_files = list(wikicssh_path.glob("./*.csv"))
wikicssh_files

[WindowsPath('../data/v1/WikiCSSH_categories.csv'),
 WindowsPath('../data/v1/WikiCSSH_category2page.csv'),
 WindowsPath('../data/v1/WikiCSSH_category_links.csv'),
 WindowsPath('../data/v1/WikiCSSH_category_links_all.csv'),
 WindowsPath('../data/v1/Wikicssh_core_categories.csv'),
 WindowsPath('../data/v1/WikiCSSH_page2redirect.csv')]

In [3]:
%%time
page2cats = (
    pd.read_csv('../data/v1/WikiCSSH_category2page.csv')
    .groupby("page_title")
    .cat_title
    .agg(lambda x: list(x))
    .to_dict()
)

Wall time: 20.4 s


In [4]:
pd.read_csv(wikicssh_files[4]).head()

Unnamed: 0,category,level
0,Computer_science,1
1,Mathematics,1
2,Information_science,1
3,Computer_engineering,1
4,Statistics,1


In [5]:
processor = KeywordProcessor()

In [6]:
%%time
# categories
processor.add_keywords_from_dict(
    {
        f'Category:{k}': [f'{k.lower().replace("_", " ")}']
        for k in pd.read_csv("../data/v1/WikiCSSH_categories.csv").category.values
    }
)

Wall time: 124 ms


In [7]:
%%time
for row in pd.read_csv('../data/v1/WikiCSSH_page2redirect.csv').values:
    #print(row)
    #break
    if isinstance(row[-1], float):
        row[-1] = row[0]
    processor.add_keyword(row[-1].lower().replace("_", " "), row[0])
#df_redirects = pd.read_csv(wikicssh_files[4]) # redirects
#df_redirects.head()



Wall time: 8.44 s


In [8]:
text = """In the last decade, we experienced an urgent need for a flexible, context-sensitive, fine-grained, and machine-actionable representation of scholarly knowledge and corresponding infrastructures for knowledge curation, publishing and processing. Such technical infrastructures are becoming increasingly popular in representing scholarly knowledge as structured, interlinked, and semantically rich Scientific Knowledge Graphs (SKG). Knowledge graphs are large networks of entities and relationships, usually expressed in W3C standards such as OWL and RDF. SKGs focus on the scholarly domain and describe the actors (e.g., authors, organizations), the documents (e.g., publications, patents), and the research knowledge (e.g., research topics, tasks, technologies) in this space as well as their reciprocal relationships. These resources provide substantial benefits to researchers, companies, and policymakers by powering several data-driven services for navigating, analysing, and making sense of research dynamics. Some examples include Microsoft Academic Graph (MAG), Open Academic Graph (combining MAG and AMiner), ScholarlyData, PID Graph, Open Research Knowledge Graph, OpenCitations, and OpenAIRE research graph. Current challenges in this area include: i) the design of ontologies able to conceptualise scholarly knowledge, ii) (semi-)automatic extraction of entities and concepts, integration of information from heterogeneous sources, identification of duplicates, finding connections between entities, and iii) the development of new services using this data, that allow to explore this information, measure research impact and accelerate science. This workshop aims at bringing together researchers and practitioners from different fields (including, but not limited to, Digital Libraries, Information Extraction, Machine Learning, Semantic Web, Knowledge Engineering, Natural Language Processing, Scholarly Communication, and Bibliometrics) in order to explore innovative solutions and ideas for the production and consumption of Scientific Knowledge Graphs (SKGs)."""

In [9]:
processor.extract_keywords(text, span_info=True)

[('Experience', 23, 34),
 ('Granularity', 85, 97),
 ('Scholarly_method', 140, 149),
 ('Knowledge', 150, 159),
 ('Knowledge', 198, 207),
 ('Scholarly_method', 326, 335),
 ('Knowledge', 336, 345),
 ('Semantics', 378, 390),
 ('Knowledge', 407, 416),
 ('Category:Graphs', 417, 423),
 ('Knowledge', 431, 440),
 ('Category:Graphs', 441, 447),
 ('Entity', 470, 478),
 ('World_Wide_Web_Consortium', 519, 532),
 ('Scholarly_method', 572, 581),
 ('Document', 649, 658),
 ('Research', 698, 706),
 ('Knowledge', 707, 716),
 ('Research', 724, 732),
 ('Category:Space', 770, 775),
 ('Research', 867, 878),
 ('Business', 880, 889),
 ('Research', 996, 1004),
 ('CONFIG.SYS', 1029, 1036),
 ('Microsoft_Academic', 1037, 1055),
 ('Academy_(educational_institution)', 1074, 1082),
 ('Open_research', 1143, 1156),
 ('Ontology_(information_science)', 1157, 1172),
 ('Research', 1202, 1210),
 ('Category:Area', 1245, 1249),
 ('CONFIG.SYS', 1250, 1257),
 ('Category:Design', 1266, 1272),
 ('Ontology', 1276, 1286),
 ('Concep

In [10]:
def get_html(text, processor):
    spans = processor.extract_keywords(text, span_info=True)
    prev = 0
    parts = []
    category_counts = defaultdict(int)
    for entity, start, end in spans:
        if entity.startswith("Category:"):
            entity_cats = [entity.replace("Category:", "")]
        else:
            entity_cats = [c for c in page2cats.get(entity, [])]
        for cat in entity_cats:
            category_counts[cat] += 1
        if start > prev:
            parts.append(text[prev:start])
        parts.append(f"<a href='https://en.wikipedia.org/wiki/{entity}' title='{entity}'>{text[start:end]}</a>")
        prev = end
    tagged_doc = "".join(parts).replace("\n", "<br/>")
    pred_categories = " | ".join([
        f"<a href='https://en.wikipedia.org/wiki/Category:{k}' title='{k}'>{k}</a> ({v})"
        for k,v in sorted(category_counts.items(), key=lambda x: x[1], reverse=True)
    ])
    final_div = f"""<div>
    <div>
        <h3>Tagged document:</h3>
        {tagged_doc}
    </div>
    <div>
        <h3>Predicted categories:</h3>
        {pred_categories}
    </div>
    </div>"""
    return HTML(final_div)
    

In [11]:
display(get_html(text, processor))

In [12]:
text = """Methods for extracting entities (methods, research topics, technologies, tasks, materials, metrics, research contributions) and relationships from research publications
Methods for extracting metadata about authors, documents, datasets, grants, affiliations and others.
Data models (e.g., ontologies, vocabularies, schemas) for the description of scholarly data and the linking between scholarly data/software and academic papers that report or cite them
Description of citations for scholarly articles, data and software and their interrelationships
Applications for the (semi-)automatic annotation of scholarly papers
Theoretical models describing the rhetorical and argumentative structure of scholarly papers and their application in practice
Methods for quality assessment of scientific knowledge graphs
Description and use of provenance information of scholarly data
Methods for the exploration, retrieval and visualization of scientific knowledge graphs
Pattern discovery of scholarly data
Scientific claims identification from textual contents
Automatic or semi-automatic approaches to making sense of research dynamics
Content- and data-based analysis on scholarly papers
Automatic semantic enhancement of existing scholarly libraries and papers
Reconstruction, forecasting and monitoring of scholarly data
Novel user interfaces for interaction with paper, metadata, content, software and data
Visualisation of related papers or data according to multiple dimensions (semantic similarity of abstracts, keywords, etc.)
Applications for making sense of scholarly data"""

In [13]:
display(get_html(text, processor))

In [14]:
text="""One of the most common AI techniques used for processing big data is machine learning, a self-adaptive algorithm that gets increasingly better analysis and patterns with experience or with newly added data.

If a digital payments company wanted to detect the occurrence or potential for fraud in its system, it could employ machine learning tools for this purpose. The computational algorithm built into a computer model will process all transactions happening on the digital platform, find patterns in the data set, and point out any anomaly detected by the pattern.

Deep learning, a subset of machine learning, utilizes a hierarchical level of artificial neural networks to carry out the process of machine learning. The artificial neural networks are built like the human brain, with neuron nodes connected together like a web. While traditional programs build analysis with data in a linear way, the hierarchical function of deep learning systems enables machines to process data with a nonlinear approach."""

In [15]:
display(get_html(text, processor))

In [16]:
text="""Commonsense knowledge graph reasoning(CKGR) is the task of predicting a missing entity given one existing and the relation in a commonsense knowledge graph (CKG). Existing methods can be classified into two categories generation method and selection method. Each method has its own advantage. We theoretically and empirically compare the two methods, finding the selection method is more suitable than the generation method in CKGR. Given the observation, we further combine the structure of neural Text Encoder and Knowledge Graph Embedding models to solve the selection method's two problems, achieving competitive results. We provide a basic framework and baseline model for subsequent CKGR tasks by selection methods."""

In [17]:
display(get_html(text, processor))

In [18]:
text="""We introduce several measures of novelty for a scientific article in MEDLINE based on the temporal profiles of its assigned Medical Subject Headings (MeSH). First, temporal profiles for all MeSH terms (and pairs of MeSH terms) were characterized empirically and modelled as logistic growth curves. Second, a paper's novelty is captured by its youngest MeSH (and pairs of MeSH) as measured in years and volume of prior work. Across all papers in MEDLINE published since 1985, we find that individual concept novelty is rare (2.7% of papers have a MeSH ≤ 3 years old; 1.0% have a MeSH ≤ 20 papers old), while combinatorial novelty is the norm (68% have a pair of MeSH ≤ 3 years old; 90% have a pair of MeSH ≤ 10 papers old). Furthermore, these novelty measures exhibit complex correlations with article impact (as measured by citations received) and authors' professional age."""

In [19]:
display(get_html(text, processor))

In [20]:
text="""Deep Learning is a subfield of machine learning concerned with algorithms inspired by the structure and function of the brain called artificial neural networks.
If you are just starting out in the field of deep learning or you had some experience with neural networks some time ago, you may be confused. I know I was confused initially and so were many of my colleagues and friends who learned and used neural networks in the 1990s and early 2000s.
The leaders and experts in the field have ideas of what deep learning is and these specific and nuanced perspectives shed a lot of light on what deep learning is all about.
In this post, you will discover exactly what deep learning is by hearing from a range of experts and leaders in the field."""

In [21]:
display(get_html(text, processor))

In [22]:
len(page2cats)

181070

## Interactive usage

In [23]:
from ipywidgets import interact_manual, widgets, Layout

In [24]:
text_area_widget = widgets.Textarea(
    value=text,
    placeholder="Type your text hear",
    description='String:',
    disabled=False,
    layout=Layout(width="90%")
)
text_area_widget.rows=10;
interact_manual(lambda text: get_html(text, processor), text=text_area_widget);

interactive(children=(Textarea(value='Deep Learning is a subfield of machine learning concerned with algorithm…