%load_ext graph_notebook.magics

Could not find a valid configuration. Do not forget to validate your settings using %graph_notebook_config.


%%graph_notebook_config
{
  "host": "fuseki",
  "port": 3030,
  "ssl": false,
  "sparql": {
    "path": "spase/sparql"
  }
}

set notebook config to:
{
  "host": "fuseki",
  "port": 3030,
  "proxy_host": "",
  "proxy_port": 8182,
  "ssl": false,
  "ssl_verify": true,
  "sparql": {
    "path": "spase/sparql"
  },
  "gremlin": {
    "traversal_source": "g",
    "username": "",
    "password": "",
    "message_serializer": "graphsonv3"
  },
  "neo4j": {
    "username": "neo4j",
    "password": "password",
    "auth": true,
    "database": null
  }
}

<graph_notebook.configuration.generate_config.Configuration at 0xffff67450c70>


%%sparql --store-to catalog_by_phenomenon_type

PREFIX space: <http://purl.org/net/schemas/space/>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX spase: <http://www.spase-group.org/data/schema/>

SELECT ?phenomenonType (COUNT(?sub) as ?count)  WHERE {
  ?sub  spase:has_phenomenon_type ?phenomenonTypeURI .
  ?phenomenonTypeURI rdfs:label ?phenomenonType .
  ?sub a spase:Catalog.
} GROUP BY ?phenomenonType ORDER BY ?count

Tab(children=(Output(layout=Layout(max_height='600px', max_width='940px', overflow='scroll')), Output(layout=L…


import plotly.graph_objects as go

# Extracting data for plotting
phenomenon_types = [entry['phenomenonType']['value'] for entry in catalog_by_phenomenon_type['results']['bindings']]
counts = [int(entry['count']['value']) for entry in catalog_by_phenomenon_type['results']['bindings']]

# Plotting
fig = go.Figure(go.Bar(
    y=phenomenon_types,
    x=counts,
    orientation='h',  # horizontal orientation
    marker_color='skyblue'
))

fig.update_layout(
    title='Catalog counts',
    xaxis_title='Count',
    yaxis_title='Phenomenon Type',
    yaxis=dict(autorange="reversed"),  # Reverse the y-axis to have the highest count at the top
    height=600,  # Set the height of the figure
    width=800,  # Set the width of the figure
    margin=dict(l=100, r=50, t=50, b=50),  # Adjust margins for better visualization
    showlegend=False  # Hide the legend
)

fig.show()


%%sparql --store-to instruments_time_line

PREFIX space: <http://purl.org/net/schemas/space/>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX spase: <http://www.spase-group.org/data/schema/>

SELECT ?instrumentName ?start_date ?stop_date  WHERE {
  ?instrument  spase:has_operating_span ?span .
  ?instrument a spase:Instrument .
  ?instrument spase:has_resource_header ?instrumentHeader .
  ?instrumentHeader spase:resource_name ?instrumentName .
  ?span spase:start_date ?start_date .
  OPTIONAL {
    ?span spase:stop_date ?stop_date .
  }
} LIMIT 20

Tab(children=(Output(layout=Layout(max_height='600px', max_width='940px', overflow='scroll')), Output(layout=L…


import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from dateutil import parser
from datetime import datetime

# Extracting data
instrument_names = [entry['instrumentName']['value'] for entry in instruments_time_line['results']['bindings']]
start_dates = [parser.parse(entry['start_date']['value']) for entry in instruments_time_line['results']['bindings']]
stop_dates = [parser.parse(entry['stop_date']['value']) if 'stop_date' in entry else None for entry in instruments_time_line['results']['bindings']]

# Plotting
fig = go.Figure()

for idx, (instrument, start_date, stop_date) in enumerate(zip(instrument_names, start_dates, stop_dates)):
    if stop_date is None:
        stop_date = datetime.now()  # If the instrument is still in operation, set current date
    fig.add_trace(go.Scatter(x=[start_date, stop_date], y=[instrument, instrument], mode='lines+markers', name=instrument))

# Setting labels and title
fig.update_layout(
    yaxis=dict(
        tickmode='array',
        tickvals=list(range(len(instrument_names))),
        ticktext=instrument_names,
        title='Instruments'
    ),
    xaxis=dict(title='Time'),
    title='Timeline of Instruments',
)
fig.update_traces(showlegend=False)


fig.show()


%%sparql --store-to counts_by_region
PREFIX space: <http://purl.org/net/schemas/space/>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX spase: <http://www.spase-group.org/data/schema/>

SELECT ?region (COUNT(?s) AS ?count)  WHERE {
	?s spase:has_observed_region ?o .
  	?o rdfs:label ?region .

} GROUP BY ?region

Tab(children=(Output(layout=Layout(max_height='600px', max_width='940px', overflow='scroll')), Output(layout=L…


import pandas as pd
import plotly.express as px


# Extracting data from SPARQL results
data = counts_by_region["results"]["bindings"]

# Extracting region values and counts
regions = [entry["region"]["value"] for entry in data]
counts = [int(entry["count"]["value"]) for entry in data]

# Creating DataFrame
df = pd.DataFrame({"region": regions, "count": counts})

# Sort the DataFrame by the length of the "region" strings in descending order
df['region_length'] = df['region'].apply(len)
df = df.sort_values(by='region_length', ascending=False).drop(columns='region_length')

# Initialize a set to keep track of the paths that have more specific counterparts
specific_paths = set()

# Iterate over the sorted DataFrame
for _, row in df.iterrows():
    # Check if there is a more specific path already encountered
    if any(path.startswith(row['region']) for path in specific_paths):
        continue
    specific_paths.add(row['region'])

# Filter the DataFrame to keep only the rows with the most specific paths
df_filtered = df[df['region'].isin(specific_paths)].reset_index(drop=True)

# Splitting the 'region' column by '.' and expanding it into separate columns
df_filtered[['level1', 'level2', 'level3', 'level4']] = df_filtered['region'].str.split('.', expand=True)

fig = px.sunburst(df_filtered, path=['level1', 'level2', 'level3', 'level4'], values='count')
fig.show()


%%sparql --store-to instruments_by_location

PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX spase: <http://www.spase-group.org/data/schema/>

SELECT ?observatoryName  ?lat ?long (COUNT(?instrument) AS ?count)  WHERE {
	?observatory a spase:Observatory .
     ?observatory spase:has_resource_header ?header .
   ?header spase:resource_name ?observatoryName .
    ?instrument a spase:Instrument .
    ?instrument spase:has_observatory ?observatory .
  ?observatory spase:has_location ?location .
  ?location spase:latitude ?lat .
  ?location spase:longitude ?long .
} GROUP BY ?observatoryName ?lat ?long ORDER BY DESC(?count)

Tab(children=(Output(layout=Layout(max_height='600px', max_width='940px', overflow='scroll')), Output(layout=L…


import re

# Extracting data with handling for longitude values
data = []
for binding in instruments_by_location['results']['bindings']:
    long_str = binding['long']['value']
    long_match = re.match(r'(-?\d+(\.\d+)?)E', long_str)
    if long_match:
        long_val = float(long_match.group(1))
        if long_val > 180:  # Adjusting values greater than 180 (east)
            long_val = long_val - 360
    else:
        long_val = float(long_str)

    data.append({
        'observatoryName': binding['observatoryName']['value'],
        'lat': float(binding['lat']['value']),
        'long': long_val,
        'count': int(binding['count']['value'])
    })
# Create DataFrame
df = pd.DataFrame(data)

# Plot map with bubbles
fig = px.scatter_geo(df, lat='lat', lon='long', size='count', hover_name='observatoryName',
                     projection='natural earth', title='Observatory Bubbles')
fig.show()