alexwlchan (
alexwlchan) wrote in
dw_dev2019-11-28 08:25 pm
![[personal profile]](https://www.dreamwidth.org/img/silk/identity/user.png)
![[site community profile]](https://www.dreamwidth.org/img/comm_staff.png)
Entry tags:
Searching your Dreamwidth posts by more than one tag
I was chatting to
cesy this evening, and we were chatting about tagging on Dreamwidth (because I've just written about how I use tags to organise my scanned documents). In particular, we were talking about ways to find posts with particular combinations of tags.
As far as we both know (please correct us if we're wrong!), you can only filter your posts by a particular tag. For example, to see all of my posts tagged with "meta", I'd visit https://alexwlchan.dreamwidth.org/tag/meta.
What if you want to search by more than one tag? For example:
I already had code that uses the XML-RPC API to get all my posts (to get a backup of my Dreamwidth entries). I added some extra filtering, and now it can search posts using the queries of the form above.
MIT license.
![[personal profile]](https://www.dreamwidth.org/img/silk/identity/user.png)
As far as we both know (please correct us if we're wrong!), you can only filter your posts by a particular tag. For example, to see all of my posts tagged with "meta", I'd visit https://alexwlchan.dreamwidth.org/tag/meta.
What if you want to search by more than one tag? For example:
- Which posts have I tagged with reviews and quotes? (An AND query)
- Which posts have I tagged with at least one of person:alex or person:lexie? (An OR query)
I already had code that uses the XML-RPC API to get all my posts (to get a backup of my Dreamwidth entries). I added some extra filtering, and now it can search posts using the queries of the form above.
Usage
- You need Python installed (downloads page). Python 2.7 or 3.x is fine; if you already have Python installed on your computer, that should be fine.
- Copy the code below into a file, for example
search_dreamwidth_posts_by_tag.py
. - Run the script with Python, for example by typing
python search_dreamwidth_posts_by_tag.py
in a terminal.
Reusing the code
If you know a bit of Python, you should be able to pull out bits of this code and reuse it elsewhere -- the XML-RPC API client, downloading all your posts, checking a user's password. You could modify it to find posts by different criteria: posts within a particular date range, or posted at the weekend, or that don't contain the letter e.MIT license.
The code
#!/usr/bin/env python
# -*- encoding: utf-8
"""
A script for doing complex tag queries on your Dreamwidth posts.
As far as I know, you can only filter your posts by a single tag.
For example, to see all of my posts tagged with "meta", I'd visit
https://alexwlchan.dreamwidth.org/tag/meta
This script allows you to make more complex queries. For example:
* Find every post that is tagged with "reviews" and "quotes"
* Find every post that is tagged with "person:alex" or "person:lexie"
(an ANY/OR query)
The results are written to a spreadsheet which includes the subject, URL,
tag list and date of the matching posts.
"""
from __future__ import print_function
import csv
import datetime as dt
import getpass
import hashlib
import os
import sys
try:
from urllib import quote_plus
from xmlrpclib import Binary, Fault, ServerProxy
except ImportError: # Python 3
raw_input = input
from urllib.parse import quote_plus
from xmlrpc.client import Binary, Fault, ServerProxy
def md5(s):
h = hashlib.md5()
h.update(s.encode("ascii"))
return h.hexdigest()
class DreamwidthAPI:
"""
Wrapper around the Dreamwidth XML-RPC API.
See http://wiki.dwscoalition.org/wiki/index.php/XML-RPC_Protocol
"""
def __init__(self, username, password):
self.username = username
self.password = password
self.server = ServerProxy("https://www.dreamwidth.org/interface/xmlrpc")
def auth_info(self):
# Invoke LJ.XMLRPC.getchallenge on the endpoint.
# See https://dw-dev-training.dreamwidth.org/58924.html?thread=383532
challenge_resp = self.server.LJ.XMLRPC.getchallenge()
auth_challenge = challenge_resp["challenge"]
auth_response = md5(auth_challenge + md5(self.password))
return {
"username": self.username,
"auth_method": "challenge",
"auth_challenge": auth_challenge,
"auth_response": auth_response
}
def call_endpoint(self, method_name, data=None):
if data is None:
data = {}
data.update(self.auth_info())
data.update({"ver": "1"})
method = getattr(self.server, "LJ.XMLRPC." + method_name)
return method(data)
def get_all_posts(self):
data = {
"selecttype": "lastn",
"howmany": 50
}
# I'm doing my own book-keeping of event IDs to ensure that this function
# never returns a duplicate item, even if we mess up the API call and
# end up retrieving an item more than once.
#
# This will happen on the ``beforedate`` boundary, because I deliberately
# fudge the date slightly to ensure we're getting everything before *or on*
# the time specified by ``beforedate``.
seen_event_ids = set()
while True:
resp = self.call_endpoint("getevents", data=data)
# If we've seen every event in this array already, we must be at
# the end of the journal. Abort!
if all(event["itemid"] in seen_event_ids for event in resp["events"]):
break
for event in resp["events"]:
event_id = event["itemid"]
if event_id not in seen_event_ids:
yield event
seen_event_ids.add(event_id)
# This ensures that if there were multiple posts at the same time as
# the earliest event in the response, we'll get all of them.
sorted_logtimes = sorted(
set(event["logtime"] for event in resp["events"])
)
data["beforedate"] = sorted_logtimes[1]
def write_posts_to_path(out_path, posts):
"""
Given an iterable of posts from the XML-RPC API, write them
as a CSV to `out_path`.
"""
with open(out_path + ".tmp", "w") as outfile:
fieldnames = ["subject", "url", "tags", "date"]
writer = csv.DictWriter(outfile, fieldnames=fieldnames)
writer.writeheader()
for post in posts:
subject_field = post.get("subject", "")
if isinstance(subject_field, Binary):
subject = subject_field.data.decode("utf8")
else:
subject = subject_field
row = {
"subject": subject,
"url": post["url"],
"tags": post["props"].get("taglist", ""),
"date": post["eventtime"]
}
writer.writerow(row)
os.rename(out_path + ".tmp", out_path)
def get_posts(api, should_include_post):
"""
Return every post from a Dreamwidth user that matches predicate.
"""
for post in api.get_all_posts():
if should_include_post(post):
yield post
if __name__ == "__main__":
username = raw_input("Username: ")
password = getpass.getpass()
api = DreamwidthAPI(username=username, password=password)
# Check the user's password is correct before proceeding.
try:
api.call_endpoint("login")
except Fault as err:
print("Error from Dreamwidth: is your password correct?", file=sys.stderr)
print("XML-RPC fault %d: %s" % (err.faultCode, err.faultString))
sys.exit(1)
# Ask the user for a list of tags they want to search.
tag_string = raw_input("What tags would you like to search for? (comma separated) ")
tags = [t.strip() for t in tag_string.split(",")]
# If they're only searching for a single tag, they can already view that
# on the Dreamwidth website -- send them there instead.
if len(tags) == 1:
print("")
print("You can see all posts tagged with '%s' by visiting:" % tags[0])
print("")
print("https://%s.dreamwidth.org/tag/%s" % (username, quote_plus(tags[0])))
sys.exit(0)
# Call the getusertags XML-RPC endpoint. This tells us all the tags the user
# has defined. We can check the tags we've been asked to search actually
# exist in the user's tag list.
user_tags_resp = api.call_endpoint("getusertags")
user_tags = {t["name"] for t in user_tags_resp["tags"]}
not_in_user_tags = [t for t in tags if t not in user_tags]
if not_in_user_tags:
if len(not_in_user_tags) == 1:
print(
"You don't have a '%s' tag. Typo?" % not_in_user_tags[0],
file=sys.stderr
)
else:
tag_string = ", ".join("'%s'" % t for t in not_in_user_tags)
print(
"You don't have the tags %s. Typos?" % tag_string,
file=sys.stderr
)
sys.exit(2)
# Ask them what sort of query they want to do.
operator = raw_input("Match ALL tags or ANY tags? ").strip().upper()
if operator not in ("ALL", "ANY"):
print("Unrecognised operator: %s" % operator, file=sys.stderr)
sys.exit(3)
# TODO: It would be nice if this slug told you something about the tag query
# you'd used.
slug = dt.datetime.now().strftime("%Y-%m-%d_%H-%M")
out_path = "dreamwidth_tag_query_%s.csv" % slug
# Should we include this post in the list? Change the logic in this function
# to customise the posts that appear -- for example, you could instead
# find posts in a particular date range, or every post that was written on
# a Tuesday, or contains the letter "X" in the title.
def should_include_post(post):
post_tag_string = post["props"].get("taglist", "")
post_tags = [t.strip() for t in post_tag_string.split(",")]
if operator == "ALL":
return all(t in post_tags for t in tags)
elif operator == "ANY":
return any(t in post_tags for t in tags)
else:
sys.exit("Unrecognised operator: %s" % operator)
print("Working!")
matching_posts = get_posts(api, should_include_post=should_include_post)
write_posts_to_path(out_path, posts=matching_posts)
print("Your list of posts is in %s" % out_path)
no subject
no subject
https://github.com/dreamwidth/dw-free/issues/1166
no subject
e.g. https://dw-dev.dreamwidth.org/tag/github,jquery,javascript?mode=or
Thank you for the link!
no subject
no subject